Skip to content

Commit 675e41e

Browse files
authored
feat: add regexp_instr function (#1382)
* feat: add regexp_instr function The current implementation of regexp_instr in Datafusion, does not support endoption. Hence None is passed in the implementation of the function exposing it to Python. * chore: add test for all optional arguments * fix: make start truly optional in regexp_count
1 parent 3481904 commit 675e41e

File tree

3 files changed

+91
-3
lines changed

3 files changed

+91
-3
lines changed

python/datafusion/functions.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@
225225
"range",
226226
"rank",
227227
"regexp_count",
228+
"regexp_instr",
228229
"regexp_like",
229230
"regexp_match",
230231
"regexp_replace",
@@ -816,7 +817,7 @@ def regexp_replace(
816817

817818

818819
def regexp_count(
819-
string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None
820+
string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None = None
820821
) -> Expr:
821822
"""Returns the number of matches in a string.
822823
@@ -825,10 +826,42 @@ def regexp_count(
825826
"""
826827
if flags is not None:
827828
flags = flags.expr
828-
start = start.expr if start is not None else Expr.expr
829+
start = start.expr if start is not None else start
829830
return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))
830831

831832

833+
def regexp_instr(
834+
values: Expr,
835+
regex: Expr,
836+
start: Expr | None = None,
837+
n: Expr | None = None,
838+
flags: Expr | None = None,
839+
sub_expr: Expr | None = None,
840+
) -> Expr:
841+
"""Returns the position of a regular expression match in a string.
842+
843+
Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
844+
``start`` (the first position is 1). Returns the starting or ending position based
845+
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
846+
return the position of a specific capture group instead of the entire match.
847+
"""
848+
start = start.expr if start is not None else None
849+
n = n.expr if n is not None else None
850+
flags = flags.expr if flags is not None else None
851+
sub_expr = sub_expr.expr if sub_expr is not None else None
852+
853+
return Expr(
854+
f.regexp_instr(
855+
values.expr,
856+
regex.expr,
857+
start,
858+
n,
859+
flags,
860+
sub_expr,
861+
)
862+
)
863+
864+
832865
def repeat(string: Expr, n: Expr) -> Expr:
833866
"""Repeats the ``string`` to ``n`` times."""
834867
return Expr(f.repeat(string.expr, n.expr))

python/tests/test_functions.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,38 @@ def test_array_function_obj_tests(stmt, py_expr):
769769
pa.array(["H-o", "W-d", "!"], type=pa.string_view()),
770770
),
771771
(
772-
f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
772+
f.regexp_count(column("a"), literal("(ell|orl)"), start=literal(1)),
773+
pa.array([1, 1, 0], type=pa.int64()),
774+
),
775+
(
776+
f.regexp_count(column("a"), literal("(ell|orl)")),
777+
pa.array([1, 1, 0], type=pa.int64()),
778+
),
779+
(
780+
f.regexp_instr(column("a"), literal("(ell|orl)")),
781+
pa.array([2, 2, 0], type=pa.int64()),
782+
),
783+
(
784+
f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)),
785+
pa.array([4, 4, 0], type=pa.int64()),
786+
),
787+
(
788+
f.regexp_instr(
789+
column("a"),
790+
literal("(x)?([hw])"),
791+
start=literal(1),
792+
n=literal(1),
793+
flags=literal("i"),
794+
sub_expr=literal(2),
795+
),
796+
pa.array([1, 1, 0], type=pa.int64()),
797+
),
798+
(
799+
f.regexp_instr(column("a"), literal("([hw])"), flags=literal("i")),
800+
pa.array([1, 1, 0], type=pa.int64()),
801+
),
802+
(
803+
f.regexp_instr(column("a"), literal("(x)?([HW])"), sub_expr=literal(2)),
773804
pa.array([1, 1, 0], type=pa.int64()),
774805
),
775806
],

src/functions.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,29 @@ fn regexp_count(
189189
.into())
190190
}
191191

192+
#[pyfunction]
193+
#[pyo3(signature = (values, regex, start=None, n=None, flags=None, subexpr=None))]
194+
/// Returns the position in a string where the specified occurrence of a regular expression is located
195+
fn regexp_instr(
196+
values: PyExpr,
197+
regex: PyExpr,
198+
start: Option<PyExpr>,
199+
n: Option<PyExpr>,
200+
flags: Option<PyExpr>,
201+
subexpr: Option<PyExpr>,
202+
) -> PyResult<PyExpr> {
203+
Ok(functions::expr_fn::regexp_instr(
204+
values.into(),
205+
regex.into(),
206+
start.map(|x| x.expr).or(Some(lit(1))),
207+
n.map(|x| x.expr).or(Some(lit(1))),
208+
None,
209+
flags.map(|x| x.expr).or(Some(lit(""))),
210+
subexpr.map(|x| x.expr).or(Some(lit(0))),
211+
)
212+
.into())
213+
}
214+
192215
/// Creates a new Sort Expr
193216
#[pyfunction]
194217
fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult<PySortExpr> {
@@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
9881011
m.add_wrapped(wrap_pyfunction!(radians))?;
9891012
m.add_wrapped(wrap_pyfunction!(random))?;
9901013
m.add_wrapped(wrap_pyfunction!(regexp_count))?;
1014+
m.add_wrapped(wrap_pyfunction!(regexp_instr))?;
9911015
m.add_wrapped(wrap_pyfunction!(regexp_like))?;
9921016
m.add_wrapped(wrap_pyfunction!(regexp_match))?;
9931017
m.add_wrapped(wrap_pyfunction!(regexp_replace))?;

0 commit comments

Comments
 (0)