Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions datafusion-examples/examples/builtin_functions/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,5 +340,87 @@ pub async fn regexp() -> Result<()> {
&result
);

//
//
// regexp_extract examples
//
//
// regexp_extract format is regexp_extract(str, regexp, idx)
// idx=0 returns the entire matched string, idx=N returns the Nth capture group
// if the regex did not match, or the group did not match, an empty string is returned
//

// extract year from date string (group 1)
let result = ctx
.sql("SELECT regexp_extract('2024-03-16', '(\\d{4})-(\\d{2})-(\\d{2})', 1)")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+-----------------------------------------------------------------------------+",
r#"| regexp_extract(Utf8("2024-03-16"),Utf8("(\d{4})-(\d{2})-(\d{2})"),Int64(1)) |"#,
"+-----------------------------------------------------------------------------+",
"| 2024 |",
"+-----------------------------------------------------------------------------+",
],
&result
);

// idx=0 returns the entire matched string
let result = ctx
.sql("SELECT regexp_extract('2024-03-16', '(\\d{4})-(\\d{2})-(\\d{2})', 0)")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+-----------------------------------------------------------------------------+",
r#"| regexp_extract(Utf8("2024-03-16"),Utf8("(\d{4})-(\d{2})-(\d{2})"),Int64(0)) |"#,
"+-----------------------------------------------------------------------------+",
"| 2024-03-16 |",
"+-----------------------------------------------------------------------------+",
],
&result
);

// no match returns empty string (not NULL)
let result = ctx
.sql("SELECT regexp_extract('no digits here', '(\\d+)', 1)")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------------------------+",
r#"| regexp_extract(Utf8("no digits here"),Utf8("(\d+)"),Int64(1)) |"#,
"+---------------------------------------------------------------+",
"| |",
"+---------------------------------------------------------------+",
],
&result
);

// NULL input returns NULL
let result = ctx
.sql("SELECT regexp_extract(NULL, '(\\d+)', 1)")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------+",
r#"| regexp_extract(NULL,Utf8("(\d+)"),Int64(1)) |"#,
"+---------------------------------------------+",
"| |",
"+---------------------------------------------+",
],
&result
);

Ok(())
}
8 changes: 8 additions & 0 deletions datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::sync::Arc;
pub mod regexpcount;
pub mod regexpextract;
pub mod regexpinstr;
pub mod regexplike;
pub mod regexpmatch;
pub mod regexpreplace;

// create UDFs
make_udf_function!(regexpcount::RegexpCountFunc, regexp_count);
make_udf_function!(regexpextract::RegexpExtractFunc, regexp_extract);
make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr);
make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, regexp_like);
Expand Down Expand Up @@ -102,6 +104,11 @@ pub mod expr_fn {
super::regexp_like().call(args)
}

/// Extracts the first match of a regular expression capture group from a string.
pub fn regexp_extract(str: Expr, regexp: Expr, idx: Expr) -> Expr {
super::regexp_extract().call(vec![str, regexp, idx])
}

/// Replaces substrings in a string that match.
pub fn regexp_replace(
string: Expr,
Expand All @@ -121,6 +128,7 @@ pub mod expr_fn {
pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
vec![
regexp_count(),
regexp_extract(),
regexp_match(),
regexp_instr(),
regexp_like(),
Expand Down
Loading