Skip to content

Commit fce331a

Browse files
jonathanc-nalamb
andauthored
Migrate Regex Functions from static docs (#12886)
* regex migrate * small fixes * update docs --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent f2564b7 commit fce331a

File tree

4 files changed

+178
-102
lines changed

4 files changed

+178
-102
lines changed

datafusion/functions/src/regex/regexpmatch.rs

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
2626
use datafusion_common::{
2727
cast::as_generic_string_array, internal_err, DataFusionError, Result,
2828
};
29-
use datafusion_expr::{ColumnarValue, TypeSignature};
29+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
30+
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
3031
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
3132
use std::any::Any;
32-
use std::sync::Arc;
33+
use std::sync::{Arc, OnceLock};
3334

3435
#[derive(Debug)]
3536
pub struct RegexpMatchFunc {
@@ -106,7 +107,51 @@ impl ScalarUDFImpl for RegexpMatchFunc {
106107
result.map(ColumnarValue::Array)
107108
}
108109
}
110+
111+
fn documentation(&self) -> Option<&Documentation> {
112+
Some(get_regexp_match_doc())
113+
}
114+
}
115+
116+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
117+
118+
fn get_regexp_match_doc() -> &'static Documentation {
119+
DOCUMENTATION.get_or_init(|| {
120+
Documentation::builder()
121+
.with_doc_section(DOC_SECTION_REGEX)
122+
.with_description("Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.")
123+
.with_syntax_example("regexp_match(str, regexp[, flags])")
124+
.with_sql_example(r#"```sql
125+
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
126+
+---------------------------------------------------------+
127+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
128+
+---------------------------------------------------------+
129+
| [Köln] |
130+
+---------------------------------------------------------+
131+
SELECT regexp_match('aBc', '(b|d)', 'i');
132+
+---------------------------------------------------+
133+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
134+
+---------------------------------------------------+
135+
| [B] |
136+
+---------------------------------------------------+
137+
```
138+
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
139+
"#)
140+
.with_standard_argument("str", "String")
141+
.with_argument("regexp","Regular expression to match against.
142+
Can be a constant, column, or function.")
143+
.with_argument("flags",
144+
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
145+
- **i**: case-insensitive: letters match both upper and lower case
146+
- **m**: multi-line mode: ^ and $ match begin/end of line
147+
- **s**: allow . to match \n
148+
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
149+
- **U**: swap the meaning of x* and x*?"#)
150+
.build()
151+
.unwrap()
152+
})
109153
}
154+
110155
fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
111156
match args[0].data_type() {
112157
DataType::Utf8 => regexp_match::<i32>(args),

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,15 @@ use datafusion_common::{
3232
cast::as_generic_string_array, internal_err, DataFusionError, Result,
3333
};
3434
use datafusion_expr::function::Hint;
35+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
3536
use datafusion_expr::ColumnarValue;
3637
use datafusion_expr::TypeSignature;
37-
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
38+
use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
3839
use regex::Regex;
3940
use std::any::Any;
4041
use std::collections::HashMap;
41-
use std::sync::Arc;
42-
use std::sync::OnceLock;
42+
use std::sync::{Arc, OnceLock};
43+
4344
#[derive(Debug)]
4445
pub struct RegexpReplaceFunc {
4546
signature: Signature,
@@ -123,6 +124,51 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
123124
result.map(ColumnarValue::Array)
124125
}
125126
}
127+
128+
fn documentation(&self) -> Option<&Documentation> {
129+
Some(get_regexp_replace_doc())
130+
}
131+
}
132+
133+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
134+
135+
fn get_regexp_replace_doc() -> &'static Documentation {
136+
DOCUMENTATION.get_or_init(|| {
137+
Documentation::builder()
138+
.with_doc_section(DOC_SECTION_REGEX)
139+
.with_description("Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).")
140+
.with_syntax_example("regexp_replace(str, regexp, replacement[, flags])")
141+
.with_sql_example(r#"```sql
142+
> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
143+
+------------------------------------------------------------------------+
144+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
145+
+------------------------------------------------------------------------+
146+
| fooXarYXazY |
147+
+------------------------------------------------------------------------+
148+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
149+
+-------------------------------------------------------------------+
150+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
151+
+-------------------------------------------------------------------+
152+
| aAbBac |
153+
+-------------------------------------------------------------------+
154+
```
155+
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
156+
"#)
157+
.with_standard_argument("str", "String")
158+
.with_argument("regexp","Regular expression to match against.
159+
Can be a constant, column, or function.")
160+
.with_standard_argument("replacement", "Replacement string")
161+
.with_argument("flags",
162+
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
163+
- **g**: (global) Search globally and don't return after the first match
164+
- **i**: case-insensitive: letters match both upper and lower case
165+
- **m**: multi-line mode: ^ and $ match begin/end of line
166+
- **s**: allow . to match \n
167+
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
168+
- **U**: swap the meaning of x* and x*?"#)
169+
.build()
170+
.unwrap()
171+
})
126172
}
127173

128174
fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -563,103 +563,6 @@ See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/
563563

564564
See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html)
565565

566-
## Regular Expression Functions
567-
568-
Apache DataFusion uses a [PCRE-like] regular expression [syntax]
569-
(minus support for several features including look-around and backreferences).
570-
The following regular expression functions are supported:
571-
572-
- [regexp_match](#regexp_match)
573-
- [regexp_replace](#regexp_replace)
574-
575-
[pcre-like]: https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions
576-
[syntax]: https://docs.rs/regex/latest/regex/#syntax
577-
578-
### `regexp_match`
579-
580-
Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.
581-
582-
```
583-
regexp_match(str, regexp[, flags])
584-
```
585-
586-
#### Arguments
587-
588-
- **str**: String expression to operate on.
589-
Can be a constant, column, or function, and any combination of string operators.
590-
- **regexp**: Regular expression to match against.
591-
Can be a constant, column, or function.
592-
- **flags**: Optional regular expression flags that control the behavior of the
593-
regular expression. The following flags are supported:
594-
- **i**: case-insensitive: letters match both upper and lower case
595-
- **m**: multi-line mode: ^ and $ match begin/end of line
596-
- **s**: allow . to match \n
597-
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
598-
- **U**: swap the meaning of x* and x*?
599-
600-
#### Example
601-
602-
```sql
603-
select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
604-
+---------------------------------------------------------+
605-
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
606-
+---------------------------------------------------------+
607-
| [Köln] |
608-
+---------------------------------------------------------+
609-
SELECT regexp_match('aBc', '(b|d)', 'i');
610-
+---------------------------------------------------+
611-
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
612-
+---------------------------------------------------+
613-
| [B] |
614-
+---------------------------------------------------+
615-
```
616-
617-
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
618-
619-
### `regexp_replace`
620-
621-
Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).
622-
623-
```
624-
regexp_replace(str, regexp, replacement[, flags])
625-
```
626-
627-
#### Arguments
628-
629-
- **str**: String expression to operate on.
630-
Can be a constant, column, or function, and any combination of string operators.
631-
- **regexp**: Regular expression to match against.
632-
Can be a constant, column, or function.
633-
- **replacement**: Replacement string expression.
634-
Can be a constant, column, or function, and any combination of string operators.
635-
- **flags**: Optional regular expression flags that control the behavior of the
636-
regular expression. The following flags are supported:
637-
- **g**: (global) Search globally and don't return after the first match
638-
- **i**: case-insensitive: letters match both upper and lower case
639-
- **m**: multi-line mode: ^ and $ match begin/end of line
640-
- **s**: allow . to match \n
641-
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
642-
- **U**: swap the meaning of x* and x*?
643-
644-
#### Example
645-
646-
```sql
647-
SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
648-
+------------------------------------------------------------------------+
649-
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
650-
+------------------------------------------------------------------------+
651-
| fooXarYXazY |
652-
+------------------------------------------------------------------------+
653-
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
654-
+-------------------------------------------------------------------+
655-
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
656-
+-------------------------------------------------------------------+
657-
| aAbBac |
658-
+-------------------------------------------------------------------+
659-
```
660-
661-
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
662-
663566
### `position`
664567

665568
Returns the position of `substr` in `origstr` (counting from 1). If `substr` does

docs/source/user-guide/sql/scalar_functions_new.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,8 @@ regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
11911191
The following regular expression functions are supported:
11921192

11931193
- [regexp_like](#regexp_like)
1194+
- [regexp_match](#regexp_match)
1195+
- [regexp_replace](#regexp_replace)
11941196

11951197
### `regexp_like`
11961198

@@ -1230,6 +1232,86 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
12301232

12311233
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
12321234

1235+
### `regexp_match`
1236+
1237+
Returns a list of [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.
1238+
1239+
```
1240+
regexp_match(str, regexp[, flags])
1241+
```
1242+
1243+
#### Arguments
1244+
1245+
- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
1246+
- **regexp**: Regular expression to match against.
1247+
Can be a constant, column, or function.
1248+
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
1249+
- **i**: case-insensitive: letters match both upper and lower case
1250+
- **m**: multi-line mode: ^ and $ match begin/end of line
1251+
- **s**: allow . to match \n
1252+
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
1253+
- **U**: swap the meaning of x* and x*?
1254+
1255+
#### Example
1256+
1257+
```sql
1258+
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
1259+
+---------------------------------------------------------+
1260+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
1261+
+---------------------------------------------------------+
1262+
| [Köln] |
1263+
+---------------------------------------------------------+
1264+
SELECT regexp_match('aBc', '(b|d)', 'i');
1265+
+---------------------------------------------------+
1266+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
1267+
+---------------------------------------------------+
1268+
| [B] |
1269+
+---------------------------------------------------+
1270+
```
1271+
1272+
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
1273+
1274+
### `regexp_replace`
1275+
1276+
Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).
1277+
1278+
```
1279+
regexp_replace(str, regexp, replacement[, flags])
1280+
```
1281+
1282+
#### Arguments
1283+
1284+
- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
1285+
- **regexp**: Regular expression to match against.
1286+
Can be a constant, column, or function.
1287+
- **replacement**: Replacement string expression to operate on. Can be a constant, column, or function, and any combination of operators.
1288+
- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
1289+
- **g**: (global) Search globally and don't return after the first match
1290+
- **i**: case-insensitive: letters match both upper and lower case
1291+
- **m**: multi-line mode: ^ and $ match begin/end of line
1292+
- **s**: allow . to match \n
1293+
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
1294+
- **U**: swap the meaning of x* and x*?
1295+
1296+
#### Example
1297+
1298+
```sql
1299+
> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
1300+
+------------------------------------------------------------------------+
1301+
| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
1302+
+------------------------------------------------------------------------+
1303+
| fooXarYXazY |
1304+
+------------------------------------------------------------------------+
1305+
SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
1306+
+-------------------------------------------------------------------+
1307+
| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
1308+
+-------------------------------------------------------------------+
1309+
| aAbBac |
1310+
+-------------------------------------------------------------------+
1311+
```
1312+
1313+
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
1314+
12331315
## Time and Date Functions
12341316

12351317
- [to_date](#to_date)

0 commit comments

Comments
 (0)