Skip to content

Commit 67e0bd3

Browse files
authored
Move levenshtein, uuid, overlay to datafusion-functions (#9760)
* Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Move levenshtein, uuid, overlay to datafusion-functions
1 parent b1f3774 commit 67e0bd3

File tree

20 files changed

+458
-334
lines changed

20 files changed

+458
-334
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-examples/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,4 @@ tempfile = { workspace = true }
7676
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
7777
tonic = "0.11"
7878
url = { workspace = true }
79-
uuid = "1.2"
79+
uuid = "1.7"

datafusion/core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ tempfile = { workspace = true }
122122
tokio = { workspace = true }
123123
tokio-util = { version = "0.7.4", features = ["io"], optional = true }
124124
url = { workspace = true }
125-
uuid = { version = "1.0", features = ["v4"] }
125+
uuid = { version = "1.7", features = ["v4"] }
126126
xz2 = { version = "0.1", optional = true, features = ["static"] }
127127
zstd = { version = "0.13", optional = true, default-features = false }
128128

datafusion/expr/src/built_in_function.rs

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,6 @@ pub enum BuiltinScalarFunction {
141141
Substr,
142142
/// translate
143143
Translate,
144-
/// uuid
145-
Uuid,
146-
/// overlay
147-
OverLay,
148-
/// levenshtein
149-
Levenshtein,
150144
/// substr_index
151145
SubstrIndex,
152146
/// find_in_set
@@ -253,14 +247,11 @@ impl BuiltinScalarFunction {
253247
BuiltinScalarFunction::Strpos => Volatility::Immutable,
254248
BuiltinScalarFunction::Substr => Volatility::Immutable,
255249
BuiltinScalarFunction::Translate => Volatility::Immutable,
256-
BuiltinScalarFunction::OverLay => Volatility::Immutable,
257-
BuiltinScalarFunction::Levenshtein => Volatility::Immutable,
258250
BuiltinScalarFunction::SubstrIndex => Volatility::Immutable,
259251
BuiltinScalarFunction::FindInSet => Volatility::Immutable,
260252

261253
// Volatile builtin functions
262254
BuiltinScalarFunction::Random => Volatility::Volatile,
263-
BuiltinScalarFunction::Uuid => Volatility::Volatile,
264255
}
265256
}
266257

@@ -302,7 +293,6 @@ impl BuiltinScalarFunction {
302293
BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"),
303294
BuiltinScalarFunction::Pi => Ok(Float64),
304295
BuiltinScalarFunction::Random => Ok(Float64),
305-
BuiltinScalarFunction::Uuid => Ok(Utf8),
306296
BuiltinScalarFunction::Repeat => {
307297
utf8_to_str_type(&input_expr_types[0], "repeat")
308298
}
@@ -362,14 +352,6 @@ impl BuiltinScalarFunction {
362352

363353
BuiltinScalarFunction::Iszero => Ok(Boolean),
364354

365-
BuiltinScalarFunction::OverLay => {
366-
utf8_to_str_type(&input_expr_types[0], "overlay")
367-
}
368-
369-
BuiltinScalarFunction::Levenshtein => {
370-
utf8_to_int_type(&input_expr_types[0], "levenshtein")
371-
}
372-
373355
BuiltinScalarFunction::Atan
374356
| BuiltinScalarFunction::Acosh
375357
| BuiltinScalarFunction::Asinh
@@ -490,7 +472,6 @@ impl BuiltinScalarFunction {
490472
}
491473
BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
492474
BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()),
493-
BuiltinScalarFunction::Uuid => Signature::exact(vec![], self.volatility()),
494475
BuiltinScalarFunction::Power => Signature::one_of(
495476
vec![Exact(vec![Int64, Int64]), Exact(vec![Float64, Float64])],
496477
self.volatility(),
@@ -536,19 +517,6 @@ impl BuiltinScalarFunction {
536517
BuiltinScalarFunction::Gcd | BuiltinScalarFunction::Lcm => {
537518
Signature::uniform(2, vec![Int64], self.volatility())
538519
}
539-
BuiltinScalarFunction::OverLay => Signature::one_of(
540-
vec![
541-
Exact(vec![Utf8, Utf8, Int64, Int64]),
542-
Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
543-
Exact(vec![Utf8, Utf8, Int64]),
544-
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
545-
],
546-
self.volatility(),
547-
),
548-
BuiltinScalarFunction::Levenshtein => Signature::one_of(
549-
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
550-
self.volatility(),
551-
),
552520
BuiltinScalarFunction::Atan
553521
| BuiltinScalarFunction::Acosh
554522
| BuiltinScalarFunction::Asinh
@@ -678,11 +646,8 @@ impl BuiltinScalarFunction {
678646
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
679647
BuiltinScalarFunction::Substr => &["substr"],
680648
BuiltinScalarFunction::Translate => &["translate"],
681-
BuiltinScalarFunction::Uuid => &["uuid"],
682-
BuiltinScalarFunction::Levenshtein => &["levenshtein"],
683649
BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"],
684650
BuiltinScalarFunction::FindInSet => &["find_in_set"],
685-
BuiltinScalarFunction::OverLay => &["overlay"],
686651
}
687652
}
688653
}

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,6 @@ scalar_expr!(Log10, log10, num, "base 10 logarithm of number");
575575
scalar_expr!(Ln, ln, num, "natural logarithm (base e) of number");
576576
scalar_expr!(Power, power, base exponent, "`base` raised to the power of `exponent`");
577577
scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argument");
578-
scalar_expr!(Uuid, uuid, , "returns uuid v4 as a string value");
579578
scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
580579

581580
// string functions
@@ -628,12 +627,6 @@ nary_scalar_expr!(
628627
"concatenates several strings, placing a seperator between each one"
629628
);
630629
nary_scalar_expr!(Concat, concat_expr, "concatenates several strings");
631-
nary_scalar_expr!(
632-
OverLay,
633-
overlay,
634-
"replace the substring of string that starts at the start'th character and extends for count characters with new substring"
635-
);
636-
637630
scalar_expr!(Nanvl, nanvl, x y, "returns x if x is not NaN otherwise returns y");
638631
scalar_expr!(
639632
Iszero,
@@ -642,7 +635,6 @@ scalar_expr!(
642635
"returns true if a given number is +0.0 or -0.0 otherwise returns false"
643636
);
644637

645-
scalar_expr!(Levenshtein, levenshtein, string1 string2, "Returns the Levenshtein distance between the two given strings");
646638
scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter");
647639
scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings");
648640

@@ -1076,25 +1068,7 @@ mod test {
10761068
test_scalar_expr!(Substr, substr, string, position);
10771069
test_scalar_expr!(Substr, substring, string, position, count);
10781070
test_scalar_expr!(Translate, translate, string, from, to);
1079-
test_nary_scalar_expr!(OverLay, overlay, string, characters, position, len);
1080-
test_nary_scalar_expr!(OverLay, overlay, string, characters, position);
1081-
test_scalar_expr!(Levenshtein, levenshtein, string1, string2);
10821071
test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count);
10831072
test_scalar_expr!(FindInSet, find_in_set, string, stringlist);
10841073
}
1085-
1086-
#[test]
1087-
fn uuid_function_definitions() {
1088-
if let Expr::ScalarFunction(ScalarFunction {
1089-
func_def: ScalarFunctionDefinition::BuiltIn(fun),
1090-
args,
1091-
}) = uuid()
1092-
{
1093-
let name = BuiltinScalarFunction::Uuid;
1094-
assert_eq!(name, fun);
1095-
assert_eq!(0, args.len());
1096-
} else {
1097-
unreachable!();
1098-
}
1099-
}
11001074
}

datafusion/functions/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ log = { workspace = true }
7575
md-5 = { version = "^0.10.0", optional = true }
7676
regex = { version = "1.8", optional = true }
7777
sha2 = { version = "^0.10.1", optional = true }
78+
uuid = { version = "1.7", features = ["v4"] }
79+
7880
[dev-dependencies]
7981
criterion = "0.5"
8082
rand = { workspace = true }
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::sync::Arc;
20+
21+
use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
22+
use arrow::datatypes::DataType;
23+
24+
use datafusion_common::cast::as_generic_string_array;
25+
use datafusion_common::utils::datafusion_strsim;
26+
use datafusion_common::{exec_err, Result};
27+
use datafusion_expr::ColumnarValue;
28+
use datafusion_expr::TypeSignature::*;
29+
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
30+
31+
use crate::string::common::{make_scalar_function, utf8_to_int_type};
32+
33+
#[derive(Debug)]
34+
pub(super) struct LevenshteinFunc {
35+
signature: Signature,
36+
}
37+
38+
impl LevenshteinFunc {
39+
pub fn new() -> Self {
40+
use DataType::*;
41+
Self {
42+
signature: Signature::one_of(
43+
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
44+
Volatility::Immutable,
45+
),
46+
}
47+
}
48+
}
49+
50+
impl ScalarUDFImpl for LevenshteinFunc {
51+
fn as_any(&self) -> &dyn Any {
52+
self
53+
}
54+
55+
fn name(&self) -> &str {
56+
"levenshtein"
57+
}
58+
59+
fn signature(&self) -> &Signature {
60+
&self.signature
61+
}
62+
63+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
64+
utf8_to_int_type(&arg_types[0], "levenshtein")
65+
}
66+
67+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
68+
match args[0].data_type() {
69+
DataType::Utf8 => make_scalar_function(levenshtein::<i32>, vec![])(args),
70+
DataType::LargeUtf8 => make_scalar_function(levenshtein::<i64>, vec![])(args),
71+
other => {
72+
exec_err!("Unsupported data type {other:?} for function levenshtein")
73+
}
74+
}
75+
}
76+
}
77+
78+
///Returns the Levenshtein distance between the two given strings.
79+
/// LEVENSHTEIN('kitten', 'sitting') = 3
80+
pub fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
81+
if args.len() != 2 {
82+
return exec_err!(
83+
"levenshtein function requires two arguments, got {}",
84+
args.len()
85+
);
86+
}
87+
let str1_array = as_generic_string_array::<T>(&args[0])?;
88+
let str2_array = as_generic_string_array::<T>(&args[1])?;
89+
match args[0].data_type() {
90+
DataType::Utf8 => {
91+
let result = str1_array
92+
.iter()
93+
.zip(str2_array.iter())
94+
.map(|(string1, string2)| match (string1, string2) {
95+
(Some(string1), Some(string2)) => {
96+
Some(datafusion_strsim::levenshtein(string1, string2) as i32)
97+
}
98+
_ => None,
99+
})
100+
.collect::<Int32Array>();
101+
Ok(Arc::new(result) as ArrayRef)
102+
}
103+
DataType::LargeUtf8 => {
104+
let result = str1_array
105+
.iter()
106+
.zip(str2_array.iter())
107+
.map(|(string1, string2)| match (string1, string2) {
108+
(Some(string1), Some(string2)) => {
109+
Some(datafusion_strsim::levenshtein(string1, string2) as i64)
110+
}
111+
_ => None,
112+
})
113+
.collect::<Int64Array>();
114+
Ok(Arc::new(result) as ArrayRef)
115+
}
116+
other => {
117+
exec_err!(
118+
"levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8."
119+
)
120+
}
121+
}
122+
}
123+
124+
#[cfg(test)]
125+
mod tests {
126+
use arrow::array::{Int32Array, StringArray};
127+
128+
use datafusion_common::cast::as_int32_array;
129+
130+
use super::*;
131+
132+
#[test]
133+
fn to_levenshtein() -> Result<()> {
134+
let string1_array =
135+
Arc::new(StringArray::from(vec!["123", "abc", "xyz", "kitten"]));
136+
let string2_array =
137+
Arc::new(StringArray::from(vec!["321", "def", "zyx", "sitting"]));
138+
let res = levenshtein::<i32>(&[string1_array, string2_array]).unwrap();
139+
let result =
140+
as_int32_array(&res).expect("failed to initialized function levenshtein");
141+
let expected = Int32Array::from(vec![2, 3, 2, 3]);
142+
assert_eq!(&expected, result);
143+
144+
Ok(())
145+
}
146+
}

0 commit comments

Comments
 (0)