Skip to content

Commit 3bf06d3

Browse files
Issue-9660 - Extract array_to_string and string_to_array from kernels and udf containers (#9704)
1 parent 1d0171a commit 3bf06d3

File tree

5 files changed

+502
-464
lines changed

5 files changed

+502
-464
lines changed

datafusion/functions-array/src/kernels.rs

Lines changed: 5 additions & 324 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@
1818
//! implementation kernels for array functions
1919
2020
use arrow::array::{
21-
Array, ArrayRef, BooleanArray, Capacities, Date32Array, Float32Array, Float64Array,
22-
GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeListArray,
23-
LargeStringArray, ListArray, ListBuilder, MutableArrayData, OffsetSizeTrait,
24-
StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
21+
Array, ArrayRef, BooleanArray, Capacities, Date32Array, GenericListArray, Int64Array,
22+
LargeListArray, ListArray, MutableArrayData, OffsetSizeTrait, UInt64Array,
2523
};
2624
use arrow::compute;
2725
use arrow::datatypes::{
@@ -33,335 +31,18 @@ use arrow_schema::FieldRef;
3331
use arrow_schema::SortOptions;
3432

3533
use datafusion_common::cast::{
36-
as_date32_array, as_generic_list_array, as_generic_string_array, as_int64_array,
37-
as_interval_mdn_array, as_large_list_array, as_list_array, as_null_array,
38-
as_string_array,
34+
as_date32_array, as_generic_list_array, as_int64_array, as_interval_mdn_array,
35+
as_large_list_array, as_list_array, as_null_array, as_string_array,
3936
};
4037
use datafusion_common::{
4138
exec_err, internal_datafusion_err, not_impl_datafusion_err, DataFusionError, Result,
4239
ScalarValue,
4340
};
4441

42+
use crate::utils::downcast_arg;
4543
use std::any::type_name;
4644
use std::sync::Arc;
4745

48-
macro_rules! downcast_arg {
49-
($ARG:expr, $ARRAY_TYPE:ident) => {{
50-
$ARG.as_any().downcast_ref::<$ARRAY_TYPE>().ok_or_else(|| {
51-
DataFusionError::Internal(format!(
52-
"could not cast to {}",
53-
type_name::<$ARRAY_TYPE>()
54-
))
55-
})?
56-
}};
57-
}
58-
59-
macro_rules! to_string {
60-
($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
61-
let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
62-
for x in arr {
63-
match x {
64-
Some(x) => {
65-
$ARG.push_str(&x.to_string());
66-
$ARG.push_str($DELIMITER);
67-
}
68-
None => {
69-
if $WITH_NULL_STRING {
70-
$ARG.push_str($NULL_STRING);
71-
$ARG.push_str($DELIMITER);
72-
}
73-
}
74-
}
75-
}
76-
Ok($ARG)
77-
}};
78-
}
79-
80-
macro_rules! call_array_function {
81-
($DATATYPE:expr, false) => {
82-
match $DATATYPE {
83-
DataType::Utf8 => array_function!(StringArray),
84-
DataType::LargeUtf8 => array_function!(LargeStringArray),
85-
DataType::Boolean => array_function!(BooleanArray),
86-
DataType::Float32 => array_function!(Float32Array),
87-
DataType::Float64 => array_function!(Float64Array),
88-
DataType::Int8 => array_function!(Int8Array),
89-
DataType::Int16 => array_function!(Int16Array),
90-
DataType::Int32 => array_function!(Int32Array),
91-
DataType::Int64 => array_function!(Int64Array),
92-
DataType::UInt8 => array_function!(UInt8Array),
93-
DataType::UInt16 => array_function!(UInt16Array),
94-
DataType::UInt32 => array_function!(UInt32Array),
95-
DataType::UInt64 => array_function!(UInt64Array),
96-
_ => unreachable!(),
97-
}
98-
};
99-
($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
100-
match $DATATYPE {
101-
DataType::List(_) => array_function!(ListArray),
102-
DataType::Utf8 => array_function!(StringArray),
103-
DataType::LargeUtf8 => array_function!(LargeStringArray),
104-
DataType::Boolean => array_function!(BooleanArray),
105-
DataType::Float32 => array_function!(Float32Array),
106-
DataType::Float64 => array_function!(Float64Array),
107-
DataType::Int8 => array_function!(Int8Array),
108-
DataType::Int16 => array_function!(Int16Array),
109-
DataType::Int32 => array_function!(Int32Array),
110-
DataType::Int64 => array_function!(Int64Array),
111-
DataType::UInt8 => array_function!(UInt8Array),
112-
DataType::UInt16 => array_function!(UInt16Array),
113-
DataType::UInt32 => array_function!(UInt32Array),
114-
DataType::UInt64 => array_function!(UInt64Array),
115-
_ => unreachable!(),
116-
}
117-
}};
118-
}
119-
120-
/// Array_to_string SQL function
121-
pub(super) fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> {
122-
if args.len() < 2 || args.len() > 3 {
123-
return exec_err!("array_to_string expects two or three arguments");
124-
}
125-
126-
let arr = &args[0];
127-
128-
let delimiters = as_string_array(&args[1])?;
129-
let delimiters: Vec<Option<&str>> = delimiters.iter().collect();
130-
131-
let mut null_string = String::from("");
132-
let mut with_null_string = false;
133-
if args.len() == 3 {
134-
null_string = as_string_array(&args[2])?.value(0).to_string();
135-
with_null_string = true;
136-
}
137-
138-
fn compute_array_to_string(
139-
arg: &mut String,
140-
arr: ArrayRef,
141-
delimiter: String,
142-
null_string: String,
143-
with_null_string: bool,
144-
) -> datafusion_common::Result<&mut String> {
145-
match arr.data_type() {
146-
DataType::List(..) => {
147-
let list_array = as_list_array(&arr)?;
148-
for i in 0..list_array.len() {
149-
compute_array_to_string(
150-
arg,
151-
list_array.value(i),
152-
delimiter.clone(),
153-
null_string.clone(),
154-
with_null_string,
155-
)?;
156-
}
157-
158-
Ok(arg)
159-
}
160-
DataType::LargeList(..) => {
161-
let list_array = as_large_list_array(&arr)?;
162-
for i in 0..list_array.len() {
163-
compute_array_to_string(
164-
arg,
165-
list_array.value(i),
166-
delimiter.clone(),
167-
null_string.clone(),
168-
with_null_string,
169-
)?;
170-
}
171-
172-
Ok(arg)
173-
}
174-
DataType::Null => Ok(arg),
175-
data_type => {
176-
macro_rules! array_function {
177-
($ARRAY_TYPE:ident) => {
178-
to_string!(
179-
arg,
180-
arr,
181-
&delimiter,
182-
&null_string,
183-
with_null_string,
184-
$ARRAY_TYPE
185-
)
186-
};
187-
}
188-
call_array_function!(data_type, false)
189-
}
190-
}
191-
}
192-
193-
fn generate_string_array<O: OffsetSizeTrait>(
194-
list_arr: &GenericListArray<O>,
195-
delimiters: Vec<Option<&str>>,
196-
null_string: String,
197-
with_null_string: bool,
198-
) -> datafusion_common::Result<StringArray> {
199-
let mut res: Vec<Option<String>> = Vec::new();
200-
for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
201-
if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
202-
let mut arg = String::from("");
203-
let s = compute_array_to_string(
204-
&mut arg,
205-
arr,
206-
delimiter.to_string(),
207-
null_string.clone(),
208-
with_null_string,
209-
)?
210-
.clone();
211-
212-
if let Some(s) = s.strip_suffix(delimiter) {
213-
res.push(Some(s.to_string()));
214-
} else {
215-
res.push(Some(s));
216-
}
217-
} else {
218-
res.push(None);
219-
}
220-
}
221-
222-
Ok(StringArray::from(res))
223-
}
224-
225-
let arr_type = arr.data_type();
226-
let string_arr = match arr_type {
227-
DataType::List(_) | DataType::FixedSizeList(_, _) => {
228-
let list_array = as_list_array(&arr)?;
229-
generate_string_array::<i32>(
230-
list_array,
231-
delimiters,
232-
null_string,
233-
with_null_string,
234-
)?
235-
}
236-
DataType::LargeList(_) => {
237-
let list_array = as_large_list_array(&arr)?;
238-
generate_string_array::<i64>(
239-
list_array,
240-
delimiters,
241-
null_string,
242-
with_null_string,
243-
)?
244-
}
245-
_ => {
246-
let mut arg = String::from("");
247-
let mut res: Vec<Option<String>> = Vec::new();
248-
// delimiter length is 1
249-
assert_eq!(delimiters.len(), 1);
250-
let delimiter = delimiters[0].unwrap();
251-
let s = compute_array_to_string(
252-
&mut arg,
253-
arr.clone(),
254-
delimiter.to_string(),
255-
null_string,
256-
with_null_string,
257-
)?
258-
.clone();
259-
260-
if !s.is_empty() {
261-
let s = s.strip_suffix(delimiter).unwrap().to_string();
262-
res.push(Some(s));
263-
} else {
264-
res.push(Some(s));
265-
}
266-
StringArray::from(res)
267-
}
268-
};
269-
270-
Ok(Arc::new(string_arr))
271-
}
272-
273-
/// Splits string at occurrences of delimiter and returns an array of parts
274-
/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
275-
pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
276-
if args.len() < 2 || args.len() > 3 {
277-
return exec_err!("string_to_array expects two or three arguments");
278-
}
279-
let string_array = as_generic_string_array::<T>(&args[0])?;
280-
let delimiter_array = as_generic_string_array::<T>(&args[1])?;
281-
282-
let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
283-
string_array.len(),
284-
string_array.get_buffer_memory_size(),
285-
));
286-
287-
match args.len() {
288-
2 => {
289-
string_array.iter().zip(delimiter_array.iter()).for_each(
290-
|(string, delimiter)| {
291-
match (string, delimiter) {
292-
(Some(string), Some("")) => {
293-
list_builder.values().append_value(string);
294-
list_builder.append(true);
295-
}
296-
(Some(string), Some(delimiter)) => {
297-
string.split(delimiter).for_each(|s| {
298-
list_builder.values().append_value(s);
299-
});
300-
list_builder.append(true);
301-
}
302-
(Some(string), None) => {
303-
string.chars().map(|c| c.to_string()).for_each(|c| {
304-
list_builder.values().append_value(c);
305-
});
306-
list_builder.append(true);
307-
}
308-
_ => list_builder.append(false), // null value
309-
}
310-
},
311-
);
312-
}
313-
314-
3 => {
315-
let null_value_array = as_generic_string_array::<T>(&args[2])?;
316-
string_array
317-
.iter()
318-
.zip(delimiter_array.iter())
319-
.zip(null_value_array.iter())
320-
.for_each(|((string, delimiter), null_value)| {
321-
match (string, delimiter) {
322-
(Some(string), Some("")) => {
323-
if Some(string) == null_value {
324-
list_builder.values().append_null();
325-
} else {
326-
list_builder.values().append_value(string);
327-
}
328-
list_builder.append(true);
329-
}
330-
(Some(string), Some(delimiter)) => {
331-
string.split(delimiter).for_each(|s| {
332-
if Some(s) == null_value {
333-
list_builder.values().append_null();
334-
} else {
335-
list_builder.values().append_value(s);
336-
}
337-
});
338-
list_builder.append(true);
339-
}
340-
(Some(string), None) => {
341-
string.chars().map(|c| c.to_string()).for_each(|c| {
342-
if Some(c.as_str()) == null_value {
343-
list_builder.values().append_null();
344-
} else {
345-
list_builder.values().append_value(c);
346-
}
347-
});
348-
list_builder.append(true);
349-
}
350-
_ => list_builder.append(false), // null value
351-
}
352-
});
353-
}
354-
_ => {
355-
return exec_err!(
356-
"Expect string_to_array function to take two or three parameters"
357-
)
358-
}
359-
}
360-
361-
let list_array = list_builder.finish();
362-
Ok(Arc::new(list_array) as ArrayRef)
363-
}
364-
36546
/// Generates an array of integers from start to stop with a given step.
36647
///
36748
/// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values.

datafusion/functions-array/src/lib.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ mod remove;
3939
mod replace;
4040
mod rewrite;
4141
mod set_ops;
42+
mod string;
4243
mod udf;
4344
mod utils;
4445

@@ -73,6 +74,8 @@ pub mod expr_fn {
7374
pub use super::set_ops::array_distinct;
7475
pub use super::set_ops::array_intersect;
7576
pub use super::set_ops::array_union;
77+
pub use super::string::array_to_string;
78+
pub use super::string::string_to_array;
7679
pub use super::udf::array_dims;
7780
pub use super::udf::array_empty;
7881
pub use super::udf::array_length;
@@ -81,19 +84,17 @@ pub mod expr_fn {
8184
pub use super::udf::array_resize;
8285
pub use super::udf::array_reverse;
8386
pub use super::udf::array_sort;
84-
pub use super::udf::array_to_string;
8587
pub use super::udf::cardinality;
8688
pub use super::udf::flatten;
8789
pub use super::udf::gen_series;
8890
pub use super::udf::range;
89-
pub use super::udf::string_to_array;
9091
}
9192

9293
/// Registers all enabled packages with a [`FunctionRegistry`]
9394
pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
9495
let functions: Vec<Arc<ScalarUDF>> = vec![
95-
udf::array_to_string_udf(),
96-
udf::string_to_array_udf(),
96+
string::array_to_string_udf(),
97+
string::string_to_array_udf(),
9798
udf::range_udf(),
9899
udf::gen_series_udf(),
99100
udf::array_dims_udf(),

0 commit comments

Comments
 (0)