Skip to content

Commit cf566d0

Browse files
gstvgalamb
authored andcommitted
Support custom struct field names with new scalar function named_struct (apache#9743)
* Support custom struct field names with new scalar function named_struct * add tests and corretly handle mixed arrray and scalar values * fix slt * fmt * port test to slt --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent ab88cac commit cf566d0

File tree

6 files changed

+343
-27
lines changed

6 files changed

+343
-27
lines changed

datafusion/functions/src/core/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
mod arrow_cast;
2121
mod arrowtypeof;
2222
mod getfield;
23+
mod named_struct;
2324
mod nullif;
2425
mod nvl;
2526
mod nvl2;
@@ -32,6 +33,7 @@ make_udf_function!(nvl::NVLFunc, NVL, nvl);
3233
make_udf_function!(nvl2::NVL2Func, NVL2, nvl2);
3334
make_udf_function!(arrowtypeof::ArrowTypeOfFunc, ARROWTYPEOF, arrow_typeof);
3435
make_udf_function!(r#struct::StructFunc, STRUCT, r#struct);
36+
make_udf_function!(named_struct::NamedStructFunc, NAMED_STRUCT, named_struct);
3537
make_udf_function!(getfield::GetFieldFunc, GET_FIELD, get_field);
3638

3739
// Export the functions out of this package, both as expr_fn as well as a list of functions
@@ -42,5 +44,6 @@ export_functions!(
4244
(nvl2, arg_1 arg_2 arg_3, "Returns value2 if value1 is not NULL; otherwise, it returns value3."),
4345
(arrow_typeof, arg_1, "Returns the Arrow type of the input expression."),
4446
(r#struct, args, "Returns a struct with the given arguments"),
47+
(named_struct, args, "Returns a struct with the given names and arguments pairs"),
4548
(get_field, arg_1 arg_2, "Returns the value of the field with the given name from the struct")
4649
);
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::StructArray;
19+
use arrow::datatypes::{DataType, Field, Fields};
20+
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
21+
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
22+
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
23+
use std::any::Any;
24+
use std::sync::Arc;
25+
26+
/// put values in a struct array.
27+
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
28+
// do not accept 0 arguments.
29+
if args.is_empty() {
30+
return exec_err!(
31+
"named_struct requires at least one pair of arguments, got 0 instead"
32+
);
33+
}
34+
35+
if args.len() % 2 != 0 {
36+
return exec_err!(
37+
"named_struct requires an even number of arguments, got {} instead",
38+
args.len()
39+
);
40+
}
41+
42+
let (names, values): (Vec<_>, Vec<_>) = args
43+
.chunks_exact(2)
44+
.enumerate()
45+
.map(|(i, chunk)| {
46+
47+
let name_column = &chunk[0];
48+
49+
let name = match name_column {
50+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(name_scalar))) => name_scalar,
51+
_ => return exec_err!("named_struct even arguments must be string literals, got {name_column:?} instead at position {}", i * 2)
52+
};
53+
54+
Ok((name, chunk[1].clone()))
55+
})
56+
.collect::<Result<Vec<_>>>()?
57+
.into_iter()
58+
.unzip();
59+
60+
let arrays = ColumnarValue::values_to_arrays(&values)?;
61+
62+
let fields = names
63+
.into_iter()
64+
.zip(arrays)
65+
.map(|(name, value)| {
66+
(
67+
Arc::new(Field::new(name, value.data_type().clone(), true)),
68+
value,
69+
)
70+
})
71+
.collect::<Vec<_>>();
72+
73+
Ok(ColumnarValue::Array(Arc::new(StructArray::from(fields))))
74+
}
75+
76+
#[derive(Debug)]
77+
pub(super) struct NamedStructFunc {
78+
signature: Signature,
79+
}
80+
81+
impl NamedStructFunc {
82+
pub fn new() -> Self {
83+
Self {
84+
signature: Signature::variadic_any(Volatility::Immutable),
85+
}
86+
}
87+
}
88+
89+
impl ScalarUDFImpl for NamedStructFunc {
90+
fn as_any(&self) -> &dyn Any {
91+
self
92+
}
93+
94+
fn name(&self) -> &str {
95+
"named_struct"
96+
}
97+
98+
fn signature(&self) -> &Signature {
99+
&self.signature
100+
}
101+
102+
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
103+
internal_err!(
104+
"named_struct: return_type called instead of return_type_from_exprs"
105+
)
106+
}
107+
108+
fn return_type_from_exprs(
109+
&self,
110+
args: &[datafusion_expr::Expr],
111+
schema: &dyn datafusion_common::ExprSchema,
112+
_arg_types: &[DataType],
113+
) -> Result<DataType> {
114+
// do not accept 0 arguments.
115+
if args.is_empty() {
116+
return exec_err!(
117+
"named_struct requires at least one pair of arguments, got 0 instead"
118+
);
119+
}
120+
121+
if args.len() % 2 != 0 {
122+
return exec_err!(
123+
"named_struct requires an even number of arguments, got {} instead",
124+
args.len()
125+
);
126+
}
127+
128+
let return_fields = args
129+
.chunks_exact(2)
130+
.enumerate()
131+
.map(|(i, chunk)| {
132+
let name = &chunk[0];
133+
let value = &chunk[1];
134+
135+
if let Expr::Literal(ScalarValue::Utf8(Some(name))) = name {
136+
Ok(Field::new(name, value.get_type(schema)?, true))
137+
} else {
138+
exec_err!("named_struct even arguments must be string literals, got {name} instead at position {}", i * 2)
139+
}
140+
})
141+
.collect::<Result<Vec<Field>>>()?;
142+
Ok(DataType::Struct(Fields::from(return_fields)))
143+
}
144+
145+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
146+
named_struct_expr(args)
147+
}
148+
}

datafusion/sql/src/expr/mod.rs

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ use datafusion_expr::expr::InList;
2929
use datafusion_expr::expr::ScalarFunction;
3030
use datafusion_expr::{
3131
col, expr, lit, AggregateFunction, Between, BinaryExpr, BuiltinScalarFunction, Cast,
32-
Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Operator, TryCast,
32+
Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Literal, Operator,
33+
TryCast,
3334
};
3435

3536
use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
@@ -604,18 +605,44 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
604605
}
605606
let args = values
606607
.into_iter()
607-
.map(|value| {
608-
self.sql_expr_to_logical_expr(value, input_schema, planner_context)
608+
.enumerate()
609+
.map(|(i, value)| {
610+
let args = if let SQLExpr::Named { expr, name } = value {
611+
[
612+
name.value.lit(),
613+
self.sql_expr_to_logical_expr(
614+
*expr,
615+
input_schema,
616+
planner_context,
617+
)?,
618+
]
619+
} else {
620+
[
621+
format!("c{i}").lit(),
622+
self.sql_expr_to_logical_expr(
623+
value,
624+
input_schema,
625+
planner_context,
626+
)?,
627+
]
628+
};
629+
630+
Ok(args)
609631
})
610-
.collect::<Result<Vec<_>>>()?;
611-
let struct_func = self
632+
.collect::<Result<Vec<_>>>()?
633+
.into_iter()
634+
.flatten()
635+
.collect();
636+
637+
let named_struct_func = self
612638
.context_provider
613-
.get_function_meta("struct")
639+
.get_function_meta("named_struct")
614640
.ok_or_else(|| {
615-
internal_datafusion_err!("Unable to find expected 'struct' function")
616-
})?;
641+
internal_datafusion_err!("Unable to find expected 'named_struct' function")
642+
})?;
643+
617644
Ok(Expr::ScalarFunction(ScalarFunction::new_udf(
618-
struct_func,
645+
named_struct_func,
619646
args,
620647
)))
621648
}

datafusion/sqllogictest/test_files/explain.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,8 @@ query TT
390390
explain select struct(1, 2.3, 'abc');
391391
----
392392
logical_plan
393-
Projection: Struct({c0:1,c1:2.3,c2:abc}) AS struct(Int64(1),Float64(2.3),Utf8("abc"))
393+
Projection: Struct({c0:1,c1:2.3,c2:abc}) AS named_struct(Utf8("c0"),Int64(1),Utf8("c1"),Float64(2.3),Utf8("c2"),Utf8("abc"))
394394
--EmptyRelation
395395
physical_plan
396-
ProjectionExec: expr=[{c0:1,c1:2.3,c2:abc} as struct(Int64(1),Float64(2.3),Utf8("abc"))]
396+
ProjectionExec: expr=[{c0:1,c1:2.3,c2:abc} as named_struct(Utf8("c0"),Int64(1),Utf8("c1"),Float64(2.3),Utf8("c2"),Utf8("abc"))]
397397
--PlaceholderRowExec

datafusion/sqllogictest/test_files/struct.slt

Lines changed: 106 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ statement ok
2323
CREATE TABLE values(
2424
a INT,
2525
b FLOAT,
26-
c VARCHAR
26+
c VARCHAR,
27+
n VARCHAR,
2728
) AS VALUES
28-
(1, 1.1, 'a'),
29-
(2, 2.2, 'b'),
30-
(3, 3.3, 'c')
29+
(1, 1.1, 'a', NULL),
30+
(2, 2.2, 'b', NULL),
31+
(3, 3.3, 'c', NULL)
3132
;
3233

3334
# struct[i]
@@ -50,6 +51,18 @@ select struct(1, 3.14, 'e');
5051
----
5152
{c0: 1, c1: 3.14, c2: e}
5253

54+
# struct scalar function with named values
55+
query ?
56+
select struct(1 as "name0", 3.14 as name1, 'e', true as 'name3');
57+
----
58+
{name0: 1, name1: 3.14, c2: e, name3: true}
59+
60+
# struct scalar function with mixed named and unnamed values
61+
query ?
62+
select struct(1, 3.14 as name1, 'e', true);
63+
----
64+
{c0: 1, name1: 3.14, c2: e, c3: true}
65+
5366
# struct scalar function with columns #1
5467
query ?
5568
select struct(a, b, c) from values;
@@ -72,11 +85,98 @@ query TT
7285
explain select struct(a, b, c) from values;
7386
----
7487
logical_plan
75-
Projection: struct(values.a, values.b, values.c)
88+
Projection: named_struct(Utf8("c0"), values.a, Utf8("c1"), values.b, Utf8("c2"), values.c)
7689
--TableScan: values projection=[a, b, c]
7790
physical_plan
78-
ProjectionExec: expr=[struct(a@0, b@1, c@2) as struct(values.a,values.b,values.c)]
91+
ProjectionExec: expr=[named_struct(c0, a@0, c1, b@1, c2, c@2) as named_struct(Utf8("c0"),values.a,Utf8("c1"),values.b,Utf8("c2"),values.c)]
7992
--MemoryExec: partitions=1, partition_sizes=[1]
8093

94+
# error on 0 arguments
95+
query error DataFusion error: Error during planning: No function matches the given name and argument types 'named_struct\(\)'. You might need to add explicit type casts.
96+
select named_struct();
97+
98+
# error on odd number of arguments #1
99+
query error DataFusion error: Execution error: named_struct requires an even number of arguments, got 1 instead
100+
select named_struct('a');
101+
102+
# error on odd number of arguments #2
103+
query error DataFusion error: Execution error: named_struct requires an even number of arguments, got 1 instead
104+
select named_struct(1);
105+
106+
# error on odd number of arguments #3
107+
query error DataFusion error: Execution error: named_struct requires an even number of arguments, got 1 instead
108+
select named_struct(values.a) from values;
109+
110+
# error on odd number of arguments #4
111+
query error DataFusion error: Execution error: named_struct requires an even number of arguments, got 3 instead
112+
select named_struct('a', 1, 'b');
113+
114+
# error on even argument not a string literal #1
115+
query error DataFusion error: Execution error: named_struct even arguments must be string literals, got Int64\(1\) instead at position 0
116+
select named_struct(1, 'a');
117+
118+
# error on even argument not a string literal #2
119+
query error DataFusion error: Execution error: named_struct even arguments must be string literals, got Int64\(0\) instead at position 2
120+
select named_struct('corret', 1, 0, 'wrong');
121+
122+
# error on even argument not a string literal #3
123+
query error DataFusion error: Execution error: named_struct even arguments must be string literals, got values\.a instead at position 0
124+
select named_struct(values.a, 'a') from values;
125+
126+
# error on even argument not a string literal #4
127+
query error DataFusion error: Execution error: named_struct even arguments must be string literals, got values\.c instead at position 0
128+
select named_struct(values.c, 'c') from values;
129+
130+
# named_struct with mixed scalar and array values #1
131+
query ?
132+
select named_struct('scalar', 27, 'array', values.a, 'null', NULL) from values;
133+
----
134+
{scalar: 27, array: 1, null: }
135+
{scalar: 27, array: 2, null: }
136+
{scalar: 27, array: 3, null: }
137+
138+
# named_struct with mixed scalar and array values #2
139+
query ?
140+
select named_struct('array', values.a, 'scalar', 27, 'null', NULL) from values;
141+
----
142+
{array: 1, scalar: 27, null: }
143+
{array: 2, scalar: 27, null: }
144+
{array: 3, scalar: 27, null: }
145+
146+
# named_struct with mixed scalar and array values #3
147+
query ?
148+
select named_struct('null', NULL, 'array', values.a, 'scalar', 27) from values;
149+
----
150+
{null: , array: 1, scalar: 27}
151+
{null: , array: 2, scalar: 27}
152+
{null: , array: 3, scalar: 27}
153+
154+
# named_struct with mixed scalar and array values #4
155+
query ?
156+
select named_struct('null_array', values.n, 'array', values.a, 'scalar', 27, 'null', NULL) from values;
157+
----
158+
{null_array: , array: 1, scalar: 27, null: }
159+
{null_array: , array: 2, scalar: 27, null: }
160+
{null_array: , array: 3, scalar: 27, null: }
161+
162+
# named_struct arrays only
163+
query ?
164+
select named_struct('field_a', a, 'field_b', b) from values;
165+
----
166+
{field_a: 1, field_b: 1.1}
167+
{field_a: 2, field_b: 2.2}
168+
{field_a: 3, field_b: 3.3}
169+
170+
# named_struct scalars only
171+
query ?
172+
select named_struct('field_a', 1, 'field_b', 2);
173+
----
174+
{field_a: 1, field_b: 2}
175+
81176
statement ok
82177
drop table values;
178+
179+
query T
180+
select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3));
181+
----
182+
Struct([Field { name: "first", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "second", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "third", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }])

0 commit comments

Comments
 (0)