Skip to content

Commit 9400d72

Browse files
findepigabotechs
authored andcommitted
Add support for DISTINCT and ORDER BY in ARRAY_AGG
1 parent 48a28af commit 9400d72

File tree

6 files changed

+653
-165
lines changed

6 files changed

+653
-165
lines changed

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ use datafusion_common::file_options::file_type::FileType;
5454
use datafusion_common::{
5555
exec_err, get_target_functional_dependencies, internal_err, not_impl_err,
5656
plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError,
57-
FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema,
58-
UnnestOptions,
57+
Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions,
5958
};
6059
use datafusion_expr_common::type_coercion::binary::type_union_resolution;
6160

@@ -1518,27 +1517,10 @@ pub fn validate_unique_names<'a>(
15181517
/// [`TypeCoercionRewriter::coerce_union`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/struct.TypeCoercionRewriter.html#method.coerce_union
15191518
/// [`coerce_union_schema`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/fn.coerce_union_schema.html
15201519
pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result<LogicalPlan> {
1521-
if left_plan.schema().fields().len() != right_plan.schema().fields().len() {
1522-
return plan_err!(
1523-
"UNION queries have different number of columns: \
1524-
left has {} columns whereas right has {} columns",
1525-
left_plan.schema().fields().len(),
1526-
right_plan.schema().fields().len()
1527-
);
1528-
}
1529-
1530-
// Temporarily use the schema from the left input and later rely on the analyzer to
1531-
// coerce the two schemas into a common one.
1532-
1533-
// Functional Dependencies doesn't preserve after UNION operation
1534-
let schema = (**left_plan.schema()).clone();
1535-
let schema =
1536-
Arc::new(schema.with_functional_dependencies(FunctionalDependencies::empty())?);
1537-
1538-
Ok(LogicalPlan::Union(Union {
1539-
inputs: vec![Arc::new(left_plan), Arc::new(right_plan)],
1540-
schema,
1541-
}))
1520+
Ok(LogicalPlan::Union(Union::try_new_with_loose_types(vec![
1521+
Arc::new(left_plan),
1522+
Arc::new(right_plan),
1523+
])?))
15421524
}
15431525

15441526
/// Create Projection

datafusion/expr/src/logical_plan/plan.rs

Lines changed: 107 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -699,15 +699,13 @@ impl LogicalPlan {
699699
}))
700700
}
701701
LogicalPlan::Union(Union { inputs, schema }) => {
702-
let input_schema = inputs[0].schema();
703-
// If inputs are not pruned do not change schema
704-
// TODO this seems wrong (shouldn't we always use the schema of the input?)
705-
let schema = if schema.fields().len() == input_schema.fields().len() {
706-
Arc::clone(&schema)
702+
let first_input_schema = inputs[0].schema();
703+
if schema.fields().len() == first_input_schema.fields().len() {
704+
// If inputs are not pruned do not change schema
705+
Ok(LogicalPlan::Union(Union { inputs, schema }))
707706
} else {
708-
Arc::clone(input_schema)
709-
};
710-
Ok(LogicalPlan::Union(Union { inputs, schema }))
707+
Ok(LogicalPlan::Union(Union::try_new(inputs)?))
708+
}
711709
}
712710
LogicalPlan::Distinct(distinct) => {
713711
let distinct = match distinct {
@@ -2645,6 +2643,107 @@ pub struct Union {
26452643
pub schema: DFSchemaRef,
26462644
}
26472645

2646+
impl Union {
2647+
/// Constructs new Union instance deriving schema from inputs.
2648+
fn try_new(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
2649+
let schema = Self::derive_schema_from_inputs(&inputs, false)?;
2650+
Ok(Union { inputs, schema })
2651+
}
2652+
2653+
/// Constructs new Union instance deriving schema from inputs.
2654+
/// Inputs do not have to have matching types and produced schema will
2655+
/// take type from the first input.
2656+
// TODO (https://github.com/apache/datafusion/issues/14380): Avoid creating uncoerced union at all.
2657+
pub fn try_new_with_loose_types(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
2658+
let schema = Self::derive_schema_from_inputs(&inputs, true)?;
2659+
Ok(Union { inputs, schema })
2660+
}
2661+
2662+
/// Constructs new Union instance deriving schema from inputs.
2663+
///
2664+
/// `loose_types` if true, inputs do not have to have matching types and produced schema will
2665+
/// take type from the first input. TODO (<https://github.com/apache/datafusion/issues/14380>) this is not necessarily reasonable behavior.
2666+
fn derive_schema_from_inputs(
2667+
inputs: &[Arc<LogicalPlan>],
2668+
loose_types: bool,
2669+
) -> Result<DFSchemaRef> {
2670+
if inputs.len() < 2 {
2671+
return plan_err!("UNION requires at least two inputs");
2672+
}
2673+
let first_schema = inputs[0].schema();
2674+
let fields_count = first_schema.fields().len();
2675+
for input in inputs.iter().skip(1) {
2676+
if fields_count != input.schema().fields().len() {
2677+
return plan_err!(
2678+
"UNION queries have different number of columns: \
2679+
left has {} columns whereas right has {} columns",
2680+
fields_count,
2681+
input.schema().fields().len()
2682+
);
2683+
}
2684+
}
2685+
2686+
let union_fields = (0..fields_count)
2687+
.map(|i| {
2688+
let fields = inputs
2689+
.iter()
2690+
.map(|input| input.schema().field(i))
2691+
.collect::<Vec<_>>();
2692+
let first_field = fields[0];
2693+
let name = first_field.name();
2694+
let data_type = if loose_types {
2695+
// TODO apply type coercion here, or document why it's better to defer
2696+
// temporarily use the data type from the left input and later rely on the analyzer to
2697+
// coerce the two schemas into a common one.
2698+
first_field.data_type()
2699+
} else {
2700+
fields.iter().skip(1).try_fold(
2701+
first_field.data_type(),
2702+
|acc, field| {
2703+
if acc != field.data_type() {
2704+
return plan_err!(
2705+
"UNION field {i} have different type in inputs: \
2706+
left has {} whereas right has {}",
2707+
first_field.data_type(),
2708+
field.data_type()
2709+
);
2710+
}
2711+
Ok(acc)
2712+
},
2713+
)?
2714+
};
2715+
let nullable = fields.iter().any(|field| field.is_nullable());
2716+
let mut field = Field::new(name, data_type.clone(), nullable);
2717+
let field_metadata =
2718+
intersect_maps(fields.iter().map(|field| field.metadata()));
2719+
field.set_metadata(field_metadata);
2720+
// TODO reusing table reference from the first schema is probably wrong
2721+
let table_reference = first_schema.qualified_field(i).0.cloned();
2722+
Ok((table_reference, Arc::new(field)))
2723+
})
2724+
.collect::<Result<_>>()?;
2725+
let union_schema_metadata =
2726+
intersect_maps(inputs.iter().map(|input| input.schema().metadata()));
2727+
2728+
// Functional Dependencies doesn't preserve after UNION operation
2729+
let schema = DFSchema::new_with_metadata(union_fields, union_schema_metadata)?;
2730+
let schema = Arc::new(schema);
2731+
2732+
Ok(schema)
2733+
}
2734+
}
2735+
2736+
fn intersect_maps<'a>(
2737+
inputs: impl IntoIterator<Item = &'a HashMap<String, String>>,
2738+
) -> HashMap<String, String> {
2739+
let mut inputs = inputs.into_iter();
2740+
let mut merged: HashMap<String, String> = inputs.next().cloned().unwrap_or_default();
2741+
for input in inputs {
2742+
merged.retain(|k, v| input.get(k) == Some(v));
2743+
}
2744+
merged
2745+
}
2746+
26482747
// Manual implementation needed because of `schema` field. Comparison excludes this field.
26492748
impl PartialOrd for Union {
26502749
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {

datafusion/functions-aggregate-common/src/merge_arrays.rs

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,149 @@ pub fn merge_ordered_arrays(
193193

194194
Ok((merged_values, merged_orderings))
195195
}
196+
197+
#[cfg(test)]
198+
mod tests {
199+
use super::*;
200+
201+
use std::collections::VecDeque;
202+
use std::sync::Arc;
203+
204+
use arrow::array::{ArrayRef, Int64Array};
205+
206+
use datafusion_common::utils::get_row_at_idx;
207+
use datafusion_common::{Result, ScalarValue};
208+
209+
#[test]
210+
fn test_merge_asc() -> Result<()> {
211+
let lhs_arrays: Vec<ArrayRef> = vec![
212+
Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])),
213+
Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])),
214+
];
215+
let n_row = lhs_arrays[0].len();
216+
let lhs_orderings = (0..n_row)
217+
.map(|idx| get_row_at_idx(&lhs_arrays, idx))
218+
.collect::<Result<VecDeque<_>>>()?;
219+
220+
let rhs_arrays: Vec<ArrayRef> = vec![
221+
Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])),
222+
Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])),
223+
];
224+
let n_row = rhs_arrays[0].len();
225+
let rhs_orderings = (0..n_row)
226+
.map(|idx| get_row_at_idx(&rhs_arrays, idx))
227+
.collect::<Result<VecDeque<_>>>()?;
228+
let sort_options = vec![
229+
SortOptions {
230+
descending: false,
231+
nulls_first: false,
232+
},
233+
SortOptions {
234+
descending: false,
235+
nulls_first: false,
236+
},
237+
];
238+
239+
let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef;
240+
let lhs_vals = (0..lhs_vals_arr.len())
241+
.map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx))
242+
.collect::<Result<VecDeque<_>>>()?;
243+
244+
let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef;
245+
let rhs_vals = (0..rhs_vals_arr.len())
246+
.map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx))
247+
.collect::<Result<VecDeque<_>>>()?;
248+
let expected =
249+
Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef;
250+
let expected_ts = vec![
251+
Arc::new(Int64Array::from(vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2])) as ArrayRef,
252+
Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef,
253+
];
254+
255+
let (merged_vals, merged_ts) = merge_ordered_arrays(
256+
&mut [lhs_vals, rhs_vals],
257+
&mut [lhs_orderings, rhs_orderings],
258+
&sort_options,
259+
)?;
260+
let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?;
261+
let merged_ts = (0..merged_ts[0].len())
262+
.map(|col_idx| {
263+
ScalarValue::iter_to_array(
264+
(0..merged_ts.len())
265+
.map(|row_idx| merged_ts[row_idx][col_idx].clone()),
266+
)
267+
})
268+
.collect::<Result<Vec<_>>>()?;
269+
270+
assert_eq!(&merged_vals, &expected);
271+
assert_eq!(&merged_ts, &expected_ts);
272+
273+
Ok(())
274+
}
275+
276+
#[test]
277+
fn test_merge_desc() -> Result<()> {
278+
let lhs_arrays: Vec<ArrayRef> = vec![
279+
Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])),
280+
Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])),
281+
];
282+
let n_row = lhs_arrays[0].len();
283+
let lhs_orderings = (0..n_row)
284+
.map(|idx| get_row_at_idx(&lhs_arrays, idx))
285+
.collect::<Result<VecDeque<_>>>()?;
286+
287+
let rhs_arrays: Vec<ArrayRef> = vec![
288+
Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])),
289+
Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])),
290+
];
291+
let n_row = rhs_arrays[0].len();
292+
let rhs_orderings = (0..n_row)
293+
.map(|idx| get_row_at_idx(&rhs_arrays, idx))
294+
.collect::<Result<VecDeque<_>>>()?;
295+
let sort_options = vec![
296+
SortOptions {
297+
descending: true,
298+
nulls_first: false,
299+
},
300+
SortOptions {
301+
descending: true,
302+
nulls_first: false,
303+
},
304+
];
305+
306+
// Values (which will be merged) doesn't have to be ordered.
307+
let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef;
308+
let lhs_vals = (0..lhs_vals_arr.len())
309+
.map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx))
310+
.collect::<Result<VecDeque<_>>>()?;
311+
312+
let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef;
313+
let rhs_vals = (0..rhs_vals_arr.len())
314+
.map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx))
315+
.collect::<Result<VecDeque<_>>>()?;
316+
let expected =
317+
Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 1, 1, 2, 2])) as ArrayRef;
318+
let expected_ts = vec![
319+
Arc::new(Int64Array::from(vec![2, 2, 1, 1, 1, 1, 0, 0, 0, 0])) as ArrayRef,
320+
Arc::new(Int64Array::from(vec![4, 4, 3, 3, 2, 2, 1, 1, 0, 0])) as ArrayRef,
321+
];
322+
let (merged_vals, merged_ts) = merge_ordered_arrays(
323+
&mut [lhs_vals, rhs_vals],
324+
&mut [lhs_orderings, rhs_orderings],
325+
&sort_options,
326+
)?;
327+
let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?;
328+
let merged_ts = (0..merged_ts[0].len())
329+
.map(|col_idx| {
330+
ScalarValue::iter_to_array(
331+
(0..merged_ts.len())
332+
.map(|row_idx| merged_ts[row_idx][col_idx].clone()),
333+
)
334+
})
335+
.collect::<Result<Vec<_>>>()?;
336+
337+
assert_eq!(&merged_vals, &expected);
338+
assert_eq!(&merged_ts, &expected_ts);
339+
Ok(())
340+
}
341+
}

0 commit comments

Comments
 (0)