Skip to content

Commit b3c6f07

Browse files
committed
Fix UNION field nullability tracking
This commit fixes two bugs related to UNION handling - when constructing union plan nullability of the other union branch was ignored, thus resulting field could easily have incorrect nullability - when pruning/simplifying projects, in `recompute_schema` function there was similar logic, thus loosing nullability information even for correctly constructed Union plan node As a result, other optimizer logic (e.g. `expr_simplifier.rs`) could draw incorrect conclusions and thus lead to incorrect query results, as demonstrated with the attached SLT test.
1 parent 9185640 commit b3c6f07

File tree

3 files changed

+58
-13
lines changed

3 files changed

+58
-13
lines changed

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,9 +1529,47 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result<LogicalP
15291529
);
15301530
}
15311531

1532-
// Temporarily use the schema from the left input and later rely on the analyzer to
1533-
// coerce the two schemas into a common one.
1534-
let schema = (**left_schema).clone();
1532+
let schema = DFSchema::from_unqualified_fields(
1533+
left_schema
1534+
.fields()
1535+
.iter()
1536+
.zip(right_schema.fields()).enumerate()
1537+
.map(|(i, (left_field, right_field))| {
1538+
let name = left_field.name();
1539+
// TODO apply type coercion here, or document why it's better to defer
1540+
// temporarily use the data type from the left input and later rely on the analyzer to
1541+
// coerce the two schemas into a common one.
1542+
let data_type = left_field.data_type();
1543+
let nullable = left_field.is_nullable() || right_field.is_nullable();
1544+
let mut field = Field::new(
1545+
name,
1546+
data_type.clone(),
1547+
nullable,
1548+
);
1549+
println!("Constructed union field {} nullable {}", field.name(), field.is_nullable());
1550+
if left_field.metadata() != right_field.metadata() {
1551+
return plan_err!(
1552+
"UNION field {i} have different metadata: left has {:?} whereas right has {:?}",
1553+
left_field.metadata(),
1554+
right_field.metadata()
1555+
);
1556+
}
1557+
field.set_metadata(left_field.metadata().clone());
1558+
Ok(field)
1559+
})
1560+
.collect::<Result<_>>()?,
1561+
{
1562+
// Metadata is application specific. It's unclear how it should be merged/reconciled.
1563+
if left_schema.inner().metadata() != right_schema.inner().metadata() {
1564+
return plan_err!(
1565+
"UNION queries have different metadata: left has {:?} whereas right has {:?}",
1566+
left_schema.inner().metadata(),
1567+
right_schema.inner().metadata()
1568+
);
1569+
}
1570+
left_schema.inner().metadata().clone()
1571+
},
1572+
) ? ;
15351573

15361574
// Functional Dependencies doesn't preserve after UNION operation
15371575
let schema =

datafusion/expr/src/logical_plan/plan.rs

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -698,16 +698,8 @@ impl LogicalPlan {
698698
node: node.with_exprs_and_inputs(expr, inputs)?,
699699
}))
700700
}
701-
LogicalPlan::Union(Union { inputs, schema }) => {
702-
let input_schema = inputs[0].schema();
703-
// If inputs are not pruned do not change schema
704-
// TODO this seems wrong (shouldn't we always use the schema of the input?)
705-
let schema = if schema.fields().len() == input_schema.fields().len() {
706-
Arc::clone(&schema)
707-
} else {
708-
Arc::clone(input_schema)
709-
};
710-
Ok(LogicalPlan::Union(Union { inputs, schema }))
701+
LogicalPlan::Union(_) => {
702+
Ok(self)
711703
}
712704
LogicalPlan::Distinct(distinct) => {
713705
let distinct = match distinct {

datafusion/sqllogictest/test_files/union.slt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,3 +836,18 @@ physical_plan
836836
# Clean up after the test
837837
statement ok
838838
drop table aggregate_test_100;
839+
840+
# test for https://github.com/apache/datafusion/issues/14352
841+
query TB rowsort
842+
SELECT
843+
a,
844+
a IS NOT NULL
845+
FROM (
846+
-- second column, even though it's not selected, was necessary to reproduce the bug linked above
847+
SELECT 'foo' AS a, 3 AS b
848+
UNION ALL
849+
SELECT NULL AS a, 4 AS b
850+
)
851+
----
852+
NULL false
853+
foo true

0 commit comments

Comments
 (0)