Skip to content

Commit a11aa42

Browse files
authored
enhance: simplify x=x (#15387) (#15589)
- if x is not nullable, x=x -> true - else, x=x -> x is NOT NULL OR NULL
1 parent 6f64b0f commit a11aa42

File tree

3 files changed

+53
-9
lines changed

3 files changed

+53
-9
lines changed

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,25 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
760760
None => lit_bool_null(),
761761
})
762762
}
763+
// According to SQL's null semantics, NULL = NULL evaluates to NULL
764+
// Both sides are the same expression (A = A) and A is non-volatile expression
765+
// A = A --> A IS NOT NULL OR NULL
766+
// A = A --> true (if A not nullable)
767+
Expr::BinaryExpr(BinaryExpr {
768+
left,
769+
op: Eq,
770+
right,
771+
}) if (left == right) & !left.is_volatile() => {
772+
Transformed::yes(match !info.nullable(&left)? {
773+
true => lit(true),
774+
false => Expr::BinaryExpr(BinaryExpr {
775+
left: Box::new(Expr::IsNotNull(left)),
776+
op: Or,
777+
right: Box::new(lit_bool_null()),
778+
}),
779+
})
780+
}
781+
763782
// Rules for NotEq
764783
//
765784

@@ -2243,6 +2262,21 @@ mod tests {
22432262
}
22442263
}
22452264

2265+
#[test]
2266+
fn test_simplify_eq_not_self() {
2267+
// `expr_a`: column `c2` is nullable, so `c2 = c2` simplifies to `c2 IS NOT NULL OR NULL`
2268+
// This ensures the expression is only true when `c2` is not NULL, accounting for SQL's NULL semantics.
2269+
let expr_a = col("c2").eq(col("c2"));
2270+
let expected_a = col("c2").is_not_null().or(lit_bool_null());
2271+
2272+
// `expr_b`: column `c2_non_null` is explicitly non-nullable, so `c2_non_null = c2_non_null` is always true
2273+
let expr_b = col("c2_non_null").eq(col("c2_non_null"));
2274+
let expected_b = lit(true);
2275+
2276+
assert_eq!(simplify(expr_a), expected_a);
2277+
assert_eq!(simplify(expr_b), expected_b);
2278+
}
2279+
22462280
#[test]
22472281
fn test_simplify_or_true() {
22482282
let expr_a = col("c2").or(lit(true));

datafusion/sqllogictest/test_files/array.slt

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6130,7 +6130,8 @@ select count(*) from test WHERE array_has([needle], needle);
61306130
----
61316131
100000
61326132

6133-
# TODO: this should probably be possible to completely remove the filter as always true?
6133+
# The optimizer does not currently eliminate the filter;
6134+
# Instead, it's rewritten as `IS NULL OR NOT NULL` due to SQL null semantics
61346135
query TT
61356136
explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
61366137
select count(*) from test WHERE array_has([needle], needle);
@@ -6140,21 +6141,19 @@ logical_plan
61406141
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
61416142
03)----SubqueryAlias: test
61426143
04)------SubqueryAlias: t
6143-
05)--------Projection:
6144-
06)----------Filter: __common_expr_3 = __common_expr_3
6145-
07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3
6146-
08)--------------TableScan: tmp_table projection=[value]
6144+
05)--------Projection:
6145+
06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
6146+
07)------------TableScan: tmp_table projection=[value]
61476147
physical_plan
61486148
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
61496149
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
61506150
03)----CoalescePartitionsExec
61516151
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
61526152
05)--------ProjectionExec: expr=[]
61536153
06)----------CoalesceBatchesExec: target_batch_size=8192
6154-
07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0
6155-
08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3]
6156-
09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6157-
10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6154+
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL
6155+
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6156+
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
61586157

61596158
# any operator
61606159
query ?

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,17 @@ query T
6363
select b from t where b !~ '.*'
6464
----
6565

66+
query TT
67+
explain select * from t where a = a;
68+
----
69+
logical_plan
70+
01)Filter: t.a IS NOT NULL OR Boolean(NULL)
71+
02)--TableScan: t projection=[a, b]
72+
physical_plan
73+
01)CoalesceBatchesExec: target_batch_size=8192
74+
02)--FilterExec: a@0 IS NOT NULL OR NULL
75+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
76+
6677
statement ok
6778
drop table t;
6879

0 commit comments

Comments
 (0)