Skip to content

Commit dcaab5a

Browse files
committed
Drop support for intermediate column updates on context
1 parent 0916b3d commit dcaab5a

File tree

4 files changed

+30
-105
lines changed

4 files changed

+30
-105
lines changed

datafusion/physical-expr/src/expressions/binary.rs

Lines changed: 18 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,8 @@ impl PhysicalExpr for BinaryExpr {
657657
)))
658658
}
659659

660-
/// Return the boundaries of this binary expression's result. If the expression itself
661-
/// is a comparison which changes the boundaries of one of its inputs (a = 20 would pin
662-
/// a to 20), then it might update the input's boundaries directly on the context.
663-
fn analyze(&self, context: &mut AnalysisContext) -> Option<ExprBoundaries> {
660+
/// Return the boundaries of this binary expression's result.
661+
fn boundaries(&self, context: &AnalysisContext) -> Option<ExprBoundaries> {
664662
match &self.op {
665663
Operator::Eq
666664
| Operator::Gt
@@ -669,8 +667,8 @@ impl PhysicalExpr for BinaryExpr {
669667
| Operator::GtEq => {
670668
// We currently only support comparison when we know at least one of the sides are
671669
// a known value (a scalar). This includes predicates like a > 20 or 5 > a.
672-
let left_boundaries = self.left.analyze(context)?;
673-
let right_boundaries = self.right.analyze(context)?;
670+
let left_boundaries = self.left.boundaries(context)?;
671+
let right_boundaries = self.right.boundaries(context)?;
674672
let (op, left, right) = match right_boundaries.reduce() {
675673
Some(right_value) => {
676674
// We know the right side is a scalar, so we can use the operator as is
@@ -707,13 +705,13 @@ impl PartialEq<dyn Any> for BinaryExpr {
707705
// false, or unknown (with a probablistic selectivity value attached).
708706
fn analyze_expr_scalar_comparison(
709707
op: &Operator,
710-
context: &mut AnalysisContext,
708+
context: &AnalysisContext,
711709
left: &Arc<dyn PhysicalExpr>,
712710
right: ScalarValue,
713711
) -> Option<ExprBoundaries> {
714-
let left_bounds = left.analyze(context)?;
715-
let left_min = left_bounds.min_value.clone();
716-
let left_max = left_bounds.max_value.clone();
712+
let left_bounds = left.boundaries(context)?;
713+
let left_min = left_bounds.min_value;
714+
let left_max = left_bounds.max_value;
717715

718716
// Direct selectivity is applicable when we can determine that this comparison will
719717
// always be true or false (e.g. `x > 10` where the `x`'s min value is 11 or `a < 5`
@@ -762,43 +760,6 @@ fn analyze_expr_scalar_comparison(
762760
}
763761
}?;
764762

765-
let (left_min, left_max) = match op {
766-
// TODO: for lt/gt, we technically should shrink the possibility space
767-
// by one since a < 5 means that 5 is not a possible value for `a`. However,
768-
// it is currently tricky to do so (e.g. for floats, we can get away with 4.999
769-
// so we need a smarter logic to find out what is the closest value that is
770-
// different from the scalar_value).
771-
Operator::Lt | Operator::LtEq => {
772-
// We only want to update the upper bound when we know it will help us (e.g.
773-
// it is actually smaller than what we have right now) and it is a valid
774-
// value (e.g. [0, 100] < -100 would update the boundaries to [0, -100] if
775-
// there weren't the selectivity check).
776-
if right < left_max && selectivity > 0.0 {
777-
(left_min, right)
778-
} else {
779-
(left_min, left_max)
780-
}
781-
}
782-
Operator::Gt | Operator::GtEq => {
783-
// Same as above, but this time we want to limit the lower bound.
784-
if right > left_min && selectivity > 0.0 {
785-
(right, left_max)
786-
} else {
787-
(left_min, left_max)
788-
}
789-
}
790-
// For equality, we don't have the range problem so even if the selectivity
791-
// is 0.0, we can still update the boundaries.
792-
Operator::Eq => (right.clone(), right),
793-
_ => unreachable!(),
794-
};
795-
796-
// The context represents all the knowledge we have gathered during the
797-
// analysis process, which we can now add more since the expression's upper
798-
// and lower boundaries might have changed.
799-
let left_bounds = ExprBoundaries::new(left_min, left_max, left_bounds.distinct_count);
800-
left.apply(context, &left_bounds);
801-
802763
// The selectivity can't be be greater than 1.0.
803764
assert!(selectivity <= 1.0);
804765

@@ -3007,13 +2968,12 @@ mod tests {
30072968
((Operator::GtEq, 200), (0.0, 1, 100)),
30082969
];
30092970

3010-
for ((operator, rhs), (exp_selectivity, exp_min, exp_max)) in cases {
3011-
let mut context =
3012-
AnalysisContext::from_statistics(&schema, statistics.clone());
2971+
for ((operator, rhs), (exp_selectivity, _, _)) in cases {
2972+
let context = AnalysisContext::from_statistics(&schema, statistics.clone());
30132973
let left = col("a", &schema).unwrap();
30142974
let right = ScalarValue::Int64(Some(rhs));
30152975
let boundaries =
3016-
analyze_expr_scalar_comparison(&operator, &mut context, &left, right)
2976+
analyze_expr_scalar_comparison(&operator, &context, &left, right)
30172977
.expect("this case should not return None");
30182978

30192979
assert_eq!(
@@ -3036,12 +2996,6 @@ mod tests {
30362996
assert_eq!(boundaries.min_value, ScalarValue::Boolean(Some(false)));
30372997
assert_eq!(boundaries.max_value, ScalarValue::Boolean(Some(true)));
30382998
}
3039-
3040-
let left_boundaries = left
3041-
.analyze(&mut context)
3042-
.expect("this case should not return None");
3043-
assert_eq!(left_boundaries.min_value, ScalarValue::Int64(Some(exp_min)));
3044-
assert_eq!(left_boundaries.max_value, ScalarValue::Int64(Some(exp_max)));
30452999
}
30463000
Ok(())
30473001
}
@@ -3083,13 +3037,12 @@ mod tests {
30833037
((Operator::GtEq, 50.7), (1.0 / distance, 50.7, 50.7)),
30843038
];
30853039

3086-
for ((operator, rhs), (exp_selectivity, exp_min, exp_max)) in cases {
3087-
let mut context =
3088-
AnalysisContext::from_statistics(&schema, statistics.clone());
3040+
for ((operator, rhs), (exp_selectivity, _, _)) in cases {
3041+
let context = AnalysisContext::from_statistics(&schema, statistics.clone());
30893042
let left = col("a", &schema).unwrap();
30903043
let right = ScalarValue::from(rhs);
30913044
let boundaries =
3092-
analyze_expr_scalar_comparison(&operator, &mut context, &left, right)
3045+
analyze_expr_scalar_comparison(&operator, &context, &left, right)
30933046
.expect("this case should not return None");
30943047

30953048
assert_eq!(
@@ -3112,12 +3065,6 @@ mod tests {
31123065
assert_eq!(boundaries.min_value, ScalarValue::from(false));
31133066
assert_eq!(boundaries.max_value, ScalarValue::from(true));
31143067
}
3115-
3116-
let left_boundaries = left
3117-
.analyze(&mut context)
3118-
.expect("this case should not return None");
3119-
assert_eq!(left_boundaries.min_value, ScalarValue::from(exp_min));
3120-
assert_eq!(left_boundaries.max_value, ScalarValue::from(exp_max));
31213068
}
31223069
Ok(())
31233070
}
@@ -3137,18 +3084,12 @@ mod tests {
31373084
&schema,
31383085
);
31393086

3140-
let mut context = AnalysisContext::from_statistics(&schema, statistics);
3087+
let context = AnalysisContext::from_statistics(&schema, statistics);
31413088
let predicate_boundaries = gt
3142-
.analyze(&mut context)
3089+
.boundaries(&context)
31433090
.expect("boundaries should not be None");
31443091
assert_eq!(predicate_boundaries.selectivity, Some(0.76));
31453092

3146-
let col_boundaries = a
3147-
.analyze(&mut context)
3148-
.expect("boundaries should not be None");
3149-
assert_eq!(col_boundaries.min_value, ScalarValue::from(25));
3150-
assert_eq!(col_boundaries.max_value, ScalarValue::from(100));
3151-
31523093
Ok(())
31533094
}
31543095

@@ -3171,18 +3112,12 @@ mod tests {
31713112
&schema,
31723113
);
31733114

3174-
let mut context = AnalysisContext::from_statistics(&schema, statistics);
3115+
let context = AnalysisContext::from_statistics(&schema, statistics);
31753116
let predicate_boundaries = gt
3176-
.analyze(&mut context)
3117+
.boundaries(&context)
31773118
.expect("boundaries should not be None");
31783119
assert_eq!(predicate_boundaries.selectivity, Some(0.5));
31793120

3180-
let col_boundaries = a
3181-
.analyze(&mut context)
3182-
.expect("boundaries should not be None");
3183-
assert_eq!(col_boundaries.min_value, ScalarValue::from(1));
3184-
assert_eq!(col_boundaries.max_value, ScalarValue::from(50));
3185-
31863121
Ok(())
31873122
}
31883123
}

datafusion/physical-expr/src/expressions/column.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,10 @@ impl PhysicalExpr for Column {
104104
}
105105

106106
/// Return the boundaries of this column, if known.
107-
fn analyze(&self, context: &mut AnalysisContext) -> Option<ExprBoundaries> {
107+
fn boundaries(&self, context: &AnalysisContext) -> Option<ExprBoundaries> {
108108
assert!(self.index < context.column_boundaries.len());
109109
context.column_boundaries[self.index].clone()
110110
}
111-
112-
/// Update the boundaries of this column on the given context.
113-
fn apply(&self, context: &mut AnalysisContext, boundaries: &ExprBoundaries) {
114-
assert!(self.index < context.column_boundaries.len());
115-
context.column_boundaries[self.index] = Some(boundaries.clone());
116-
}
117111
}
118112

119113
impl PartialEq<dyn Any> for Column {
@@ -223,10 +217,11 @@ mod test {
223217

224218
(schema, statistics)
225219
}
220+
226221
#[test]
227222
fn stats_bounds_analysis() -> Result<()> {
228223
let (schema, statistics) = get_test_table_stats();
229-
let mut context = AnalysisContext::from_statistics(&schema, statistics);
224+
let context = AnalysisContext::from_statistics(&schema, statistics);
230225

231226
let cases = [
232227
// (name, index, expected boundaries)
@@ -253,7 +248,7 @@ mod test {
253248

254249
for (name, index, expected) in cases {
255250
let col = Column::new(name, index);
256-
let boundaries = col.analyze(&mut context);
251+
let boundaries = col.boundaries(&context);
257252
assert_eq!(boundaries, expected);
258253
}
259254

datafusion/physical-expr/src/expressions/literal.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ impl PhysicalExpr for Literal {
8787
#[allow(unused_variables)]
8888
/// Return the boundaries of this literal expression (which is the same as
8989
/// the value it represents).
90-
fn analyze(&self, context: &mut AnalysisContext) -> Option<ExprBoundaries> {
90+
fn boundaries(&self, context: &AnalysisContext) -> Option<ExprBoundaries> {
9191
Some(ExprBoundaries::new(
9292
self.value.clone(),
9393
self.value.clone(),
@@ -146,10 +146,10 @@ mod tests {
146146
#[test]
147147
fn literal_bounds_analysis() -> Result<()> {
148148
let schema = Schema::new(vec![]);
149-
let mut context = AnalysisContext::new(&schema, vec![]);
149+
let context = AnalysisContext::new(&schema, vec![]);
150150

151151
let literal_expr = lit(42i32);
152-
let boundaries = literal_expr.analyze(&mut context).unwrap();
152+
let boundaries = literal_expr.boundaries(&context).unwrap();
153153
assert_eq!(boundaries.min_value, ScalarValue::Int32(Some(42)));
154154
assert_eq!(boundaries.max_value, ScalarValue::Int32(Some(42)));
155155
assert_eq!(boundaries.distinct_count, Some(1));

datafusion/physical-expr/src/physical_expr.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,22 +79,17 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq<dyn Any> {
7979
#[allow(unused_variables)]
8080
/// Return the boundaries of this expression. This method (and all the
8181
/// related APIs) are experimental and subject to change.
82-
fn analyze(&self, context: &mut AnalysisContext) -> Option<ExprBoundaries> {
82+
fn boundaries(&self, context: &AnalysisContext) -> Option<ExprBoundaries> {
8383
None
8484
}
85-
86-
#[allow(unused_variables)]
87-
/// Apply the given boundaries to this expression (and its child expressions,
88-
/// if they are relevant to the boundaries).
89-
fn apply(&self, context: &mut AnalysisContext, boundaries: &ExprBoundaries) {}
9085
}
9186

92-
/// A context for collecting and aggregating known boundaries of an expression
93-
/// tree.
87+
/// The shared context used during the analysis of an expression. Includes
88+
/// the boundaries for all known columns.
9489
#[derive(Clone, Debug, PartialEq)]
9590
pub struct AnalysisContext {
96-
/// A list of known column boundaries, ordered by column index
97-
/// in the current schema.
91+
/// A list of known column boundaries, ordered by the index
92+
/// of the column in the current schema.
9893
pub column_boundaries: Vec<Option<ExprBoundaries>>,
9994
}
10095

0 commit comments

Comments
 (0)