apache
diff --git a/‎datafusion/core/src/physical_plan/filter.rs
Lines changed: 16 additions & 14 deletions b/‎datafusion/core/src/physical_plan/filter.rs
Lines changed: 16 additions & 14 deletions
diff --git a/‎datafusion/physical-expr/src/expressions/binary.rs
Lines changed: 113 additions & 81 deletions b/‎datafusion/physical-expr/src/expressions/binary.rs
Lines changed: 113 additions & 81 deletions
@@ -172,16 +172,13 @@ impl ExecutionPlan for FilterExec {
     /// predicate's selectivity value can be determined for the incoming data.
     fn statistics(&self) -> Statistics {
         let input_stats = self.input.statistics();
-        let mut analysis_ctx =
+        let starter_ctx =
             AnalysisContext::from_statistics(self.input.schema().as_ref(), &input_stats);
 
-        let predicate_selectivity = self
-            .predicate
-            .boundaries(&mut analysis_ctx)
-            .and_then(|bounds| bounds.selectivity);
+        let analysis_ctx = self.predicate.analyze(starter_ctx);
 
-        match predicate_selectivity {
-            Some(selectivity) => {
+        match analysis_ctx.boundaries {
+            Some(boundaries) => {
                 // Build back the column level statistics from the boundaries inside the
                 // analysis context. It is possible that these are going to be different
                 // than the input statistics, especially when a comparison is made inside
@@ -200,14 +197,19 @@ impl ExecutionPlan for FilterExec {
                     .collect();
 
                 Statistics {
-                    num_rows: input_stats
-                        .num_rows
-                        .map(|num_rows| (num_rows as f64 * selectivity).ceil() as usize),
+                    num_rows: input_stats.num_rows.zip(boundaries.selectivity).map(
+                        |(num_rows, selectivity)| {
+                            (num_rows as f64 * selectivity).ceil() as usize
+                        },
+                    ),
+                    total_byte_size: input_stats
+                        .total_byte_size
+                        .zip(boundaries.selectivity)
+                        .map(|(num_rows, selectivity)| {
+                            (num_rows as f64 * selectivity).ceil() as usize
+                        }),
                     column_statistics: Some(column_statistics),
-                    total_byte_size: input_stats.total_byte_size.map(|total_byte_size| {
-                        (total_byte_size as f64 * selectivity).ceil() as usize
-                    }),
-                    ..Default::default(),
+                    ..Default::default()
                 }
             }
             None => Statistics::default(),
 
@@ -73,7 +73,7 @@ use arrow::record_batch::RecordBatch;
 
 use super::column::Column;
 use crate::physical_expr::down_cast_any_ref;
-use crate::{AnalysisContext, ExprBoundaries, PhysicalExpr};
+use crate::{analysis_expect, AnalysisContext, ExprBoundaries, PhysicalExpr};
 use datafusion_common::cast::{as_boolean_array, as_decimal128_array};
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
@@ -659,7 +659,7 @@ impl PhysicalExpr for BinaryExpr {
     }
 
     /// Return the boundaries of this binary expression's result.
-    fn boundaries(&self, context: &mut AnalysisContext) -> Option<ExprBoundaries> {
+    fn analyze(&self, context: AnalysisContext) -> AnalysisContext {
         match &self.op {
             Operator::Eq
             | Operator::Gt
@@ -668,26 +668,42 @@ impl PhysicalExpr for BinaryExpr {
             | Operator::GtEq => {
                 // We currently only support comparison when we know at least one of the sides are
                 // a known value (a scalar). This includes predicates like a > 20 or 5 > a.
-                let left_boundaries = self.left.boundaries(context)?;
-                let right_boundaries = self.right.boundaries(context)?;
-                let (op, left, right) = match right_boundaries.reduce() {
-                    Some(right_value) => {
+                let context = self.left.analyze(context);
+                let left_boundaries =
+                    analysis_expect!(context, context.boundaries()).clone();
+
+                let context = self.right.analyze(context);
+                let right_boundaries =
+                    analysis_expect!(context, context.boundaries.clone());
+
+                match (left_boundaries.reduce(), right_boundaries.reduce()) {
+                    (_, Some(right_value)) => {
                         // We know the right side is a scalar, so we can use the operator as is
-                        (self.op, self.left.clone(), right_value)
+                        analyze_expr_scalar_comparison(
+                            context,
+                            &self.op,
+                            &self.left,
+                            right_value,
+                        )
                     }
-                    None => {
+                    (Some(left_value), _) => {
                         // If not, we have to swap the operator and left/right (since this means
                         // left has to be a scalar).
-                        (
-                            self.op.swap()?,
-                            self.right.clone(),
-                            left_boundaries.reduce()?,
+                        let swapped_op = analysis_expect!(context, self.op.swap());
+                        analyze_expr_scalar_comparison(
+                            context,
+                            &swapped_op,
+                            &self.right,
+                            left_value,
                         )
                     }
-                };
-                analyze_expr_scalar_comparison(&op, context, &left, right)
+                    _ => {
+                        // Both sides are columns, so we give up.
+                        context.with_boundaries(None)
+                    }
+                }
             }
-            _ => None,
+            _ => context.with_boundaries(None),
         }
     }
 }
@@ -705,12 +721,12 @@ impl PartialEq<dyn Any> for BinaryExpr {
 // (on the right). The new boundaries will indicate whether it is always true, always
 // false, or unknown (with a probablistic selectivity value attached).
 fn analyze_expr_scalar_comparison(
+    context: AnalysisContext,
     op: &Operator,
-    context: &mut AnalysisContext,
     left: &Arc<dyn PhysicalExpr>,
     right: ScalarValue,
-) -> Option<ExprBoundaries> {
-    let left_bounds = left.boundaries(context)?;
+) -> AnalysisContext {
+    let left_bounds = analysis_expect!(context, left.analyze(context.clone()).boundaries);
     let left_min = left_bounds.min_value;
     let left_max = left_bounds.max_value;
 
@@ -738,68 +754,74 @@ fn analyze_expr_scalar_comparison(
     assert!(!(always_selects && never_selects));
 
     let selectivity = match (always_selects, never_selects) {
-        (true, _) => Some(1.0),
-        (_, true) => Some(0.0),
+        (true, _) => 1.0,
+        (_, true) => 0.0,
         (false, false) => {
             // If there is a partial overlap, then we can estimate the selectivity
             // by computing the ratio of the existing overlap to the total range. Since we
             // currently don't have access to a value distribution histogram, the part below
             // assumes a uniform distribution by default.
 
             // Our [min, max] is inclusive, so we need to add 1 to the difference.
-            let total_range = left_max.distance(&left_min)? + 1;
-            let overlap_between_boundaries = match op {
-                Operator::Lt => right.distance(&left_min)?,
-                Operator::Gt => left_max.distance(&right)?,
-                Operator::LtEq => right.distance(&left_min)? + 1,
-                Operator::GtEq => left_max.distance(&right)? + 1,
-                Operator::Eq => 1,
-                _ => unreachable!(),
-            };
+            let total_range = analysis_expect!(context, left_max.distance(&left_min)) + 1;
+            let overlap_between_boundaries = analysis_expect!(
+                context,
+                match op {
+                    Operator::Lt => right.distance(&left_min),
+                    Operator::Gt => left_max.distance(&right),
+                    Operator::LtEq => right.distance(&left_min).map(|dist| dist + 1),
+                    Operator::GtEq => left_max.distance(&right).map(|dist| dist + 1),
+                    Operator::Eq => Some(1),
+                    _ => None,
+                }
+            );
 
-            Some(overlap_between_boundaries as f64 / total_range as f64)
+            overlap_between_boundaries as f64 / total_range as f64
         }
-    }?;
+    };
 
     // The context represents all the knowledge we have gathered during the
     // analysis process, which we can now add more since the expression's upper
     // and lower boundaries might have changed.
-    if let Some(column_expr) = left.as_any().downcast_ref::<Column>() {
-        let (left_min, left_max) = match op {
-            // TODO: for lt/gt, we technically should shrink the possibility space
-            // by one since a < 5 means that 5 is not a possible value for `a`. However,
-            // it is currently tricky to do so (e.g. for floats, we can get away with 4.999
-            // so we need a smarter logic to find out what is the closest value that is
-            // different from the scalar_value).
-            Operator::Lt | Operator::LtEq => {
-                // We only want to update the upper bound when we know it will help us (e.g.
-                // it is actually smaller than what we have right now) and it is a valid
-                // value (e.g. [0, 100] < -100 would update the boundaries to [0, -100] if
-                // there weren't the selectivity check).
-                if right < left_max && selectivity > 0.0 {
-                    (left_min, right)
-                } else {
-                    (left_min, left_max)
+    let context = match left.as_any().downcast_ref::<Column>() {
+        Some(column_expr) => {
+            let (left_min, left_max) = match op {
+                // TODO: for lt/gt, we technically should shrink the possibility space
+                // by one since a < 5 means that 5 is not a possible value for `a`. However,
+                // it is currently tricky to do so (e.g. for floats, we can get away with 4.999
+                // so we need a smarter logic to find out what is the closest value that is
+                // different from the scalar_value).
+                Operator::Lt | Operator::LtEq => {
+                    // We only want to update the upper bound when we know it will help us (e.g.
+                    // it is actually smaller than what we have right now) and it is a valid
+                    // value (e.g. [0, 100] < -100 would update the boundaries to [0, -100] if
+                    // there weren't the selectivity check).
+                    if right < left_max && selectivity > 0.0 {
+                        (left_min, right)
+                    } else {
+                        (left_min, left_max)
+                    }
                 }
-            }
-            Operator::Gt | Operator::GtEq => {
-                // Same as above, but this time we want to limit the lower bound.
-                if right > left_min && selectivity > 0.0 {
-                    (right, left_max)
-                } else {
-                    (left_min, left_max)
+                Operator::Gt | Operator::GtEq => {
+                    // Same as above, but this time we want to limit the lower bound.
+                    if right > left_min && selectivity > 0.0 {
+                        (right, left_max)
+                    } else {
+                        (left_min, left_max)
+                    }
                 }
-            }
-            // For equality, we don't have the range problem so even if the selectivity
-            // is 0.0, we can still update the boundaries.
-            Operator::Eq => (right.clone(), right),
-            _ => unreachable!(),
-        };
+                // For equality, we don't have the range problem so even if the selectivity
+                // is 0.0, we can still update the boundaries.
+                Operator::Eq => (right.clone(), right),
+                _ => unreachable!(),
+            };
 
-        let left_bounds =
-            ExprBoundaries::new(left_min, left_max, left_bounds.distinct_count);
-        context.update_column(column_expr.index(), left_bounds);
-    }
+            let left_bounds =
+                ExprBoundaries::new(left_min, left_max, left_bounds.distinct_count);
+            context.with_column_update(column_expr.index(), left_bounds)
+        }
+        None => context,
+    };
 
     // The selectivity can't be be greater than 1.0.
     assert!(selectivity <= 1.0);
@@ -810,12 +832,13 @@ fn analyze_expr_scalar_comparison(
         _ => (false, true, 2),
     };
 
-    Some(ExprBoundaries::new_with_selectivity(
+    let result_boundaries = Some(ExprBoundaries::new_with_selectivity(
         ScalarValue::Boolean(Some(pred_min)),
         ScalarValue::Boolean(Some(pred_max)),
         Some(pred_distinct),
         Some(selectivity),
-    ))
+    ));
+    context.with_boundaries(result_boundaries)
 }
 
 /// unwrap underlying (non dictionary) value, if any, to pass to a scalar kernel
@@ -3151,12 +3174,15 @@ mod tests {
         ];
 
         for ((operator, rhs), (exp_selectivity, exp_min, exp_max)) in cases {
-            let mut context = AnalysisContext::from_statistics(&schema, &statistics);
+            let context = AnalysisContext::from_statistics(&schema, &statistics);
             let left = col("a", &schema).unwrap();
             let right = ScalarValue::Int64(Some(rhs));
-            let boundaries =
-                analyze_expr_scalar_comparison(&operator, &mut context, &left, right)
-                    .expect("this case should not return None");
+            let analysis_ctx =
+                analyze_expr_scalar_comparison(context, &operator, &left, right);
+            let boundaries = analysis_ctx
+                .clone()
+                .boundaries
+                .expect("Analysis must complete for this test!");
 
             assert_eq!(
                 boundaries
@@ -3182,14 +3208,14 @@ mod tests {
             // For getting the updated boundaries, we can simply analyze the LHS
             // with the existing context.
             let left_boundaries = left
-                .boundaries(&mut context)
+                .analyze(analysis_ctx)
+                .boundaries
                 .expect("this case should not return None");
             assert_eq!(left_boundaries.min_value, ScalarValue::Int64(Some(exp_min)));
             assert_eq!(left_boundaries.max_value, ScalarValue::Int64(Some(exp_max)));
         }
         Ok(())
     }
-
     #[test]
     fn test_comparison_result_estimate_different_type() -> Result<()> {
         // A table where the column 'a' has a min of 1.3, a max of 50.7.
@@ -3228,12 +3254,15 @@ mod tests {
         ];
 
         for ((operator, rhs), (exp_selectivity, exp_min, exp_max)) in cases {
-            let mut context = AnalysisContext::from_statistics(&schema, &statistics);
+            let context = AnalysisContext::from_statistics(&schema, &statistics);
             let left = col("a", &schema).unwrap();
             let right = ScalarValue::from(rhs);
-            let boundaries =
-                analyze_expr_scalar_comparison(&operator, &mut context, &left, right)
-                    .expect("this case should not return None");
+            let analysis_ctx =
+                analyze_expr_scalar_comparison(context, &operator, &left, right);
+            let boundaries = analysis_ctx
+                .clone()
+                .boundaries
+                .expect("Analysis must complete for this test!");
 
             assert_eq!(
                 boundaries
@@ -3257,7 +3286,8 @@ mod tests {
             }
 
             let left_boundaries = left
-                .boundaries(&mut context)
+                .analyze(analysis_ctx)
+                .boundaries
                 .expect("this case should not return None");
             assert_eq!(
                 left_boundaries.min_value,
@@ -3286,9 +3316,10 @@ mod tests {
             &schema,
         );
 
-        let mut context = AnalysisContext::from_statistics(&schema, &statistics);
+        let context = AnalysisContext::from_statistics(&schema, &statistics);
         let predicate_boundaries = gt
-            .boundaries(&mut context)
+            .analyze(context)
+            .boundaries
             .expect("boundaries should not be None");
         assert_eq!(predicate_boundaries.selectivity, Some(0.76));
 
@@ -3314,9 +3345,10 @@ mod tests {
             &schema,
         );
 
-        let mut context = AnalysisContext::from_statistics(&schema, &statistics);
+        let context = AnalysisContext::from_statistics(&schema, &statistics);
         let predicate_boundaries = gt
-            .boundaries(&mut context)
+            .analyze(context)
+            .boundaries
             .expect("boundaries should not be None");
         assert_eq!(predicate_boundaries.selectivity, Some(0.5));