Skip to content

Commit 58330b6

Browse files
authored
Examples: boundary analysis example for AND/OR conjunctions (#14735)
* feat(examples): boundary analysis example for the case of conjunctions * feat(docs): supplement the guide with an overview of boundary analysis This change adds a short section in the Query Optimizer page of the library guide that gives a brief overview of boundary analysis and cardinality estimation and their role during query optimization. * fix(docs): add eol to address prettier issue * fix(docs) run prettier on doc * fix(docs): fix doc test example * fix(docs): fix doc test example * fix: address typo in example function name
1 parent 1ae06a4 commit 58330b6

File tree

2 files changed

+240
-13
lines changed

2 files changed

+240
-13
lines changed

datafusion-examples/examples/expr_api.rs

Lines changed: 124 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ async fn main() -> Result<()> {
8484
// See how to analyze boundaries in different kinds of expressions.
8585
boundary_analysis_and_selectivity_demo()?;
8686

87+
// See how boundary analysis works for `AND` & `OR` conjunctions.
88+
boundary_analysis_in_conjuctions_demo()?;
89+
8790
// See how to determine the data types of expressions
8891
expression_type_demo()?;
8992

@@ -279,15 +282,15 @@ fn range_analysis_demo() -> Result<()> {
279282
Ok(())
280283
}
281284

282-
// DataFusion's analysis can infer boundary statistics and selectivity in
283-
// various situations which can be helpful in building more efficient
284-
// query plans.
285+
/// DataFusion's analysis can infer boundary statistics and selectivity in
286+
/// various situations which can be helpful in building more efficient
287+
/// query plans.
285288
fn boundary_analysis_and_selectivity_demo() -> Result<()> {
286289
// Consider the example where we want all rows with an `id` greater than
287290
// 5000.
288291
let id_greater_5000 = col("id").gt_eq(lit(5000i64));
289292

290-
// As in most examples we must tell DaataFusion the type of the column.
293+
// As in most examples we must tell DataFusion the type of the column.
291294
let schema = Arc::new(Schema::new(vec![make_field("id", DataType::Int64)]));
292295

293296
// DataFusion is able to do cardinality estimation on various column types
@@ -312,10 +315,10 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
312315
let df_schema = DFSchema::try_from(schema.clone())?;
313316

314317
// Analysis case id >= 5000
315-
let physical_expr1 =
318+
let physical_expr =
316319
SessionContext::new().create_physical_expr(id_greater_5000, &df_schema)?;
317320
let analysis = analyze(
318-
&physical_expr1,
321+
&physical_expr,
319322
AnalysisContext::new(initial_boundaries.clone()),
320323
df_schema.as_ref(),
321324
)?;
@@ -347,14 +350,112 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
347350
Ok(())
348351
}
349352

350-
fn make_field(name: &str, data_type: DataType) -> Field {
351-
let nullable = false;
352-
Field::new(name, data_type, nullable)
353-
}
353+
/// This function shows how to think about and leverage the analysis API
354+
/// to infer boundaries in `AND` & `OR` conjunctions.
355+
fn boundary_analysis_in_conjuctions_demo() -> Result<()> {
356+
// Let us consider the more common case of AND & OR conjunctions.
357+
//
358+
// age > 18 AND age <= 25
359+
let age_between_18_25 = col("age").gt(lit(18i64)).and(col("age").lt_eq(lit(25)));
354360

355-
fn make_ts_field(name: &str) -> Field {
356-
let tz = None;
357-
make_field(name, DataType::Timestamp(TimeUnit::Nanosecond, tz))
361+
// As always we need to tell DataFusion the type of the column.
362+
let schema = Arc::new(Schema::new(vec![make_field("age", DataType::Int64)]));
363+
364+
// Similarly to the example in `boundary_analysis_and_selectivity_demo` we
365+
// can establish column statistics that can be used to describe certain
366+
// column properties.
367+
let column_stats = ColumnStatistics {
368+
null_count: Precision::Exact(0),
369+
max_value: Precision::Exact(ScalarValue::Int64(Some(79))),
370+
min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
371+
sum_value: Precision::Absent,
372+
distinct_count: Precision::Absent,
373+
};
374+
375+
let initial_boundaries =
376+
vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?];
377+
378+
// Before we run the analysis pass; let us describe what we can infer from
379+
// the initial information.
380+
//
381+
// To recap, the expression is `age > 18 AND age <= 25`.
382+
//
383+
// The column `age` can take any value in the `Int64` range.
384+
//
385+
// But using the `min`, `max` statistics we can reduce that initial range
386+
// to `[min_value, max_value]` which is [14, 79].
387+
//
388+
// During analysis, when evaluating, let's say the left-hand side of the `AND`
389+
// expression, we know that `age` must be greater than 18. Therefore our range
390+
// is now [19, 79].
391+
// And by evaluating the right-hand side we can get an upper bound, allowing
392+
// us to infer that `age` must be in the range [19, 25] inclusive.
393+
let df_schema = DFSchema::try_from(schema.clone())?;
394+
395+
let physical_expr =
396+
SessionContext::new().create_physical_expr(age_between_18_25, &df_schema)?;
397+
let analysis = analyze(
398+
&physical_expr,
399+
// We re-use initial_boundaries elsewhere so we must clone it.
400+
AnalysisContext::new(initial_boundaries.clone()),
401+
df_schema.as_ref(),
402+
)?;
403+
404+
// We can check that DataFusion's analysis inferred the same bounds.
405+
assert_eq!(
406+
analysis.boundaries.first().map(|boundary| boundary
407+
.interval
408+
.clone()
409+
.unwrap()
410+
.into_bounds()),
411+
Some((ScalarValue::Int64(Some(19)), ScalarValue::Int64(Some(25))))
412+
);
413+
414+
// We can also infer the selectivity using the same approach as before.
415+
//
416+
// Granted a column such as age will more likely follow a Normal distribution
417+
// as such our selectivity estimation will not be as good as it can.
418+
assert!(analysis
419+
.selectivity
420+
.is_some_and(|selectivity| (0.1..=0.2).contains(&selectivity)));
421+
422+
// The above example was a good way to look at how we can derive better
423+
// interval and get a lower selectivity during boundary analysis.
424+
//
425+
// But `AND` conjunctions are easier to reason with because their interval
426+
// arithmetic follows naturally from set intersection operations, let us
427+
// now look at an example that is a tad more complicated `OR` conjunctions.
428+
429+
// The expression we will look at is `age > 60 OR age <= 18`.
430+
let age_greater_than_60_less_than_18 =
431+
col("age").gt(lit(64i64)).or(col("age").lt_eq(lit(18i64)));
432+
433+
// We can re-use the same schema, initial boundaries and column statistics
434+
// described above. So let's think about this for a bit.
435+
//
436+
// Initial range: [14, 79] as described in our column statistics.
437+
//
438+
// From the left-hand side and right-hand side of our `OR` conjunctions
439+
// we end up with two ranges, instead of just one.
440+
//
441+
// - age > 60: [61, 79]
442+
// - age <= 18: [14, 18]
443+
//
444+
// Thus the range of possible values the `age` column might take is a
445+
// union of both sets [14, 18] U [61, 79].
446+
let physical_expr = SessionContext::new()
447+
.create_physical_expr(age_greater_than_60_less_than_18, &df_schema)?;
448+
449+
// Since we don't handle interval arithmetic for `OR` operator this will error out.
450+
let analysis = analyze(
451+
&physical_expr,
452+
AnalysisContext::new(initial_boundaries),
453+
df_schema.as_ref(),
454+
);
455+
456+
assert!(analysis.is_err());
457+
458+
Ok(())
358459
}
359460

360461
/// This function shows how to use `Expr::get_type` to retrieve the DataType
@@ -494,3 +595,13 @@ fn type_coercion_demo() -> Result<()> {
494595

495596
Ok(())
496597
}
598+
599+
fn make_field(name: &str, data_type: DataType) -> Field {
600+
let nullable = false;
601+
Field::new(name, data_type, nullable)
602+
}
603+
604+
fn make_ts_field(name: &str) -> Field {
605+
let tz = None;
606+
make_field(name, DataType::Timestamp(TimeUnit::Nanosecond, tz))
607+
}

docs/source/library-user-guide/query-optimizer.md

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,3 +388,119 @@ In the following example, the `type_coercion` and `simplify_expressions` passes
388388
```
389389

390390
[df]: https://crates.io/crates/datafusion
391+
392+
## Thinking about Query Optimization
393+
394+
Query optimization in DataFusion uses a cost based model. The cost based model
395+
relies on table and column level statistics to estimate selectivity; selectivity
396+
estimates are an important piece in cost analysis for filters and projections
397+
as they allow estimating the cost of joins and filters.
398+
399+
An important piece of building these estimates is _boundary analysis_ which uses
400+
interval arithmetic to take an expression such as `a > 2500 AND a <= 5000` and
401+
build an accurate selectivity estimate that can then be used to find more efficient
402+
plans.
403+
404+
#### `AnalysisContext` API
405+
406+
The `AnalysisContext` serves as a shared knowledge base during expression evaluation
407+
and boundary analysis. Think of it as a dynamic repository that maintains information about:
408+
409+
1. Current known boundaries for columns and expressions
410+
2. Statistics that have been gathered or inferred
411+
3. A mutable state that can be updated as analysis progresses
412+
413+
What makes `AnalysisContext` particularly powerful is its ability to propagate information
414+
through the expression tree. As each node in the expression tree is analyzed, it can both
415+
read from and write to this shared context, allowing for sophisticated boundary analysis and inference.
416+
417+
#### `ColumnStatistics` for Cardinality Estimation
418+
419+
Column statistics form the foundation of optimization decisions. Rather than just tracking
420+
simple metrics, DataFusion's `ColumnStatistics` provides a rich set of information including:
421+
422+
- Null value counts
423+
- Maximum and minimum values
424+
- Value sums (for numeric columns)
425+
- Distinct value counts
426+
427+
Each of these statistics is wrapped in a `Precision` type that indicates whether the value is
428+
exact or estimated, allowing the optimizer to make informed decisions about the reliability
429+
of its cardinality estimates.
430+
431+
### Boundary Analaysis Flow
432+
433+
The boundary analysis process flows through several stages, with each stage building
434+
upon the information gathered in previous stages. The `AnalysisContext` is continuously
435+
updated as the analysis progresses through the expression tree.
436+
437+
#### Expression Boundary Analysis
438+
439+
When analyzing expressions, DataFusion runs boundary analysis using interval arithmetic.
440+
Consider a simple predicate like age > 18 AND age <= 25. The analysis flows as follows:
441+
442+
1. Context Initialization
443+
444+
- Begin with known column statistics
445+
- Set up initial boundaries based on column constraints
446+
- Initialize the shared analysis context
447+
448+
2. Expression Tree Walk
449+
450+
- Analyze each node in the expression tree
451+
- Propagate boundary information upward
452+
- Allow child nodes to influence parent boundaries
453+
454+
3. Boundary Updates
455+
- Each expression can update the shared context
456+
- Changes flow through the entire expression tree
457+
- Final boundaries inform optimization decisions
458+
459+
### Working with the analysis API
460+
461+
The following example shows how you can run an analysis pass on a physical expression
462+
to infer the selectivity of the expression and the space of possible values it can
463+
take.
464+
465+
```rust
466+
# use std::sync::Arc;
467+
# use datafusion::prelude::*;
468+
# use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
469+
# use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
470+
# use datafusion::common::stats::Precision;
471+
#
472+
# use datafusion::common::{ColumnStatistics, DFSchema};
473+
# use datafusion::common::{ScalarValue, ToDFSchema};
474+
# use datafusion::error::Result;
475+
fn analyze_filter_example() -> Result<()> {
476+
// Create a schema with an 'age' column
477+
let age = Field::new("age", DataType::Int64, false);
478+
let schema = Arc::new(Schema::new(vec![age]));
479+
480+
// Define column statistics
481+
let column_stats = ColumnStatistics {
482+
null_count: Precision::Exact(0),
483+
max_value: Precision::Exact(ScalarValue::Int64(Some(79))),
484+
min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
485+
distinct_count: Precision::Absent,
486+
sum_value: Precision::Absent,
487+
};
488+
489+
// Create expression: age > 18 AND age <= 25
490+
let expr = col("age")
491+
.gt(lit(18i64))
492+
.and(col("age").lt_eq(lit(25i64)));
493+
494+
// Initialize analysis context
495+
let initial_boundaries = vec![ExprBoundaries::try_from_column(
496+
&schema, &column_stats, 0)?];
497+
let context = AnalysisContext::new(initial_boundaries);
498+
499+
// Analyze expression
500+
let df_schema = DFSchema::try_from(schema)?;
501+
let physical_expr = SessionContext::new().create_physical_expr(expr, &df_schema)?;
502+
let analysis = analyze(&physical_expr, context, df_schema.as_ref())?;
503+
504+
Ok(())
505+
}
506+
```

0 commit comments

Comments
 (0)