@@ -84,6 +84,9 @@ async fn main() -> Result<()> {
84
84
// See how to analyze boundaries in different kinds of expressions.
85
85
boundary_analysis_and_selectivity_demo ( ) ?;
86
86
87
+ // See how boundary analysis works for `AND` & `OR` conjunctions.
88
+ boundary_analysis_in_conjuctions_demo ( ) ?;
89
+
87
90
// See how to determine the data types of expressions
88
91
expression_type_demo ( ) ?;
89
92
@@ -279,15 +282,15 @@ fn range_analysis_demo() -> Result<()> {
279
282
Ok ( ( ) )
280
283
}
281
284
282
- // DataFusion's analysis can infer boundary statistics and selectivity in
283
- // various situations which can be helpful in building more efficient
284
- // query plans.
285
+ /// DataFusion's analysis can infer boundary statistics and selectivity in
286
+ /// various situations which can be helpful in building more efficient
287
+ /// query plans.
285
288
fn boundary_analysis_and_selectivity_demo ( ) -> Result < ( ) > {
286
289
// Consider the example where we want all rows with an `id` greater than
287
290
// 5000.
288
291
let id_greater_5000 = col ( "id" ) . gt_eq ( lit ( 5000i64 ) ) ;
289
292
290
- // As in most examples we must tell DaataFusion the type of the column.
293
+ // As in most examples we must tell DataFusion the type of the column.
291
294
let schema = Arc :: new ( Schema :: new ( vec ! [ make_field( "id" , DataType :: Int64 ) ] ) ) ;
292
295
293
296
// DataFusion is able to do cardinality estimation on various column types
@@ -312,10 +315,10 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
312
315
let df_schema = DFSchema :: try_from ( schema. clone ( ) ) ?;
313
316
314
317
// Analysis case id >= 5000
315
- let physical_expr1 =
318
+ let physical_expr =
316
319
SessionContext :: new ( ) . create_physical_expr ( id_greater_5000, & df_schema) ?;
317
320
let analysis = analyze (
318
- & physical_expr1 ,
321
+ & physical_expr ,
319
322
AnalysisContext :: new ( initial_boundaries. clone ( ) ) ,
320
323
df_schema. as_ref ( ) ,
321
324
) ?;
@@ -347,14 +350,112 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
347
350
Ok ( ( ) )
348
351
}
349
352
350
- fn make_field ( name : & str , data_type : DataType ) -> Field {
351
- let nullable = false ;
352
- Field :: new ( name, data_type, nullable)
353
- }
353
+ /// This function shows how to think about and leverage the analysis API
354
+ /// to infer boundaries in `AND` & `OR` conjunctions.
355
+ fn boundary_analysis_in_conjuctions_demo ( ) -> Result < ( ) > {
356
+ // Let us consider the more common case of AND & OR conjunctions.
357
+ //
358
+ // age > 18 AND age <= 25
359
+ let age_between_18_25 = col ( "age" ) . gt ( lit ( 18i64 ) ) . and ( col ( "age" ) . lt_eq ( lit ( 25 ) ) ) ;
354
360
355
- fn make_ts_field ( name : & str ) -> Field {
356
- let tz = None ;
357
- make_field ( name, DataType :: Timestamp ( TimeUnit :: Nanosecond , tz) )
361
+ // As always we need to tell DataFusion the type of the column.
362
+ let schema = Arc :: new ( Schema :: new ( vec ! [ make_field( "age" , DataType :: Int64 ) ] ) ) ;
363
+
364
+ // Similarly to the example in `boundary_analysis_and_selectivity_demo` we
365
+ // can establish column statistics that can be used to describe certain
366
+ // column properties.
367
+ let column_stats = ColumnStatistics {
368
+ null_count : Precision :: Exact ( 0 ) ,
369
+ max_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 79 ) ) ) ,
370
+ min_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 14 ) ) ) ,
371
+ sum_value : Precision :: Absent ,
372
+ distinct_count : Precision :: Absent ,
373
+ } ;
374
+
375
+ let initial_boundaries =
376
+ vec ! [ ExprBoundaries :: try_from_column( & schema, & column_stats, 0 ) ?] ;
377
+
378
+ // Before we run the analysis pass; let us describe what we can infer from
379
+ // the initial information.
380
+ //
381
+ // To recap, the expression is `age > 18 AND age <= 25`.
382
+ //
383
+ // The column `age` can take any value in the `Int64` range.
384
+ //
385
+ // But using the `min`, `max` statistics we can reduce that initial range
386
+ // to `[min_value, max_value]` which is [14, 79].
387
+ //
388
+ // During analysis, when evaluating, let's say the left-hand side of the `AND`
389
+ // expression, we know that `age` must be greater than 18. Therefore our range
390
+ // is now [19, 79].
391
+ // And by evaluating the right-hand side we can get an upper bound, allowing
392
+ // us to infer that `age` must be in the range [19, 25] inclusive.
393
+ let df_schema = DFSchema :: try_from ( schema. clone ( ) ) ?;
394
+
395
+ let physical_expr =
396
+ SessionContext :: new ( ) . create_physical_expr ( age_between_18_25, & df_schema) ?;
397
+ let analysis = analyze (
398
+ & physical_expr,
399
+ // We re-use initial_boundaries elsewhere so we must clone it.
400
+ AnalysisContext :: new ( initial_boundaries. clone ( ) ) ,
401
+ df_schema. as_ref ( ) ,
402
+ ) ?;
403
+
404
+ // We can check that DataFusion's analysis inferred the same bounds.
405
+ assert_eq ! (
406
+ analysis. boundaries. first( ) . map( |boundary| boundary
407
+ . interval
408
+ . clone( )
409
+ . unwrap( )
410
+ . into_bounds( ) ) ,
411
+ Some ( ( ScalarValue :: Int64 ( Some ( 19 ) ) , ScalarValue :: Int64 ( Some ( 25 ) ) ) )
412
+ ) ;
413
+
414
+ // We can also infer the selectivity using the same approach as before.
415
+ //
416
+ // Granted a column such as age will more likely follow a Normal distribution
417
+ // as such our selectivity estimation will not be as good as it can.
418
+ assert ! ( analysis
419
+ . selectivity
420
+ . is_some_and( |selectivity| ( 0.1 ..=0.2 ) . contains( & selectivity) ) ) ;
421
+
422
+ // The above example was a good way to look at how we can derive better
423
+ // interval and get a lower selectivity during boundary analysis.
424
+ //
425
+ // But `AND` conjunctions are easier to reason with because their interval
426
+ // arithmetic follows naturally from set intersection operations, let us
427
+ // now look at an example that is a tad more complicated `OR` conjunctions.
428
+
429
+ // The expression we will look at is `age > 60 OR age <= 18`.
430
+ let age_greater_than_60_less_than_18 =
431
+ col ( "age" ) . gt ( lit ( 64i64 ) ) . or ( col ( "age" ) . lt_eq ( lit ( 18i64 ) ) ) ;
432
+
433
+ // We can re-use the same schema, initial boundaries and column statistics
434
+ // described above. So let's think about this for a bit.
435
+ //
436
+ // Initial range: [14, 79] as described in our column statistics.
437
+ //
438
+ // From the left-hand side and right-hand side of our `OR` conjunctions
439
+ // we end up with two ranges, instead of just one.
440
+ //
441
+ // - age > 60: [61, 79]
442
+ // - age <= 18: [14, 18]
443
+ //
444
+ // Thus the range of possible values the `age` column might take is a
445
+ // union of both sets [14, 18] U [61, 79].
446
+ let physical_expr = SessionContext :: new ( )
447
+ . create_physical_expr ( age_greater_than_60_less_than_18, & df_schema) ?;
448
+
449
+ // Since we don't handle interval arithmetic for `OR` operator this will error out.
450
+ let analysis = analyze (
451
+ & physical_expr,
452
+ AnalysisContext :: new ( initial_boundaries) ,
453
+ df_schema. as_ref ( ) ,
454
+ ) ;
455
+
456
+ assert ! ( analysis. is_err( ) ) ;
457
+
458
+ Ok ( ( ) )
358
459
}
359
460
360
461
/// This function shows how to use `Expr::get_type` to retrieve the DataType
@@ -494,3 +595,13 @@ fn type_coercion_demo() -> Result<()> {
494
595
495
596
Ok ( ( ) )
496
597
}
598
+
599
+ fn make_field ( name : & str , data_type : DataType ) -> Field {
600
+ let nullable = false ;
601
+ Field :: new ( name, data_type, nullable)
602
+ }
603
+
604
+ fn make_ts_field ( name : & str ) -> Field {
605
+ let tz = None ;
606
+ make_field ( name, DataType :: Timestamp ( TimeUnit :: Nanosecond , tz) )
607
+ }
0 commit comments