Skip to content

Commit f8063e8

Browse files
gatesnalamb
andauthored
Add ColumnStatistics::Sum (#14074)
* Add sum statistic * Add sum statistic * Add sum statistic * Add sum statistic * Add sum statistic * Add sum statistic * Add tests and Cargo fmt * fix up --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent d051731 commit f8063e8

File tree

17 files changed

+269
-18
lines changed

17 files changed

+269
-18
lines changed

datafusion/common/src/stats.rs

Lines changed: 161 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::fmt::{self, Debug, Display};
2121

2222
use crate::{Result, ScalarValue};
2323

24-
use arrow_schema::{Schema, SchemaRef};
24+
use arrow_schema::{DataType, Schema, SchemaRef};
2525

2626
/// Represents a value with a degree of certainty. `Precision` is used to
2727
/// propagate information the precision of statistical values.
@@ -170,24 +170,63 @@ impl Precision<ScalarValue> {
170170
pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
171171
match (self, other) {
172172
(Precision::Exact(a), Precision::Exact(b)) => {
173-
if let Ok(result) = a.add(b) {
174-
Precision::Exact(result)
175-
} else {
176-
Precision::Absent
177-
}
173+
a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
178174
}
179175
(Precision::Inexact(a), Precision::Exact(b))
180176
| (Precision::Exact(a), Precision::Inexact(b))
181-
| (Precision::Inexact(a), Precision::Inexact(b)) => {
182-
if let Ok(result) = a.add(b) {
183-
Precision::Inexact(result)
184-
} else {
185-
Precision::Absent
186-
}
177+
| (Precision::Inexact(a), Precision::Inexact(b)) => a
178+
.add(b)
179+
.map(Precision::Inexact)
180+
.unwrap_or(Precision::Absent),
181+
(_, _) => Precision::Absent,
182+
}
183+
}
184+
185+
/// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
186+
/// conservatively propagating exactness information. If one of the input
187+
/// values is [`Precision::Absent`], the result is `Absent` too.
188+
pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
189+
match (self, other) {
190+
(Precision::Exact(a), Precision::Exact(b)) => {
191+
a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
187192
}
193+
(Precision::Inexact(a), Precision::Exact(b))
194+
| (Precision::Exact(a), Precision::Inexact(b))
195+
| (Precision::Inexact(a), Precision::Inexact(b)) => a
196+
.sub(b)
197+
.map(Precision::Inexact)
198+
.unwrap_or(Precision::Absent),
199+
(_, _) => Precision::Absent,
200+
}
201+
}
202+
203+
/// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
204+
/// conservatively propagating exactness information. If one of the input
205+
/// values is [`Precision::Absent`], the result is `Absent` too.
206+
pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
207+
match (self, other) {
208+
(Precision::Exact(a), Precision::Exact(b)) => a
209+
.mul_checked(b)
210+
.map(Precision::Exact)
211+
.unwrap_or(Precision::Absent),
212+
(Precision::Inexact(a), Precision::Exact(b))
213+
| (Precision::Exact(a), Precision::Inexact(b))
214+
| (Precision::Inexact(a), Precision::Inexact(b)) => a
215+
.mul_checked(b)
216+
.map(Precision::Inexact)
217+
.unwrap_or(Precision::Absent),
188218
(_, _) => Precision::Absent,
189219
}
190220
}
221+
222+
/// Casts the value to the given data type, propagating exactness information.
223+
pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
224+
match self {
225+
Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
226+
Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
227+
Precision::Absent => Ok(Precision::Absent),
228+
}
229+
}
191230
}
192231

193232
impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
@@ -210,6 +249,18 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
210249
}
211250
}
212251

252+
impl From<Precision<usize>> for Precision<ScalarValue> {
253+
fn from(value: Precision<usize>) -> Self {
254+
match value {
255+
Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
256+
Precision::Inexact(v) => {
257+
Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
258+
}
259+
Precision::Absent => Precision::Absent,
260+
}
261+
}
262+
}
263+
213264
/// Statistics for a relation
214265
/// Fields are optional and can be inexact because the sources
215266
/// sometimes provide approximate estimates for performance reasons
@@ -401,6 +452,11 @@ impl Display for Statistics {
401452
} else {
402453
s
403454
};
455+
let s = if cs.sum_value != Precision::Absent {
456+
format!("{} Sum={}", s, cs.sum_value)
457+
} else {
458+
s
459+
};
404460
let s = if cs.null_count != Precision::Absent {
405461
format!("{} Null={}", s, cs.null_count)
406462
} else {
@@ -436,6 +492,8 @@ pub struct ColumnStatistics {
436492
pub max_value: Precision<ScalarValue>,
437493
/// Minimum value of column
438494
pub min_value: Precision<ScalarValue>,
495+
/// Sum value of a column
496+
pub sum_value: Precision<ScalarValue>,
439497
/// Number of distinct values
440498
pub distinct_count: Precision<usize>,
441499
}
@@ -458,6 +516,7 @@ impl ColumnStatistics {
458516
null_count: Precision::Absent,
459517
max_value: Precision::Absent,
460518
min_value: Precision::Absent,
519+
sum_value: Precision::Absent,
461520
distinct_count: Precision::Absent,
462521
}
463522
}
@@ -469,6 +528,7 @@ impl ColumnStatistics {
469528
self.null_count = self.null_count.to_inexact();
470529
self.max_value = self.max_value.to_inexact();
471530
self.min_value = self.min_value.to_inexact();
531+
self.sum_value = self.sum_value.to_inexact();
472532
self.distinct_count = self.distinct_count.to_inexact();
473533
self
474534
}
@@ -563,6 +623,26 @@ mod tests {
563623
assert_eq!(precision1.add(&absent_precision), Precision::Absent);
564624
}
565625

626+
#[test]
627+
fn test_add_scalar() {
628+
let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
629+
630+
assert_eq!(
631+
precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
632+
Precision::Exact(ScalarValue::Int32(Some(65))),
633+
);
634+
assert_eq!(
635+
precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
636+
Precision::Inexact(ScalarValue::Int32(Some(65))),
637+
);
638+
assert_eq!(
639+
precision.add(&Precision::Exact(ScalarValue::Int32(None))),
640+
// As per behavior of ScalarValue::add
641+
Precision::Exact(ScalarValue::Int32(None)),
642+
);
643+
assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
644+
}
645+
566646
#[test]
567647
fn test_sub() {
568648
let precision1 = Precision::Exact(42);
@@ -575,6 +655,26 @@ mod tests {
575655
assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
576656
}
577657

658+
#[test]
659+
fn test_sub_scalar() {
660+
let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
661+
662+
assert_eq!(
663+
precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
664+
Precision::Exact(ScalarValue::Int32(Some(19))),
665+
);
666+
assert_eq!(
667+
precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
668+
Precision::Inexact(ScalarValue::Int32(Some(19))),
669+
);
670+
assert_eq!(
671+
precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
672+
// As per behavior of ScalarValue::sub
673+
Precision::Exact(ScalarValue::Int32(None)),
674+
);
675+
assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
676+
}
677+
578678
#[test]
579679
fn test_multiply() {
580680
let precision1 = Precision::Exact(6);
@@ -588,6 +688,54 @@ mod tests {
588688
assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
589689
}
590690

691+
#[test]
692+
fn test_multiply_scalar() {
693+
let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
694+
695+
assert_eq!(
696+
precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
697+
Precision::Exact(ScalarValue::Int32(Some(30))),
698+
);
699+
assert_eq!(
700+
precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
701+
Precision::Inexact(ScalarValue::Int32(Some(30))),
702+
);
703+
assert_eq!(
704+
precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
705+
// As per behavior of ScalarValue::mul_checked
706+
Precision::Exact(ScalarValue::Int32(None)),
707+
);
708+
assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
709+
}
710+
711+
#[test]
712+
fn test_cast_to() {
713+
// Valid
714+
assert_eq!(
715+
Precision::Exact(ScalarValue::Int32(Some(42)))
716+
.cast_to(&DataType::Int64)
717+
.unwrap(),
718+
Precision::Exact(ScalarValue::Int64(Some(42))),
719+
);
720+
assert_eq!(
721+
Precision::Inexact(ScalarValue::Int32(Some(42)))
722+
.cast_to(&DataType::Int64)
723+
.unwrap(),
724+
Precision::Inexact(ScalarValue::Int64(Some(42))),
725+
);
726+
// Null
727+
assert_eq!(
728+
Precision::Exact(ScalarValue::Int32(None))
729+
.cast_to(&DataType::Int64)
730+
.unwrap(),
731+
Precision::Exact(ScalarValue::Int64(None)),
732+
);
733+
// Overflow returns error
734+
assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
735+
.cast_to(&DataType::Int8)
736+
.is_err());
737+
}
738+
591739
#[test]
592740
fn test_precision_cloning() {
593741
// Precision<usize> is copy
@@ -646,6 +794,7 @@ mod tests {
646794
null_count: Precision::Exact(null_count),
647795
max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
648796
min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
797+
sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
649798
distinct_count: Precision::Exact(100),
650799
}
651800
}

datafusion/core/src/datasource/statistics.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ pub async fn get_statistics_with_limit(
7676
col_stats_set[index].null_count = file_column.null_count;
7777
col_stats_set[index].max_value = file_column.max_value;
7878
col_stats_set[index].min_value = file_column.min_value;
79+
col_stats_set[index].sum_value = file_column.sum_value;
7980
}
8081

8182
// If the number of rows exceeds the limit, we can stop processing
@@ -113,12 +114,14 @@ pub async fn get_statistics_with_limit(
113114
null_count: file_nc,
114115
max_value: file_max,
115116
min_value: file_min,
117+
sum_value: file_sum,
116118
distinct_count: _,
117119
} = file_col_stats;
118120

119121
col_stats.null_count = add_row_stats(*file_nc, col_stats.null_count);
120122
set_max_if_greater(file_max, &mut col_stats.max_value);
121-
set_min_if_lesser(file_min, &mut col_stats.min_value)
123+
set_min_if_lesser(file_min, &mut col_stats.min_value);
124+
col_stats.sum_value = file_sum.add(&col_stats.sum_value);
122125
}
123126

124127
// If the number of rows exceeds the limit, we can stop processing
@@ -204,6 +207,7 @@ pub(crate) fn get_col_stats(
204207
null_count: null_counts[i],
205208
max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
206209
min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
210+
sum_value: Precision::Absent,
207211
distinct_count: Precision::Absent,
208212
}
209213
})

datafusion/core/tests/custom_sources_cases/statistics.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,14 @@ fn fully_defined() -> (Statistics, Schema) {
200200
distinct_count: Precision::Exact(2),
201201
max_value: Precision::Exact(ScalarValue::Int32(Some(1023))),
202202
min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
203+
sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
203204
null_count: Precision::Exact(0),
204205
},
205206
ColumnStatistics {
206207
distinct_count: Precision::Exact(13),
207208
max_value: Precision::Exact(ScalarValue::Int64(Some(5486))),
208209
min_value: Precision::Exact(ScalarValue::Int64(Some(-6783))),
210+
sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
209211
null_count: Precision::Exact(5),
210212
},
211213
],

datafusion/physical-plan/src/common.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,12 +333,14 @@ mod tests {
333333
distinct_count: Precision::Absent,
334334
max_value: Precision::Absent,
335335
min_value: Precision::Absent,
336+
sum_value: Precision::Absent,
336337
null_count: Precision::Exact(0),
337338
},
338339
ColumnStatistics {
339340
distinct_count: Precision::Absent,
340341
max_value: Precision::Absent,
341342
min_value: Precision::Absent,
343+
sum_value: Precision::Absent,
342344
null_count: Precision::Exact(0),
343345
},
344346
],
@@ -371,6 +373,7 @@ mod tests {
371373
distinct_count: Precision::Absent,
372374
max_value: Precision::Absent,
373375
min_value: Precision::Absent,
376+
sum_value: Precision::Absent,
374377
null_count: Precision::Exact(3),
375378
}],
376379
};

datafusion/physical-plan/src/filter.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,7 @@ fn collect_new_statistics(
476476
null_count: input_column_stats[idx].null_count.to_inexact(),
477477
max_value,
478478
min_value,
479+
sum_value: Precision::Absent,
479480
distinct_count: distinct_count.to_inexact(),
480481
}
481482
},
@@ -1196,6 +1197,7 @@ mod tests {
11961197
null_count: Precision::Absent,
11971198
min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
11981199
max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
1200+
sum_value: Precision::Absent,
11991201
distinct_count: Precision::Absent,
12001202
}],
12011203
};

0 commit comments

Comments
 (0)