Skip to content

Commit cdaaf50

Browse files
committed
Add test hitting the former overflow panic
1 parent 81781ff commit cdaaf50

File tree

2 files changed

+114
-90
lines changed

2 files changed

+114
-90
lines changed

datafusion/common/src/stats.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
4848

4949
/// Transform the value in this [`Precision`] object, if one exists, using
5050
/// the given function. Preserves the exactness state.
51-
pub fn map<F>(self, f: F) -> Precision<T>
51+
pub fn map<U, F>(self, f: F) -> Precision<U>
5252
where
53-
F: Fn(T) -> T,
53+
F: Fn(T) -> U,
54+
U: Debug + Clone + PartialEq + Eq + PartialOrd,
5455
{
5556
match self {
5657
Precision::Exact(val) => Precision::Exact(f(val)),
5758
Precision::Inexact(val) => Precision::Inexact(f(val)),
58-
_ => self,
59+
_ => Precision::<U>::Absent,
5960
}
6061
}
6162

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 110 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,7 @@ mod tests {
14731473
use arrow::error::{ArrowError, Result as ArrowResult};
14741474
use arrow_schema::SortOptions;
14751475

1476+
use datafusion_common::stats::Precision::{Absent, Exact, Inexact};
14761477
use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue};
14771478

14781479
fn check(left: &[Column], right: &[Column], on: &[(Column, Column)]) -> Result<()> {
@@ -1640,25 +1641,26 @@ mod tests {
16401641
}
16411642

16421643
fn create_column_stats(
1643-
min: Option<i64>,
1644-
max: Option<i64>,
1645-
distinct_count: Option<usize>,
1644+
min: Precision<i64>,
1645+
max: Precision<i64>,
1646+
distinct_count: Precision<usize>,
1647+
null_count: Precision<usize>,
16461648
) -> ColumnStatistics {
16471649
ColumnStatistics {
1648-
distinct_count: distinct_count
1649-
.map(Precision::Inexact)
1650-
.unwrap_or(Precision::Absent),
1651-
min_value: min
1652-
.map(|size| Precision::Inexact(ScalarValue::from(size)))
1653-
.unwrap_or(Precision::Absent),
1654-
max_value: max
1655-
.map(|size| Precision::Inexact(ScalarValue::from(size)))
1656-
.unwrap_or(Precision::Absent),
1657-
..Default::default()
1650+
distinct_count,
1651+
min_value: min.map(|size| ScalarValue::from(size)),
1652+
max_value: max.map(|size| ScalarValue::from(size)),
1653+
null_count,
16581654
}
16591655
}
16601656

1661-
type PartialStats = (usize, Option<i64>, Option<i64>, Option<usize>);
1657+
type PartialStats = (
1658+
usize,
1659+
Precision<i64>,
1660+
Precision<i64>,
1661+
Precision<usize>,
1662+
Precision<usize>,
1663+
);
16621664

16631665
// This is mainly for validating the all edge cases of the estimation, but
16641666
// more advanced (and real world test cases) are below where we need some control
@@ -1675,133 +1677,156 @@ mod tests {
16751677
//
16761678
// distinct(left) == NaN, distinct(right) == NaN
16771679
(
1678-
(10, Some(1), Some(10), None),
1679-
(10, Some(1), Some(10), None),
1680-
Some(Precision::Inexact(10)),
1680+
(10, Inexact(1), Inexact(10), Absent, Absent),
1681+
(10, Inexact(1), Inexact(10), Absent, Absent),
1682+
Some(Inexact(10)),
16811683
),
16821684
// range(left) > range(right)
16831685
(
1684-
(10, Some(6), Some(10), None),
1685-
(10, Some(8), Some(10), None),
1686-
Some(Precision::Inexact(20)),
1686+
(10, Inexact(6), Inexact(10), Absent, Absent),
1687+
(10, Inexact(8), Inexact(10), Absent, Absent),
1688+
Some(Inexact(20)),
16871689
),
16881690
// range(right) > range(left)
16891691
(
1690-
(10, Some(8), Some(10), None),
1691-
(10, Some(6), Some(10), None),
1692-
Some(Precision::Inexact(20)),
1692+
(10, Inexact(8), Inexact(10), Absent, Absent),
1693+
(10, Inexact(6), Inexact(10), Absent, Absent),
1694+
Some(Inexact(20)),
16931695
),
16941696
// range(left) > len(left), range(right) > len(right)
16951697
(
1696-
(10, Some(1), Some(15), None),
1697-
(20, Some(1), Some(40), None),
1698-
Some(Precision::Inexact(10)),
1698+
(10, Inexact(1), Inexact(15), Absent, Absent),
1699+
(20, Inexact(1), Inexact(40), Absent, Absent),
1700+
Some(Inexact(10)),
16991701
),
17001702
// When we have distinct count.
17011703
(
1702-
(10, Some(1), Some(10), Some(10)),
1703-
(10, Some(1), Some(10), Some(10)),
1704-
Some(Precision::Inexact(10)),
1704+
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
1705+
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
1706+
Some(Inexact(10)),
17051707
),
17061708
// distinct(left) > distinct(right)
17071709
(
1708-
(10, Some(1), Some(10), Some(5)),
1709-
(10, Some(1), Some(10), Some(2)),
1710-
Some(Precision::Inexact(20)),
1710+
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
1711+
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
1712+
Some(Inexact(20)),
17111713
),
17121714
// distinct(right) > distinct(left)
17131715
(
1714-
(10, Some(1), Some(10), Some(2)),
1715-
(10, Some(1), Some(10), Some(5)),
1716-
Some(Precision::Inexact(20)),
1716+
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
1717+
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
1718+
Some(Inexact(20)),
17171719
),
17181720
// min(left) < 0 (range(left) > range(right))
17191721
(
1720-
(10, Some(-5), Some(5), None),
1721-
(10, Some(1), Some(5), None),
1722-
Some(Precision::Inexact(10)),
1722+
(10, Inexact(-5), Inexact(5), Absent, Absent),
1723+
(10, Inexact(1), Inexact(5), Absent, Absent),
1724+
Some(Inexact(10)),
17231725
),
17241726
// min(right) < 0, max(right) < 0 (range(right) > range(left))
17251727
(
1726-
(10, Some(-25), Some(-20), None),
1727-
(10, Some(-25), Some(-15), None),
1728-
Some(Precision::Inexact(10)),
1728+
(10, Inexact(-25), Inexact(-20), Absent, Absent),
1729+
(10, Inexact(-25), Inexact(-15), Absent, Absent),
1730+
Some(Inexact(10)),
17291731
),
17301732
// range(left) < 0, range(right) >= 0
17311733
// (there isn't a case where both left and right ranges are negative
17321734
// so one of them is always going to work, this just proves negative
17331735
// ranges with bigger absolute values are not are not accidentally used).
17341736
(
1735-
(10, Some(-10), Some(0), None),
1736-
(10, Some(0), Some(10), Some(5)),
1737-
Some(Precision::Inexact(10)),
1737+
(10, Inexact(-10), Inexact(0), Absent, Absent),
1738+
(10, Inexact(0), Inexact(10), Inexact(5), Absent),
1739+
Some(Inexact(10)),
17381740
),
17391741
// range(left) = 1, range(right) = 1
17401742
(
1741-
(10, Some(1), Some(1), None),
1742-
(10, Some(1), Some(1), None),
1743-
Some(Precision::Inexact(100)),
1743+
(10, Inexact(1), Inexact(1), Absent, Absent),
1744+
(10, Inexact(1), Inexact(1), Absent, Absent),
1745+
Some(Inexact(100)),
17441746
),
17451747
//
17461748
// Edge cases
17471749
// ==========
17481750
//
17491751
// No column level stats.
1750-
((10, None, None, None), (10, None, None, None), None),
1752+
(
1753+
(10, Absent, Absent, Absent, Absent),
1754+
(10, Absent, Absent, Absent, Absent),
1755+
None,
1756+
),
17511757
// No min or max (or both).
1752-
((10, None, None, Some(3)), (10, None, None, Some(3)), None),
17531758
(
1754-
(10, Some(2), None, Some(3)),
1755-
(10, None, Some(5), Some(3)),
1759+
(10, Absent, Absent, Inexact(3), Absent),
1760+
(10, Absent, Absent, Inexact(3), Absent),
1761+
None,
1762+
),
1763+
(
1764+
(10, Inexact(2), Absent, Inexact(3), Absent),
1765+
(10, Absent, Inexact(5), Inexact(3), Absent),
17561766
None,
17571767
),
17581768
(
1759-
(10, None, Some(3), Some(3)),
1760-
(10, Some(1), None, Some(3)),
1769+
(10, Absent, Inexact(3), Inexact(3), Absent),
1770+
(10, Inexact(1), Absent, Inexact(3), Absent),
1771+
None,
1772+
),
1773+
(
1774+
(10, Absent, Inexact(3), Absent, Absent),
1775+
(10, Inexact(1), Absent, Absent, Absent),
17611776
None,
17621777
),
1763-
((10, None, Some(3), None), (10, Some(1), None, None), None),
17641778
// Non overlapping min/max (when exact=False).
17651779
(
1766-
(10, Some(0), Some(10), None),
1767-
(10, Some(11), Some(20), None),
1768-
Some(Precision::Inexact(0)),
1780+
(10, Inexact(0), Inexact(10), Absent, Absent),
1781+
(10, Inexact(11), Inexact(20), Absent, Absent),
1782+
Some(Inexact(0)),
17691783
),
17701784
(
1771-
(10, Some(11), Some(20), None),
1772-
(10, Some(0), Some(10), None),
1773-
Some(Precision::Inexact(0)),
1785+
(10, Inexact(11), Inexact(20), Absent, Absent),
1786+
(10, Inexact(0), Inexact(10), Absent, Absent),
1787+
Some(Inexact(0)),
17741788
),
17751789
// distinct(left) = 0, distinct(right) = 0
17761790
(
1777-
(10, Some(1), Some(10), Some(0)),
1778-
(10, Some(1), Some(10), Some(0)),
1791+
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
1792+
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
17791793
None,
17801794
),
1795+
// Inexact row count < exact null count with absent distinct count
1796+
(
1797+
(0, Inexact(1), Inexact(10), Absent, Exact(5)),
1798+
(10, Inexact(1), Inexact(10), Absent, Absent),
1799+
Some(Inexact(0)),
1800+
),
17811801
];
17821802

17831803
for (left_info, right_info, expected_cardinality) in cases {
17841804
let left_num_rows = left_info.0;
1785-
let left_col_stats =
1786-
vec![create_column_stats(left_info.1, left_info.2, left_info.3)];
1805+
let left_col_stats = vec![create_column_stats(
1806+
left_info.1,
1807+
left_info.2,
1808+
left_info.3,
1809+
left_info.4,
1810+
)];
17871811

17881812
let right_num_rows = right_info.0;
17891813
let right_col_stats = vec![create_column_stats(
17901814
right_info.1,
17911815
right_info.2,
17921816
right_info.3,
1817+
right_info.4,
17931818
)];
17941819

17951820
assert_eq!(
17961821
estimate_inner_join_cardinality(
17971822
Statistics {
1798-
num_rows: Precision::Inexact(left_num_rows),
1799-
total_byte_size: Precision::Absent,
1823+
num_rows: Inexact(left_num_rows),
1824+
total_byte_size: Absent,
18001825
column_statistics: left_col_stats.clone(),
18011826
},
18021827
Statistics {
1803-
num_rows: Precision::Inexact(right_num_rows),
1804-
total_byte_size: Precision::Absent,
1828+
num_rows: Inexact(right_num_rows),
1829+
total_byte_size: Absent,
18051830
column_statistics: right_col_stats.clone(),
18061831
},
18071832
),
@@ -1819,9 +1844,7 @@ mod tests {
18191844
);
18201845

18211846
assert_eq!(
1822-
partial_join_stats
1823-
.clone()
1824-
.map(|s| Precision::Inexact(s.num_rows)),
1847+
partial_join_stats.clone().map(|s| Inexact(s.num_rows)),
18251848
expected_cardinality.clone()
18261849
);
18271850
assert_eq!(
@@ -1837,13 +1860,13 @@ mod tests {
18371860
#[test]
18381861
fn test_inner_join_cardinality_multiple_column() -> Result<()> {
18391862
let left_col_stats = vec![
1840-
create_column_stats(Some(0), Some(100), Some(100)),
1841-
create_column_stats(Some(100), Some(500), Some(150)),
1863+
create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
1864+
create_column_stats(Inexact(100), Inexact(500), Inexact(150), Absent),
18421865
];
18431866

18441867
let right_col_stats = vec![
1845-
create_column_stats(Some(0), Some(100), Some(50)),
1846-
create_column_stats(Some(100), Some(500), Some(200)),
1868+
create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
1869+
create_column_stats(Inexact(100), Inexact(500), Inexact(200), Absent),
18471870
];
18481871

18491872
// We have statistics about 4 columns, where the highest distinct
@@ -1921,15 +1944,15 @@ mod tests {
19211944
];
19221945

19231946
let left_col_stats = vec![
1924-
create_column_stats(Some(0), Some(100), Some(100)),
1925-
create_column_stats(Some(0), Some(500), Some(500)),
1926-
create_column_stats(Some(1000), Some(10000), None),
1947+
create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
1948+
create_column_stats(Inexact(0), Inexact(500), Inexact(500), Absent),
1949+
create_column_stats(Inexact(1000), Inexact(10000), Absent, Absent),
19271950
];
19281951

19291952
let right_col_stats = vec![
1930-
create_column_stats(Some(0), Some(100), Some(50)),
1931-
create_column_stats(Some(0), Some(2000), Some(2500)),
1932-
create_column_stats(Some(0), Some(100), None),
1953+
create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
1954+
create_column_stats(Inexact(0), Inexact(2000), Inexact(2500), Absent),
1955+
create_column_stats(Inexact(0), Inexact(100), Absent, Absent),
19331956
];
19341957

19351958
for (join_type, expected_num_rows) in cases {
@@ -1970,15 +1993,15 @@ mod tests {
19701993
// Join on a=c, x=y (ignores b/d) where x and y does not intersect
19711994

19721995
let left_col_stats = vec![
1973-
create_column_stats(Some(0), Some(100), Some(100)),
1974-
create_column_stats(Some(0), Some(500), Some(500)),
1975-
create_column_stats(Some(1000), Some(10000), None),
1996+
create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
1997+
create_column_stats(Inexact(0), Inexact(500), Inexact(500), Absent),
1998+
create_column_stats(Inexact(1000), Inexact(10000), Absent, Absent),
19761999
];
19772000

19782001
let right_col_stats = vec![
1979-
create_column_stats(Some(0), Some(100), Some(50)),
1980-
create_column_stats(Some(0), Some(2000), Some(2500)),
1981-
create_column_stats(Some(0), Some(100), None),
2002+
create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
2003+
create_column_stats(Inexact(0), Inexact(2000), Inexact(2500), Absent),
2004+
create_column_stats(Inexact(0), Inexact(100), Absent, Absent),
19822005
];
19832006

19842007
let join_on = vec![

0 commit comments

Comments
 (0)