@@ -955,7 +955,12 @@ fn max_distinct_count(
955
955
let result = match num_rows {
956
956
Precision :: Absent => Precision :: Absent ,
957
957
Precision :: Inexact ( count) => {
958
- Precision :: Inexact ( count - stats. null_count . get_value ( ) . unwrap_or ( & 0 ) )
958
+ // To safeguard against inexact number of rows (e.g. 0) being smaller than
959
+ // an exact null count we need to do a checked subtraction.
960
+ match count. checked_sub ( * stats. null_count . get_value ( ) . unwrap_or ( & 0 ) ) {
961
+ None => Precision :: Inexact ( 0 ) ,
962
+ Some ( non_null_count) => Precision :: Inexact ( non_null_count) ,
963
+ }
959
964
}
960
965
Precision :: Exact ( count) => {
961
966
let count = count - stats. null_count . get_value ( ) . unwrap_or ( & 0 ) ;
@@ -1468,6 +1473,7 @@ mod tests {
1468
1473
use arrow:: error:: { ArrowError , Result as ArrowResult } ;
1469
1474
use arrow_schema:: SortOptions ;
1470
1475
1476
+ use datafusion_common:: stats:: Precision :: { Absent , Exact , Inexact } ;
1471
1477
use datafusion_common:: { arrow_datafusion_err, arrow_err, ScalarValue } ;
1472
1478
1473
1479
fn check ( left : & [ Column ] , right : & [ Column ] , on : & [ ( Column , Column ) ] ) -> Result < ( ) > {
@@ -1635,25 +1641,26 @@ mod tests {
1635
1641
}
1636
1642
1637
1643
fn create_column_stats (
1638
- min : Option < i64 > ,
1639
- max : Option < i64 > ,
1640
- distinct_count : Option < usize > ,
1644
+ min : Precision < i64 > ,
1645
+ max : Precision < i64 > ,
1646
+ distinct_count : Precision < usize > ,
1647
+ null_count : Precision < usize > ,
1641
1648
) -> ColumnStatistics {
1642
1649
ColumnStatistics {
1643
- distinct_count : distinct_count
1644
- . map ( Precision :: Inexact )
1645
- . unwrap_or ( Precision :: Absent ) ,
1646
- min_value : min
1647
- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1648
- . unwrap_or ( Precision :: Absent ) ,
1649
- max_value : max
1650
- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1651
- . unwrap_or ( Precision :: Absent ) ,
1652
- ..Default :: default ( )
1650
+ distinct_count,
1651
+ min_value : min. map ( ScalarValue :: from) ,
1652
+ max_value : max. map ( ScalarValue :: from) ,
1653
+ null_count,
1653
1654
}
1654
1655
}
1655
1656
1656
- type PartialStats = ( usize , Option < i64 > , Option < i64 > , Option < usize > ) ;
1657
+ type PartialStats = (
1658
+ usize ,
1659
+ Precision < i64 > ,
1660
+ Precision < i64 > ,
1661
+ Precision < usize > ,
1662
+ Precision < usize > ,
1663
+ ) ;
1657
1664
1658
1665
// This is mainly for validating the all edge cases of the estimation, but
1659
1666
// more advanced (and real world test cases) are below where we need some control
@@ -1670,133 +1677,156 @@ mod tests {
1670
1677
//
1671
1678
// distinct(left) == NaN, distinct(right) == NaN
1672
1679
(
1673
- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1674
- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1675
- Some ( Precision :: Inexact ( 10 ) ) ,
1680
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1681
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1682
+ Some ( Inexact ( 10 ) ) ,
1676
1683
) ,
1677
1684
// range(left) > range(right)
1678
1685
(
1679
- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1680
- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1681
- Some ( Precision :: Inexact ( 20 ) ) ,
1686
+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1687
+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1688
+ Some ( Inexact ( 20 ) ) ,
1682
1689
) ,
1683
1690
// range(right) > range(left)
1684
1691
(
1685
- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1686
- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1687
- Some ( Precision :: Inexact ( 20 ) ) ,
1692
+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1693
+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1694
+ Some ( Inexact ( 20 ) ) ,
1688
1695
) ,
1689
1696
// range(left) > len(left), range(right) > len(right)
1690
1697
(
1691
- ( 10 , Some ( 1 ) , Some ( 15 ) , None ) ,
1692
- ( 20 , Some ( 1 ) , Some ( 40 ) , None ) ,
1693
- Some ( Precision :: Inexact ( 10 ) ) ,
1698
+ ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
1699
+ ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
1700
+ Some ( Inexact ( 10 ) ) ,
1694
1701
) ,
1695
1702
// When we have distinct count.
1696
1703
(
1697
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1698
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1699
- Some ( Precision :: Inexact ( 10 ) ) ,
1704
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1705
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1706
+ Some ( Inexact ( 10 ) ) ,
1700
1707
) ,
1701
1708
// distinct(left) > distinct(right)
1702
1709
(
1703
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1704
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1705
- Some ( Precision :: Inexact ( 20 ) ) ,
1710
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1711
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1712
+ Some ( Inexact ( 20 ) ) ,
1706
1713
) ,
1707
1714
// distinct(right) > distinct(left)
1708
1715
(
1709
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1710
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1711
- Some ( Precision :: Inexact ( 20 ) ) ,
1716
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1717
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1718
+ Some ( Inexact ( 20 ) ) ,
1712
1719
) ,
1713
1720
// min(left) < 0 (range(left) > range(right))
1714
1721
(
1715
- ( 10 , Some ( -5 ) , Some ( 5 ) , None ) ,
1716
- ( 10 , Some ( 1 ) , Some ( 5 ) , None ) ,
1717
- Some ( Precision :: Inexact ( 10 ) ) ,
1722
+ ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
1723
+ ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
1724
+ Some ( Inexact ( 10 ) ) ,
1718
1725
) ,
1719
1726
// min(right) < 0, max(right) < 0 (range(right) > range(left))
1720
1727
(
1721
- ( 10 , Some ( -25 ) , Some ( -20 ) , None ) ,
1722
- ( 10 , Some ( -25 ) , Some ( -15 ) , None ) ,
1723
- Some ( Precision :: Inexact ( 10 ) ) ,
1728
+ ( 10 , Inexact ( -25 ) , Inexact ( -20 ) , Absent , Absent ) ,
1729
+ ( 10 , Inexact ( -25 ) , Inexact ( -15 ) , Absent , Absent ) ,
1730
+ Some ( Inexact ( 10 ) ) ,
1724
1731
) ,
1725
1732
// range(left) < 0, range(right) >= 0
1726
1733
// (there isn't a case where both left and right ranges are negative
1727
1734
// so one of them is always going to work, this just proves negative
1728
1735
// ranges with bigger absolute values are not are not accidentally used).
1729
1736
(
1730
- ( 10 , Some ( -10 ) , Some ( 0 ) , None ) ,
1731
- ( 10 , Some ( 0 ) , Some ( 10 ) , Some ( 5 ) ) ,
1732
- Some ( Precision :: Inexact ( 10 ) ) ,
1737
+ ( 10 , Inexact ( -10 ) , Inexact ( 0 ) , Absent , Absent ) ,
1738
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1739
+ Some ( Inexact ( 10 ) ) ,
1733
1740
) ,
1734
1741
// range(left) = 1, range(right) = 1
1735
1742
(
1736
- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1737
- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1738
- Some ( Precision :: Inexact ( 100 ) ) ,
1743
+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1744
+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1745
+ Some ( Inexact ( 100 ) ) ,
1739
1746
) ,
1740
1747
//
1741
1748
// Edge cases
1742
1749
// ==========
1743
1750
//
1744
1751
// No column level stats.
1745
- ( ( 10 , None , None , None ) , ( 10 , None , None , None ) , None ) ,
1752
+ (
1753
+ ( 10 , Absent , Absent , Absent , Absent ) ,
1754
+ ( 10 , Absent , Absent , Absent , Absent ) ,
1755
+ None ,
1756
+ ) ,
1746
1757
// No min or max (or both).
1747
- ( ( 10 , None , None , Some ( 3 ) ) , ( 10 , None , None , Some ( 3 ) ) , None ) ,
1748
1758
(
1749
- ( 10 , Some ( 2 ) , None , Some ( 3 ) ) ,
1750
- ( 10 , None , Some ( 5 ) , Some ( 3 ) ) ,
1759
+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1760
+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1761
+ None ,
1762
+ ) ,
1763
+ (
1764
+ ( 10 , Inexact ( 2 ) , Absent , Inexact ( 3 ) , Absent ) ,
1765
+ ( 10 , Absent , Inexact ( 5 ) , Inexact ( 3 ) , Absent ) ,
1751
1766
None ,
1752
1767
) ,
1753
1768
(
1754
- ( 10 , None , Some ( 3 ) , Some ( 3 ) ) ,
1755
- ( 10 , Some ( 1 ) , None , Some ( 3 ) ) ,
1769
+ ( 10 , Absent , Inexact ( 3 ) , Inexact ( 3 ) , Absent ) ,
1770
+ ( 10 , Inexact ( 1 ) , Absent , Inexact ( 3 ) , Absent ) ,
1771
+ None ,
1772
+ ) ,
1773
+ (
1774
+ ( 10 , Absent , Inexact ( 3 ) , Absent , Absent ) ,
1775
+ ( 10 , Inexact ( 1 ) , Absent , Absent , Absent ) ,
1756
1776
None ,
1757
1777
) ,
1758
- ( ( 10 , None , Some ( 3 ) , None ) , ( 10 , Some ( 1 ) , None , None ) , None ) ,
1759
1778
// Non overlapping min/max (when exact=False).
1760
1779
(
1761
- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1762
- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1763
- Some ( Precision :: Inexact ( 0 ) ) ,
1780
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1781
+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1782
+ Some ( Inexact ( 0 ) ) ,
1764
1783
) ,
1765
1784
(
1766
- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1767
- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1768
- Some ( Precision :: Inexact ( 0 ) ) ,
1785
+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1786
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1787
+ Some ( Inexact ( 0 ) ) ,
1769
1788
) ,
1770
1789
// distinct(left) = 0, distinct(right) = 0
1771
1790
(
1772
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1773
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1791
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
1792
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
1774
1793
None ,
1775
1794
) ,
1795
+ // Inexact row count < exact null count with absent distinct count
1796
+ (
1797
+ ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
1798
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1799
+ Some ( Inexact ( 0 ) ) ,
1800
+ ) ,
1776
1801
] ;
1777
1802
1778
1803
for ( left_info, right_info, expected_cardinality) in cases {
1779
1804
let left_num_rows = left_info. 0 ;
1780
- let left_col_stats =
1781
- vec ! [ create_column_stats( left_info. 1 , left_info. 2 , left_info. 3 ) ] ;
1805
+ let left_col_stats = vec ! [ create_column_stats(
1806
+ left_info. 1 ,
1807
+ left_info. 2 ,
1808
+ left_info. 3 ,
1809
+ left_info. 4 ,
1810
+ ) ] ;
1782
1811
1783
1812
let right_num_rows = right_info. 0 ;
1784
1813
let right_col_stats = vec ! [ create_column_stats(
1785
1814
right_info. 1 ,
1786
1815
right_info. 2 ,
1787
1816
right_info. 3 ,
1817
+ right_info. 4 ,
1788
1818
) ] ;
1789
1819
1790
1820
assert_eq ! (
1791
1821
estimate_inner_join_cardinality(
1792
1822
Statistics {
1793
- num_rows: Precision :: Inexact ( left_num_rows) ,
1794
- total_byte_size: Precision :: Absent ,
1823
+ num_rows: Inexact ( left_num_rows) ,
1824
+ total_byte_size: Absent ,
1795
1825
column_statistics: left_col_stats. clone( ) ,
1796
1826
} ,
1797
1827
Statistics {
1798
- num_rows: Precision :: Inexact ( right_num_rows) ,
1799
- total_byte_size: Precision :: Absent ,
1828
+ num_rows: Inexact ( right_num_rows) ,
1829
+ total_byte_size: Absent ,
1800
1830
column_statistics: right_col_stats. clone( ) ,
1801
1831
} ,
1802
1832
) ,
@@ -1814,9 +1844,7 @@ mod tests {
1814
1844
) ;
1815
1845
1816
1846
assert_eq ! (
1817
- partial_join_stats
1818
- . clone( )
1819
- . map( |s| Precision :: Inexact ( s. num_rows) ) ,
1847
+ partial_join_stats. clone( ) . map( |s| Inexact ( s. num_rows) ) ,
1820
1848
expected_cardinality. clone( )
1821
1849
) ;
1822
1850
assert_eq ! (
@@ -1832,13 +1860,13 @@ mod tests {
1832
1860
#[ test]
1833
1861
fn test_inner_join_cardinality_multiple_column ( ) -> Result < ( ) > {
1834
1862
let left_col_stats = vec ! [
1835
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1836
- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 150 ) ) ,
1863
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1864
+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 150 ) , Absent ) ,
1837
1865
] ;
1838
1866
1839
1867
let right_col_stats = vec ! [
1840
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1841
- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 200 ) ) ,
1868
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1869
+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 200 ) , Absent ) ,
1842
1870
] ;
1843
1871
1844
1872
// We have statistics about 4 columns, where the highest distinct
@@ -1916,15 +1944,15 @@ mod tests {
1916
1944
] ;
1917
1945
1918
1946
let left_col_stats = vec ! [
1919
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1920
- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1921
- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1947
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1948
+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1949
+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
1922
1950
] ;
1923
1951
1924
1952
let right_col_stats = vec ! [
1925
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1926
- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1927
- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
1953
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1954
+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
1955
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
1928
1956
] ;
1929
1957
1930
1958
for ( join_type, expected_num_rows) in cases {
@@ -1965,15 +1993,15 @@ mod tests {
1965
1993
// Join on a=c, x=y (ignores b/d) where x and y does not intersect
1966
1994
1967
1995
let left_col_stats = vec ! [
1968
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1969
- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1970
- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1996
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1997
+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1998
+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
1971
1999
] ;
1972
2000
1973
2001
let right_col_stats = vec ! [
1974
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1975
- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1976
- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
2002
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
2003
+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
2004
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
1977
2005
] ;
1978
2006
1979
2007
let join_on = vec ! [
0 commit comments