@@ -1473,6 +1473,7 @@ mod tests {
1473
1473
use arrow:: error:: { ArrowError , Result as ArrowResult } ;
1474
1474
use arrow_schema:: SortOptions ;
1475
1475
1476
+ use datafusion_common:: stats:: Precision :: { Absent , Exact , Inexact } ;
1476
1477
use datafusion_common:: { arrow_datafusion_err, arrow_err, ScalarValue } ;
1477
1478
1478
1479
fn check ( left : & [ Column ] , right : & [ Column ] , on : & [ ( Column , Column ) ] ) -> Result < ( ) > {
@@ -1640,25 +1641,26 @@ mod tests {
1640
1641
}
1641
1642
1642
1643
fn create_column_stats (
1643
- min : Option < i64 > ,
1644
- max : Option < i64 > ,
1645
- distinct_count : Option < usize > ,
1644
+ min : Precision < i64 > ,
1645
+ max : Precision < i64 > ,
1646
+ distinct_count : Precision < usize > ,
1647
+ null_count : Precision < usize > ,
1646
1648
) -> ColumnStatistics {
1647
1649
ColumnStatistics {
1648
- distinct_count : distinct_count
1649
- . map ( Precision :: Inexact )
1650
- . unwrap_or ( Precision :: Absent ) ,
1651
- min_value : min
1652
- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1653
- . unwrap_or ( Precision :: Absent ) ,
1654
- max_value : max
1655
- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1656
- . unwrap_or ( Precision :: Absent ) ,
1657
- ..Default :: default ( )
1650
+ distinct_count,
1651
+ min_value : min. map ( |size| ScalarValue :: from ( size) ) ,
1652
+ max_value : max. map ( |size| ScalarValue :: from ( size) ) ,
1653
+ null_count,
1658
1654
}
1659
1655
}
1660
1656
1661
- type PartialStats = ( usize , Option < i64 > , Option < i64 > , Option < usize > ) ;
1657
+ type PartialStats = (
1658
+ usize ,
1659
+ Precision < i64 > ,
1660
+ Precision < i64 > ,
1661
+ Precision < usize > ,
1662
+ Precision < usize > ,
1663
+ ) ;
1662
1664
1663
1665
// This is mainly for validating the all edge cases of the estimation, but
1664
1666
// more advanced (and real world test cases) are below where we need some control
@@ -1675,133 +1677,156 @@ mod tests {
1675
1677
//
1676
1678
// distinct(left) == NaN, distinct(right) == NaN
1677
1679
(
1678
- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1679
- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1680
- Some ( Precision :: Inexact ( 10 ) ) ,
1680
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1681
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1682
+ Some ( Inexact ( 10 ) ) ,
1681
1683
) ,
1682
1684
// range(left) > range(right)
1683
1685
(
1684
- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1685
- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1686
- Some ( Precision :: Inexact ( 20 ) ) ,
1686
+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1687
+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1688
+ Some ( Inexact ( 20 ) ) ,
1687
1689
) ,
1688
1690
// range(right) > range(left)
1689
1691
(
1690
- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1691
- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1692
- Some ( Precision :: Inexact ( 20 ) ) ,
1692
+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1693
+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1694
+ Some ( Inexact ( 20 ) ) ,
1693
1695
) ,
1694
1696
// range(left) > len(left), range(right) > len(right)
1695
1697
(
1696
- ( 10 , Some ( 1 ) , Some ( 15 ) , None ) ,
1697
- ( 20 , Some ( 1 ) , Some ( 40 ) , None ) ,
1698
- Some ( Precision :: Inexact ( 10 ) ) ,
1698
+ ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
1699
+ ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
1700
+ Some ( Inexact ( 10 ) ) ,
1699
1701
) ,
1700
1702
// When we have distinct count.
1701
1703
(
1702
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1703
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1704
- Some ( Precision :: Inexact ( 10 ) ) ,
1704
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1705
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1706
+ Some ( Inexact ( 10 ) ) ,
1705
1707
) ,
1706
1708
// distinct(left) > distinct(right)
1707
1709
(
1708
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1709
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1710
- Some ( Precision :: Inexact ( 20 ) ) ,
1710
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1711
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1712
+ Some ( Inexact ( 20 ) ) ,
1711
1713
) ,
1712
1714
// distinct(right) > distinct(left)
1713
1715
(
1714
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1715
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1716
- Some ( Precision :: Inexact ( 20 ) ) ,
1716
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1717
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1718
+ Some ( Inexact ( 20 ) ) ,
1717
1719
) ,
1718
1720
// min(left) < 0 (range(left) > range(right))
1719
1721
(
1720
- ( 10 , Some ( -5 ) , Some ( 5 ) , None ) ,
1721
- ( 10 , Some ( 1 ) , Some ( 5 ) , None ) ,
1722
- Some ( Precision :: Inexact ( 10 ) ) ,
1722
+ ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
1723
+ ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
1724
+ Some ( Inexact ( 10 ) ) ,
1723
1725
) ,
1724
1726
// min(right) < 0, max(right) < 0 (range(right) > range(left))
1725
1727
(
1726
- ( 10 , Some ( -25 ) , Some ( -20 ) , None ) ,
1727
- ( 10 , Some ( -25 ) , Some ( -15 ) , None ) ,
1728
- Some ( Precision :: Inexact ( 10 ) ) ,
1728
+ ( 10 , Inexact ( -25 ) , Inexact ( -20 ) , Absent , Absent ) ,
1729
+ ( 10 , Inexact ( -25 ) , Inexact ( -15 ) , Absent , Absent ) ,
1730
+ Some ( Inexact ( 10 ) ) ,
1729
1731
) ,
1730
1732
// range(left) < 0, range(right) >= 0
1731
1733
// (there isn't a case where both left and right ranges are negative
1732
1734
// so one of them is always going to work, this just proves negative
1733
1735
// ranges with bigger absolute values are not are not accidentally used).
1734
1736
(
1735
- ( 10 , Some ( -10 ) , Some ( 0 ) , None ) ,
1736
- ( 10 , Some ( 0 ) , Some ( 10 ) , Some ( 5 ) ) ,
1737
- Some ( Precision :: Inexact ( 10 ) ) ,
1737
+ ( 10 , Inexact ( -10 ) , Inexact ( 0 ) , Absent , Absent ) ,
1738
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1739
+ Some ( Inexact ( 10 ) ) ,
1738
1740
) ,
1739
1741
// range(left) = 1, range(right) = 1
1740
1742
(
1741
- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1742
- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1743
- Some ( Precision :: Inexact ( 100 ) ) ,
1743
+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1744
+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1745
+ Some ( Inexact ( 100 ) ) ,
1744
1746
) ,
1745
1747
//
1746
1748
// Edge cases
1747
1749
// ==========
1748
1750
//
1749
1751
// No column level stats.
1750
- ( ( 10 , None , None , None ) , ( 10 , None , None , None ) , None ) ,
1752
+ (
1753
+ ( 10 , Absent , Absent , Absent , Absent ) ,
1754
+ ( 10 , Absent , Absent , Absent , Absent ) ,
1755
+ None ,
1756
+ ) ,
1751
1757
// No min or max (or both).
1752
- ( ( 10 , None , None , Some ( 3 ) ) , ( 10 , None , None , Some ( 3 ) ) , None ) ,
1753
1758
(
1754
- ( 10 , Some ( 2 ) , None , Some ( 3 ) ) ,
1755
- ( 10 , None , Some ( 5 ) , Some ( 3 ) ) ,
1759
+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1760
+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1761
+ None ,
1762
+ ) ,
1763
+ (
1764
+ ( 10 , Inexact ( 2 ) , Absent , Inexact ( 3 ) , Absent ) ,
1765
+ ( 10 , Absent , Inexact ( 5 ) , Inexact ( 3 ) , Absent ) ,
1756
1766
None ,
1757
1767
) ,
1758
1768
(
1759
- ( 10 , None , Some ( 3 ) , Some ( 3 ) ) ,
1760
- ( 10 , Some ( 1 ) , None , Some ( 3 ) ) ,
1769
+ ( 10 , Absent , Inexact ( 3 ) , Inexact ( 3 ) , Absent ) ,
1770
+ ( 10 , Inexact ( 1 ) , Absent , Inexact ( 3 ) , Absent ) ,
1771
+ None ,
1772
+ ) ,
1773
+ (
1774
+ ( 10 , Absent , Inexact ( 3 ) , Absent , Absent ) ,
1775
+ ( 10 , Inexact ( 1 ) , Absent , Absent , Absent ) ,
1761
1776
None ,
1762
1777
) ,
1763
- ( ( 10 , None , Some ( 3 ) , None ) , ( 10 , Some ( 1 ) , None , None ) , None ) ,
1764
1778
// Non overlapping min/max (when exact=False).
1765
1779
(
1766
- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1767
- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1768
- Some ( Precision :: Inexact ( 0 ) ) ,
1780
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1781
+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1782
+ Some ( Inexact ( 0 ) ) ,
1769
1783
) ,
1770
1784
(
1771
- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1772
- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1773
- Some ( Precision :: Inexact ( 0 ) ) ,
1785
+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1786
+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1787
+ Some ( Inexact ( 0 ) ) ,
1774
1788
) ,
1775
1789
// distinct(left) = 0, distinct(right) = 0
1776
1790
(
1777
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1778
- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1791
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
1792
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
1779
1793
None ,
1780
1794
) ,
1795
+ // Inexact row count < exact null count with absent distinct count
1796
+ (
1797
+ ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
1798
+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1799
+ Some ( Inexact ( 0 ) ) ,
1800
+ ) ,
1781
1801
] ;
1782
1802
1783
1803
for ( left_info, right_info, expected_cardinality) in cases {
1784
1804
let left_num_rows = left_info. 0 ;
1785
- let left_col_stats =
1786
- vec ! [ create_column_stats( left_info. 1 , left_info. 2 , left_info. 3 ) ] ;
1805
+ let left_col_stats = vec ! [ create_column_stats(
1806
+ left_info. 1 ,
1807
+ left_info. 2 ,
1808
+ left_info. 3 ,
1809
+ left_info. 4 ,
1810
+ ) ] ;
1787
1811
1788
1812
let right_num_rows = right_info. 0 ;
1789
1813
let right_col_stats = vec ! [ create_column_stats(
1790
1814
right_info. 1 ,
1791
1815
right_info. 2 ,
1792
1816
right_info. 3 ,
1817
+ right_info. 4 ,
1793
1818
) ] ;
1794
1819
1795
1820
assert_eq ! (
1796
1821
estimate_inner_join_cardinality(
1797
1822
Statistics {
1798
- num_rows: Precision :: Inexact ( left_num_rows) ,
1799
- total_byte_size: Precision :: Absent ,
1823
+ num_rows: Inexact ( left_num_rows) ,
1824
+ total_byte_size: Absent ,
1800
1825
column_statistics: left_col_stats. clone( ) ,
1801
1826
} ,
1802
1827
Statistics {
1803
- num_rows: Precision :: Inexact ( right_num_rows) ,
1804
- total_byte_size: Precision :: Absent ,
1828
+ num_rows: Inexact ( right_num_rows) ,
1829
+ total_byte_size: Absent ,
1805
1830
column_statistics: right_col_stats. clone( ) ,
1806
1831
} ,
1807
1832
) ,
@@ -1819,9 +1844,7 @@ mod tests {
1819
1844
) ;
1820
1845
1821
1846
assert_eq ! (
1822
- partial_join_stats
1823
- . clone( )
1824
- . map( |s| Precision :: Inexact ( s. num_rows) ) ,
1847
+ partial_join_stats. clone( ) . map( |s| Inexact ( s. num_rows) ) ,
1825
1848
expected_cardinality. clone( )
1826
1849
) ;
1827
1850
assert_eq ! (
@@ -1837,13 +1860,13 @@ mod tests {
1837
1860
#[ test]
1838
1861
fn test_inner_join_cardinality_multiple_column ( ) -> Result < ( ) > {
1839
1862
let left_col_stats = vec ! [
1840
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1841
- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 150 ) ) ,
1863
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1864
+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 150 ) , Absent ) ,
1842
1865
] ;
1843
1866
1844
1867
let right_col_stats = vec ! [
1845
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1846
- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 200 ) ) ,
1868
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1869
+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 200 ) , Absent ) ,
1847
1870
] ;
1848
1871
1849
1872
// We have statistics about 4 columns, where the highest distinct
@@ -1921,15 +1944,15 @@ mod tests {
1921
1944
] ;
1922
1945
1923
1946
let left_col_stats = vec ! [
1924
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1925
- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1926
- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1947
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1948
+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1949
+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
1927
1950
] ;
1928
1951
1929
1952
let right_col_stats = vec ! [
1930
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1931
- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1932
- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
1953
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1954
+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
1955
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
1933
1956
] ;
1934
1957
1935
1958
for ( join_type, expected_num_rows) in cases {
@@ -1970,15 +1993,15 @@ mod tests {
1970
1993
// Join on a=c, x=y (ignores b/d) where x and y does not intersect
1971
1994
1972
1995
let left_col_stats = vec ! [
1973
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1974
- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1975
- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1996
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1997
+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1998
+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
1976
1999
] ;
1977
2000
1978
2001
let right_col_stats = vec ! [
1979
- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1980
- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1981
- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
2002
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
2003
+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
2004
+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
1982
2005
] ;
1983
2006
1984
2007
let join_on = vec ! [
0 commit comments