@@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
636
636
Type :: BYTE_ARRAY | Type :: FIXED_LEN_BYTE_ARRAY => {
637
637
self . column_index_builder . append (
638
638
null_page,
639
- self . truncate_min_value ( stat. min_bytes ( ) ) ,
640
- self . truncate_max_value ( stat. max_bytes ( ) ) ,
639
+ self . truncate_min_value (
640
+ self . props . column_index_truncate_length ( ) ,
641
+ stat. min_bytes ( ) ,
642
+ )
643
+ . 0 ,
644
+ self . truncate_max_value (
645
+ self . props . column_index_truncate_length ( ) ,
646
+ stat. max_bytes ( ) ,
647
+ )
648
+ . 0 ,
641
649
self . page_metrics . num_page_nulls as i64 ,
642
650
) ;
643
651
}
@@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
658
666
. append_row_count ( self . page_metrics . num_buffered_rows as i64 ) ;
659
667
}
660
668
661
- fn truncate_min_value ( & self , data : & [ u8 ] ) -> Vec < u8 > {
662
- self . props
663
- . column_index_truncate_length ( )
669
+ fn truncate_min_value ( & self , truncation_length : Option < usize > , data : & [ u8 ] ) -> ( Vec < u8 > , bool ) {
670
+ truncation_length
664
671
. filter ( |l| data. len ( ) > * l)
665
672
. and_then ( |l| match str:: from_utf8 ( data) {
666
673
Ok ( str_data) => truncate_utf8 ( str_data, l) ,
667
674
Err ( _) => Some ( data[ ..l] . to_vec ( ) ) ,
668
675
} )
669
- . unwrap_or_else ( || data. to_vec ( ) )
676
+ . map ( |truncated| ( truncated, true ) )
677
+ . unwrap_or_else ( || ( data. to_vec ( ) , false ) )
670
678
}
671
679
672
- fn truncate_max_value ( & self , data : & [ u8 ] ) -> Vec < u8 > {
673
- self . props
674
- . column_index_truncate_length ( )
680
+ fn truncate_max_value ( & self , truncation_length : Option < usize > , data : & [ u8 ] ) -> ( Vec < u8 > , bool ) {
681
+ truncation_length
675
682
. filter ( |l| data. len ( ) > * l)
676
683
. and_then ( |l| match str:: from_utf8 ( data) {
677
684
Ok ( str_data) => truncate_utf8 ( str_data, l) . and_then ( increment_utf8) ,
678
685
Err ( _) => increment ( data[ ..l] . to_vec ( ) ) ,
679
686
} )
680
- . unwrap_or_else ( || data. to_vec ( ) )
687
+ . map ( |truncated| ( truncated, true ) )
688
+ . unwrap_or_else ( || ( data. to_vec ( ) , false ) )
681
689
}
682
690
683
691
/// Adds data page.
@@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
856
864
. set_dictionary_page_offset ( dict_page_offset) ;
857
865
858
866
if self . statistics_enabled != EnabledStatistics :: None {
867
+ let backwards_compatible_min_max = self . descr . sort_order ( ) . is_signed ( ) ;
868
+
859
869
let statistics = ValueStatistics :: < E :: T > :: new (
860
870
self . column_metrics . min_column_value . clone ( ) ,
861
871
self . column_metrics . max_column_value . clone ( ) ,
862
872
self . column_metrics . column_distinct_count ,
863
873
self . column_metrics . num_column_nulls ,
864
874
false ,
865
- ) ;
875
+ )
876
+ . with_backwards_compatible_min_max ( backwards_compatible_min_max)
877
+ . into ( ) ;
878
+
879
+ let statistics = match statistics {
880
+ Statistics :: ByteArray ( stats) if stats. has_min_max_set ( ) => {
881
+ let ( min, did_truncate_min) = self . truncate_min_value (
882
+ self . props . statistics_truncate_length ( ) ,
883
+ stats. min_bytes ( ) ,
884
+ ) ;
885
+ let ( max, did_truncate_max) = self . truncate_max_value (
886
+ self . props . statistics_truncate_length ( ) ,
887
+ stats. max_bytes ( ) ,
888
+ ) ;
889
+ Statistics :: ByteArray (
890
+ ValueStatistics :: new (
891
+ Some ( min. into ( ) ) ,
892
+ Some ( max. into ( ) ) ,
893
+ stats. distinct_count ( ) ,
894
+ stats. null_count ( ) ,
895
+ backwards_compatible_min_max,
896
+ )
897
+ . with_max_is_exact ( !did_truncate_max)
898
+ . with_min_is_exact ( !did_truncate_min) ,
899
+ )
900
+ }
901
+ Statistics :: FixedLenByteArray ( stats) if stats. has_min_max_set ( ) => {
902
+ let ( min, did_truncate_min) = self . truncate_min_value (
903
+ self . props . statistics_truncate_length ( ) ,
904
+ stats. min_bytes ( ) ,
905
+ ) ;
906
+ let ( max, did_truncate_max) = self . truncate_max_value (
907
+ self . props . statistics_truncate_length ( ) ,
908
+ stats. max_bytes ( ) ,
909
+ ) ;
910
+ Statistics :: FixedLenByteArray (
911
+ ValueStatistics :: new (
912
+ Some ( min. into ( ) ) ,
913
+ Some ( max. into ( ) ) ,
914
+ stats. distinct_count ( ) ,
915
+ stats. null_count ( ) ,
916
+ backwards_compatible_min_max,
917
+ )
918
+ . with_max_is_exact ( !did_truncate_max)
919
+ . with_min_is_exact ( !did_truncate_min) ,
920
+ )
921
+ }
922
+ stats => stats,
923
+ } ;
866
924
867
- // Some common readers only support the deprecated statistics
868
- // format so we also write them out if possible
869
- // See https://github.com/apache/arrow-rs/issues/799
870
- let statistics = statistics
871
- . with_backwards_compatible_min_max ( self . descr . sort_order ( ) . is_signed ( ) )
872
- . into ( ) ;
873
925
builder = builder. set_statistics ( statistics) ;
874
926
}
875
927
@@ -2612,6 +2664,148 @@ mod tests {
2612
2664
}
2613
2665
}
2614
2666
2667
+ #[ test]
2668
+ fn test_statistics_truncating_byte_array ( ) {
2669
+ let page_writer = get_test_page_writer ( ) ;
2670
+
2671
+ const TEST_TRUNCATE_LENGTH : usize = 1 ;
2672
+
2673
+ // Truncate values at 1 byte
2674
+ let builder =
2675
+ WriterProperties :: builder ( ) . set_statistics_truncate_length ( Some ( TEST_TRUNCATE_LENGTH ) ) ;
2676
+ let props = Arc :: new ( builder. build ( ) ) ;
2677
+ let mut writer = get_test_column_writer :: < ByteArrayType > ( page_writer, 0 , 0 , props) ;
2678
+
2679
+ let mut data = vec ! [ ByteArray :: default ( ) ; 1 ] ;
2680
+ // This is the expected min value
2681
+ data[ 0 ] . set_data ( Bytes :: from ( String :: from ( "Blart Versenwald III" ) ) ) ;
2682
+
2683
+ writer. write_batch ( & data, None , None ) . unwrap ( ) ;
2684
+
2685
+ writer. flush_data_pages ( ) . unwrap ( ) ;
2686
+
2687
+ let r = writer. close ( ) . unwrap ( ) ;
2688
+
2689
+ assert_eq ! ( 1 , r. rows_written) ;
2690
+
2691
+ let stats = r. metadata . statistics ( ) . expect ( "statistics" ) ;
2692
+ assert ! ( stats. has_min_max_set( ) ) ;
2693
+ assert_eq ! ( stats. null_count( ) , 0 ) ;
2694
+ assert_eq ! ( stats. distinct_count( ) , None ) ;
2695
+ if let Statistics :: ByteArray ( _stats) = stats {
2696
+ let min_value = _stats. min ( ) ;
2697
+ let max_value = _stats. max ( ) ;
2698
+
2699
+ assert ! ( !_stats. min_is_exact( ) ) ;
2700
+ assert ! ( !_stats. max_is_exact( ) ) ;
2701
+
2702
+ assert_eq ! ( min_value. len( ) , TEST_TRUNCATE_LENGTH ) ;
2703
+ assert_eq ! ( max_value. len( ) , TEST_TRUNCATE_LENGTH ) ;
2704
+
2705
+ assert_eq ! ( "B" . as_bytes( ) , min_value. as_bytes( ) ) ;
2706
+ assert_eq ! ( "C" . as_bytes( ) , max_value. as_bytes( ) ) ;
2707
+ } else {
2708
+ panic ! ( "expecting Statistics::ByteArray" ) ;
2709
+ }
2710
+ }
2711
+
2712
+ #[ test]
2713
+ fn test_statistics_truncating_fixed_len_byte_array ( ) {
2714
+ let page_writer = get_test_page_writer ( ) ;
2715
+
2716
+ const TEST_TRUNCATE_LENGTH : usize = 1 ;
2717
+
2718
+ // Truncate values at 1 byte
2719
+ let builder =
2720
+ WriterProperties :: builder ( ) . set_statistics_truncate_length ( Some ( TEST_TRUNCATE_LENGTH ) ) ;
2721
+ let props = Arc :: new ( builder. build ( ) ) ;
2722
+ let mut writer = get_test_column_writer :: < FixedLenByteArrayType > ( page_writer, 0 , 0 , props) ;
2723
+
2724
+ let mut data = vec ! [ FixedLenByteArray :: default ( ) ; 1 ] ;
2725
+
2726
+ const PSEUDO_DECIMAL_VALUE : i128 = 6541894651216648486512564456564654 ;
2727
+ const PSEUDO_DECIMAL_BYTES : [ u8 ; 16 ] = PSEUDO_DECIMAL_VALUE . to_be_bytes ( ) ;
2728
+
2729
+ const EXPECTED_MIN : [ u8 ; TEST_TRUNCATE_LENGTH ] = [ PSEUDO_DECIMAL_BYTES [ 0 ] ] ; // parquet specifies big-endian order for decimals
2730
+ const EXPECTED_MAX : [ u8 ; TEST_TRUNCATE_LENGTH ] =
2731
+ [ PSEUDO_DECIMAL_BYTES [ 0 ] . overflowing_add ( 1 ) . 0 ] ;
2732
+
2733
+ // This is the expected min value
2734
+ data[ 0 ] . set_data ( Bytes :: from ( PSEUDO_DECIMAL_BYTES . as_slice ( ) ) ) ;
2735
+
2736
+ writer. write_batch ( & data, None , None ) . unwrap ( ) ;
2737
+
2738
+ writer. flush_data_pages ( ) . unwrap ( ) ;
2739
+
2740
+ let r = writer. close ( ) . unwrap ( ) ;
2741
+
2742
+ assert_eq ! ( 1 , r. rows_written) ;
2743
+
2744
+ let stats = r. metadata . statistics ( ) . expect ( "statistics" ) ;
2745
+ assert ! ( stats. has_min_max_set( ) ) ;
2746
+ assert_eq ! ( stats. null_count( ) , 0 ) ;
2747
+ assert_eq ! ( stats. distinct_count( ) , None ) ;
2748
+ if let Statistics :: FixedLenByteArray ( _stats) = stats {
2749
+ let min_value = _stats. min ( ) ;
2750
+ let max_value = _stats. max ( ) ;
2751
+
2752
+ assert ! ( !_stats. min_is_exact( ) ) ;
2753
+ assert ! ( !_stats. max_is_exact( ) ) ;
2754
+
2755
+ assert_eq ! ( min_value. len( ) , TEST_TRUNCATE_LENGTH ) ;
2756
+ assert_eq ! ( max_value. len( ) , TEST_TRUNCATE_LENGTH ) ;
2757
+
2758
+ assert_eq ! ( EXPECTED_MIN . as_slice( ) , min_value. as_bytes( ) ) ;
2759
+ assert_eq ! ( EXPECTED_MAX . as_slice( ) , max_value. as_bytes( ) ) ;
2760
+
2761
+ let reconstructed_min = i128:: from_be_bytes ( [
2762
+ min_value. as_bytes ( ) [ 0 ] ,
2763
+ 0 ,
2764
+ 0 ,
2765
+ 0 ,
2766
+ 0 ,
2767
+ 0 ,
2768
+ 0 ,
2769
+ 0 ,
2770
+ 0 ,
2771
+ 0 ,
2772
+ 0 ,
2773
+ 0 ,
2774
+ 0 ,
2775
+ 0 ,
2776
+ 0 ,
2777
+ 0 ,
2778
+ ] ) ;
2779
+
2780
+ let reconstructed_max = i128:: from_be_bytes ( [
2781
+ max_value. as_bytes ( ) [ 0 ] ,
2782
+ 0 ,
2783
+ 0 ,
2784
+ 0 ,
2785
+ 0 ,
2786
+ 0 ,
2787
+ 0 ,
2788
+ 0 ,
2789
+ 0 ,
2790
+ 0 ,
2791
+ 0 ,
2792
+ 0 ,
2793
+ 0 ,
2794
+ 0 ,
2795
+ 0 ,
2796
+ 0 ,
2797
+ ] ) ;
2798
+
2799
+ // check that the inner value is correctly bounded by the min/max
2800
+ println ! ( "min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}" ) ;
2801
+ assert ! ( reconstructed_min <= PSEUDO_DECIMAL_VALUE ) ;
2802
+ println ! ( "max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}" ) ;
2803
+ assert ! ( reconstructed_max >= PSEUDO_DECIMAL_VALUE ) ;
2804
+ } else {
2805
+ panic ! ( "expecting Statistics::FixedLenByteArray" ) ;
2806
+ }
2807
+ }
2808
+
2615
2809
#[ test]
2616
2810
fn test_send ( ) {
2617
2811
fn test < T : Send > ( ) { }
0 commit comments