Skip to content

Commit 7941577

Browse files
emcakeMatthew Kemp
and
Matthew Kemp
authored
Enable truncation of binary statistics columns (#5076)
* changes needed to introduce min/max exactness * implement truncation property and logic, tests * format lints * change min/max exact to be with... methods * reduce code noise * remove redundant clone --------- Co-authored-by: Matthew Kemp <[email protected]>
1 parent 7ba36b0 commit 7941577

File tree

4 files changed

+401
-74
lines changed

4 files changed

+401
-74
lines changed

parquet/src/column/writer/mod.rs

Lines changed: 211 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
636636
Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => {
637637
self.column_index_builder.append(
638638
null_page,
639-
self.truncate_min_value(stat.min_bytes()),
640-
self.truncate_max_value(stat.max_bytes()),
639+
self.truncate_min_value(
640+
self.props.column_index_truncate_length(),
641+
stat.min_bytes(),
642+
)
643+
.0,
644+
self.truncate_max_value(
645+
self.props.column_index_truncate_length(),
646+
stat.max_bytes(),
647+
)
648+
.0,
641649
self.page_metrics.num_page_nulls as i64,
642650
);
643651
}
@@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
658666
.append_row_count(self.page_metrics.num_buffered_rows as i64);
659667
}
660668

661-
fn truncate_min_value(&self, data: &[u8]) -> Vec<u8> {
662-
self.props
663-
.column_index_truncate_length()
669+
fn truncate_min_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
670+
truncation_length
664671
.filter(|l| data.len() > *l)
665672
.and_then(|l| match str::from_utf8(data) {
666673
Ok(str_data) => truncate_utf8(str_data, l),
667674
Err(_) => Some(data[..l].to_vec()),
668675
})
669-
.unwrap_or_else(|| data.to_vec())
676+
.map(|truncated| (truncated, true))
677+
.unwrap_or_else(|| (data.to_vec(), false))
670678
}
671679

672-
fn truncate_max_value(&self, data: &[u8]) -> Vec<u8> {
673-
self.props
674-
.column_index_truncate_length()
680+
fn truncate_max_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
681+
truncation_length
675682
.filter(|l| data.len() > *l)
676683
.and_then(|l| match str::from_utf8(data) {
677684
Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8),
678685
Err(_) => increment(data[..l].to_vec()),
679686
})
680-
.unwrap_or_else(|| data.to_vec())
687+
.map(|truncated| (truncated, true))
688+
.unwrap_or_else(|| (data.to_vec(), false))
681689
}
682690

683691
/// Adds data page.
@@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
856864
.set_dictionary_page_offset(dict_page_offset);
857865

858866
if self.statistics_enabled != EnabledStatistics::None {
867+
let backwards_compatible_min_max = self.descr.sort_order().is_signed();
868+
859869
let statistics = ValueStatistics::<E::T>::new(
860870
self.column_metrics.min_column_value.clone(),
861871
self.column_metrics.max_column_value.clone(),
862872
self.column_metrics.column_distinct_count,
863873
self.column_metrics.num_column_nulls,
864874
false,
865-
);
875+
)
876+
.with_backwards_compatible_min_max(backwards_compatible_min_max)
877+
.into();
878+
879+
let statistics = match statistics {
880+
Statistics::ByteArray(stats) if stats.has_min_max_set() => {
881+
let (min, did_truncate_min) = self.truncate_min_value(
882+
self.props.statistics_truncate_length(),
883+
stats.min_bytes(),
884+
);
885+
let (max, did_truncate_max) = self.truncate_max_value(
886+
self.props.statistics_truncate_length(),
887+
stats.max_bytes(),
888+
);
889+
Statistics::ByteArray(
890+
ValueStatistics::new(
891+
Some(min.into()),
892+
Some(max.into()),
893+
stats.distinct_count(),
894+
stats.null_count(),
895+
backwards_compatible_min_max,
896+
)
897+
.with_max_is_exact(!did_truncate_max)
898+
.with_min_is_exact(!did_truncate_min),
899+
)
900+
}
901+
Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => {
902+
let (min, did_truncate_min) = self.truncate_min_value(
903+
self.props.statistics_truncate_length(),
904+
stats.min_bytes(),
905+
);
906+
let (max, did_truncate_max) = self.truncate_max_value(
907+
self.props.statistics_truncate_length(),
908+
stats.max_bytes(),
909+
);
910+
Statistics::FixedLenByteArray(
911+
ValueStatistics::new(
912+
Some(min.into()),
913+
Some(max.into()),
914+
stats.distinct_count(),
915+
stats.null_count(),
916+
backwards_compatible_min_max,
917+
)
918+
.with_max_is_exact(!did_truncate_max)
919+
.with_min_is_exact(!did_truncate_min),
920+
)
921+
}
922+
stats => stats,
923+
};
866924

867-
// Some common readers only support the deprecated statistics
868-
// format so we also write them out if possible
869-
// See https://github.com/apache/arrow-rs/issues/799
870-
let statistics = statistics
871-
.with_backwards_compatible_min_max(self.descr.sort_order().is_signed())
872-
.into();
873925
builder = builder.set_statistics(statistics);
874926
}
875927

@@ -2612,6 +2664,148 @@ mod tests {
26122664
}
26132665
}
26142666

2667+
#[test]
2668+
fn test_statistics_truncating_byte_array() {
2669+
let page_writer = get_test_page_writer();
2670+
2671+
const TEST_TRUNCATE_LENGTH: usize = 1;
2672+
2673+
// Truncate values at 1 byte
2674+
let builder =
2675+
WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
2676+
let props = Arc::new(builder.build());
2677+
let mut writer = get_test_column_writer::<ByteArrayType>(page_writer, 0, 0, props);
2678+
2679+
let mut data = vec![ByteArray::default(); 1];
2680+
// This is the expected min value
2681+
data[0].set_data(Bytes::from(String::from("Blart Versenwald III")));
2682+
2683+
writer.write_batch(&data, None, None).unwrap();
2684+
2685+
writer.flush_data_pages().unwrap();
2686+
2687+
let r = writer.close().unwrap();
2688+
2689+
assert_eq!(1, r.rows_written);
2690+
2691+
let stats = r.metadata.statistics().expect("statistics");
2692+
assert!(stats.has_min_max_set());
2693+
assert_eq!(stats.null_count(), 0);
2694+
assert_eq!(stats.distinct_count(), None);
2695+
if let Statistics::ByteArray(_stats) = stats {
2696+
let min_value = _stats.min();
2697+
let max_value = _stats.max();
2698+
2699+
assert!(!_stats.min_is_exact());
2700+
assert!(!_stats.max_is_exact());
2701+
2702+
assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
2703+
assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
2704+
2705+
assert_eq!("B".as_bytes(), min_value.as_bytes());
2706+
assert_eq!("C".as_bytes(), max_value.as_bytes());
2707+
} else {
2708+
panic!("expecting Statistics::ByteArray");
2709+
}
2710+
}
2711+
2712+
#[test]
2713+
fn test_statistics_truncating_fixed_len_byte_array() {
2714+
let page_writer = get_test_page_writer();
2715+
2716+
const TEST_TRUNCATE_LENGTH: usize = 1;
2717+
2718+
// Truncate values at 1 byte
2719+
let builder =
2720+
WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
2721+
let props = Arc::new(builder.build());
2722+
let mut writer = get_test_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, props);
2723+
2724+
let mut data = vec![FixedLenByteArray::default(); 1];
2725+
2726+
const PSEUDO_DECIMAL_VALUE: i128 = 6541894651216648486512564456564654;
2727+
const PSEUDO_DECIMAL_BYTES: [u8; 16] = PSEUDO_DECIMAL_VALUE.to_be_bytes();
2728+
2729+
const EXPECTED_MIN: [u8; TEST_TRUNCATE_LENGTH] = [PSEUDO_DECIMAL_BYTES[0]]; // parquet specifies big-endian order for decimals
2730+
const EXPECTED_MAX: [u8; TEST_TRUNCATE_LENGTH] =
2731+
[PSEUDO_DECIMAL_BYTES[0].overflowing_add(1).0];
2732+
2733+
// This is the expected min value
2734+
data[0].set_data(Bytes::from(PSEUDO_DECIMAL_BYTES.as_slice()));
2735+
2736+
writer.write_batch(&data, None, None).unwrap();
2737+
2738+
writer.flush_data_pages().unwrap();
2739+
2740+
let r = writer.close().unwrap();
2741+
2742+
assert_eq!(1, r.rows_written);
2743+
2744+
let stats = r.metadata.statistics().expect("statistics");
2745+
assert!(stats.has_min_max_set());
2746+
assert_eq!(stats.null_count(), 0);
2747+
assert_eq!(stats.distinct_count(), None);
2748+
if let Statistics::FixedLenByteArray(_stats) = stats {
2749+
let min_value = _stats.min();
2750+
let max_value = _stats.max();
2751+
2752+
assert!(!_stats.min_is_exact());
2753+
assert!(!_stats.max_is_exact());
2754+
2755+
assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
2756+
assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
2757+
2758+
assert_eq!(EXPECTED_MIN.as_slice(), min_value.as_bytes());
2759+
assert_eq!(EXPECTED_MAX.as_slice(), max_value.as_bytes());
2760+
2761+
let reconstructed_min = i128::from_be_bytes([
2762+
min_value.as_bytes()[0],
2763+
0,
2764+
0,
2765+
0,
2766+
0,
2767+
0,
2768+
0,
2769+
0,
2770+
0,
2771+
0,
2772+
0,
2773+
0,
2774+
0,
2775+
0,
2776+
0,
2777+
0,
2778+
]);
2779+
2780+
let reconstructed_max = i128::from_be_bytes([
2781+
max_value.as_bytes()[0],
2782+
0,
2783+
0,
2784+
0,
2785+
0,
2786+
0,
2787+
0,
2788+
0,
2789+
0,
2790+
0,
2791+
0,
2792+
0,
2793+
0,
2794+
0,
2795+
0,
2796+
0,
2797+
]);
2798+
2799+
// check that the inner value is correctly bounded by the min/max
2800+
println!("min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}");
2801+
assert!(reconstructed_min <= PSEUDO_DECIMAL_VALUE);
2802+
println!("max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}");
2803+
assert!(reconstructed_max >= PSEUDO_DECIMAL_VALUE);
2804+
} else {
2805+
panic!("expecting Statistics::FixedLenByteArray");
2806+
}
2807+
}
2808+
26152809
#[test]
26162810
fn test_send() {
26172811
fn test<T: Send>() {}

parquet/src/file/properties.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
5151
pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
5252
/// Default value for [`BloomFilterProperties::ndv`]
5353
pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
54+
/// Default values for [`WriterProperties::statistics_truncate_length`]
55+
pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
5456

5557
/// Parquet writer version.
5658
///
@@ -136,6 +138,7 @@ pub struct WriterProperties {
136138
column_properties: HashMap<ColumnPath, ColumnProperties>,
137139
sorting_columns: Option<Vec<SortingColumn>>,
138140
column_index_truncate_length: Option<usize>,
141+
statistics_truncate_length: Option<usize>,
139142
}
140143

141144
impl Default for WriterProperties {
@@ -241,6 +244,13 @@ impl WriterProperties {
241244
self.column_index_truncate_length
242245
}
243246

247+
/// Returns the maximum length of truncated min/max values in statistics.
248+
///
249+
/// `None` if truncation is disabled, must be greater than 0 otherwise.
250+
pub fn statistics_truncate_length(&self) -> Option<usize> {
251+
self.statistics_truncate_length
252+
}
253+
244254
/// Returns encoding for a data page, when dictionary encoding is enabled.
245255
/// This is not configurable.
246256
#[inline]
@@ -334,6 +344,7 @@ pub struct WriterPropertiesBuilder {
334344
column_properties: HashMap<ColumnPath, ColumnProperties>,
335345
sorting_columns: Option<Vec<SortingColumn>>,
336346
column_index_truncate_length: Option<usize>,
347+
statistics_truncate_length: Option<usize>,
337348
}
338349

339350
impl WriterPropertiesBuilder {
@@ -352,6 +363,7 @@ impl WriterPropertiesBuilder {
352363
column_properties: HashMap::new(),
353364
sorting_columns: None,
354365
column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
366+
statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
355367
}
356368
}
357369

@@ -370,6 +382,7 @@ impl WriterPropertiesBuilder {
370382
column_properties: self.column_properties,
371383
sorting_columns: self.sorting_columns,
372384
column_index_truncate_length: self.column_index_truncate_length,
385+
statistics_truncate_length: self.statistics_truncate_length,
373386
}
374387
}
375388

@@ -643,6 +656,17 @@ impl WriterPropertiesBuilder {
643656
self.column_index_truncate_length = max_length;
644657
self
645658
}
659+
660+
/// Sets the max length of min/max value fields in statistics. Must be greater than 0.
661+
/// If set to `None` - there's no effective limit.
662+
pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
663+
if let Some(value) = max_length {
664+
assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
665+
}
666+
667+
self.statistics_truncate_length = max_length;
668+
self
669+
}
646670
}
647671

648672
/// Controls the level of statistics to be computed by the writer

0 commit comments

Comments
 (0)