Skip to content

Commit f621d28

Browse files
authored
Parquet: omit min/max for interval columns when writing stats (#5147)
* Parquet: omit min/max for interval columns when writing stats * Trigger
1 parent 6d4b8bb commit f621d28

File tree

2 files changed

+55
-11
lines changed

2 files changed

+55
-11
lines changed

parquet/src/column/writer/encoder.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
use bytes::Bytes;
1919
use half::f16;
2020

21-
use crate::basic::{Encoding, LogicalType, Type};
21+
use crate::basic::{ConvertedType, Encoding, LogicalType, Type};
2222
use crate::bloom_filter::Sbbf;
2323
use crate::column::writer::{
2424
compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min,
@@ -137,7 +137,10 @@ pub struct ColumnValueEncoderImpl<T: DataType> {
137137

138138
impl<T: DataType> ColumnValueEncoderImpl<T> {
139139
fn write_slice(&mut self, slice: &[T::T]) -> Result<()> {
140-
if self.statistics_enabled == EnabledStatistics::Page {
140+
if self.statistics_enabled == EnabledStatistics::Page
141+
// INTERVAL has undefined sort order, so don't write min/max stats for it
142+
&& self.descr.converted_type() != ConvertedType::INTERVAL
143+
{
141144
if let Some((min, max)) = self.min_max(slice, None) {
142145
update_min(&self.descr, &min, &mut self.min_value);
143146
update_max(&self.descr, &max, &mut self.max_value);

parquet/src/column/writer/mod.rs

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
332332
// If only computing chunk-level statistics compute them here, page-level statistics
333333
// are computed in [`Self::write_mini_batch`] and used to update chunk statistics in
334334
// [`Self::add_data_page`]
335-
if self.statistics_enabled == EnabledStatistics::Chunk {
335+
if self.statistics_enabled == EnabledStatistics::Chunk
336+
// INTERVAL has undefined sort order, so don't write min/max stats for it
337+
&& self.descr.converted_type() != ConvertedType::INTERVAL
338+
{
336339
match (min, max) {
337340
(Some(min), Some(max)) => {
338341
update_min(&self.descr, min, &mut self.column_metrics.min_column_value);
@@ -1093,7 +1096,6 @@ fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T) -> bool {
10931096
///
10941097
/// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls `should_update` with
10951098
/// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true`
1096-
10971099
fn update_stat<T: ParquetValueType, F>(
10981100
descr: &ColumnDescriptor,
10991101
val: &T,
@@ -3066,6 +3068,30 @@ mod tests {
30663068
Ok(())
30673069
}
30683070

3071+
#[test]
3072+
fn test_interval_stats_should_not_have_min_max() {
3073+
let input = [
3074+
vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
3075+
vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
3076+
vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
3077+
]
3078+
.into_iter()
3079+
.map(|s| ByteArray::from(s).into())
3080+
.collect::<Vec<_>>();
3081+
3082+
let page_writer = get_test_page_writer();
3083+
let mut writer = get_test_interval_column_writer(page_writer);
3084+
writer.write_batch(&input, None, None).unwrap();
3085+
3086+
let metadata = writer.close().unwrap().metadata;
3087+
let stats = if let Some(Statistics::FixedLenByteArray(stats)) = metadata.statistics() {
3088+
stats.clone()
3089+
} else {
3090+
panic!("metadata missing statistics");
3091+
};
3092+
assert!(!stats.has_min_max_set());
3093+
}
3094+
30693095
fn write_multiple_pages<T: DataType>(
30703096
column_descr: &Arc<ColumnDescriptor>,
30713097
pages: &[&[Option<T::T>]],
@@ -3395,8 +3421,7 @@ mod tests {
33953421
values: &[FixedLenByteArray],
33963422
) -> ValueStatistics<FixedLenByteArray> {
33973423
let page_writer = get_test_page_writer();
3398-
let props = Default::default();
3399-
let mut writer = get_test_float16_column_writer(page_writer, 0, 0, props);
3424+
let mut writer = get_test_float16_column_writer(page_writer);
34003425
writer.write_batch(values, None, None).unwrap();
34013426

34023427
let metadata = writer.close().unwrap().metadata;
@@ -3409,12 +3434,9 @@ mod tests {
34093434

34103435
fn get_test_float16_column_writer(
34113436
page_writer: Box<dyn PageWriter>,
3412-
max_def_level: i16,
3413-
max_rep_level: i16,
3414-
props: WriterPropertiesPtr,
34153437
) -> ColumnWriterImpl<'static, FixedLenByteArrayType> {
3416-
let descr = Arc::new(get_test_float16_column_descr(max_def_level, max_rep_level));
3417-
let column_writer = get_column_writer(descr, props, page_writer);
3438+
let descr = Arc::new(get_test_float16_column_descr(0, 0));
3439+
let column_writer = get_column_writer(descr, Default::default(), page_writer);
34183440
get_typed_column_writer::<FixedLenByteArrayType>(column_writer)
34193441
}
34203442

@@ -3429,6 +3451,25 @@ mod tests {
34293451
ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path)
34303452
}
34313453

3454+
fn get_test_interval_column_writer(
3455+
page_writer: Box<dyn PageWriter>,
3456+
) -> ColumnWriterImpl<'static, FixedLenByteArrayType> {
3457+
let descr = Arc::new(get_test_interval_column_descr());
3458+
let column_writer = get_column_writer(descr, Default::default(), page_writer);
3459+
get_typed_column_writer::<FixedLenByteArrayType>(column_writer)
3460+
}
3461+
3462+
fn get_test_interval_column_descr() -> ColumnDescriptor {
3463+
let path = ColumnPath::from("col");
3464+
let tpe =
3465+
SchemaType::primitive_type_builder("col", FixedLenByteArrayType::get_physical_type())
3466+
.with_length(12)
3467+
.with_converted_type(ConvertedType::INTERVAL)
3468+
.build()
3469+
.unwrap();
3470+
ColumnDescriptor::new(Arc::new(tpe), 0, 0, path)
3471+
}
3472+
34323473
/// Returns column writer for UINT32 Column provided as ConvertedType only
34333474
fn get_test_unsigned_int_given_as_converted_column_writer<'a, T: DataType>(
34343475
page_writer: Box<dyn PageWriter + 'a>,

0 commit comments

Comments
 (0)