Skip to content

Commit ce14fbc

Browse files
akoshchiyalamb
andauthored
Add statistics_truncate_length parquet writer config (#14782)
* Add parquet writer config * test fixes --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 2d57a0b commit ce14fbc

File tree

11 files changed

+71
-0
lines changed

11 files changed

+71
-0
lines changed

datafusion/common/src/config.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,10 @@ config_namespace! {
503503
/// (writing) Sets column index truncate length
504504
pub column_index_truncate_length: Option<usize>, default = Some(64)
505505

506+
/// (writing) Sets statictics truncate length. If NULL, uses
507+
/// default parquet writer setting
508+
pub statistics_truncate_length: Option<usize>, default = None
509+
506510
/// (writing) Sets best effort maximum number of rows in data page
507511
pub data_page_row_count_limit: usize, default = 20_000
508512

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ impl ParquetOptions {
219219
max_row_group_size,
220220
created_by,
221221
column_index_truncate_length,
222+
statistics_truncate_length,
222223
data_page_row_count_limit,
223224
encoding,
224225
bloom_filter_on_write,
@@ -255,6 +256,7 @@ impl ParquetOptions {
255256
.set_max_row_group_size(*max_row_group_size)
256257
.set_created_by(created_by.clone())
257258
.set_column_index_truncate_length(*column_index_truncate_length)
259+
.set_statistics_truncate_length(*statistics_truncate_length)
258260
.set_data_page_row_count_limit(*data_page_row_count_limit)
259261
.set_bloom_filter_enabled(*bloom_filter_on_write);
260262

@@ -491,6 +493,7 @@ mod tests {
491493
max_row_group_size: 42,
492494
created_by: "wordy".into(),
493495
column_index_truncate_length: Some(42),
496+
statistics_truncate_length: Some(42),
494497
data_page_row_count_limit: 42,
495498
encoding: Some("BYTE_STREAM_SPLIT".into()),
496499
bloom_filter_on_write: !defaults.bloom_filter_on_write,
@@ -587,6 +590,7 @@ mod tests {
587590
max_row_group_size: props.max_row_group_size(),
588591
created_by: props.created_by().to_string(),
589592
column_index_truncate_length: props.column_index_truncate_length(),
593+
statistics_truncate_length: props.statistics_truncate_length(),
590594
data_page_row_count_limit: props.data_page_row_count_limit(),
591595

592596
// global options which set the default column props

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,10 @@ message ParquetOptions {
522522
uint64 column_index_truncate_length = 17;
523523
}
524524

525+
oneof statistics_truncate_length_opt {
526+
uint64 statistics_truncate_length = 31;
527+
}
528+
525529
oneof encoding_opt {
526530
string encoding = 19;
527531
}

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,12 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
952952
protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => Some(*v as usize),
953953
})
954954
.unwrap_or(None),
955+
statistics_truncate_length: value
956+
.statistics_truncate_length_opt.as_ref()
957+
.map(|opt| match opt {
958+
protobuf::parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(v) => Some(*v as usize),
959+
})
960+
.unwrap_or(None),
955961
data_page_row_count_limit: value.data_page_row_count_limit as usize,
956962
encoding: value
957963
.encoding_opt.clone()

datafusion/proto-common/src/generated/pbjson.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3133,6 +3133,7 @@ impl serde::Serialize for Field {
31333133
}
31343134
}
31353135
impl<'de> serde::Deserialize<'de> for Field {
3136+
#[allow(deprecated)]
31363137
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
31373138
where
31383139
D: serde::Deserializer<'de>,
@@ -4968,6 +4969,9 @@ impl serde::Serialize for ParquetOptions {
49684969
if self.column_index_truncate_length_opt.is_some() {
49694970
len += 1;
49704971
}
4972+
if self.statistics_truncate_length_opt.is_some() {
4973+
len += 1;
4974+
}
49714975
if self.encoding_opt.is_some() {
49724976
len += 1;
49734977
}
@@ -5100,6 +5104,15 @@ impl serde::Serialize for ParquetOptions {
51005104
}
51015105
}
51025106
}
5107+
if let Some(v) = self.statistics_truncate_length_opt.as_ref() {
5108+
match v {
5109+
parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(v) => {
5110+
#[allow(clippy::needless_borrow)]
5111+
#[allow(clippy::needless_borrows_for_generic_args)]
5112+
struct_ser.serialize_field("statisticsTruncateLength", ToString::to_string(&v).as_str())?;
5113+
}
5114+
}
5115+
}
51035116
if let Some(v) = self.encoding_opt.as_ref() {
51045117
match v {
51055118
parquet_options::EncodingOpt::Encoding(v) => {
@@ -5183,6 +5196,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
51835196
"maxStatisticsSize",
51845197
"column_index_truncate_length",
51855198
"columnIndexTruncateLength",
5199+
"statistics_truncate_length",
5200+
"statisticsTruncateLength",
51865201
"encoding",
51875202
"bloom_filter_fpp",
51885203
"bloomFilterFpp",
@@ -5218,6 +5233,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
52185233
StatisticsEnabled,
52195234
MaxStatisticsSize,
52205235
ColumnIndexTruncateLength,
5236+
StatisticsTruncateLength,
52215237
Encoding,
52225238
BloomFilterFpp,
52235239
BloomFilterNdv,
@@ -5268,6 +5284,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
52685284
"statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
52695285
"maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
52705286
"columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength),
5287+
"statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength),
52715288
"encoding" => Ok(GeneratedField::Encoding),
52725289
"bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
52735290
"bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv),
@@ -5316,6 +5333,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
53165333
let mut statistics_enabled_opt__ = None;
53175334
let mut max_statistics_size_opt__ = None;
53185335
let mut column_index_truncate_length_opt__ = None;
5336+
let mut statistics_truncate_length_opt__ = None;
53195337
let mut encoding_opt__ = None;
53205338
let mut bloom_filter_fpp_opt__ = None;
53215339
let mut bloom_filter_ndv_opt__ = None;
@@ -5491,6 +5509,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
54915509
}
54925510
column_index_truncate_length_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(x.0));
54935511
}
5512+
GeneratedField::StatisticsTruncateLength => {
5513+
if statistics_truncate_length_opt__.is_some() {
5514+
return Err(serde::de::Error::duplicate_field("statisticsTruncateLength"));
5515+
}
5516+
statistics_truncate_length_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(x.0));
5517+
}
54945518
GeneratedField::Encoding => {
54955519
if encoding_opt__.is_some() {
54965520
return Err(serde::de::Error::duplicate_field("encoding"));
@@ -5538,6 +5562,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
55385562
statistics_enabled_opt: statistics_enabled_opt__,
55395563
max_statistics_size_opt: max_statistics_size_opt__,
55405564
column_index_truncate_length_opt: column_index_truncate_length_opt__,
5565+
statistics_truncate_length_opt: statistics_truncate_length_opt__,
55415566
encoding_opt: encoding_opt__,
55425567
bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
55435568
bloom_filter_ndv_opt: bloom_filter_ndv_opt__,

datafusion/proto-common/src/generated/prost.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,10 @@ pub struct ParquetOptions {
794794
pub column_index_truncate_length_opt: ::core::option::Option<
795795
parquet_options::ColumnIndexTruncateLengthOpt,
796796
>,
797+
#[prost(oneof = "parquet_options::StatisticsTruncateLengthOpt", tags = "31")]
798+
pub statistics_truncate_length_opt: ::core::option::Option<
799+
parquet_options::StatisticsTruncateLengthOpt,
800+
>,
797801
#[prost(oneof = "parquet_options::EncodingOpt", tags = "19")]
798802
pub encoding_opt: ::core::option::Option<parquet_options::EncodingOpt>,
799803
#[prost(oneof = "parquet_options::BloomFilterFppOpt", tags = "21")]
@@ -833,6 +837,11 @@ pub mod parquet_options {
833837
#[prost(uint64, tag = "17")]
834838
ColumnIndexTruncateLength(u64),
835839
}
840+
#[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
841+
pub enum StatisticsTruncateLengthOpt {
842+
#[prost(uint64, tag = "31")]
843+
StatisticsTruncateLength(u64),
844+
}
836845
#[derive(Clone, PartialEq, ::prost::Oneof)]
837846
pub enum EncodingOpt {
838847
#[prost(string, tag = "19")]

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
823823
max_row_group_size: value.max_row_group_size as u64,
824824
created_by: value.created_by.clone(),
825825
column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)),
826+
statistics_truncate_length_opt: value.statistics_truncate_length.map(|v| protobuf::parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(v as u64)),
826827
data_page_row_count_limit: value.data_page_row_count_limit as u64,
827828
encoding_opt: value.encoding.clone().map(protobuf::parquet_options::EncodingOpt::Encoding),
828829
bloom_filter_on_read: value.bloom_filter_on_read,

datafusion/proto/src/generated/datafusion_proto_common.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,10 @@ pub struct ParquetOptions {
794794
pub column_index_truncate_length_opt: ::core::option::Option<
795795
parquet_options::ColumnIndexTruncateLengthOpt,
796796
>,
797+
#[prost(oneof = "parquet_options::StatisticsTruncateLengthOpt", tags = "31")]
798+
pub statistics_truncate_length_opt: ::core::option::Option<
799+
parquet_options::StatisticsTruncateLengthOpt,
800+
>,
797801
#[prost(oneof = "parquet_options::EncodingOpt", tags = "19")]
798802
pub encoding_opt: ::core::option::Option<parquet_options::EncodingOpt>,
799803
#[prost(oneof = "parquet_options::BloomFilterFppOpt", tags = "21")]
@@ -833,6 +837,11 @@ pub mod parquet_options {
833837
#[prost(uint64, tag = "17")]
834838
ColumnIndexTruncateLength(u64),
835839
}
840+
#[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
841+
pub enum StatisticsTruncateLengthOpt {
842+
#[prost(uint64, tag = "31")]
843+
StatisticsTruncateLength(u64),
844+
}
836845
#[derive(Clone, PartialEq, ::prost::Oneof)]
837846
pub enum EncodingOpt {
838847
#[prost(string, tag = "19")]

datafusion/proto/src/logical_plan/file_formats.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,9 @@ impl TableParquetOptionsProto {
394394
column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| {
395395
parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(length as u64)
396396
}),
397+
statistics_truncate_length_opt: global_options.global.statistics_truncate_length.map(|length| {
398+
parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(length as u64)
399+
}),
397400
data_page_row_count_limit: global_options.global.data_page_row_count_limit as u64,
398401
encoding_opt: global_options.global.encoding.map(|encoding| {
399402
parquet_options::EncodingOpt::Encoding(encoding)
@@ -487,6 +490,9 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
487490
column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt {
488491
parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(length) => *length as usize,
489492
}),
493+
statistics_truncate_length: proto.statistics_truncate_length_opt.as_ref().map(|opt| match opt {
494+
parquet_options::StatisticsTruncateLengthOpt::StatisticsTruncateLength(length) => *length as usize,
495+
}),
490496
data_page_row_count_limit: proto.data_page_row_count_limit as usize,
491497
encoding: proto.encoding_opt.as_ref().map(|opt| match opt {
492498
parquet_options::EncodingOpt::Encoding(encoding) => encoding.clone(),

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ datafusion.execution.parquet.schema_force_view_types true
218218
datafusion.execution.parquet.skip_arrow_metadata false
219219
datafusion.execution.parquet.skip_metadata true
220220
datafusion.execution.parquet.statistics_enabled page
221+
datafusion.execution.parquet.statistics_truncate_length NULL
221222
datafusion.execution.parquet.write_batch_size 1024
222223
datafusion.execution.parquet.writer_version 1.0
223224
datafusion.execution.planning_concurrency 13
@@ -313,6 +314,7 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par
313314
datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>
314315
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
315316
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
317+
datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting
316318
datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes
317319
datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0"
318320
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system

docs/source/user-guide/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
7070
| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. |
7171
| datafusion.execution.parquet.created_by | datafusion version 45.0.0 | (writing) Sets "created by" property |
7272
| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length |
73+
| datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting |
7374
| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page |
7475
| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting |
7576
| datafusion.execution.parquet.bloom_filter_on_read | true | (writing) Use any available bloom filters when reading parquet files |

0 commit comments

Comments
 (0)