Skip to content

Commit c819b16

Browse files
committed
enhance: Remove redundant statistics from FileScanConfig
Signed-off-by: Alan Tang <[email protected]>
1 parent 0867086 commit c819b16

File tree

3 files changed

+31
-31
lines changed

3 files changed

+31
-31
lines changed

datafusion/datasource/src/file_scan_config.rs

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,6 @@ pub struct FileScanConfig {
148148
pub file_groups: Vec<Vec<PartitionedFile>>,
149149
/// Table constraints
150150
pub constraints: Constraints,
151-
/// Estimated overall statistics of the files, taking `filters` into account.
152-
/// Defaults to [`Statistics::new_unknown`].
153-
pub statistics: Statistics,
154151
/// Columns on which to project the data. Indexes that are higher than the
155152
/// number of columns of `file_schema` refer to `table_partition_cols`.
156153
pub projection: Option<Vec<usize>>,
@@ -308,7 +305,6 @@ impl FileScanConfig {
308305
file_schema,
309306
file_groups: vec![],
310307
constraints: Constraints::empty(),
311-
statistics,
312308
projection: None,
313309
limit: None,
314310
table_partition_cols: vec![],
@@ -317,14 +313,15 @@ impl FileScanConfig {
317313
new_lines_in_values: false,
318314
file_source: Arc::clone(&file_source),
319315
};
316+
config = config.with_statistics(statistics);
320317

321-
config = config.with_source(Arc::clone(&file_source));
322318
config
323319
}
324320

325321
/// Set the file source
326322
pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
327-
self.file_source = file_source.with_statistics(self.statistics.clone());
323+
self.file_source =
324+
file_source.with_statistics(Statistics::new_unknown(&self.file_schema));
328325
self
329326
}
330327

@@ -336,7 +333,6 @@ impl FileScanConfig {
336333

337334
/// Set the statistics of the files
338335
pub fn with_statistics(mut self, statistics: Statistics) -> Self {
339-
self.statistics = statistics.clone();
340336
self.file_source = self.file_source.with_statistics(statistics);
341337
self
342338
}
@@ -351,10 +347,7 @@ impl FileScanConfig {
351347
}
352348

353349
fn projected_stats(&self) -> Statistics {
354-
let statistics = self
355-
.file_source
356-
.statistics()
357-
.unwrap_or(self.statistics.clone());
350+
let statistics = self.file_source.statistics().unwrap();
358351

359352
let table_cols_stats = self
360353
.projection_indices()
@@ -487,7 +480,7 @@ impl FileScanConfig {
487480
return (
488481
Arc::clone(&self.file_schema),
489482
self.constraints.clone(),
490-
self.statistics.clone(),
483+
self.file_source.statistics().unwrap().clone(),
491484
self.output_ordering.clone(),
492485
);
493486
}
@@ -630,7 +623,11 @@ impl Debug for FileScanConfig {
630623
write!(f, "FileScanConfig {{")?;
631624
write!(f, "object_store_url={:?}, ", self.object_store_url)?;
632625

633-
write!(f, "statistics={:?}, ", self.statistics)?;
626+
write!(
627+
f,
628+
"statistics={:?}, ",
629+
self.file_source.statistics().unwrap()
630+
)?;
634631

635632
DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
636633
write!(f, "}}")

datafusion/proto/src/physical_plan/to_proto.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ pub fn serialize_file_scan_config(
507507

508508
Ok(protobuf::FileScanExecConf {
509509
file_groups,
510-
statistics: Some((&conf.statistics).into()),
510+
statistics: Some((&conf.file_source.statistics().unwrap()).into()),
511511
limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
512512
projection: conf
513513
.projection

datafusion/proto/tests/cases/roundtrip_physical_plan.rs

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -741,22 +741,22 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
741741
let source = Arc::new(
742742
ParquetSource::new(options).with_predicate(Arc::clone(&file_schema), predicate),
743743
);
744+
let statistics = Statistics {
745+
num_rows: Precision::Inexact(100),
746+
total_byte_size: Precision::Inexact(1024),
747+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
748+
Field::new("col", DataType::Utf8, false),
749+
]))),
750+
};
744751

745-
let scan_config = FileScanConfig {
752+
let mut scan_config = FileScanConfig {
746753
object_store_url: ObjectStoreUrl::local_filesystem(),
747754
file_schema,
748755
file_groups: vec![vec![PartitionedFile::new(
749756
"/path/to/file.parquet".to_string(),
750757
1024,
751758
)]],
752759
constraints: Constraints::empty(),
753-
statistics: Statistics {
754-
num_rows: Precision::Inexact(100),
755-
total_byte_size: Precision::Inexact(1024),
756-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
757-
Field::new("col", DataType::Utf8, false),
758-
]))),
759-
},
760760
projection: None,
761761
limit: None,
762762
table_partition_cols: vec![],
@@ -765,6 +765,7 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
765765
new_lines_in_values: false,
766766
file_source: source,
767767
};
768+
scan_config = scan_config.with_statistics(statistics);
768769

769770
roundtrip_test(scan_config.build())
770771
}
@@ -806,21 +807,23 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
806807
.with_predicate(Arc::clone(&file_schema), custom_predicate_expr),
807808
);
808809

809-
let scan_config = FileScanConfig {
810+
let statistics = Statistics {
811+
num_rows: Precision::Inexact(100),
812+
total_byte_size: Precision::Inexact(1024),
813+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
814+
Field::new("col", DataType::Utf8, false),
815+
]))),
816+
};
817+
818+
let mut scan_config = FileScanConfig {
810819
object_store_url: ObjectStoreUrl::local_filesystem(),
811820
file_schema,
812821
file_groups: vec![vec![PartitionedFile::new(
813822
"/path/to/file.parquet".to_string(),
814823
1024,
815824
)]],
816825
constraints: Constraints::empty(),
817-
statistics: Statistics {
818-
num_rows: Precision::Inexact(100),
819-
total_byte_size: Precision::Inexact(1024),
820-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
821-
Field::new("col", DataType::Utf8, false),
822-
]))),
823-
},
826+
824827
projection: None,
825828
limit: None,
826829
table_partition_cols: vec![],
@@ -829,6 +832,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
829832
new_lines_in_values: false,
830833
file_source: source,
831834
};
835+
scan_config = scan_config.with_statistics(statistics);
832836

833837
#[derive(Debug, Clone, Eq)]
834838
struct CustomPredicateExpr {
@@ -1616,7 +1620,6 @@ async fn roundtrip_projection_source() -> Result<()> {
16161620
1024,
16171621
)]],
16181622
constraints: Constraints::empty(),
1619-
statistics,
16201623
file_schema: schema.clone(),
16211624
projection: Some(vec![0, 1, 2]),
16221625
limit: None,

0 commit comments

Comments
 (0)