|
24 | 24 | use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions};
|
25 | 25 | use arrow::compute::{can_cast_types, cast};
|
26 | 26 | use arrow::datatypes::{Schema, SchemaRef};
|
27 |
| -use datafusion_common::plan_err; |
| 27 | +use datafusion_common::{plan_err, ColumnStatistics}; |
28 | 28 | use std::fmt::Debug;
|
29 | 29 | use std::sync::Arc;
|
30 | 30 |
|
@@ -96,6 +96,12 @@ pub trait SchemaAdapter: Send + Sync {
|
96 | 96 | pub trait SchemaMapper: Debug + Send + Sync {
|
97 | 97 | /// Adapts a `RecordBatch` to match the `table_schema`
|
98 | 98 | fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>;
|
| 99 | + |
| 100 | + /// Adapts file-level column `Statistics` to match the `table_schema` |
| 101 | + fn map_column_statistics( |
| 102 | + &self, |
| 103 | + file_col_statistics: &[ColumnStatistics], |
| 104 | + ) -> datafusion_common::Result<Vec<ColumnStatistics>>; |
99 | 105 | }
|
100 | 106 |
|
101 | 107 | /// Default [`SchemaAdapterFactory`] for mapping schemas.
|
@@ -334,4 +340,126 @@ impl SchemaMapper for SchemaMapping {
|
334 | 340 | let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
|
335 | 341 | Ok(record_batch)
|
336 | 342 | }
|
| 343 | + |
| 344 | + /// Adapts file-level column `Statistics` to match the `table_schema` |
| 345 | + fn map_column_statistics( |
| 346 | + &self, |
| 347 | + file_col_statistics: &[ColumnStatistics], |
| 348 | + ) -> datafusion_common::Result<Vec<ColumnStatistics>> { |
| 349 | + let mut table_col_statistics = vec![]; |
| 350 | + |
| 351 | + // Map the statistics for each field in the file schema to the corresponding field in the |
| 352 | + // table schema, if a field is not present in the file schema, we need to fill it with `ColumnStatistics::new_unknown` |
| 353 | + for (_, file_col_idx) in self |
| 354 | + .projected_table_schema |
| 355 | + .fields() |
| 356 | + .iter() |
| 357 | + .zip(&self.field_mappings) |
| 358 | + { |
| 359 | + if let Some(file_col_idx) = file_col_idx { |
| 360 | + table_col_statistics.push( |
| 361 | + file_col_statistics |
| 362 | + .get(*file_col_idx) |
| 363 | + .cloned() |
| 364 | + .unwrap_or_default(), |
| 365 | + ); |
| 366 | + } else { |
| 367 | + table_col_statistics.push(ColumnStatistics::new_unknown()); |
| 368 | + } |
| 369 | + } |
| 370 | + |
| 371 | + Ok(table_col_statistics) |
| 372 | + } |
| 373 | +} |
| 374 | + |
| 375 | +#[cfg(test)] |
| 376 | +mod tests { |
| 377 | + use arrow::datatypes::{DataType, Field}; |
| 378 | + use datafusion_common::{stats::Precision, Statistics}; |
| 379 | + |
| 380 | + use super::*; |
| 381 | + |
| 382 | + #[test] |
| 383 | + fn test_schema_mapping_map_statistics_basic() { |
| 384 | + // Create table schema (a, b, c) |
| 385 | + let table_schema = Arc::new(Schema::new(vec![ |
| 386 | + Field::new("a", DataType::Int32, true), |
| 387 | + Field::new("b", DataType::Utf8, true), |
| 388 | + Field::new("c", DataType::Float64, true), |
| 389 | + ])); |
| 390 | + |
| 391 | + // Create file schema (b, a) - different order, missing c |
| 392 | + let file_schema = Schema::new(vec![ |
| 393 | + Field::new("b", DataType::Utf8, true), |
| 394 | + Field::new("a", DataType::Int32, true), |
| 395 | + ]); |
| 396 | + |
| 397 | + // Create SchemaAdapter |
| 398 | + let adapter = DefaultSchemaAdapter { |
| 399 | + projected_table_schema: Arc::clone(&table_schema), |
| 400 | + }; |
| 401 | + |
| 402 | + // Get mapper and projection |
| 403 | + let (mapper, projection) = adapter.map_schema(&file_schema).unwrap(); |
| 404 | + |
| 405 | + // Should project columns 0,1 from file |
| 406 | + assert_eq!(projection, vec![0, 1]); |
| 407 | + |
| 408 | + // Create file statistics |
| 409 | + let mut file_stats = Statistics::default(); |
| 410 | + |
| 411 | + // Statistics for column b (index 0 in file) |
| 412 | + let b_stats = ColumnStatistics { |
| 413 | + null_count: Precision::Exact(5), |
| 414 | + ..Default::default() |
| 415 | + }; |
| 416 | + |
| 417 | + // Statistics for column a (index 1 in file) |
| 418 | + let a_stats = ColumnStatistics { |
| 419 | + null_count: Precision::Exact(10), |
| 420 | + ..Default::default() |
| 421 | + }; |
| 422 | + |
| 423 | + file_stats.column_statistics = vec![b_stats, a_stats]; |
| 424 | + |
| 425 | + // Map statistics |
| 426 | + let table_col_stats = mapper |
| 427 | + .map_column_statistics(&file_stats.column_statistics) |
| 428 | + .unwrap(); |
| 429 | + |
| 430 | + // Verify stats |
| 431 | + assert_eq!(table_col_stats.len(), 3); |
| 432 | + assert_eq!(table_col_stats[0].null_count, Precision::Exact(10)); // a from file idx 1 |
| 433 | + assert_eq!(table_col_stats[1].null_count, Precision::Exact(5)); // b from file idx 0 |
| 434 | + assert_eq!(table_col_stats[2].null_count, Precision::Absent); // c (unknown) |
| 435 | + } |
| 436 | + |
| 437 | + #[test] |
| 438 | + fn test_schema_mapping_map_statistics_empty() { |
| 439 | + // Create schemas |
| 440 | + let table_schema = Arc::new(Schema::new(vec![ |
| 441 | + Field::new("a", DataType::Int32, true), |
| 442 | + Field::new("b", DataType::Utf8, true), |
| 443 | + ])); |
| 444 | + let file_schema = Schema::new(vec![ |
| 445 | + Field::new("a", DataType::Int32, true), |
| 446 | + Field::new("b", DataType::Utf8, true), |
| 447 | + ]); |
| 448 | + |
| 449 | + let adapter = DefaultSchemaAdapter { |
| 450 | + projected_table_schema: Arc::clone(&table_schema), |
| 451 | + }; |
| 452 | + let (mapper, _) = adapter.map_schema(&file_schema).unwrap(); |
| 453 | + |
| 454 | + // Empty file statistics |
| 455 | + let file_stats = Statistics::default(); |
| 456 | + let table_col_stats = mapper |
| 457 | + .map_column_statistics(&file_stats.column_statistics) |
| 458 | + .unwrap(); |
| 459 | + |
| 460 | + // All stats should be unknown |
| 461 | + assert_eq!(table_col_stats.len(), 2); |
| 462 | + assert_eq!(table_col_stats[0], ColumnStatistics::new_unknown(),); |
| 463 | + assert_eq!(table_col_stats[1], ColumnStatistics::new_unknown(),); |
| 464 | + } |
337 | 465 | }
|
0 commit comments