Skip to content

Commit 5c025cc

Browse files
tustvoldalamb
andauthored
Don't use parquet file offset for file range pruning (#5997)
* Don't use parquet file offset for file range pruning * Update datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs Co-authored-by: Andrew Lamb <[email protected]> * Format * Tweak logic * Update test --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 59fd93c commit 5c025cc

File tree

2 files changed

+11
-5
lines changed

2 files changed

+11
-5
lines changed

datafusion/core/src/physical_plan/file_format/parquet.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,11 +1619,11 @@ mod tests {
16191619
.infer_schema(&state, &store, &[meta.clone()])
16201620
.await?;
16211621

1622-
let group_empty = vec![vec![file_range(&meta, 0, 5)]];
1623-
let group_contain = vec![vec![file_range(&meta, 5, i64::MAX)]];
1622+
let group_empty = vec![vec![file_range(&meta, 0, 2)]];
1623+
let group_contain = vec![vec![file_range(&meta, 2, i64::MAX)]];
16241624
let group_all = vec![vec![
1625-
file_range(&meta, 0, 5),
1626-
file_range(&meta, 5, i64::MAX),
1625+
file_range(&meta, 0, 2),
1626+
file_range(&meta, 2, i64::MAX),
16271627
]];
16281628

16291629
assert_parquet_read(&state, group_empty, None, file_schema.clone()).await?;

datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,13 @@ pub(crate) fn prune_row_groups(
5353
let mut filtered = Vec::with_capacity(groups.len());
5454
for (idx, metadata) in groups.iter().enumerate() {
5555
if let Some(range) = &range {
56-
let offset = metadata.column(0).file_offset();
56+
// figure out where the first dictionary page (or first data page are)
57+
// note don't use the location of metadata
58+
// <https://github.com/apache/arrow-datafusion/issues/5995>
59+
let col = metadata.column(0);
60+
let offset = col
61+
.dictionary_page_offset()
62+
.unwrap_or_else(|| col.data_page_offset());
5763
if offset < range.start || offset >= range.end {
5864
continue;
5965
}

0 commit comments

Comments
 (0)