Skip to content

Commit 9503456

Browse files
alambJefffrey
andauthored
Support user defined ParquetAccessPlan in ParquetExec, validation to ParquetAccessPlan::select (#10813)
* Allow `ParquetAccessPlan` to be passed in to `ParquetExec`, add validation to ParquetAccessPlan::select * Add test for filtering and user supplied access plan * fix on windows * Apply suggestions from code review Co-authored-by: Jeffrey Vo <[email protected]> --------- Co-authored-by: Jeffrey Vo <[email protected]>
1 parent ad0dc2f commit 9503456

File tree

8 files changed

+692
-38
lines changed

8 files changed

+692
-38
lines changed

datafusion/core/src/datasource/listing/mod.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@ impl PartitionedFile {
134134
self.range = Some(FileRange { start, end });
135135
self
136136
}
137+
138+
/// Update the user defined extensions for this file.
139+
///
140+
/// This can be used to pass reader specific information.
141+
pub fn with_extensions(
142+
mut self,
143+
extensions: Arc<dyn std::any::Any + Send + Sync>,
144+
) -> Self {
145+
self.extensions = Some(extensions);
146+
self
147+
}
137148
}
138149

139150
impl From<ObjectMeta> for PartitionedFile {

datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs

Lines changed: 111 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use datafusion_common::{internal_err, Result};
1819
use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
1920
use parquet::file::metadata::RowGroupMetaData;
2021

@@ -182,6 +183,11 @@ impl ParquetAccessPlan {
182183
/// is returned for *all* the rows in the row groups that are not skipped.
183184
/// Thus it includes a `Select` selection for any [`RowGroupAccess::Scan`].
184185
///
186+
/// # Errors
187+
///
188+
/// Returns an error if any specified row selection does not specify
189+
/// the same number of rows as in it's corresponding `row_group_metadata`.
190+
///
185191
/// # Example: No Selections
186192
///
187193
/// Given an access plan like this
@@ -228,7 +234,7 @@ impl ParquetAccessPlan {
228234
pub fn into_overall_row_selection(
229235
self,
230236
row_group_meta_data: &[RowGroupMetaData],
231-
) -> Option<RowSelection> {
237+
) -> Result<Option<RowSelection>> {
232238
assert_eq!(row_group_meta_data.len(), self.row_groups.len());
233239
// Intuition: entire row groups are filtered out using
234240
// `row_group_indexes` which come from Skip and Scan. An overall
@@ -239,7 +245,32 @@ impl ParquetAccessPlan {
239245
.iter()
240246
.any(|rg| matches!(rg, RowGroupAccess::Selection(_)))
241247
{
242-
return None;
248+
return Ok(None);
249+
}
250+
251+
// validate all Selections
252+
for (idx, (rg, rg_meta)) in self
253+
.row_groups
254+
.iter()
255+
.zip(row_group_meta_data.iter())
256+
.enumerate()
257+
{
258+
let RowGroupAccess::Selection(selection) = rg else {
259+
continue;
260+
};
261+
let rows_in_selection = selection
262+
.iter()
263+
.map(|selection| selection.row_count)
264+
.sum::<usize>();
265+
266+
let row_group_row_count = rg_meta.num_rows();
267+
if rows_in_selection as i64 != row_group_row_count {
268+
return internal_err!(
269+
"Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
270+
but selection only specifies {rows_in_selection} rows. \
271+
Selection: {selection:?}"
272+
);
273+
}
243274
}
244275

245276
let total_selection: RowSelection = self
@@ -261,7 +292,7 @@ impl ParquetAccessPlan {
261292
})
262293
.collect();
263294

264-
Some(total_selection)
295+
Ok(Some(total_selection))
265296
}
266297

267298
/// Return an iterator over the row group indexes that should be scanned
@@ -305,6 +336,7 @@ impl ParquetAccessPlan {
305336
#[cfg(test)]
306337
mod test {
307338
use super::*;
339+
use datafusion_common::assert_contains;
308340
use parquet::basic::LogicalType;
309341
use parquet::file::metadata::ColumnChunkMetaData;
310342
use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor};
@@ -320,7 +352,9 @@ mod test {
320352
]);
321353

322354
let row_group_indexes = access_plan.row_group_indexes();
323-
let row_selection = access_plan.into_overall_row_selection(row_group_metadata());
355+
let row_selection = access_plan
356+
.into_overall_row_selection(row_group_metadata())
357+
.unwrap();
324358

325359
// scan all row groups, no selection
326360
assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
@@ -337,7 +371,9 @@ mod test {
337371
]);
338372

339373
let row_group_indexes = access_plan.row_group_indexes();
340-
let row_selection = access_plan.into_overall_row_selection(row_group_metadata());
374+
let row_selection = access_plan
375+
.into_overall_row_selection(row_group_metadata())
376+
.unwrap();
341377

342378
// skip all row groups, no selection
343379
assert_eq!(row_group_indexes, vec![] as Vec<usize>);
@@ -348,14 +384,22 @@ mod test {
348384
let access_plan = ParquetAccessPlan::new(vec![
349385
RowGroupAccess::Scan,
350386
RowGroupAccess::Selection(
351-
vec![RowSelector::select(5), RowSelector::skip(7)].into(),
387+
// select / skip all 20 rows in row group 1
388+
vec![
389+
RowSelector::select(5),
390+
RowSelector::skip(7),
391+
RowSelector::select(8),
392+
]
393+
.into(),
352394
),
353395
RowGroupAccess::Skip,
354396
RowGroupAccess::Skip,
355397
]);
356398

357399
let row_group_indexes = access_plan.row_group_indexes();
358-
let row_selection = access_plan.into_overall_row_selection(row_group_metadata());
400+
let row_selection = access_plan
401+
.into_overall_row_selection(row_group_metadata())
402+
.unwrap();
359403

360404
assert_eq!(row_group_indexes, vec![0, 1]);
361405
assert_eq!(
@@ -366,7 +410,8 @@ mod test {
366410
RowSelector::select(10),
367411
// selectors from the second row group
368412
RowSelector::select(5),
369-
RowSelector::skip(7)
413+
RowSelector::skip(7),
414+
RowSelector::select(8)
370415
]
371416
.into()
372417
)
@@ -379,13 +424,21 @@ mod test {
379424
RowGroupAccess::Skip,
380425
RowGroupAccess::Scan,
381426
RowGroupAccess::Selection(
382-
vec![RowSelector::select(5), RowSelector::skip(7)].into(),
427+
// specify all 30 rows in row group 1
428+
vec![
429+
RowSelector::select(5),
430+
RowSelector::skip(7),
431+
RowSelector::select(18),
432+
]
433+
.into(),
383434
),
384435
RowGroupAccess::Scan,
385436
]);
386437

387438
let row_group_indexes = access_plan.row_group_indexes();
388-
let row_selection = access_plan.into_overall_row_selection(row_group_metadata());
439+
let row_selection = access_plan
440+
.into_overall_row_selection(row_group_metadata())
441+
.unwrap();
389442

390443
assert_eq!(row_group_indexes, vec![1, 2, 3]);
391444
assert_eq!(
@@ -397,6 +450,7 @@ mod test {
397450
// selectors from the third row group
398451
RowSelector::select(5),
399452
RowSelector::skip(7),
453+
RowSelector::select(18),
400454
// select the entire fourth row group
401455
RowSelector::select(40),
402456
]
@@ -405,6 +459,53 @@ mod test {
405459
);
406460
}
407461

462+
#[test]
463+
fn test_invalid_too_few() {
464+
let access_plan = ParquetAccessPlan::new(vec![
465+
RowGroupAccess::Scan,
466+
// select 12 rows, but row group 1 has 20
467+
RowGroupAccess::Selection(
468+
vec![RowSelector::select(5), RowSelector::skip(7)].into(),
469+
),
470+
RowGroupAccess::Scan,
471+
RowGroupAccess::Scan,
472+
]);
473+
474+
let row_group_indexes = access_plan.row_group_indexes();
475+
let err = access_plan
476+
.into_overall_row_selection(row_group_metadata())
477+
.unwrap_err()
478+
.to_string();
479+
assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
480+
assert_contains!(err, "Internal error: Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 12 rows");
481+
}
482+
483+
#[test]
484+
fn test_invalid_too_many() {
485+
let access_plan = ParquetAccessPlan::new(vec![
486+
RowGroupAccess::Scan,
487+
// select 22 rows, but row group 1 has only 20
488+
RowGroupAccess::Selection(
489+
vec![
490+
RowSelector::select(10),
491+
RowSelector::skip(2),
492+
RowSelector::select(10),
493+
]
494+
.into(),
495+
),
496+
RowGroupAccess::Scan,
497+
RowGroupAccess::Scan,
498+
]);
499+
500+
let row_group_indexes = access_plan.row_group_indexes();
501+
let err = access_plan
502+
.into_overall_row_selection(row_group_metadata())
503+
.unwrap_err()
504+
.to_string();
505+
assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
506+
assert_contains!(err, "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows");
507+
}
508+
408509
static ROW_GROUP_METADATA: OnceLock<Vec<RowGroupMetaData>> = OnceLock::new();
409510

410511
/// [`RowGroupMetaData`] that returns 4 row groups with 10, 20, 30, 40 rows

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,52 @@ pub use writer::plan_to_parquet;
145145
/// custom reader is used, it supplies the metadata directly and this parameter
146146
/// is ignored. [`ParquetExecBuilder::with_metadata_size_hint`] for more details.
147147
///
148+
/// * User provided [`ParquetAccessPlan`]s to skip row groups and/or pages
149+
/// based on external information. See "Implementing External Indexes" below
150+
///
151+
/// # Implementing External Indexes
152+
///
153+
/// It is possible to restrict the row groups and selections within those row
154+
/// groups that the ParquetExec will consider by providing an initial
155+
/// [`ParquetAccessPlan`] as `extensions` on [`PartitionedFile`]. This can be
156+
/// used to implement external indexes on top of parquet files and select only
157+
/// portions of the files.
158+
///
159+
/// The `ParquetExec` will try and further reduce any provided
160+
/// `ParquetAccessPlan` further based on the contents of `ParquetMetadata` and
161+
/// other settings.
162+
///
163+
/// ## Example of providing a ParquetAccessPlan
164+
///
165+
/// ```
166+
/// # use std::sync::Arc;
167+
/// # use arrow_schema::{Schema, SchemaRef};
168+
/// # use datafusion::datasource::listing::PartitionedFile;
169+
/// # use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan;
170+
/// # use datafusion::datasource::physical_plan::{FileScanConfig, ParquetExec};
171+
/// # use datafusion_execution::object_store::ObjectStoreUrl;
172+
/// # fn schema() -> SchemaRef {
173+
/// # Arc::new(Schema::empty())
174+
/// # }
175+
/// // create an access plan to scan row group 0, 1 and 3 and skip row groups 2 and 4
176+
/// let mut access_plan = ParquetAccessPlan::new_all(5);
177+
/// access_plan.skip(2);
178+
/// access_plan.skip(4);
179+
/// // provide the plan as extension to the FileScanConfig
180+
/// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234)
181+
/// .with_extensions(Arc::new(access_plan));
182+
/// // create a ParquetExec to scan this file
183+
/// let file_scan_config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema())
184+
/// .with_file(partitioned_file);
185+
/// // this parquet exec will not even try to read row groups 2 and 4. Additional
186+
/// // pruning based on predicates may also happen
187+
/// let exec = ParquetExec::builder(file_scan_config).build();
188+
/// ```
189+
///
190+
/// For a complete example, see the [`parquet_index_advanced` example]).
191+
///
192+
/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/parquet_index_advanced.rs
193+
///
148194
/// # Execution Overview
149195
///
150196
/// * Step 1: [`ParquetExec::execute`] is called, returning a [`FileStream`]

datafusion/core/src/datasource/physical_plan/parquet/opener.rs

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use crate::datasource::physical_plan::{
2828
use crate::datasource::schema_adapter::SchemaAdapterFactory;
2929
use crate::physical_optimizer::pruning::PruningPredicate;
3030
use arrow_schema::{ArrowError, SchemaRef};
31+
use datafusion_common::{exec_err, Result};
3132
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
3233
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
3334
use futures::{StreamExt, TryStreamExt};
@@ -60,11 +61,10 @@ pub(super) struct ParquetOpener {
6061
impl FileOpener for ParquetOpener {
6162
fn open(&self, file_meta: FileMeta) -> datafusion_common::Result<FileOpenFuture> {
6263
let file_range = file_meta.range.clone();
63-
let file_metrics = ParquetFileMetrics::new(
64-
self.partition_index,
65-
file_meta.location().as_ref(),
66-
&self.metrics,
67-
);
64+
let extensions = file_meta.extensions.clone();
65+
let file_name = file_meta.location().to_string();
66+
let file_metrics =
67+
ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics);
6868

6969
let reader: Box<dyn AsyncFileReader> =
7070
self.parquet_file_reader_factory.create_reader(
@@ -139,7 +139,8 @@ impl FileOpener for ParquetOpener {
139139
let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
140140
let rg_metadata = file_metadata.row_groups();
141141
// track which row groups to actually read
142-
let access_plan = ParquetAccessPlan::new_all(rg_metadata.len());
142+
let access_plan =
143+
create_initial_plan(&file_name, extensions, rg_metadata.len())?;
143144
let mut row_groups = RowGroupAccessPlanFilter::new(access_plan);
144145
// if there is a range restricting what parts of the file to read
145146
if let Some(range) = file_range.as_ref() {
@@ -186,7 +187,7 @@ impl FileOpener for ParquetOpener {
186187

187188
let row_group_indexes = access_plan.row_group_indexes();
188189
if let Some(row_selection) =
189-
access_plan.into_overall_row_selection(rg_metadata)
190+
access_plan.into_overall_row_selection(rg_metadata)?
190191
{
191192
builder = builder.with_row_selection(row_selection);
192193
}
@@ -212,3 +213,34 @@ impl FileOpener for ParquetOpener {
212213
}))
213214
}
214215
}
216+
217+
/// Return the initial [`ParquetAccessPlan`]
218+
///
219+
/// If the user has supplied one as an extension, use that
220+
/// otherwise return a plan that scans all row groups
221+
///
222+
/// Returns an error if an invalid `ParquetAccessPlan` is provided
223+
///
224+
/// Note: file_name is only used for error messages
225+
fn create_initial_plan(
226+
file_name: &str,
227+
extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
228+
row_group_count: usize,
229+
) -> Result<ParquetAccessPlan> {
230+
if let Some(extensions) = extensions {
231+
if let Some(access_plan) = extensions.downcast_ref::<ParquetAccessPlan>() {
232+
let plan_len = access_plan.len();
233+
if plan_len != row_group_count {
234+
return exec_err!(
235+
"Invalid ParquetAccessPlan for {file_name}. Specified {plan_len} row groups, but file has {row_group_count}"
236+
);
237+
}
238+
239+
// check row group count matches the plan
240+
return Ok(access_plan.clone());
241+
}
242+
}
243+
244+
// default to scanning all row groups
245+
Ok(ParquetAccessPlan::new_all(row_group_count))
246+
}

0 commit comments

Comments
 (0)