15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
+ use datafusion_common:: { internal_err, Result } ;
18
19
use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
19
20
use parquet:: file:: metadata:: RowGroupMetaData ;
20
21
@@ -182,6 +183,11 @@ impl ParquetAccessPlan {
182
183
/// is returned for *all* the rows in the row groups that are not skipped.
183
184
/// Thus it includes a `Select` selection for any [`RowGroupAccess::Scan`].
184
185
///
186
+ /// # Errors
187
+ ///
188
+ /// Returns an error if any specified row selection does not specify
189
+ /// the same number of rows as in it's corresponding `row_group_metadata`.
190
+ ///
185
191
/// # Example: No Selections
186
192
///
187
193
/// Given an access plan like this
@@ -228,7 +234,7 @@ impl ParquetAccessPlan {
228
234
pub fn into_overall_row_selection (
229
235
self ,
230
236
row_group_meta_data : & [ RowGroupMetaData ] ,
231
- ) -> Option < RowSelection > {
237
+ ) -> Result < Option < RowSelection > > {
232
238
assert_eq ! ( row_group_meta_data. len( ) , self . row_groups. len( ) ) ;
233
239
// Intuition: entire row groups are filtered out using
234
240
// `row_group_indexes` which come from Skip and Scan. An overall
@@ -239,7 +245,32 @@ impl ParquetAccessPlan {
239
245
. iter ( )
240
246
. any ( |rg| matches ! ( rg, RowGroupAccess :: Selection ( _) ) )
241
247
{
242
- return None ;
248
+ return Ok ( None ) ;
249
+ }
250
+
251
+ // validate all Selections
252
+ for ( idx, ( rg, rg_meta) ) in self
253
+ . row_groups
254
+ . iter ( )
255
+ . zip ( row_group_meta_data. iter ( ) )
256
+ . enumerate ( )
257
+ {
258
+ let RowGroupAccess :: Selection ( selection) = rg else {
259
+ continue ;
260
+ } ;
261
+ let rows_in_selection = selection
262
+ . iter ( )
263
+ . map ( |selection| selection. row_count )
264
+ . sum :: < usize > ( ) ;
265
+
266
+ let row_group_row_count = rg_meta. num_rows ( ) ;
267
+ if rows_in_selection as i64 != row_group_row_count {
268
+ return internal_err ! (
269
+ "Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
270
+ but selection only specifies {rows_in_selection} rows. \
271
+ Selection: {selection:?}"
272
+ ) ;
273
+ }
243
274
}
244
275
245
276
let total_selection: RowSelection = self
@@ -261,7 +292,7 @@ impl ParquetAccessPlan {
261
292
} )
262
293
. collect ( ) ;
263
294
264
- Some ( total_selection)
295
+ Ok ( Some ( total_selection) )
265
296
}
266
297
267
298
/// Return an iterator over the row group indexes that should be scanned
@@ -305,6 +336,7 @@ impl ParquetAccessPlan {
305
336
#[ cfg( test) ]
306
337
mod test {
307
338
use super :: * ;
339
+ use datafusion_common:: assert_contains;
308
340
use parquet:: basic:: LogicalType ;
309
341
use parquet:: file:: metadata:: ColumnChunkMetaData ;
310
342
use parquet:: schema:: types:: { SchemaDescPtr , SchemaDescriptor } ;
@@ -320,7 +352,9 @@ mod test {
320
352
] ) ;
321
353
322
354
let row_group_indexes = access_plan. row_group_indexes ( ) ;
323
- let row_selection = access_plan. into_overall_row_selection ( row_group_metadata ( ) ) ;
355
+ let row_selection = access_plan
356
+ . into_overall_row_selection ( row_group_metadata ( ) )
357
+ . unwrap ( ) ;
324
358
325
359
// scan all row groups, no selection
326
360
assert_eq ! ( row_group_indexes, vec![ 0 , 1 , 2 , 3 ] ) ;
@@ -337,7 +371,9 @@ mod test {
337
371
] ) ;
338
372
339
373
let row_group_indexes = access_plan. row_group_indexes ( ) ;
340
- let row_selection = access_plan. into_overall_row_selection ( row_group_metadata ( ) ) ;
374
+ let row_selection = access_plan
375
+ . into_overall_row_selection ( row_group_metadata ( ) )
376
+ . unwrap ( ) ;
341
377
342
378
// skip all row groups, no selection
343
379
assert_eq ! ( row_group_indexes, vec![ ] as Vec <usize >) ;
@@ -348,14 +384,22 @@ mod test {
348
384
let access_plan = ParquetAccessPlan :: new ( vec ! [
349
385
RowGroupAccess :: Scan ,
350
386
RowGroupAccess :: Selection (
351
- vec![ RowSelector :: select( 5 ) , RowSelector :: skip( 7 ) ] . into( ) ,
387
+ // select / skip all 20 rows in row group 1
388
+ vec![
389
+ RowSelector :: select( 5 ) ,
390
+ RowSelector :: skip( 7 ) ,
391
+ RowSelector :: select( 8 ) ,
392
+ ]
393
+ . into( ) ,
352
394
) ,
353
395
RowGroupAccess :: Skip ,
354
396
RowGroupAccess :: Skip ,
355
397
] ) ;
356
398
357
399
let row_group_indexes = access_plan. row_group_indexes ( ) ;
358
- let row_selection = access_plan. into_overall_row_selection ( row_group_metadata ( ) ) ;
400
+ let row_selection = access_plan
401
+ . into_overall_row_selection ( row_group_metadata ( ) )
402
+ . unwrap ( ) ;
359
403
360
404
assert_eq ! ( row_group_indexes, vec![ 0 , 1 ] ) ;
361
405
assert_eq ! (
@@ -366,7 +410,8 @@ mod test {
366
410
RowSelector :: select( 10 ) ,
367
411
// selectors from the second row group
368
412
RowSelector :: select( 5 ) ,
369
- RowSelector :: skip( 7 )
413
+ RowSelector :: skip( 7 ) ,
414
+ RowSelector :: select( 8 )
370
415
]
371
416
. into( )
372
417
)
@@ -379,13 +424,21 @@ mod test {
379
424
RowGroupAccess :: Skip ,
380
425
RowGroupAccess :: Scan ,
381
426
RowGroupAccess :: Selection (
382
- vec![ RowSelector :: select( 5 ) , RowSelector :: skip( 7 ) ] . into( ) ,
427
+ // specify all 30 rows in row group 1
428
+ vec![
429
+ RowSelector :: select( 5 ) ,
430
+ RowSelector :: skip( 7 ) ,
431
+ RowSelector :: select( 18 ) ,
432
+ ]
433
+ . into( ) ,
383
434
) ,
384
435
RowGroupAccess :: Scan ,
385
436
] ) ;
386
437
387
438
let row_group_indexes = access_plan. row_group_indexes ( ) ;
388
- let row_selection = access_plan. into_overall_row_selection ( row_group_metadata ( ) ) ;
439
+ let row_selection = access_plan
440
+ . into_overall_row_selection ( row_group_metadata ( ) )
441
+ . unwrap ( ) ;
389
442
390
443
assert_eq ! ( row_group_indexes, vec![ 1 , 2 , 3 ] ) ;
391
444
assert_eq ! (
@@ -397,6 +450,7 @@ mod test {
397
450
// selectors from the third row group
398
451
RowSelector :: select( 5 ) ,
399
452
RowSelector :: skip( 7 ) ,
453
+ RowSelector :: select( 18 ) ,
400
454
// select the entire fourth row group
401
455
RowSelector :: select( 40 ) ,
402
456
]
@@ -405,6 +459,53 @@ mod test {
405
459
) ;
406
460
}
407
461
462
+ #[ test]
463
+ fn test_invalid_too_few ( ) {
464
+ let access_plan = ParquetAccessPlan :: new ( vec ! [
465
+ RowGroupAccess :: Scan ,
466
+ // select 12 rows, but row group 1 has 20
467
+ RowGroupAccess :: Selection (
468
+ vec![ RowSelector :: select( 5 ) , RowSelector :: skip( 7 ) ] . into( ) ,
469
+ ) ,
470
+ RowGroupAccess :: Scan ,
471
+ RowGroupAccess :: Scan ,
472
+ ] ) ;
473
+
474
+ let row_group_indexes = access_plan. row_group_indexes ( ) ;
475
+ let err = access_plan
476
+ . into_overall_row_selection ( row_group_metadata ( ) )
477
+ . unwrap_err ( )
478
+ . to_string ( ) ;
479
+ assert_eq ! ( row_group_indexes, vec![ 0 , 1 , 2 , 3 ] ) ;
480
+ assert_contains ! ( err, "Internal error: Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 12 rows" ) ;
481
+ }
482
+
483
+ #[ test]
484
+ fn test_invalid_too_many ( ) {
485
+ let access_plan = ParquetAccessPlan :: new ( vec ! [
486
+ RowGroupAccess :: Scan ,
487
+ // select 22 rows, but row group 1 has only 20
488
+ RowGroupAccess :: Selection (
489
+ vec![
490
+ RowSelector :: select( 10 ) ,
491
+ RowSelector :: skip( 2 ) ,
492
+ RowSelector :: select( 10 ) ,
493
+ ]
494
+ . into( ) ,
495
+ ) ,
496
+ RowGroupAccess :: Scan ,
497
+ RowGroupAccess :: Scan ,
498
+ ] ) ;
499
+
500
+ let row_group_indexes = access_plan. row_group_indexes ( ) ;
501
+ let err = access_plan
502
+ . into_overall_row_selection ( row_group_metadata ( ) )
503
+ . unwrap_err ( )
504
+ . to_string ( ) ;
505
+ assert_eq ! ( row_group_indexes, vec![ 0 , 1 , 2 , 3 ] ) ;
506
+ assert_contains ! ( err, "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows" ) ;
507
+ }
508
+
408
509
static ROW_GROUP_METADATA : OnceLock < Vec < RowGroupMetaData > > = OnceLock :: new ( ) ;
409
510
410
511
/// [`RowGroupMetaData`] that returns 4 row groups with 10, 20, 30, 40 rows
0 commit comments