Skip to content

Commit e7b08ed

Browse files
yjshenalamb
andauthored
Range scan support for ParquetExec (#1990)
* Filter parquet row groups by range as well * fix * WIP: case when expr works * short-circuit case_when * else * only range part * Update datafusion/core/src/datasource/listing/mod.rs Co-authored-by: Andrew Lamb <[email protected]> * test * Update parquet.rs * fix Co-authored-by: Andrew Lamb <[email protected]>
1 parent 774b91b commit e7b08ed

File tree

6 files changed

+149
-3
lines changed

6 files changed

+149
-3
lines changed

ballista/rust/core/proto/ballista.proto

+6
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,17 @@ message Statistics {
7272
bool is_exact = 4;
7373
}
7474

75+
message FileRange {
76+
int64 start = 1;
77+
int64 end = 2;
78+
}
79+
7580
message PartitionedFile {
7681
string path = 1;
7782
uint64 size = 2;
7883
uint64 last_modified_ns = 3;
7984
repeated datafusion.ScalarValue partition_values = 4;
85+
FileRange range = 5;
8086
}
8187

8288
message CsvFormat {

ballista/rust/core/src/serde/physical_plan/from_proto.rs

+13-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use chrono::{TimeZone, Utc};
3030
use datafusion::datafusion_data_access::{
3131
object_store::local::LocalFileSystem, FileMeta, SizedFile,
3232
};
33-
use datafusion::datasource::listing::PartitionedFile;
33+
use datafusion::datasource::listing::{FileRange, PartitionedFile};
3434
use datafusion::execution::context::ExecutionProps;
3535
use datafusion::logical_plan::FunctionRegistry;
3636

@@ -301,6 +301,18 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile {
301301
.iter()
302302
.map(|v| v.try_into())
303303
.collect::<Result<Vec<_>, _>>()?,
304+
range: val.range.as_ref().map(|v| v.try_into()).transpose()?,
305+
})
306+
}
307+
}
308+
309+
impl TryFrom<&protobuf::FileRange> for FileRange {
310+
type Error = BallistaError;
311+
312+
fn try_from(value: &protobuf::FileRange) -> Result<Self, Self::Error> {
313+
Ok(FileRange {
314+
start: value.start,
315+
end: value.end,
304316
})
305317
}
306318
}

ballista/rust/core/src/serde/physical_plan/to_proto.rs

+13-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use datafusion::physical_plan::{
3535
Statistics,
3636
};
3737

38-
use datafusion::datasource::listing::PartitionedFile;
38+
use datafusion::datasource::listing::{FileRange, PartitionedFile};
3939
use datafusion::physical_plan::file_format::FileScanConfig;
4040

4141
use datafusion::physical_plan::expressions::{Count, Literal};
@@ -354,6 +354,18 @@ impl TryFrom<&PartitionedFile> for protobuf::PartitionedFile {
354354
.iter()
355355
.map(|v| v.try_into())
356356
.collect::<Result<Vec<_>, _>>()?,
357+
range: pf.range.as_ref().map(|r| r.try_into()).transpose()?,
358+
})
359+
}
360+
}
361+
362+
impl TryFrom<&FileRange> for protobuf::FileRange {
363+
type Error = BallistaError;
364+
365+
fn try_from(value: &FileRange) -> Result<Self, Self::Error> {
366+
Ok(protobuf::FileRange {
367+
start: value.start,
368+
end: value.end,
357369
})
358370
}
359371
}

datafusion/core/src/datasource/listing/helpers.rs

+3
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ pub async fn pruned_partition_list(
174174
Ok(PartitionedFile {
175175
partition_values: vec![],
176176
file_meta: f?,
177+
range: None,
177178
})
178179
}),
179180
));
@@ -217,6 +218,7 @@ pub async fn pruned_partition_list(
217218
Ok(PartitionedFile {
218219
partition_values,
219220
file_meta,
221+
range: None,
220222
})
221223
})
222224
}
@@ -358,6 +360,7 @@ fn batches_to_paths(batches: &[RecordBatch]) -> Vec<PartitionedFile> {
358360
ScalarValue::try_from_array(batch.column(col), row).unwrap()
359361
})
360362
.collect(),
363+
range: None,
361364
})
362365
})
363366
.collect()

datafusion/core/src/datasource/listing/mod.rs

+28-1
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,28 @@ pub use table::{ListingOptions, ListingTable, ListingTableConfig};
3232
pub type PartitionedFileStream =
3333
Pin<Box<dyn Stream<Item = Result<PartitionedFile>> + Send + Sync + 'static>>;
3434

35+
/// Only scan a subset of Row Groups from the Parquet file whose data "midpoint"
36+
/// lies within the [start, end) byte offsets. This option can be used to scan non-overlapping
37+
/// sections of a Parquet file in parallel.
3538
#[derive(Debug, Clone)]
39+
pub struct FileRange {
40+
/// Range start
41+
pub start: i64,
42+
/// Range end
43+
pub end: i64,
44+
}
45+
46+
#[derive(Debug, Clone)]
47+
/// A single file or part of a file that should be read, along with its schema, statistics
3648
/// A single file that should be read, along with its schema, statistics
3749
/// and partition column values that need to be appended to each row.
3850
pub struct PartitionedFile {
3951
/// Path for the file (e.g. URL, filesystem path, etc)
4052
pub file_meta: FileMeta,
4153
/// Values of partition columns to be appended to each row
4254
pub partition_values: Vec<ScalarValue>,
43-
// We may include row group range here for a more fine-grained parallel execution
55+
/// An optional file range for a more fine-grained parallel execution
56+
pub range: Option<FileRange>,
4457
}
4558

4659
impl PartitionedFile {
@@ -52,6 +65,19 @@ impl PartitionedFile {
5265
last_modified: None,
5366
},
5467
partition_values: vec![],
68+
range: None,
69+
}
70+
}
71+
72+
/// Create a file range without metadata or partition
73+
pub fn new_with_range(path: String, size: u64, start: i64, end: i64) -> Self {
74+
Self {
75+
file_meta: FileMeta {
76+
sized_file: SizedFile { path, size },
77+
last_modified: None,
78+
},
79+
partition_values: vec![],
80+
range: Some(FileRange { start, end }),
5581
}
5682
}
5783
}
@@ -67,5 +93,6 @@ pub fn local_unpartitioned_file(file: String) -> PartitionedFile {
6793
PartitionedFile {
6894
file_meta: local::local_unpartitioned_file(file),
6995
partition_values: vec![],
96+
range: None,
7097
}
7198
}

datafusion/core/src/physical_plan/file_format/parquet.rs

+86
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,14 @@ impl ParquetExecStream {
337337
file_metrics,
338338
));
339339
}
340+
if let Some(range) = &file.range {
341+
assert!(
342+
range.start >= 0 && range.end > 0 && range.end > range.start,
343+
"invalid range specified: {:?}",
344+
range
345+
);
346+
opt = opt.with_range(range.start, range.end);
347+
}
340348

341349
let file_reader = SerializedFileReader::new_with_options(
342350
ChunkObjectReader(object_reader),
@@ -649,13 +657,15 @@ mod tests {
649657
};
650658

651659
use super::*;
660+
use crate::datasource::listing::FileRange;
652661
use crate::execution::options::CsvReadOptions;
653662
use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
654663
use arrow::array::Float32Array;
655664
use arrow::{
656665
array::{Int64Array, Int8Array, StringArray},
657666
datatypes::{DataType, Field},
658667
};
668+
use datafusion_data_access::object_store::local;
659669
use datafusion_expr::{col, lit};
660670
use futures::StreamExt;
661671
use parquet::{
@@ -1099,6 +1109,81 @@ mod tests {
10991109
Ok(())
11001110
}
11011111

1112+
#[tokio::test]
1113+
async fn parquet_exec_with_range() -> Result<()> {
1114+
fn file_range(file: String, start: i64, end: i64) -> PartitionedFile {
1115+
PartitionedFile {
1116+
file_meta: local::local_unpartitioned_file(file),
1117+
partition_values: vec![],
1118+
range: Some(FileRange { start, end }),
1119+
}
1120+
}
1121+
1122+
async fn assert_parquet_read(
1123+
file_groups: Vec<Vec<PartitionedFile>>,
1124+
expected_row_num: Option<usize>,
1125+
task_ctx: Arc<TaskContext>,
1126+
file_schema: SchemaRef,
1127+
) -> Result<()> {
1128+
let parquet_exec = ParquetExec::new(
1129+
FileScanConfig {
1130+
object_store: Arc::new(LocalFileSystem {}),
1131+
file_groups,
1132+
file_schema,
1133+
statistics: Statistics::default(),
1134+
projection: None,
1135+
limit: None,
1136+
table_partition_cols: vec![],
1137+
},
1138+
None,
1139+
);
1140+
assert_eq!(parquet_exec.output_partitioning().partition_count(), 1);
1141+
let results = parquet_exec.execute(0, task_ctx).await?.next().await;
1142+
1143+
if let Some(expected_row_num) = expected_row_num {
1144+
let batch = results.unwrap()?;
1145+
assert_eq!(expected_row_num, batch.num_rows());
1146+
} else {
1147+
assert!(results.is_none());
1148+
}
1149+
1150+
Ok(())
1151+
}
1152+
1153+
let session_ctx = SessionContext::new();
1154+
let testdata = crate::test_util::parquet_test_data();
1155+
let filename = format!("{}/alltypes_plain.parquet", testdata);
1156+
let file_schema = ParquetFormat::default()
1157+
.infer_schema(local_object_reader_stream(vec![filename.clone()]))
1158+
.await?;
1159+
1160+
let group_empty = vec![vec![file_range(filename.clone(), 0, 5)]];
1161+
let group_contain = vec![vec![file_range(filename.clone(), 5, i64::MAX)]];
1162+
let group_all = vec![vec![
1163+
file_range(filename.clone(), 0, 5),
1164+
file_range(filename.clone(), 5, i64::MAX),
1165+
]];
1166+
1167+
assert_parquet_read(
1168+
group_empty,
1169+
None,
1170+
session_ctx.task_ctx(),
1171+
file_schema.clone(),
1172+
)
1173+
.await?;
1174+
assert_parquet_read(
1175+
group_contain,
1176+
Some(8),
1177+
session_ctx.task_ctx(),
1178+
file_schema.clone(),
1179+
)
1180+
.await?;
1181+
assert_parquet_read(group_all, Some(8), session_ctx.task_ctx(), file_schema)
1182+
.await?;
1183+
1184+
Ok(())
1185+
}
1186+
11021187
#[tokio::test]
11031188
async fn parquet_exec_with_partition() -> Result<()> {
11041189
let session_ctx = SessionContext::new();
@@ -1171,6 +1256,7 @@ mod tests {
11711256
last_modified: None,
11721257
},
11731258
partition_values: vec![],
1259+
range: None,
11741260
};
11751261

11761262
let parquet_exec = ParquetExec::new(

0 commit comments

Comments
 (0)