Skip to content

Commit 7ccc6d7

Browse files
logan-keedealamb
andauthored
some dependency removals and setup for refactor of FileScanConfig (#14543)
* some dependency removals and setup for refactor * fix: CI for linux build * move FileGroupsPartitioner * remove old * Fix supports_repartition * fix --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent d5f19f3 commit 7ccc6d7

File tree

18 files changed

+97
-73
lines changed

18 files changed

+97
-73
lines changed

datafusion-examples/examples/custom_file_format.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@ use arrow::{
2121
array::{AsArray, RecordBatch, StringArray, UInt8Array},
2222
datatypes::{DataType, Field, Schema, SchemaRef, UInt64Type},
2323
};
24-
use datafusion::common::{GetExt, Statistics};
2524
use datafusion::datasource::data_source::FileSource;
2625
use datafusion::execution::session_state::SessionStateBuilder;
2726
use datafusion::physical_expr::LexRequirement;
2827
use datafusion::physical_expr::PhysicalExpr;
28+
use datafusion::{
29+
catalog::Session,
30+
common::{GetExt, Statistics},
31+
};
2932
use datafusion::{
3033
datasource::{
3134
file_format::{
@@ -36,7 +39,6 @@ use datafusion::{
3639
MemTable,
3740
},
3841
error::Result,
39-
execution::context::SessionState,
4042
physical_plan::ExecutionPlan,
4143
prelude::SessionContext,
4244
};
@@ -84,7 +86,7 @@ impl FileFormat for TSVFileFormat {
8486

8587
async fn infer_schema(
8688
&self,
87-
state: &SessionState,
89+
state: &dyn Session,
8890
store: &Arc<dyn ObjectStore>,
8991
objects: &[ObjectMeta],
9092
) -> Result<SchemaRef> {
@@ -95,7 +97,7 @@ impl FileFormat for TSVFileFormat {
9597

9698
async fn infer_stats(
9799
&self,
98-
state: &SessionState,
100+
state: &dyn Session,
99101
store: &Arc<dyn ObjectStore>,
100102
table_schema: SchemaRef,
101103
object: &ObjectMeta,
@@ -107,7 +109,7 @@ impl FileFormat for TSVFileFormat {
107109

108110
async fn create_physical_plan(
109111
&self,
110-
state: &SessionState,
112+
state: &dyn Session,
111113
conf: FileScanConfig,
112114
filters: Option<&Arc<dyn PhysicalExpr>>,
113115
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -119,7 +121,7 @@ impl FileFormat for TSVFileFormat {
119121
async fn create_writer_physical_plan(
120122
&self,
121123
input: Arc<dyn ExecutionPlan>,
122-
state: &SessionState,
124+
state: &dyn Session,
123125
conf: FileSinkConfig,
124126
order_requirements: Option<LexRequirement>,
125127
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -153,7 +155,7 @@ impl TSVFileFactory {
153155
impl FileFormatFactory for TSVFileFactory {
154156
fn create(
155157
&self,
156-
state: &SessionState,
158+
state: &dyn Session,
157159
format_options: &std::collections::HashMap<String, String>,
158160
) -> Result<Arc<dyn FileFormat>> {
159161
let mut new_options = format_options.clone();

datafusion/core/src/datasource/physical_plan/file_groups.rs renamed to datafusion/catalog-listing/src/file_groups.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! Logic for managing groups of [`PartitionedFile`]s in DataFusion
1919
20-
use crate::datasource::listing::{FileRange, PartitionedFile};
20+
use crate::{FileRange, PartitionedFile};
2121
use itertools::Itertools;
2222
use std::cmp::min;
2323
use std::collections::BinaryHeap;

datafusion/catalog-listing/src/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
//! A table that uses the `ObjectStore` listing capability
1919
//! to get the list of files to process.
2020
21+
pub mod file_groups;
2122
pub mod helpers;
2223
pub mod url;
23-
2424
use chrono::TimeZone;
2525
use datafusion_common::Result;
2626
use datafusion_common::{ScalarValue, Statistics};

datafusion/core/src/datasource/data_source.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,9 @@ pub trait FileSource: Send + Sync {
6262
fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result {
6363
Ok(())
6464
}
65+
/// Return true if the file format supports repartition
66+
///
67+
/// If this returns true, the DataSourceExec may repartition the data
68+
/// by breaking up the input files into multiple smaller groups.
69+
fn supports_repartition(&self, config: &FileScanConfig) -> bool;
6570
}

datafusion/core/src/datasource/file_format/arrow.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ use crate::datasource::physical_plan::{
3535
ArrowSource, FileGroupDisplay, FileScanConfig, FileSink, FileSinkConfig,
3636
};
3737
use crate::error::Result;
38-
use crate::execution::context::SessionState;
3938
use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
4039

4140
use arrow::ipc::convert::fb_to_schema;
4241
use arrow::ipc::reader::FileReader;
4342
use arrow::ipc::writer::IpcWriteOptions;
4443
use arrow::ipc::{root_as_message, CompressionType};
4544
use arrow_schema::{ArrowError, Schema, SchemaRef};
45+
use datafusion_catalog::Session;
4646
use datafusion_common::parsers::CompressionTypeVariant;
4747
use datafusion_common::{
4848
not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION,
@@ -84,7 +84,7 @@ impl ArrowFormatFactory {
8484
impl FileFormatFactory for ArrowFormatFactory {
8585
fn create(
8686
&self,
87-
_state: &SessionState,
87+
_state: &dyn Session,
8888
_format_options: &HashMap<String, String>,
8989
) -> Result<Arc<dyn FileFormat>> {
9090
Ok(Arc::new(ArrowFormat))
@@ -135,7 +135,7 @@ impl FileFormat for ArrowFormat {
135135

136136
async fn infer_schema(
137137
&self,
138-
_state: &SessionState,
138+
_state: &dyn Session,
139139
store: &Arc<dyn ObjectStore>,
140140
objects: &[ObjectMeta],
141141
) -> Result<SchemaRef> {
@@ -159,7 +159,7 @@ impl FileFormat for ArrowFormat {
159159

160160
async fn infer_stats(
161161
&self,
162-
_state: &SessionState,
162+
_state: &dyn Session,
163163
_store: &Arc<dyn ObjectStore>,
164164
table_schema: SchemaRef,
165165
_object: &ObjectMeta,
@@ -169,7 +169,7 @@ impl FileFormat for ArrowFormat {
169169

170170
async fn create_physical_plan(
171171
&self,
172-
_state: &SessionState,
172+
_state: &dyn Session,
173173
mut conf: FileScanConfig,
174174
_filters: Option<&Arc<dyn PhysicalExpr>>,
175175
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -180,7 +180,7 @@ impl FileFormat for ArrowFormat {
180180
async fn create_writer_physical_plan(
181181
&self,
182182
input: Arc<dyn ExecutionPlan>,
183-
_state: &SessionState,
183+
_state: &dyn Session,
184184
conf: FileSinkConfig,
185185
order_requirements: Option<LexRequirement>,
186186
) -> Result<Arc<dyn ExecutionPlan>> {

datafusion/core/src/datasource/file_format/avro.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ use super::FileFormatFactory;
2828
use crate::datasource::avro_to_arrow::read_avro_schema_from_reader;
2929
use crate::datasource::physical_plan::{AvroSource, FileScanConfig};
3030
use crate::error::Result;
31-
use crate::execution::context::SessionState;
3231
use crate::physical_plan::ExecutionPlan;
3332
use crate::physical_plan::Statistics;
3433

3534
use crate::datasource::data_source::FileSource;
3635
use arrow::datatypes::Schema;
3736
use arrow::datatypes::SchemaRef;
3837
use async_trait::async_trait;
38+
use datafusion_catalog::Session;
3939
use datafusion_common::internal_err;
4040
use datafusion_common::parsers::CompressionTypeVariant;
4141
use datafusion_common::GetExt;
@@ -57,7 +57,7 @@ impl AvroFormatFactory {
5757
impl FileFormatFactory for AvroFormatFactory {
5858
fn create(
5959
&self,
60-
_state: &SessionState,
60+
_state: &dyn Session,
6161
_format_options: &HashMap<String, String>,
6262
) -> Result<Arc<dyn FileFormat>> {
6363
Ok(Arc::new(AvroFormat))
@@ -112,7 +112,7 @@ impl FileFormat for AvroFormat {
112112

113113
async fn infer_schema(
114114
&self,
115-
_state: &SessionState,
115+
_state: &dyn Session,
116116
store: &Arc<dyn ObjectStore>,
117117
objects: &[ObjectMeta],
118118
) -> Result<SchemaRef> {
@@ -137,7 +137,7 @@ impl FileFormat for AvroFormat {
137137

138138
async fn infer_stats(
139139
&self,
140-
_state: &SessionState,
140+
_state: &dyn Session,
141141
_store: &Arc<dyn ObjectStore>,
142142
table_schema: SchemaRef,
143143
_object: &ObjectMeta,
@@ -147,7 +147,7 @@ impl FileFormat for AvroFormat {
147147

148148
async fn create_physical_plan(
149149
&self,
150-
_state: &SessionState,
150+
_state: &dyn Session,
151151
mut conf: FileScanConfig,
152152
_filters: Option<&Arc<dyn PhysicalExpr>>,
153153
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -505,7 +505,7 @@ mod tests {
505505
}
506506

507507
async fn get_exec(
508-
state: &SessionState,
508+
state: &dyn Session,
509509
file_name: &str,
510510
projection: Option<Vec<usize>>,
511511
limit: Option<usize>,

datafusion/core/src/datasource/file_format/csv.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ use arrow::array::RecordBatch;
4444
use arrow::csv::WriterBuilder;
4545
use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
4646
use arrow_schema::ArrowError;
47+
use datafusion_catalog::Session;
4748
use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions};
4849
use datafusion_common::file_options::csv_writer::CsvWriterOptions;
4950
use datafusion_common::{
@@ -95,9 +96,10 @@ impl Debug for CsvFormatFactory {
9596
impl FileFormatFactory for CsvFormatFactory {
9697
fn create(
9798
&self,
98-
state: &SessionState,
99+
state: &dyn Session,
99100
format_options: &HashMap<String, String>,
100101
) -> Result<Arc<dyn FileFormat>> {
102+
let state = state.as_any().downcast_ref::<SessionState>().unwrap();
101103
let csv_options = match &self.options {
102104
None => {
103105
let mut table_options = state.default_table_options();
@@ -365,7 +367,7 @@ impl FileFormat for CsvFormat {
365367

366368
async fn infer_schema(
367369
&self,
368-
state: &SessionState,
370+
state: &dyn Session,
369371
store: &Arc<dyn ObjectStore>,
370372
objects: &[ObjectMeta],
371373
) -> Result<SchemaRef> {
@@ -400,7 +402,7 @@ impl FileFormat for CsvFormat {
400402

401403
async fn infer_stats(
402404
&self,
403-
_state: &SessionState,
405+
_state: &dyn Session,
404406
_store: &Arc<dyn ObjectStore>,
405407
table_schema: SchemaRef,
406408
_object: &ObjectMeta,
@@ -410,7 +412,7 @@ impl FileFormat for CsvFormat {
410412

411413
async fn create_physical_plan(
412414
&self,
413-
state: &SessionState,
415+
state: &dyn Session,
414416
mut conf: FileScanConfig,
415417
_filters: Option<&Arc<dyn PhysicalExpr>>,
416418
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -440,7 +442,7 @@ impl FileFormat for CsvFormat {
440442
async fn create_writer_physical_plan(
441443
&self,
442444
input: Arc<dyn ExecutionPlan>,
443-
state: &SessionState,
445+
state: &dyn Session,
444446
conf: FileSinkConfig,
445447
order_requirements: Option<LexRequirement>,
446448
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -485,7 +487,7 @@ impl CsvFormat {
485487
/// number of lines that were read
486488
async fn infer_schema_from_stream(
487489
&self,
488-
state: &SessionState,
490+
state: &dyn Session,
489491
mut records_to_read: usize,
490492
stream: impl Stream<Item = Result<Bytes>>,
491493
) -> Result<(Schema, usize)> {
@@ -1147,7 +1149,7 @@ mod tests {
11471149
}
11481150

11491151
async fn get_exec(
1150-
state: &SessionState,
1152+
state: &dyn Session,
11511153
file_name: &str,
11521154
projection: Option<Vec<usize>>,
11531155
limit: Option<usize>,

datafusion/core/src/datasource/file_format/json.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use crate::datasource::physical_plan::{
3636
FileGroupDisplay, FileSink, FileSinkConfig, JsonSource,
3737
};
3838
use crate::error::Result;
39-
use crate::execution::context::SessionState;
39+
use crate::execution::SessionState;
4040
use crate::physical_plan::insert::{DataSink, DataSinkExec};
4141
use crate::physical_plan::{
4242
DisplayAs, DisplayFormatType, SendableRecordBatchStream, Statistics,
@@ -48,6 +48,7 @@ use arrow::json;
4848
use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
4949
use arrow_array::RecordBatch;
5050
use arrow_schema::ArrowError;
51+
use datafusion_catalog::Session;
5152
use datafusion_common::config::{ConfigField, ConfigFileType, JsonOptions};
5253
use datafusion_common::file_options::json_writer::JsonWriterOptions;
5354
use datafusion_common::{not_impl_err, GetExt, DEFAULT_JSON_EXTENSION};
@@ -87,9 +88,10 @@ impl JsonFormatFactory {
8788
impl FileFormatFactory for JsonFormatFactory {
8889
fn create(
8990
&self,
90-
state: &SessionState,
91+
state: &dyn Session,
9192
format_options: &HashMap<String, String>,
9293
) -> Result<Arc<dyn FileFormat>> {
94+
let state = state.as_any().downcast_ref::<SessionState>().unwrap();
9395
let json_options = match &self.options {
9496
None => {
9597
let mut table_options = state.default_table_options();
@@ -189,7 +191,7 @@ impl FileFormat for JsonFormat {
189191

190192
async fn infer_schema(
191193
&self,
192-
_state: &SessionState,
194+
_state: &dyn Session,
193195
store: &Arc<dyn ObjectStore>,
194196
objects: &[ObjectMeta],
195197
) -> Result<SchemaRef> {
@@ -237,7 +239,7 @@ impl FileFormat for JsonFormat {
237239

238240
async fn infer_stats(
239241
&self,
240-
_state: &SessionState,
242+
_state: &dyn Session,
241243
_store: &Arc<dyn ObjectStore>,
242244
table_schema: SchemaRef,
243245
_object: &ObjectMeta,
@@ -247,7 +249,7 @@ impl FileFormat for JsonFormat {
247249

248250
async fn create_physical_plan(
249251
&self,
250-
_state: &SessionState,
252+
_state: &dyn Session,
251253
mut conf: FileScanConfig,
252254
_filters: Option<&Arc<dyn PhysicalExpr>>,
253255
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -261,7 +263,7 @@ impl FileFormat for JsonFormat {
261263
async fn create_writer_physical_plan(
262264
&self,
263265
input: Arc<dyn ExecutionPlan>,
264-
_state: &SessionState,
266+
_state: &dyn Session,
265267
conf: FileSinkConfig,
266268
order_requirements: Option<LexRequirement>,
267269
) -> Result<Arc<dyn ExecutionPlan>> {
@@ -538,7 +540,7 @@ mod tests {
538540
}
539541

540542
async fn get_exec(
541-
state: &SessionState,
543+
state: &dyn Session,
542544
projection: Option<Vec<usize>>,
543545
limit: Option<usize>,
544546
) -> Result<Arc<dyn ExecutionPlan>> {

0 commit comments

Comments
 (0)