@@ -62,22 +62,25 @@ use itertools::Itertools;
62
62
use object_store:: ObjectStore ;
63
63
64
64
/// Configuration for creating a [`ListingTable`]
65
+ ///
66
+ ///
65
67
#[ derive( Debug , Clone ) ]
66
68
pub struct ListingTableConfig {
67
69
/// Paths on the `ObjectStore` for creating `ListingTable`.
68
70
/// They should share the same schema and object store.
69
71
pub table_paths : Vec < ListingTableUrl > ,
70
72
/// Optional `SchemaRef` for the to be created `ListingTable`.
73
+ ///
74
+ /// See details on [`ListingTableConfig::with_schema`]
71
75
pub file_schema : Option < SchemaRef > ,
72
- /// Optional `ListingOptions` for the to be created `ListingTable`.
76
+ /// Optional [`ListingOptions`] for the to be created [`ListingTable`].
77
+ ///
78
+ /// See details on [`ListingTableConfig::with_listing_options`]
73
79
pub options : Option < ListingOptions > ,
74
80
}
75
81
76
82
impl ListingTableConfig {
77
- /// Creates new [`ListingTableConfig`].
78
- ///
79
- /// The [`SchemaRef`] and [`ListingOptions`] are inferred based on
80
- /// the suffix of the provided `table_paths` first element.
83
+ /// Creates new [`ListingTableConfig`] for reading the specified URL
81
84
pub fn new ( table_path : ListingTableUrl ) -> Self {
82
85
let table_paths = vec ! [ table_path] ;
83
86
Self {
@@ -89,16 +92,24 @@ impl ListingTableConfig {
89
92
90
93
/// Creates new [`ListingTableConfig`] with multiple table paths.
91
94
///
92
- /// The [`SchemaRef`] and [`ListingOptions`] are inferred based on
93
- /// the suffix of the provided `table_paths` first element.
95
+ /// See [`Self::infer_options`] for details on what happens with multiple paths
94
96
pub fn new_with_multi_paths ( table_paths : Vec < ListingTableUrl > ) -> Self {
95
97
Self {
96
98
table_paths,
97
99
file_schema : None ,
98
100
options : None ,
99
101
}
100
102
}
101
- /// Add `schema` to [`ListingTableConfig`]
103
+ /// Set the `schema` for the overall [`ListingTable`]
104
+ ///
105
+ /// [`ListingTable`] will automatically coerce, when possible, the schema
106
+ /// for individual files to match this schema.
107
+ ///
108
+ /// If a schema is not provided, it is inferred using
109
+ /// [`Self::infer_schema`].
110
+ ///
111
+ /// If the schema is provided, it must contain only the fields in the file
112
+ /// without the table partitioning columns.
102
113
pub fn with_schema ( self , schema : SchemaRef ) -> Self {
103
114
Self {
104
115
table_paths : self . table_paths ,
@@ -108,6 +119,9 @@ impl ListingTableConfig {
108
119
}
109
120
110
121
/// Add `listing_options` to [`ListingTableConfig`]
122
+ ///
123
+ /// If not provided, format and other options are inferred via
124
+ /// [`Self::infer_options`].
111
125
pub fn with_listing_options ( self , listing_options : ListingOptions ) -> Self {
112
126
Self {
113
127
table_paths : self . table_paths ,
@@ -116,7 +130,7 @@ impl ListingTableConfig {
116
130
}
117
131
}
118
132
119
- ///Returns a tupe of (file_extension, optional compression_extension)
133
+ /// Returns a tuple of ` (file_extension, optional compression_extension)`
120
134
///
121
135
/// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))`
122
136
/// For example a path ending with blah.test.csv returns `("csv", None)`
@@ -138,7 +152,9 @@ impl ListingTableConfig {
138
152
}
139
153
}
140
154
141
- /// Infer `ListingOptions` based on `table_path` suffix.
155
+ /// Infer `ListingOptions` based on `table_path` and file suffix.
156
+ ///
157
+ /// The format is inferred based on the first `table_path`.
142
158
pub async fn infer_options ( self , state : & dyn Session ) -> Result < Self > {
143
159
let store = if let Some ( url) = self . table_paths . first ( ) {
144
160
state. runtime_env ( ) . object_store ( url) ?
@@ -192,7 +208,13 @@ impl ListingTableConfig {
192
208
} )
193
209
}
194
210
195
- /// Infer the [`SchemaRef`] based on `table_path` suffix. Requires `self.options` to be set prior to using.
211
+ /// Infer the [`SchemaRef`] based on `table_path`s.
212
+ ///
213
+ /// This method infers the table schema using the first `table_path`.
214
+ /// See [`ListingOptions::infer_schema`] for more details
215
+ ///
216
+ /// # Errors
217
+ /// * if `self.options` is not set. See [`Self::with_listing_options`]
196
218
pub async fn infer_schema ( self , state : & dyn Session ) -> Result < Self > {
197
219
match self . options {
198
220
Some ( options) => {
@@ -212,12 +234,15 @@ impl ListingTableConfig {
212
234
}
213
235
}
214
236
215
- /// Convenience wrapper for calling ` infer_options` and ` infer_schema`
237
+ /// Convenience method to call both [`Self:: infer_options`] and [`Self:: infer_schema`]
216
238
pub async fn infer ( self , state : & dyn Session ) -> Result < Self > {
217
239
self . infer_options ( state) . await ?. infer_schema ( state) . await
218
240
}
219
241
220
- /// Infer the partition columns from the path. Requires `self.options` to be set prior to using.
242
+ /// Infer the partition columns from `table_paths`.
243
+ ///
244
+ /// # Errors
245
+ /// * if `self.options` is not set. See [`Self::with_listing_options`]
221
246
pub async fn infer_partitions_from_path ( self , state : & dyn Session ) -> Result < Self > {
222
247
match self . options {
223
248
Some ( options) => {
@@ -277,6 +302,7 @@ pub struct ListingOptions {
277
302
/// parquet metadata.
278
303
///
279
304
/// See <https://github.com/apache/datafusion/issues/4177>
305
+ ///
280
306
/// NOTE: This attribute stores all equivalent orderings (the outer `Vec`)
281
307
/// where each ordering consists of an individual lexicographic
282
308
/// ordering (encapsulated by a `Vec<Expr>`). If there aren't
@@ -479,11 +505,13 @@ impl ListingOptions {
479
505
}
480
506
481
507
/// Infer the schema of the files at the given path on the provided object store.
482
- /// The inferred schema does not include the partitioning columns.
483
508
///
484
- /// This method will not be called by the table itself but before creating it.
485
- /// This way when creating the logical plan we can decide to resolve the schema
486
- /// locally or ask a remote service to do it (e.g a scheduler).
509
+ /// If the table_path contains one or more files (i.e. it is a directory /
510
+ /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]
511
+ ///
512
+ /// Note: The inferred schema does not include any partitioning columns.
513
+ ///
514
+ /// This method is called as part of creating a [`ListingTable`].
487
515
pub async fn infer_schema < ' a > (
488
516
& ' a self ,
489
517
state : & dyn Session ,
@@ -656,16 +684,14 @@ impl ListingOptions {
656
684
/// `ListingTable` also supports limit, filter and projection pushdown for formats that
657
685
/// support it as such as Parquet.
658
686
///
659
- /// # Implementation
687
+ /// # See Also
660
688
///
661
- /// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct
662
- /// for more details.
689
+ /// 1. [`ListingTableConfig`]: Configuration options
690
+ /// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable`
663
691
///
664
692
/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
665
693
///
666
- /// # Example
667
- ///
668
- /// To read a directory of parquet files using a [`ListingTable`]:
694
+ /// # Example: Read a directory of parquet files using a [`ListingTable`]
669
695
///
670
696
/// ```no_run
671
697
/// # use datafusion::prelude::SessionContext;
@@ -731,16 +757,9 @@ pub struct ListingTable {
731
757
}
732
758
733
759
impl ListingTable {
734
- /// Create new [`ListingTable`] that lists the FS to get the files
735
- /// to scan. See [`ListingTable`] for and example.
736
- ///
737
- /// Takes a `ListingTableConfig` as input which requires an `ObjectStore` and `table_path`.
738
- /// `ListingOptions` and `SchemaRef` are optional. If they are not
739
- /// provided the file type is inferred based on the file suffix.
740
- /// If the schema is provided then it must be resolved before creating the table
741
- /// and should contain the fields of the file without the table
742
- /// partitioning columns.
760
+ /// Create new [`ListingTable`]
743
761
///
762
+ /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
744
763
pub fn try_new ( config : ListingTableConfig ) -> Result < Self > {
745
764
let file_schema = config
746
765
. file_schema
0 commit comments