Skip to content

Commit 81b4c07

Browse files
alambWeijun-H
andauthored
Improve ListingTable / ListingTableOptions docs (#15767)
* Improve `ListingTable` / `ListingTableOptions` docs * Update datafusion/core/src/datasource/listing/table.rs Co-authored-by: Alex Huang <[email protected]> --------- Co-authored-by: Alex Huang <[email protected]>
1 parent cc65b72 commit 81b4c07

File tree

2 files changed

+62
-40
lines changed

2 files changed

+62
-40
lines changed

datafusion/core/src/datasource/listing/table.rs

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,25 @@ use itertools::Itertools;
6262
use object_store::ObjectStore;
6363

6464
/// Configuration for creating a [`ListingTable`]
65+
///
66+
///
6567
#[derive(Debug, Clone)]
6668
pub struct ListingTableConfig {
6769
/// Paths on the `ObjectStore` for creating `ListingTable`.
6870
/// They should share the same schema and object store.
6971
pub table_paths: Vec<ListingTableUrl>,
7072
/// Optional `SchemaRef` for the to be created `ListingTable`.
73+
///
74+
/// See details on [`ListingTableConfig::with_schema`]
7175
pub file_schema: Option<SchemaRef>,
72-
/// Optional `ListingOptions` for the to be created `ListingTable`.
76+
/// Optional [`ListingOptions`] for the to be created [`ListingTable`].
77+
///
78+
/// See details on [`ListingTableConfig::with_listing_options`]
7379
pub options: Option<ListingOptions>,
7480
}
7581

7682
impl ListingTableConfig {
77-
/// Creates new [`ListingTableConfig`].
78-
///
79-
/// The [`SchemaRef`] and [`ListingOptions`] are inferred based on
80-
/// the suffix of the provided `table_paths` first element.
83+
/// Creates new [`ListingTableConfig`] for reading the specified URL
8184
pub fn new(table_path: ListingTableUrl) -> Self {
8285
let table_paths = vec![table_path];
8386
Self {
@@ -89,16 +92,24 @@ impl ListingTableConfig {
8992

9093
/// Creates new [`ListingTableConfig`] with multiple table paths.
9194
///
92-
/// The [`SchemaRef`] and [`ListingOptions`] are inferred based on
93-
/// the suffix of the provided `table_paths` first element.
95+
/// See [`Self::infer_options`] for details on what happens with multiple paths
9496
pub fn new_with_multi_paths(table_paths: Vec<ListingTableUrl>) -> Self {
9597
Self {
9698
table_paths,
9799
file_schema: None,
98100
options: None,
99101
}
100102
}
101-
/// Add `schema` to [`ListingTableConfig`]
103+
/// Set the `schema` for the overall [`ListingTable`]
104+
///
105+
/// [`ListingTable`] will automatically coerce, when possible, the schema
106+
/// for individual files to match this schema.
107+
///
108+
/// If a schema is not provided, it is inferred using
109+
/// [`Self::infer_schema`].
110+
///
111+
/// If the schema is provided, it must contain only the fields in the file
112+
/// without the table partitioning columns.
102113
pub fn with_schema(self, schema: SchemaRef) -> Self {
103114
Self {
104115
table_paths: self.table_paths,
@@ -108,6 +119,9 @@ impl ListingTableConfig {
108119
}
109120

110121
/// Add `listing_options` to [`ListingTableConfig`]
122+
///
123+
/// If not provided, format and other options are inferred via
124+
/// [`Self::infer_options`].
111125
pub fn with_listing_options(self, listing_options: ListingOptions) -> Self {
112126
Self {
113127
table_paths: self.table_paths,
@@ -116,7 +130,7 @@ impl ListingTableConfig {
116130
}
117131
}
118132

119-
///Returns a tupe of (file_extension, optional compression_extension)
133+
/// Returns a tuple of `(file_extension, optional compression_extension)`
120134
///
121135
/// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))`
122136
/// For example a path ending with blah.test.csv returns `("csv", None)`
@@ -138,7 +152,9 @@ impl ListingTableConfig {
138152
}
139153
}
140154

141-
/// Infer `ListingOptions` based on `table_path` suffix.
155+
/// Infer `ListingOptions` based on `table_path` and file suffix.
156+
///
157+
/// The format is inferred based on the first `table_path`.
142158
pub async fn infer_options(self, state: &dyn Session) -> Result<Self> {
143159
let store = if let Some(url) = self.table_paths.first() {
144160
state.runtime_env().object_store(url)?
@@ -192,7 +208,13 @@ impl ListingTableConfig {
192208
})
193209
}
194210

195-
/// Infer the [`SchemaRef`] based on `table_path` suffix. Requires `self.options` to be set prior to using.
211+
/// Infer the [`SchemaRef`] based on `table_path`s.
212+
///
213+
/// This method infers the table schema using the first `table_path`.
214+
/// See [`ListingOptions::infer_schema`] for more details
215+
///
216+
/// # Errors
217+
/// * if `self.options` is not set. See [`Self::with_listing_options`]
196218
pub async fn infer_schema(self, state: &dyn Session) -> Result<Self> {
197219
match self.options {
198220
Some(options) => {
@@ -212,12 +234,15 @@ impl ListingTableConfig {
212234
}
213235
}
214236

215-
/// Convenience wrapper for calling `infer_options` and `infer_schema`
237+
/// Convenience method to call both [`Self::infer_options`] and [`Self::infer_schema`]
216238
pub async fn infer(self, state: &dyn Session) -> Result<Self> {
217239
self.infer_options(state).await?.infer_schema(state).await
218240
}
219241

220-
/// Infer the partition columns from the path. Requires `self.options` to be set prior to using.
242+
/// Infer the partition columns from `table_paths`.
243+
///
244+
/// # Errors
245+
/// * if `self.options` is not set. See [`Self::with_listing_options`]
221246
pub async fn infer_partitions_from_path(self, state: &dyn Session) -> Result<Self> {
222247
match self.options {
223248
Some(options) => {
@@ -277,6 +302,7 @@ pub struct ListingOptions {
277302
/// parquet metadata.
278303
///
279304
/// See <https://github.com/apache/datafusion/issues/4177>
305+
///
280306
/// NOTE: This attribute stores all equivalent orderings (the outer `Vec`)
281307
/// where each ordering consists of an individual lexicographic
282308
/// ordering (encapsulated by a `Vec<Expr>`). If there aren't
@@ -479,11 +505,13 @@ impl ListingOptions {
479505
}
480506

481507
/// Infer the schema of the files at the given path on the provided object store.
482-
/// The inferred schema does not include the partitioning columns.
483508
///
484-
/// This method will not be called by the table itself but before creating it.
485-
/// This way when creating the logical plan we can decide to resolve the schema
486-
/// locally or ask a remote service to do it (e.g a scheduler).
509+
/// If the table_path contains one or more files (i.e. it is a directory /
510+
/// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]
511+
///
512+
/// Note: The inferred schema does not include any partitioning columns.
513+
///
514+
/// This method is called as part of creating a [`ListingTable`].
487515
pub async fn infer_schema<'a>(
488516
&'a self,
489517
state: &dyn Session,
@@ -656,16 +684,14 @@ impl ListingOptions {
656684
/// `ListingTable` also supports limit, filter and projection pushdown for formats that
657685
/// support it as such as Parquet.
658686
///
659-
/// # Implementation
687+
/// # See Also
660688
///
661-
/// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct
662-
/// for more details.
689+
/// 1. [`ListingTableConfig`]: Configuration options
690+
/// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable`
663691
///
664692
/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
665693
///
666-
/// # Example
667-
///
668-
/// To read a directory of parquet files using a [`ListingTable`]:
694+
/// # Example: Read a directory of parquet files using a [`ListingTable`]
669695
///
670696
/// ```no_run
671697
/// # use datafusion::prelude::SessionContext;
@@ -731,16 +757,9 @@ pub struct ListingTable {
731757
}
732758

733759
impl ListingTable {
734-
/// Create new [`ListingTable`] that lists the FS to get the files
735-
/// to scan. See [`ListingTable`] for and example.
736-
///
737-
/// Takes a `ListingTableConfig` as input which requires an `ObjectStore` and `table_path`.
738-
/// `ListingOptions` and `SchemaRef` are optional. If they are not
739-
/// provided the file type is inferred based on the file suffix.
740-
/// If the schema is provided then it must be resolved before creating the table
741-
/// and should contain the fields of the file without the table
742-
/// partitioning columns.
760+
/// Create new [`ListingTable`]
743761
///
762+
/// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
744763
pub fn try_new(config: ListingTableConfig) -> Result<Self> {
745764
let file_schema = config
746765
.file_schema

datafusion/core/src/lib.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,14 +311,17 @@
311311
//! ```
312312
//!
313313
//! A [`TableProvider`] provides information for planning and
314-
//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`]
315-
//! which supports reading several common file formats, and you can support any
316-
//! new file format by implementing the [`TableProvider`] trait. See also:
317-
//!
318-
//! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO
319-
//! files. Supports single files or multiple files with HIVE style
320-
//! partitioning, optional compression, directly reading from remote
321-
//! object store and more.
314+
//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`],
315+
//! a [`TableProvider`] which reads individual files or directories of files
316+
//! ("partitioned datasets") of several common file formats. Uses can add
317+
//! support for new file formats by implementing the [`TableProvider`]
318+
//! trait.
319+
//!
320+
//! See also:
321+
//!
322+
//! 1. [`ListingTable`]: Reads data from one or more Parquet, JSON, CSV, or AVRO
323+
//! files supporting HIVE style partitioning, optional compression, directly
324+
//! reading from remote object store and more.
322325
//!
323326
//! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
324327
//!

0 commit comments

Comments
 (0)