diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 41e939d60b08..a983f0696e83 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -616,6 +616,7 @@ impl ListingOptions { /// using an [`ObjectStore`] instance, for example from local files or objects /// from AWS S3. /// +/// # Reading Directories /// For example, given the `table1` directory (or object store prefix) /// /// ```text @@ -651,13 +652,19 @@ impl ListingOptions { /// If the query has a predicate like `WHERE date = '2024-06-01'` /// only the corresponding directory will be read. /// -/// `ListingTable` also supports filter and projection pushdown for formats that +/// `ListingTable` also supports limit, filter and projection pushdown for formats that /// support it as such as Parquet. /// +/// # Implementation +/// +/// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct +/// for more details. +/// +/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec +/// /// # Example /// -/// Here is an example of reading a directory of parquet files using a -/// [`ListingTable`]: +/// To read a directory of parquet files using a [`ListingTable`]: /// /// ```no_run /// # use datafusion::prelude::SessionContext; diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 9a0d0157c1ae..dfd171082f02 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -298,10 +298,10 @@ //! (built in or user provided) ExecutionPlan //! ``` //! -//! DataFusion includes several built in data sources for common use -//! cases, and can be extended by implementing the [`TableProvider`] -//! trait. A [`TableProvider`] provides information for planning and -//! an [`ExecutionPlan`]s for execution. +//! A [`TableProvider`] provides information for planning and +//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`] +//! which supports reading several common file formats, and you can support any +//! new file format by implementing the [`TableProvider`] trait. See also: //! //! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO //! files. Supports single files or multiple files with HIVE style @@ -314,7 +314,7 @@ //! //! [`ListingTable`]: crate::datasource::listing::ListingTable //! [`MemTable`]: crate::datasource::memory::MemTable -//! [`StreamingTable`]: datafusion_catalog::streaming::StreamingTable +//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable //! //! ## Plan Representations //! diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 8d8cbbc67b9a..0066f39801a1 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -33,9 +33,9 @@ use datafusion_physical_plan::DisplayFormatType; use object_store::ObjectStore; -/// Common behaviors that every file format needs to implement. +/// Common file format behaviors needs to implement. /// -/// See initialization examples on `ParquetSource`, `CsvSource` +/// See implementation examples such as `ParquetSource`, `CsvSource` pub trait FileSource: Send + Sync { /// Creates a `dyn FileOpener` based on given parameters fn create_file_opener( diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index b3089a6e59fe..07cee7fba00e 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! [`DataSource`] and [`DataSourceExec`] + use std::any::Any; use std::fmt; use std::fmt::{Debug, Formatter}; @@ -34,9 +36,15 @@ use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; /// Common behaviors in Data Sources for both from Files and Memory. -/// See `DataSourceExec` for physical plan implementation /// +/// # See Also +/// * [`DataSourceExec`] for physical plan implementation +/// * [`FileSource`] for file format implementations (Parquet, Json, etc) +/// +/// # Notes /// Requires `Debug` to assist debugging +/// +/// [`FileSource`]: crate::file::FileSource pub trait DataSource: Send + Sync + Debug { fn open( &self, @@ -71,10 +79,21 @@ pub trait DataSource: Send + Sync + Debug { ) -> datafusion_common::Result>>; } -/// Unified data source for file formats like JSON, CSV, AVRO, ARROW, PARQUET +/// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO, ARROW, PARQUET +/// +/// `DataSourceExec` implements common functionality such as applying projections, +/// and caching plan properties. +/// +/// The [`DataSource`] trait describes where to find the data for this data +/// source (for example what files or what in memory partitions). Format +/// specifics are implemented with the [`FileSource`] trait. +/// +/// [`FileSource`]: crate::file::FileSource #[derive(Clone, Debug)] pub struct DataSourceExec { + /// The source of the data -- for example, `FileScanConfig` or `MemorySourceConfig` data_source: Arc, + /// Cached plan properties such as sort order cache: PlanProperties, }