From d3942e4ec3d9bd80ea6f503ec75bda72d951d6ad Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 26 Feb 2025 06:39:57 -0500 Subject: [PATCH 01/17] Add Upgrade Guide for DataFusion 46.0.0 --- datafusion/core/src/lib.rs | 6 +++ docs/source/index.rst | 4 +- docs/source/library-user-guide/upgrading.md | 45 +++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 docs/source/library-user-guide/upgrading.md diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 9a0d0157c1ae..3635909fe70a 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -1097,3 +1097,9 @@ doc_comment::doctest!( "../../../docs/source/library-user-guide/working-with-exprs.md", library_user_guide_working_with_exprs ); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/library-user-guide/upgrading.md", + library_user_guide_upgrading +); diff --git a/docs/source/index.rst b/docs/source/index.rst index d9b0c126ab12..2aff284cd614 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -133,7 +133,9 @@ To get started, see library-user-guide/profiling library-user-guide/query-optimizer library-user-guide/api-health -.. _toc.contributor-guide: + library-user-guide/upgrading + +.. .. _toc.contributor-guide: .. toctree:: :maxdepth: 1 diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md new file mode 100644 index 000000000000..0c00dda20d8a --- /dev/null +++ b/docs/source/library-user-guide/upgrading.md @@ -0,0 +1,45 @@ + + +# Upgrade Guides + +## DataFusion `46.0.0` + +See more information +- Change PR [PR #14224](https://github.com/apache/datafusion/pull/14224) +- Example of an Upgrade [PR in delta-rs](https://github.com/delta-io/delta-rs/pull/3261) + +### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated + +DataFusion 46 has a major change to how the built in DataSources are organized. The + +### Cookbook: Changes to `ParquetExecBuilder` + +#### Old pattern: +```test +TODO +``` + +#### New Pattern + + +```test +TODO +``` + From ced0fbfb6d74285a1890af56793d46b6a6eeb30a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 26 Feb 2025 06:51:56 -0500 Subject: [PATCH 02/17] Add a not about invoke vs invoke_with_batch --- docs/source/library-user-guide/upgrading.md | 24 +++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 0c00dda20d8a..e67efb2938f1 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -21,12 +21,32 @@ ## DataFusion `46.0.0` + +### Changes to `invoke()` and `invoke_batch()` deprecated + +We are migrating away from `ScalarUDFImpl::invoke()` and +`ScalarUDFImpl::invoke_batch()` in favor of `ScalarUDFImpl::invoke_with_args()`. (TODO get code links) + +If you see errors such as +```text +Example +``` + +You can resolve them by replacing all .invoke() and .invoke_batch()calls with .invoke_with_args(). +```text +TODO example +``` + +Example of changes: +- [PR XXXX] TODO + + +### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated + See more information - Change PR [PR #14224](https://github.com/apache/datafusion/pull/14224) - Example of an Upgrade [PR in delta-rs](https://github.com/delta-io/delta-rs/pull/3261) -### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated - DataFusion 46 has a major change to how the built in DataSources are organized. The ### Cookbook: Changes to `ParquetExecBuilder` From 62b6a8ca2aa1f795aa764be9beec43ac82e0829e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 26 Feb 2025 07:14:14 -0500 Subject: [PATCH 03/17] add examples --- docs/source/library-user-guide/upgrading.md | 69 +++++++++++++++++++-- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index e67efb2938f1..2184e44cb96d 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -52,14 +52,71 @@ DataFusion 46 has a major change to how the built in DataSources are organized. ### Cookbook: Changes to `ParquetExecBuilder` #### Old pattern: -```test -TODO -``` +```rust + let mut exec_plan_builder = ParquetExecBuilder::new( + FileScanConfig::new(self.log_store.object_store_url(), file_schema) + .with_file_groups( + // If all files were filtered out, we still need to emit at least one partition to + // pass datafusion sanity checks. + // + // See https://github.com/apache/datafusion/issues/11322 + if file_groups.is_empty() { + vec![vec![]] + } else { + file_groups.into_values().collect() + }, + ) + .with_statistics(stats) + .with_projection(self.projection.cloned()) + .with_limit(self.limit) + .with_table_partition_cols(table_partition_cols), + ) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) + .with_table_parquet_options(parquet_options); + + // Sometimes (i.e Merge) we want to prune files that don't make the + // filter and read the entire contents for files that do match the + // filter + if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + exec_plan_builder = exec_plan_builder.with_predicate(predicate); + } + };``` #### New Pattern -```test -TODO -``` +```rust + let mut file_source = ParquetSource::new(parquet_options) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); + + // Sometimes (i.e Merge) we want to prune files that don't make the + // filter and read the entire contents for files that do match the + // filter + if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); + } + }; + + let file_scan_config = FileScanConfig::new( + self.log_store.object_store_url(), + file_schema, + Arc::new(file_source), + ) + .with_file_groups( + // If all files were filtered out, we still need to emit at least one partition to + // pass datafusion sanity checks. + // + // See https://github.com/apache/datafusion/issues/11322 + if file_groups.is_empty() { + vec![vec![]] + } else { + file_groups.into_values().collect() + }, + ) + .with_statistics(stats) + .with_projection(self.projection.cloned()) + .with_limit(self.limit) + .with_table_partition_cols(table_partition_cols);``` From 0854dfa3a5c06a4908322ca3129fb5aac1df5fc6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 26 Feb 2025 07:14:51 -0500 Subject: [PATCH 04/17] add examples --- docs/source/library-user-guide/upgrading.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 2184e44cb96d..83332aae9755 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -120,3 +120,7 @@ DataFusion 46 has a major change to how the built in DataSources are organized. .with_limit(self.limit) .with_table_partition_cols(table_partition_cols);``` +// Build the actual scan like this +parquet_scan: file_scan_config.build(), + +``` From b26ad16b936c33fe16333f419583839f9151daed Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 26 Feb 2025 09:00:08 -0500 Subject: [PATCH 05/17] Add more examples --- docs/source/library-user-guide/upgrading.md | 30 +++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 83332aae9755..138d4c6940b1 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -49,9 +49,38 @@ See more information DataFusion 46 has a major change to how the built in DataSources are organized. The +### Cookbook: Changes to `ParquetExecBuilder` +#### Old pattern: + +When writing optimizer passes, some code treats ParquetExec specially like this: + +```rust + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + // Do something with ParquetExec here + } + } +``` + +#### New Pattern +With the new DataSource exec, most information is now on `FileScanConfig` and `ParquetSource` + +```rust + +if let Some(datasource_exec) = plan.as_any().downcast_ref::() { + if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { + // FileGroups, and other information is on the FileScanConfig + // parquet + if let Some(parquet_source) = scan_config.source.as_any().downcast_ref::() + { + // Information on PruningPredicates and parquet options are here + } +} +``` + ### Cookbook: Changes to `ParquetExecBuilder` #### Old pattern: + ```rust let mut exec_plan_builder = ParquetExecBuilder::new( FileScanConfig::new(self.log_store.object_store_url(), file_schema) @@ -124,3 +153,4 @@ DataFusion 46 has a major change to how the built in DataSources are organized. parquet_scan: file_scan_config.build(), ``` + From 074f5ecf4ffa860d3d53b72649b54dbd61f21f9b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 07:29:42 -0500 Subject: [PATCH 06/17] Add note about escaping --- docs/source/library-user-guide/upgrading.md | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 138d4c6940b1..0ea885a68f14 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -154,3 +154,34 @@ parquet_scan: file_scan_config.build(), ``` + +### `datafusion-cli` no longer automatically unescapes strings + +`datafusion-cli` previously would incorrectly unescape string literals (see [ticket] for more details). + +To escape `'` in SQL literals, use `''`: + +```sql +> select 'it''s escaped'; ++----------------------+ +| Utf8("it's escaped") | ++----------------------+ +| it's escaped | ++----------------------+ +1 row(s) fetched. +``` + +To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example + +``` +> select 'foo\nbar'; ++------------------+ +| Utf8("foo\nbar") | ++------------------+ +| foo\nbar | ++------------------+ +1 row(s) fetched. +Elapsed 0.005 seconds. +``` + +[ticket]: https://github.com/apache/datafusion/issues/13286 From 9c658c7395eacc5c4e2d3a10d349d917376b40ff Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 08:35:33 -0500 Subject: [PATCH 07/17] Cleanup ScalarUDFImpl docs --- docs/source/library-user-guide/upgrading.md | 78 ++++++++++++++++----- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 0ea885a68f14..1c5869de5b0c 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -21,35 +21,79 @@ ## DataFusion `46.0.0` +### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()` -### Changes to `invoke()` and `invoke_batch()` deprecated +DataFusion is moving to a consistent API for invoking ScalarUDFs, +[`ScalarUDFImpl::invoke_with_args()`], and deprecating +[`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`] -We are migrating away from `ScalarUDFImpl::invoke()` and -`ScalarUDFImpl::invoke_batch()` in favor of `ScalarUDFImpl::invoke_with_args()`. (TODO get code links) +If you see errors such as the following it means the older APIs are being used -If you see errors such as ```text -Example +This feature is not implemented: Function concat does not implement invoke but called ``` -You can resolve them by replacing all .invoke() and .invoke_batch()calls with .invoke_with_args(). -```text -TODO example -``` +To fix this error, change your functions to use +[`ScalarUDFImpl::invoke_with_args()`] instea, as shown below. See [PR 14876] for +an example. + +Given existing code like this: + +````rust +# /* +impl ScalarUDFImpl for SparkConcat { +... + fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result { + if args + .iter() + .any(|arg| matches!(arg.data_type(), DataType::List(_))) + { + ArrayConcat::new().invoke_batch(args, number_rows) + } else { + ConcatFunc::new().invoke_batch(args, number_rows) + } + } + # */ +}``` -Example of changes: -- [PR XXXX] TODO +To +```rust +# /* comment out so they don't run +impl ScalarUDFImpl for SparkConcat { + ... + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + if args + .args + .iter() + .any(|arg| matches!(arg.data_type(), DataType::List(_))) + { + ArrayConcat::new().invoke_with_args(args) + } else { + ConcatFunc::new().invoke_with_args(args) + } + } +} + # */ +```` + +[`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke +[`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch +[`scalarudfimpl::invoke_no_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_no_args +[`scalarudfimpl::invoke_with_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_with_args +[pr 14876]: https://github.com/apache/datafusion/pull/14876 ### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated See more information + - Change PR [PR #14224](https://github.com/apache/datafusion/pull/14224) - Example of an Upgrade [PR in delta-rs](https://github.com/delta-io/delta-rs/pull/3261) -DataFusion 46 has a major change to how the built in DataSources are organized. The +DataFusion 46 has a major change to how the built in DataSources are organized. The ### Cookbook: Changes to `ParquetExecBuilder` + #### Old pattern: When writing optimizer passes, some code treats ParquetExec specially like this: @@ -62,14 +106,15 @@ When writing optimizer passes, some code treats ParquetExec specially like this: ``` #### New Pattern -With the new DataSource exec, most information is now on `FileScanConfig` and `ParquetSource` + +With the new DataSource exec, most information is now on `FileScanConfig` and `ParquetSource` ```rust if let Some(datasource_exec) = plan.as_any().downcast_ref::() { if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { // FileGroups, and other information is on the FileScanConfig - // parquet + // parquet if let Some(parquet_source) = scan_config.source.as_any().downcast_ref::() { // Information on PruningPredicates and parquet options are here @@ -81,7 +126,7 @@ if let Some(datasource_exec) = plan.as_any().downcast_ref::() { #### Old pattern: -```rust +````rust let mut exec_plan_builder = ParquetExecBuilder::new( FileScanConfig::new(self.log_store.object_store_url(), file_schema) .with_file_groups( @@ -152,8 +197,7 @@ if let Some(datasource_exec) = plan.as_any().downcast_ref::() { // Build the actual scan like this parquet_scan: file_scan_config.build(), -``` - +```` ### `datafusion-cli` no longer automatically unescapes strings From b1e5010c7e281228816356d46b235ae082be466f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 08:36:57 -0500 Subject: [PATCH 08/17] tweaks --- docs/source/library-user-guide/upgrading.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 1c5869de5b0c..caa6931735d4 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -27,15 +27,14 @@ DataFusion is moving to a consistent API for invoking ScalarUDFs, [`ScalarUDFImpl::invoke_with_args()`], and deprecating [`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`] -If you see errors such as the following it means the older APIs are being used +If you see errors such as the following it means the older APIs are being used: ```text This feature is not implemented: Function concat does not implement invoke but called ``` -To fix this error, change your functions to use -[`ScalarUDFImpl::invoke_with_args()`] instea, as shown below. See [PR 14876] for -an example. +To fix this error, use [`ScalarUDFImpl::invoke_with_args()`] instead, as shown +below. See [PR 14876] for an example. Given existing code like this: From 2c4a33b6077f5c60b57bd1ccd625ffdf7456d11e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 08:48:04 -0500 Subject: [PATCH 09/17] Add documentation about adding the upgrade guide --- datafusion/core/src/lib.rs | 12 ++++++------ .../api-health.md | 8 ++++++++ docs/source/index.rst | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) rename docs/source/{library-user-guide => contributor-guide}/api-health.md (92%) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 3635909fe70a..3a66e22de7c3 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -1026,12 +1026,6 @@ doc_comment::doctest!( library_user_guide_adding_udfs ); -#[cfg(doctest)] -doc_comment::doctest!( - "../../../docs/source/library-user-guide/api-health.md", - library_user_guide_api_health -); - #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/library-user-guide/building-logical-plans.md", @@ -1103,3 +1097,9 @@ doc_comment::doctest!( "../../../docs/source/library-user-guide/upgrading.md", library_user_guide_upgrading ); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/contributor-guide/api-health.md", + contributor_guide_api_health +); diff --git a/docs/source/library-user-guide/api-health.md b/docs/source/contributor-guide/api-health.md similarity index 92% rename from docs/source/library-user-guide/api-health.md rename to docs/source/contributor-guide/api-health.md index 87d3754b21a7..d811bc357445 100644 --- a/docs/source/library-user-guide/api-health.md +++ b/docs/source/contributor-guide/api-health.md @@ -26,6 +26,14 @@ breaking API changes, but they are sometimes necessary. When possible, rather than making breaking API changes, we prefer to deprecate APIs to give users time to adjust to the changes. +## Upgrade Guides + +When making changes that require DataFusion users to make changes to their code +as part of an upgrade please consider adding documentation to the version +specific [Upgrade Guide] + +[upgrade guide]: ../library-user-guide/upgrading.md + ## Breaking Changes In general, a function is part of the public API if it appears on the [docs.rs page] diff --git a/docs/source/index.rst b/docs/source/index.rst index 49f1334e9620..d618e00f68b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -132,7 +132,6 @@ To get started, see library-user-guide/extending-operators library-user-guide/profiling library-user-guide/query-optimizer - library-user-guide/api-health library-user-guide/upgrading .. .. _toc.contributor-guide: @@ -146,6 +145,7 @@ To get started, see contributor-guide/development_environment contributor-guide/architecture contributor-guide/testing + contributor-guide/api-health contributor-guide/howtos contributor-guide/roadmap contributor-guide/governance From caaa97bcc5bd7281697c2b618f4dc55f4fac017a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 08:57:46 -0500 Subject: [PATCH 10/17] cleanup examples --- docs/source/library-user-guide/upgrading.md | 152 +++++++++----------- 1 file changed, 70 insertions(+), 82 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index caa6931735d4..77ef34068217 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -84,32 +84,38 @@ impl ScalarUDFImpl for SparkConcat { ### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated -See more information +DataFusion 46 has a major change to how the built in DataSources are organized. +Instead of individual `ExecutionPlan`s for the different file formats they now +all use `DataSourceExec` and the format specific information is embodied in new +traits `DataSource` and `FileSource`. -- Change PR [PR #14224](https://github.com/apache/datafusion/pull/14224) -- Example of an Upgrade [PR in delta-rs](https://github.com/delta-io/delta-rs/pull/3261) +Here is more information about +- [Design Ticket] +- Change PR [PR #14224] +- Example of an Upgrade [PR in delta-rs] -DataFusion 46 has a major change to how the built in DataSources are organized. The +[Design Ticket]: https://github.com/apache/datafusion/issues/13838 +[PR #14224]: https://github.com/apache/datafusion/pull/14224 +[PR in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261 -### Cookbook: Changes to `ParquetExecBuilder` -#### Old pattern: +### Cookbook: Changes to `ParquetExecBuilder` -When writing optimizer passes, some code treats ParquetExec specially like this: +Code that looks for `ParquetExec` like this will no longer work: ```rust - if let Some(parquet_exec) = plan.as_any().downcast_ref::() { - // Do something with ParquetExec here - } - } +# /* comment to avoid running + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + // Do something with ParquetExec here + } +# */ ``` -#### New Pattern - -With the new DataSource exec, most information is now on `FileScanConfig` and `ParquetSource` +Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` and +`ParquetSource`. The equivalent code is ```rust - +# /* comment to avoid running if let Some(datasource_exec) = plan.as_any().downcast_ref::() { if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { // FileGroups, and other information is on the FileScanConfig @@ -119,84 +125,66 @@ if let Some(datasource_exec) = plan.as_any().downcast_ref::() { // Information on PruningPredicates and parquet options are here } } +# */ ``` ### Cookbook: Changes to `ParquetExecBuilder` -#### Old pattern: - -````rust - let mut exec_plan_builder = ParquetExecBuilder::new( - FileScanConfig::new(self.log_store.object_store_url(), file_schema) - .with_file_groups( - // If all files were filtered out, we still need to emit at least one partition to - // pass datafusion sanity checks. - // - // See https://github.com/apache/datafusion/issues/11322 - if file_groups.is_empty() { - vec![vec![]] - } else { - file_groups.into_values().collect() - }, - ) - .with_statistics(stats) - .with_projection(self.projection.cloned()) - .with_limit(self.limit) - .with_table_partition_cols(table_partition_cols), - ) - .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) - .with_table_parquet_options(parquet_options); - - // Sometimes (i.e Merge) we want to prune files that don't make the - // filter and read the entire contents for files that do match the - // filter - if let Some(predicate) = logical_filter { - if config.enable_parquet_pushdown { - exec_plan_builder = exec_plan_builder.with_predicate(predicate); - } - };``` - -#### New Pattern - +Likewise code that builds `ParquetExec` using the `ParquetExecBuilder` such as +the following must be changed: ```rust - let mut file_source = ParquetSource::new(parquet_options) - .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); - - // Sometimes (i.e Merge) we want to prune files that don't make the - // filter and read the entire contents for files that do match the - // filter - if let Some(predicate) = logical_filter { - if config.enable_parquet_pushdown { - file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); - } - }; - - let file_scan_config = FileScanConfig::new( - self.log_store.object_store_url(), - file_schema, - Arc::new(file_source), - ) - .with_file_groups( - // If all files were filtered out, we still need to emit at least one partition to - // pass datafusion sanity checks. - // - // See https://github.com/apache/datafusion/issues/11322 - if file_groups.is_empty() { - vec![vec![]] - } else { - file_groups.into_values().collect() - }, - ) - .with_statistics(stats) +# /* comment to avoid running +let mut exec_plan_builder = ParquetExecBuilder::new( + FileScanConfig::new(self.log_store.object_store_url(), file_schema) .with_projection(self.projection.cloned()) .with_limit(self.limit) - .with_table_partition_cols(table_partition_cols);``` + .with_table_partition_cols(table_partition_cols), +) +.with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) +.with_table_parquet_options(parquet_options); + +// Sometimes (i.e Merge) we want to prune files that don't make the +// filter and read the entire contents for files that do match the +// filter +if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + exec_plan_builder = exec_plan_builder.with_predicate(predicate); + } +}; +# */ +``` + +New code should use `FileScanConfig` to build the appropriate `DataSourceExec`: + +```rust +# /* comment to avoid running +let mut file_source = ParquetSource::new(parquet_options) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); + +// Sometimes (i.e Merge) we want to prune files that don't make the +// filter and read the entire contents for files that do match the +// filter +if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); + } +}; + +let file_scan_config = FileScanConfig::new( + self.log_store.object_store_url(), + file_schema, + Arc::new(file_source), +) +.with_statistics(stats) +.with_projection(self.projection.cloned()) +.with_limit(self.limit) +.with_table_partition_cols(table_partition_cols); // Build the actual scan like this parquet_scan: file_scan_config.build(), - -```` +# */ +``` ### `datafusion-cli` no longer automatically unescapes strings From e30e9bff210dd6f43e8b31026a79bc6ed1cd7788 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 10:49:25 -0500 Subject: [PATCH 11/17] source --- docs/source/library-user-guide/upgrading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 77ef34068217..791b8d1bf6eb 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -120,7 +120,7 @@ if let Some(datasource_exec) = plan.as_any().downcast_ref::() { if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { // FileGroups, and other information is on the FileScanConfig // parquet - if let Some(parquet_source) = scan_config.source.as_any().downcast_ref::() + if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::() { // Information on PruningPredicates and parquet options are here } From 6ae2af214292ec1450c689c327d964f782b1b411 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 10:49:53 -0500 Subject: [PATCH 12/17] correct source --- docs/source/library-user-guide/upgrading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 791b8d1bf6eb..843e6578c593 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -120,7 +120,7 @@ if let Some(datasource_exec) = plan.as_any().downcast_ref::() { if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { // FileGroups, and other information is on the FileScanConfig // parquet - if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::() + if let Some(parquet_source) = scan_config.data_source.as_any().downcast_ref::() { // Information on PruningPredicates and parquet options are here } From 7cf18ff6362eff6e8fb8561e5484e26b1a9b73c0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 10:50:40 -0500 Subject: [PATCH 13/17] more --- docs/source/library-user-guide/upgrading.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 843e6578c593..ab0b8bf5f186 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -117,10 +117,10 @@ Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` ```rust # /* comment to avoid running if let Some(datasource_exec) = plan.as_any().downcast_ref::() { - if let Some(scan_config) = datasource_exec.source().as_any().downcast_ref::() { + if let Some(scan_config) = datasource_exec.data_source().as_any().downcast_ref::() { // FileGroups, and other information is on the FileScanConfig // parquet - if let Some(parquet_source) = scan_config.data_source.as_any().downcast_ref::() + if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::() { // Information on PruningPredicates and parquet options are here } From 4490051c3b46711f4e1322ef265554122ac92492 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 13:47:08 -0500 Subject: [PATCH 14/17] prettier --- docs/source/library-user-guide/upgrading.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index ab0b8bf5f186..de19160aa484 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -90,14 +90,14 @@ all use `DataSourceExec` and the format specific information is embodied in new traits `DataSource` and `FileSource`. Here is more information about + - [Design Ticket] - Change PR [PR #14224] - Example of an Upgrade [PR in delta-rs] -[Design Ticket]: https://github.com/apache/datafusion/issues/13838 -[PR #14224]: https://github.com/apache/datafusion/pull/14224 -[PR in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261 - +[design ticket]: https://github.com/apache/datafusion/issues/13838 +[pr #14224]: https://github.com/apache/datafusion/pull/14224 +[pr in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261 ### Cookbook: Changes to `ParquetExecBuilder` From 1ebcc8e731861b7525b5b6d78252958752c7698c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 13:58:06 -0500 Subject: [PATCH 15/17] Fix formatting --- docs/source/library-user-guide/upgrading.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index de19160aa484..9d26fbc84198 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -38,7 +38,7 @@ below. See [PR 14876] for an example. Given existing code like this: -````rust +```rust # /* impl ScalarUDFImpl for SparkConcat { ... @@ -52,8 +52,9 @@ impl ScalarUDFImpl for SparkConcat { ConcatFunc::new().invoke_batch(args, number_rows) } } - # */ -}``` +} +# */ +``` To @@ -74,7 +75,7 @@ impl ScalarUDFImpl for SparkConcat { } } # */ -```` +``` [`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke [`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch @@ -204,7 +205,7 @@ To escape `'` in SQL literals, use `''`: To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example -``` +```sql > select 'foo\nbar'; +------------------+ | Utf8("foo\nbar") | From 06c82455c5b52b5c2b52e60f0c2853808a1c3241 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 13:59:02 -0500 Subject: [PATCH 16/17] cleanup --- docs/source/library-user-guide/upgrading.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 9d26fbc84198..782a3bcc0be9 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -145,9 +145,7 @@ let mut exec_plan_builder = ParquetExecBuilder::new( .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) .with_table_parquet_options(parquet_options); -// Sometimes (i.e Merge) we want to prune files that don't make the -// filter and read the entire contents for files that do match the -// filter +// Add filter if let Some(predicate) = logical_filter { if config.enable_parquet_pushdown { exec_plan_builder = exec_plan_builder.with_predicate(predicate); @@ -163,9 +161,7 @@ New code should use `FileScanConfig` to build the appropriate `DataSourceExec`: let mut file_source = ParquetSource::new(parquet_options) .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); -// Sometimes (i.e Merge) we want to prune files that don't make the -// filter and read the entire contents for files that do match the -// filter +// Add filter if let Some(predicate) = logical_filter { if config.enable_parquet_pushdown { file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); From 696d48d8a56c2da717d3e712ad1e646b58aeab6e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Feb 2025 14:11:29 -0500 Subject: [PATCH 17/17] prettier --- docs/source/library-user-guide/upgrading.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 782a3bcc0be9..a6679cbea9ad 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -145,7 +145,7 @@ let mut exec_plan_builder = ParquetExecBuilder::new( .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) .with_table_parquet_options(parquet_options); -// Add filter +// Add filter if let Some(predicate) = logical_filter { if config.enable_parquet_pushdown { exec_plan_builder = exec_plan_builder.with_predicate(predicate); @@ -161,7 +161,7 @@ New code should use `FileScanConfig` to build the appropriate `DataSourceExec`: let mut file_source = ParquetSource::new(parquet_options) .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); -// Add filter +// Add filter if let Some(predicate) = logical_filter { if config.enable_parquet_pushdown { file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate);