diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 9a0d0157c1ae..3a66e22de7c3 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -1026,12 +1026,6 @@ doc_comment::doctest!( library_user_guide_adding_udfs ); -#[cfg(doctest)] -doc_comment::doctest!( - "../../../docs/source/library-user-guide/api-health.md", - library_user_guide_api_health -); - #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/library-user-guide/building-logical-plans.md", @@ -1097,3 +1091,15 @@ doc_comment::doctest!( "../../../docs/source/library-user-guide/working-with-exprs.md", library_user_guide_working_with_exprs ); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/library-user-guide/upgrading.md", + library_user_guide_upgrading +); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/contributor-guide/api-health.md", + contributor_guide_api_health +); diff --git a/docs/source/library-user-guide/api-health.md b/docs/source/contributor-guide/api-health.md similarity index 92% rename from docs/source/library-user-guide/api-health.md rename to docs/source/contributor-guide/api-health.md index 87d3754b21a7..d811bc357445 100644 --- a/docs/source/library-user-guide/api-health.md +++ b/docs/source/contributor-guide/api-health.md @@ -26,6 +26,14 @@ breaking API changes, but they are sometimes necessary. When possible, rather than making breaking API changes, we prefer to deprecate APIs to give users time to adjust to the changes. +## Upgrade Guides + +When making changes that require DataFusion users to make changes to their code +as part of an upgrade please consider adding documentation to the version +specific [Upgrade Guide] + +[upgrade guide]: ../library-user-guide/upgrading.md + ## Breaking Changes In general, a function is part of the public API if it appears on the [docs.rs page] diff --git a/docs/source/index.rst b/docs/source/index.rst index 839c896d0b4c..d618e00f68b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -132,8 +132,9 @@ To get started, see library-user-guide/extending-operators library-user-guide/profiling library-user-guide/query-optimizer - library-user-guide/api-health -.. _toc.contributor-guide: + library-user-guide/upgrading + +.. .. _toc.contributor-guide: .. toctree:: :maxdepth: 1 @@ -144,6 +145,7 @@ To get started, see contributor-guide/development_environment contributor-guide/architecture contributor-guide/testing + contributor-guide/api-health contributor-guide/howtos contributor-guide/roadmap contributor-guide/governance diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md new file mode 100644 index 000000000000..a6679cbea9ad --- /dev/null +++ b/docs/source/library-user-guide/upgrading.md @@ -0,0 +1,215 @@ + + +# Upgrade Guides + +## DataFusion `46.0.0` + +### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()` + +DataFusion is moving to a consistent API for invoking ScalarUDFs, +[`ScalarUDFImpl::invoke_with_args()`], and deprecating +[`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`] + +If you see errors such as the following it means the older APIs are being used: + +```text +This feature is not implemented: Function concat does not implement invoke but called +``` + +To fix this error, use [`ScalarUDFImpl::invoke_with_args()`] instead, as shown +below. See [PR 14876] for an example. + +Given existing code like this: + +```rust +# /* +impl ScalarUDFImpl for SparkConcat { +... + fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result { + if args + .iter() + .any(|arg| matches!(arg.data_type(), DataType::List(_))) + { + ArrayConcat::new().invoke_batch(args, number_rows) + } else { + ConcatFunc::new().invoke_batch(args, number_rows) + } + } +} +# */ +``` + +To + +```rust +# /* comment out so they don't run +impl ScalarUDFImpl for SparkConcat { + ... + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + if args + .args + .iter() + .any(|arg| matches!(arg.data_type(), DataType::List(_))) + { + ArrayConcat::new().invoke_with_args(args) + } else { + ConcatFunc::new().invoke_with_args(args) + } + } +} + # */ +``` + +[`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke +[`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch +[`scalarudfimpl::invoke_no_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_no_args +[`scalarudfimpl::invoke_with_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_with_args +[pr 14876]: https://github.com/apache/datafusion/pull/14876 + +### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated + +DataFusion 46 has a major change to how the built in DataSources are organized. +Instead of individual `ExecutionPlan`s for the different file formats they now +all use `DataSourceExec` and the format specific information is embodied in new +traits `DataSource` and `FileSource`. + +Here is more information about + +- [Design Ticket] +- Change PR [PR #14224] +- Example of an Upgrade [PR in delta-rs] + +[design ticket]: https://github.com/apache/datafusion/issues/13838 +[pr #14224]: https://github.com/apache/datafusion/pull/14224 +[pr in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261 + +### Cookbook: Changes to `ParquetExecBuilder` + +Code that looks for `ParquetExec` like this will no longer work: + +```rust +# /* comment to avoid running + if let Some(parquet_exec) = plan.as_any().downcast_ref::() { + // Do something with ParquetExec here + } +# */ +``` + +Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` and +`ParquetSource`. The equivalent code is + +```rust +# /* comment to avoid running +if let Some(datasource_exec) = plan.as_any().downcast_ref::() { + if let Some(scan_config) = datasource_exec.data_source().as_any().downcast_ref::() { + // FileGroups, and other information is on the FileScanConfig + // parquet + if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::() + { + // Information on PruningPredicates and parquet options are here + } +} +# */ +``` + +### Cookbook: Changes to `ParquetExecBuilder` + +Likewise code that builds `ParquetExec` using the `ParquetExecBuilder` such as +the following must be changed: + +```rust +# /* comment to avoid running +let mut exec_plan_builder = ParquetExecBuilder::new( + FileScanConfig::new(self.log_store.object_store_url(), file_schema) + .with_projection(self.projection.cloned()) + .with_limit(self.limit) + .with_table_partition_cols(table_partition_cols), +) +.with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) +.with_table_parquet_options(parquet_options); + +// Add filter +if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + exec_plan_builder = exec_plan_builder.with_predicate(predicate); + } +}; +# */ +``` + +New code should use `FileScanConfig` to build the appropriate `DataSourceExec`: + +```rust +# /* comment to avoid running +let mut file_source = ParquetSource::new(parquet_options) + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); + +// Add filter +if let Some(predicate) = logical_filter { + if config.enable_parquet_pushdown { + file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); + } +}; + +let file_scan_config = FileScanConfig::new( + self.log_store.object_store_url(), + file_schema, + Arc::new(file_source), +) +.with_statistics(stats) +.with_projection(self.projection.cloned()) +.with_limit(self.limit) +.with_table_partition_cols(table_partition_cols); + +// Build the actual scan like this +parquet_scan: file_scan_config.build(), +# */ +``` + +### `datafusion-cli` no longer automatically unescapes strings + +`datafusion-cli` previously would incorrectly unescape string literals (see [ticket] for more details). + +To escape `'` in SQL literals, use `''`: + +```sql +> select 'it''s escaped'; ++----------------------+ +| Utf8("it's escaped") | ++----------------------+ +| it's escaped | ++----------------------+ +1 row(s) fetched. +``` + +To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example + +```sql +> select 'foo\nbar'; ++------------------+ +| Utf8("foo\nbar") | ++------------------+ +| foo\nbar | ++------------------+ +1 row(s) fetched. +Elapsed 0.005 seconds. +``` + +[ticket]: https://github.com/apache/datafusion/issues/13286