|
| 1 | +<!--- |
| 2 | + Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + or more contributor license agreements. See the NOTICE file |
| 4 | + distributed with this work for additional information |
| 5 | + regarding copyright ownership. The ASF licenses this file |
| 6 | + to you under the Apache License, Version 2.0 (the |
| 7 | + "License"); you may not use this file except in compliance |
| 8 | + with the License. You may obtain a copy of the License at |
| 9 | +
|
| 10 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +
|
| 12 | + Unless required by applicable law or agreed to in writing, |
| 13 | + software distributed under the License is distributed on an |
| 14 | + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + KIND, either express or implied. See the License for the |
| 16 | + specific language governing permissions and limitations |
| 17 | + under the License. |
| 18 | +--> |
| 19 | + |
| 20 | +# Upgrade Guides |
| 21 | + |
| 22 | +## DataFusion `46.0.0` |
| 23 | + |
| 24 | +### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()` |
| 25 | + |
| 26 | +DataFusion is moving to a consistent API for invoking ScalarUDFs, |
| 27 | +[`ScalarUDFImpl::invoke_with_args()`], and deprecating |
| 28 | +[`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`] |
| 29 | + |
| 30 | +If you see errors such as the following it means the older APIs are being used: |
| 31 | + |
| 32 | +```text |
| 33 | +This feature is not implemented: Function concat does not implement invoke but called |
| 34 | +``` |
| 35 | + |
| 36 | +To fix this error, use [`ScalarUDFImpl::invoke_with_args()`] instead, as shown |
| 37 | +below. See [PR 14876] for an example. |
| 38 | + |
| 39 | +Given existing code like this: |
| 40 | + |
| 41 | +```rust |
| 42 | +# /* |
| 43 | +impl ScalarUDFImpl for SparkConcat { |
| 44 | +... |
| 45 | + fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result<ColumnarValue> { |
| 46 | + if args |
| 47 | + .iter() |
| 48 | + .any(|arg| matches!(arg.data_type(), DataType::List(_))) |
| 49 | + { |
| 50 | + ArrayConcat::new().invoke_batch(args, number_rows) |
| 51 | + } else { |
| 52 | + ConcatFunc::new().invoke_batch(args, number_rows) |
| 53 | + } |
| 54 | + } |
| 55 | +} |
| 56 | +# */ |
| 57 | +``` |
| 58 | + |
| 59 | +To |
| 60 | + |
| 61 | +```rust |
| 62 | +# /* comment out so they don't run |
| 63 | +impl ScalarUDFImpl for SparkConcat { |
| 64 | + ... |
| 65 | + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { |
| 66 | + if args |
| 67 | + .args |
| 68 | + .iter() |
| 69 | + .any(|arg| matches!(arg.data_type(), DataType::List(_))) |
| 70 | + { |
| 71 | + ArrayConcat::new().invoke_with_args(args) |
| 72 | + } else { |
| 73 | + ConcatFunc::new().invoke_with_args(args) |
| 74 | + } |
| 75 | + } |
| 76 | +} |
| 77 | + # */ |
| 78 | +``` |
| 79 | + |
| 80 | +[`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke |
| 81 | +[`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch |
| 82 | +[`scalarudfimpl::invoke_no_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_no_args |
| 83 | +[`scalarudfimpl::invoke_with_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_with_args |
| 84 | +[pr 14876]: https://github.com/apache/datafusion/pull/14876 |
| 85 | + |
| 86 | +### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated |
| 87 | + |
| 88 | +DataFusion 46 has a major change to how the built in DataSources are organized. |
| 89 | +Instead of individual `ExecutionPlan`s for the different file formats they now |
| 90 | +all use `DataSourceExec` and the format specific information is embodied in new |
| 91 | +traits `DataSource` and `FileSource`. |
| 92 | + |
| 93 | +Here is more information about |
| 94 | + |
| 95 | +- [Design Ticket] |
| 96 | +- Change PR [PR #14224] |
| 97 | +- Example of an Upgrade [PR in delta-rs] |
| 98 | + |
| 99 | +[design ticket]: https://github.com/apache/datafusion/issues/13838 |
| 100 | +[pr #14224]: https://github.com/apache/datafusion/pull/14224 |
| 101 | +[pr in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261 |
| 102 | + |
| 103 | +### Cookbook: Changes to `ParquetExecBuilder` |
| 104 | + |
| 105 | +Code that looks for `ParquetExec` like this will no longer work: |
| 106 | + |
| 107 | +```rust |
| 108 | +# /* comment to avoid running |
| 109 | + if let Some(parquet_exec) = plan.as_any().downcast_ref::<ParquetExec>() { |
| 110 | + // Do something with ParquetExec here |
| 111 | + } |
| 112 | +# */ |
| 113 | +``` |
| 114 | + |
| 115 | +Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` and |
| 116 | +`ParquetSource`. The equivalent code is |
| 117 | + |
| 118 | +```rust |
| 119 | +# /* comment to avoid running |
| 120 | +if let Some(datasource_exec) = plan.as_any().downcast_ref::<DataSourceExec>() { |
| 121 | + if let Some(scan_config) = datasource_exec.data_source().as_any().downcast_ref::<FileScanConfig>() { |
| 122 | + // FileGroups, and other information is on the FileScanConfig |
| 123 | + // parquet |
| 124 | + if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::<ParquetSource>() |
| 125 | + { |
| 126 | + // Information on PruningPredicates and parquet options are here |
| 127 | + } |
| 128 | +} |
| 129 | +# */ |
| 130 | +``` |
| 131 | + |
| 132 | +### Cookbook: Changes to `ParquetExecBuilder` |
| 133 | + |
| 134 | +Likewise code that builds `ParquetExec` using the `ParquetExecBuilder` such as |
| 135 | +the following must be changed: |
| 136 | + |
| 137 | +```rust |
| 138 | +# /* comment to avoid running |
| 139 | +let mut exec_plan_builder = ParquetExecBuilder::new( |
| 140 | + FileScanConfig::new(self.log_store.object_store_url(), file_schema) |
| 141 | + .with_projection(self.projection.cloned()) |
| 142 | + .with_limit(self.limit) |
| 143 | + .with_table_partition_cols(table_partition_cols), |
| 144 | +) |
| 145 | +.with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})) |
| 146 | +.with_table_parquet_options(parquet_options); |
| 147 | +
|
| 148 | +// Add filter |
| 149 | +if let Some(predicate) = logical_filter { |
| 150 | + if config.enable_parquet_pushdown { |
| 151 | + exec_plan_builder = exec_plan_builder.with_predicate(predicate); |
| 152 | + } |
| 153 | +}; |
| 154 | +# */ |
| 155 | +``` |
| 156 | + |
| 157 | +New code should use `FileScanConfig` to build the appropriate `DataSourceExec`: |
| 158 | + |
| 159 | +```rust |
| 160 | +# /* comment to avoid running |
| 161 | +let mut file_source = ParquetSource::new(parquet_options) |
| 162 | + .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {})); |
| 163 | +
|
| 164 | +// Add filter |
| 165 | +if let Some(predicate) = logical_filter { |
| 166 | + if config.enable_parquet_pushdown { |
| 167 | + file_source = file_source.with_predicate(Arc::clone(&file_schema), predicate); |
| 168 | + } |
| 169 | +}; |
| 170 | +
|
| 171 | +let file_scan_config = FileScanConfig::new( |
| 172 | + self.log_store.object_store_url(), |
| 173 | + file_schema, |
| 174 | + Arc::new(file_source), |
| 175 | +) |
| 176 | +.with_statistics(stats) |
| 177 | +.with_projection(self.projection.cloned()) |
| 178 | +.with_limit(self.limit) |
| 179 | +.with_table_partition_cols(table_partition_cols); |
| 180 | +
|
| 181 | +// Build the actual scan like this |
| 182 | +parquet_scan: file_scan_config.build(), |
| 183 | +# */ |
| 184 | +``` |
| 185 | + |
| 186 | +### `datafusion-cli` no longer automatically unescapes strings |
| 187 | + |
| 188 | +`datafusion-cli` previously would incorrectly unescape string literals (see [ticket] for more details). |
| 189 | + |
| 190 | +To escape `'` in SQL literals, use `''`: |
| 191 | + |
| 192 | +```sql |
| 193 | +> select 'it''s escaped'; |
| 194 | ++----------------------+ |
| 195 | +| Utf8("it's escaped") | |
| 196 | ++----------------------+ |
| 197 | +| it's escaped | |
| 198 | ++----------------------+ |
| 199 | +1 row(s) fetched. |
| 200 | +``` |
| 201 | +
|
| 202 | +To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example |
| 203 | +
|
| 204 | +```sql |
| 205 | +> select 'foo\nbar'; |
| 206 | ++------------------+ |
| 207 | +| Utf8("foo\nbar") | |
| 208 | ++------------------+ |
| 209 | +| foo\nbar | |
| 210 | ++------------------+ |
| 211 | +1 row(s) fetched. |
| 212 | +Elapsed 0.005 seconds. |
| 213 | +``` |
| 214 | +
|
| 215 | +[ticket]: https://github.com/apache/datafusion/issues/13286 |
0 commit comments