Skip to content

Commit 2f1c3ab

Browse files
authored
Move First Value UDAF and builtin first / last function to aggregate-functions (#9960)
* backup Signed-off-by: jayzhan211 <[email protected]> * move PhysicalExpr Signed-off-by: jayzhan211 <[email protected]> * cleanup Signed-off-by: jayzhan211 <[email protected]> * move physical sort Signed-off-by: jayzhan211 <[email protected]> * cleanup dependencies Signed-off-by: jayzhan211 <[email protected]> * add readme Signed-off-by: jayzhan211 <[email protected]> * disable doc test Signed-off-by: jayzhan211 <[email protected]> * move column Signed-off-by: jayzhan211 <[email protected]> * fmt Signed-off-by: jayzhan211 <[email protected]> * move aggregatexp Signed-off-by: jayzhan211 <[email protected]> * move other two utils Signed-off-by: jayzhan211 <[email protected]> * license Signed-off-by: jayzhan211 <[email protected]> * switch to ignore Signed-off-by: jayzhan211 <[email protected]> * move reverse order Signed-off-by: jayzhan211 <[email protected]> * rename to common Signed-off-by: jayzhan211 <[email protected]> * cleanup Signed-off-by: jayzhan211 <[email protected]> * backup Signed-off-by: jayzhan211 <[email protected]> * move acc to first value Signed-off-by: jayzhan211 <[email protected]> * move builtin expr too Signed-off-by: jayzhan211 <[email protected]> * use macro Signed-off-by: jayzhan211 <[email protected]> * fmt Signed-off-by: jayzhan211 <[email protected]> * fix doc Signed-off-by: jayzhan211 <[email protected]> * add todo Signed-off-by: jayzhan211 <[email protected]> * rm comments Signed-off-by: jayzhan211 <[email protected]> * rm unused Signed-off-by: jayzhan211 <[email protected]> * rm unused code Signed-off-by: jayzhan211 <[email protected]> * change to private Signed-off-by: jayzhan211 <[email protected]> * fix lock Signed-off-by: jayzhan211 <[email protected]> * cleanup Signed-off-by: jayzhan211 <[email protected]> * cleanup Signed-off-by: jayzhan211 <[email protected]> * support roundtrip Signed-off-by: jayzhan211 <[email protected]> * remmove old format state Signed-off-by: jayzhan211 <[email protected]> * move aggregate related things to aggr crate Signed-off-by: jayzhan211 <[email protected]> * move back to common Signed-off-by: jayzhan211 <[email protected]> * taplo Signed-off-by: jayzhan211 <[email protected]> * rm comment Signed-off-by: jayzhan211 <[email protected]> * cleanup Signed-off-by: jayzhan211 <[email protected]> * lock Signed-off-by: jayzhan211 <[email protected]> --------- Signed-off-by: jayzhan211 <[email protected]>
1 parent 98ba11f commit 2f1c3ab

File tree

23 files changed

+720
-570
lines changed

23 files changed

+720
-570
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ members = [
2323
"datafusion/core",
2424
"datafusion/expr",
2525
"datafusion/execution",
26+
"datafusion/functions-aggregate",
2627
"datafusion/functions",
2728
"datafusion/functions-array",
2829
"datafusion/optimizer",
@@ -78,6 +79,7 @@ datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.
7879
datafusion-execution = { path = "datafusion/execution", version = "37.0.0" }
7980
datafusion-expr = { path = "datafusion/expr", version = "37.0.0" }
8081
datafusion-functions = { path = "datafusion/functions", version = "37.0.0" }
82+
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "37.0.0" }
8183
datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" }
8284
datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false }
8385
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false }

datafusion-cli/Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ datafusion-common-runtime = { workspace = true }
9898
datafusion-execution = { workspace = true }
9999
datafusion-expr = { workspace = true }
100100
datafusion-functions = { workspace = true }
101+
datafusion-functions-aggregate = { workspace = true }
101102
datafusion-functions-array = { workspace = true, optional = true }
102103
datafusion-optimizer = { workspace = true }
103104
datafusion-physical-expr = { workspace = true }

datafusion/core/src/execution/context/mod.rs

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ use crate::{
4444
datasource::{provider_as_source, MemTable, TableProvider, ViewTable},
4545
error::{DataFusionError, Result},
4646
execution::{options::ArrowReadOptions, runtime_env::RuntimeEnv, FunctionRegistry},
47+
logical_expr::AggregateUDF,
4748
logical_expr::{
4849
CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction,
4950
CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable,
@@ -53,10 +54,11 @@ use crate::{
5354
optimizer::analyzer::{Analyzer, AnalyzerRule},
5455
optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule},
5556
physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule},
56-
physical_plan::{udaf::AggregateUDF, udf::ScalarUDF, ExecutionPlan},
57+
physical_plan::{udf::ScalarUDF, ExecutionPlan},
5758
physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner},
5859
variable::{VarProvider, VarType},
5960
};
61+
use crate::{functions, functions_aggregate, functions_array};
6062

6163
use arrow::datatypes::{DataType, SchemaRef};
6264
use arrow::record_batch::RecordBatch;
@@ -69,14 +71,11 @@ use datafusion_common::{
6971
SchemaReference, TableReference,
7072
};
7173
use datafusion_execution::registry::SerializerRegistry;
72-
use datafusion_expr::type_coercion::aggregates::NUMERICS;
73-
use datafusion_expr::{create_first_value, Signature, Volatility};
7474
use datafusion_expr::{
7575
logical_plan::{DdlStatement, Statement},
7676
var_provider::is_system_variables,
7777
Expr, StringifiedPlan, UserDefinedLogicalNode, WindowUDF,
7878
};
79-
use datafusion_physical_expr::create_first_value_accumulator;
8079
use datafusion_sql::{
8180
parser::{CopyToSource, CopyToStatement, DFParser},
8281
planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel},
@@ -85,7 +84,6 @@ use datafusion_sql::{
8584

8685
use async_trait::async_trait;
8786
use chrono::{DateTime, Utc};
88-
use log::debug;
8987
use parking_lot::RwLock;
9088
use sqlparser::dialect::dialect_from_str;
9189
use url::Url;
@@ -1452,29 +1450,16 @@ impl SessionState {
14521450
};
14531451

14541452
// register built in functions
1455-
datafusion_functions::register_all(&mut new_self)
1453+
functions::register_all(&mut new_self)
14561454
.expect("can not register built in functions");
14571455

14581456
// register crate of array expressions (if enabled)
14591457
#[cfg(feature = "array_expressions")]
1460-
datafusion_functions_array::register_all(&mut new_self)
1458+
functions_array::register_all(&mut new_self)
14611459
.expect("can not register array expressions");
14621460

1463-
let first_value = create_first_value(
1464-
"FIRST_VALUE",
1465-
Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable),
1466-
Arc::new(create_first_value_accumulator),
1467-
);
1468-
1469-
match new_self.register_udaf(Arc::new(first_value)) {
1470-
Ok(Some(existing_udaf)) => {
1471-
debug!("Overwrite existing UDAF: {}", existing_udaf.name());
1472-
}
1473-
Ok(None) => {}
1474-
Err(err) => {
1475-
panic!("Failed to register UDAF: {}", err);
1476-
}
1477-
}
1461+
functions_aggregate::register_all(&mut new_self)
1462+
.expect("can not register aggregate functions");
14781463

14791464
new_self
14801465
}

datafusion/core/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,11 @@ pub mod functions_array {
541541
pub use datafusion_functions_array::*;
542542
}
543543

544+
/// re-export of [`datafusion_functions_aggregate`] crate
545+
pub mod functions_aggregate {
546+
pub use datafusion_functions_aggregate::*;
547+
}
548+
544549
#[cfg(test)]
545550
pub mod test;
546551
pub mod test_util;

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ use crate::expr::{
2424
use crate::function::{
2525
AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory,
2626
};
27-
use crate::udaf::format_state_name;
2827
use crate::{
2928
aggregate_function, built_in_function, conditional_expressions::CaseBuilder,
3029
logical_plan::Subquery, AggregateUDF, BuiltinScalarFunction, Expr, LogicalPlan,
@@ -708,17 +707,6 @@ pub fn create_udaf(
708707
))
709708
}
710709

711-
/// Creates a new UDAF with a specific signature, state type and return type.
712-
/// The signature and state type must match the `Accumulator's implementation`.
713-
/// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then.
714-
pub fn create_first_value(
715-
name: &str,
716-
signature: Signature,
717-
accumulator: AccumulatorFactoryFunction,
718-
) -> AggregateUDF {
719-
AggregateUDF::from(FirstValue::new(name, signature, accumulator))
720-
}
721-
722710
/// Implements [`AggregateUDFImpl`] for functions that have a single signature and
723711
/// return type.
724712
pub struct SimpleAggregateUDF {
@@ -813,78 +801,6 @@ impl AggregateUDFImpl for SimpleAggregateUDF {
813801
}
814802
}
815803

816-
pub struct FirstValue {
817-
name: String,
818-
signature: Signature,
819-
accumulator: AccumulatorFactoryFunction,
820-
}
821-
822-
impl Debug for FirstValue {
823-
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
824-
f.debug_struct("FirstValue")
825-
.field("name", &self.name)
826-
.field("signature", &self.signature)
827-
.field("accumulator", &"<FUNC>")
828-
.finish()
829-
}
830-
}
831-
832-
impl FirstValue {
833-
pub fn new(
834-
name: impl Into<String>,
835-
signature: Signature,
836-
accumulator: AccumulatorFactoryFunction,
837-
) -> Self {
838-
let name = name.into();
839-
Self {
840-
name,
841-
signature,
842-
accumulator,
843-
}
844-
}
845-
}
846-
847-
impl AggregateUDFImpl for FirstValue {
848-
fn as_any(&self) -> &dyn Any {
849-
self
850-
}
851-
852-
fn name(&self) -> &str {
853-
&self.name
854-
}
855-
856-
fn signature(&self) -> &Signature {
857-
&self.signature
858-
}
859-
860-
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
861-
Ok(arg_types[0].clone())
862-
}
863-
864-
fn accumulator(
865-
&self,
866-
acc_args: AccumulatorArgs,
867-
) -> Result<Box<dyn crate::Accumulator>> {
868-
(self.accumulator)(acc_args)
869-
}
870-
871-
fn state_fields(
872-
&self,
873-
name: &str,
874-
value_type: DataType,
875-
ordering_fields: Vec<Field>,
876-
) -> Result<Vec<Field>> {
877-
let mut fields = vec![Field::new(
878-
format_state_name(name, "first_value"),
879-
value_type,
880-
true,
881-
)];
882-
fields.extend(ordering_fields);
883-
fields.push(Field::new("is_set", DataType::Boolean, true));
884-
Ok(fields)
885-
}
886-
}
887-
888804
/// Creates a new UDWF with a specific signature, state type and return type.
889805
///
890806
/// The signature and state type must match the [`PartitionEvaluator`]'s implementation`.

datafusion/expr/src/udaf.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
use crate::function::AccumulatorArgs;
2121
use crate::groups_accumulator::GroupsAccumulator;
22+
use crate::utils::format_state_name;
2223
use crate::{Accumulator, Expr};
2324
use crate::{AccumulatorFactoryFunction, ReturnTypeFunction, Signature};
2425
use arrow::datatypes::{DataType, Field};
@@ -447,9 +448,3 @@ impl AggregateUDFImpl for AggregateUDFLegacyWrapper {
447448
(self.accumulator)(acc_args)
448449
}
449450
}
450-
451-
/// returns the name of the state
452-
/// TODO: Remove duplicated function in physical-expr
453-
pub(crate) fn format_state_name(name: &str, state_name: &str) -> String {
454-
format!("{name}[{state_name}]")
455-
}

datafusion/expr/src/utils.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,11 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema {
12401240
}
12411241
}
12421242

1243+
/// Build state name. State is the intermidiate state of the aggregate function.
1244+
pub fn format_state_name(name: &str, state_name: &str) -> String {
1245+
format!("{name}[{state_name}]")
1246+
}
1247+
12431248
#[cfg(test)]
12441249
mod tests {
12451250
use super::*;
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
[package]
19+
name = "datafusion-functions-aggregate"
20+
description = "Aggregate function packages for the DataFusion query engine"
21+
keywords = ["datafusion", "logical", "plan", "expressions"]
22+
readme = "README.md"
23+
version = { workspace = true }
24+
edition = { workspace = true }
25+
homepage = { workspace = true }
26+
repository = { workspace = true }
27+
license = { workspace = true }
28+
authors = { workspace = true }
29+
rust-version = { workspace = true }
30+
31+
[lib]
32+
name = "datafusion_functions_aggregate"
33+
path = "src/lib.rs"
34+
35+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
36+
37+
[dependencies]
38+
arrow = { workspace = true }
39+
datafusion-common = { workspace = true }
40+
datafusion-execution = { workspace = true }
41+
datafusion-expr = { workspace = true }
42+
datafusion-physical-expr-common = { workspace = true }
43+
log = { workspace = true }
44+
paste = "1.0.14"

0 commit comments

Comments
 (0)