From 9fd307f5be095f10997786ff559eace3b92831f8 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Fri, 28 Feb 2025 11:49:21 +0100 Subject: [PATCH 01/14] Add first draft for sorting test --- datafusion/core/tests/dataframe/mod.rs | 60 +++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index b134ec54b13d..6295fafe7741 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -19,17 +19,9 @@ mod dataframe_functions; mod describe; -use arrow::array::{ - record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, - FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, - Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, - StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, -}; +use arrow::array::{record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::{ - DataType, Field, Float32Type, Int32Type, Schema, SchemaRef, UInt64Type, UnionFields, - UnionMode, -}; +use arrow::datatypes::{DataType, Field, Float32Type, Float64Type, Int32Type, Schema, SchemaRef, UInt64Type, UnionFields, UnionMode}; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; use datafusion_expr::utils::COUNT_STAR_EXPANSION; @@ -44,6 +36,7 @@ use sqlparser::ast::NullTreatment; use std::collections::HashMap; use std::fs; use std::sync::Arc; +use futures::StreamExt; use tempfile::TempDir; use url::Url; @@ -2846,6 +2839,53 @@ async fn sort_on_ambiguous_column() -> Result<()> { Ok(()) } +#[tokio::test] +async fn sort_on_union_with_logical_type() -> Result<()> { + let fields = [ + (0, Arc::new(Field::new("A", DataType::Int32, false))), + (1, Arc::new(Field::new("B", DataType::Float64, false))), + ] + .into_iter() + .collect(); + let schema = Schema::new(vec![Field::new( + "my_union", + DataType::Union(fields, UnionMode::Dense), + false, + )]); + + let mut builder = UnionBuilder::new_dense(); + builder.append::("A", 1)?; + builder.append::("B", 3.0)?; + builder.append::("A", 1)?; + builder.append::("B", 3.0)?; + let union = builder.build()?; + + let ctx = SessionContext::new(); + ctx.register_table( + "test_table", + Arc::new(MemTable::try_new( + Arc::new(schema.clone()), + vec![vec![RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(union)], + )?]], + )?), + )?; + + ctx.table("test_table") + .await? + .sort_by(vec![Expr::from(datafusion::common::Column::from( + "my_union", + ))])? + .execute_stream() + .await? + .next() + .await + .unwrap()?; + + Ok(()) +} + #[tokio::test] async fn group_by_ambiguous_column() -> Result<()> { let err = create_test_table("t1") From f77203c357d6d95671055b95e7a3975664604a84 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Fri, 28 Feb 2025 12:48:18 +0100 Subject: [PATCH 02/14] Add extension type registry --- datafusion/catalog/src/session.rs | 5 ++ datafusion/common/src/types/logical.rs | 23 ++++-- .../core/src/execution/session_state.rs | 9 +++ datafusion/execution/src/task.rs | 26 ++++++ datafusion/expr/src/registry.rs | 79 ++++++++++++++++++- 5 files changed, 134 insertions(+), 8 deletions(-) diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index db49529ac43f..997260787770 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -28,6 +28,7 @@ use parking_lot::{Mutex, RwLock}; use std::any::Any; use std::collections::HashMap; use std::sync::{Arc, Weak}; +use datafusion_common::types::LogicalTypeRef; /// Interface for accessing [`SessionState`] from the catalog. /// @@ -113,6 +114,9 @@ pub trait Session: Send + Sync { /// Return reference to window functions fn window_functions(&self) -> &HashMap>; + /// Return reference to extension types + fn extension_types(&self) -> &HashMap; + /// Return the runtime env fn runtime_env(&self) -> &Arc; @@ -133,6 +137,7 @@ impl From<&dyn Session> for TaskContext { state.scalar_functions().clone(), state.aggregate_functions().clone(), state.window_functions().clone(), + state.extension_types().clone(), state.runtime_env().clone(), ) } diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 884ce20fd9e2..efffa46a41f8 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -27,13 +27,26 @@ pub enum TypeSignature<'a> { /// Represents a built-in native type. Native(&'a NativeType), /// Represents an arrow-compatible extension type. - /// () + Extension(ExtensionTypeSignature<'a>), +} + +/// Represents an arrow-compatible extension type. +/// () +/// +/// The `name` should contain the same value as 'ARROW:extension:name'. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct ExtensionTypeSignature<'a> { + name: &'a str, + parameters: &'a [TypeParameter<'a>], +} + +impl ExtensionTypeSignature<'_> { + /// Returns the name of the extension type. /// /// The `name` should contain the same value as 'ARROW:extension:name'. - Extension { - name: &'a str, - parameters: &'a [TypeParameter<'a>], - }, + pub fn name(&self) -> &str { + &self.name + } } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index bdaae4f6985b..51bf3cc3e805 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -80,6 +80,7 @@ use sqlparser::ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias}; use sqlparser::dialect::dialect_from_str; use url::Url; use uuid::Uuid; +use datafusion_common::types::LogicalTypeRef; /// `SessionState` contains all the necessary state to plan and execute queries, /// such as configuration, functions, and runtime environment. Please see the @@ -148,6 +149,8 @@ pub struct SessionState { aggregate_functions: HashMap>, /// Window functions registered in the context window_functions: HashMap>, + /// Extension types registered in the context + extension_types: HashMap, /// Deserializer registry for extensions. serializer_registry: Arc, /// Holds registered external FileFormat implementations @@ -247,6 +250,10 @@ impl Session for SessionState { &self.window_functions } + fn extension_types(&self) -> &HashMap { + &self.extension_types + } + fn runtime_env(&self) -> &Arc { self.runtime_env() } @@ -1400,6 +1407,7 @@ impl SessionStateBuilder { scalar_functions: HashMap::new(), aggregate_functions: HashMap::new(), window_functions: HashMap::new(), + extension_types: HashMap::new(), serializer_registry: serializer_registry .unwrap_or(Arc::new(EmptySerializerRegistry)), file_formats: HashMap::new(), @@ -1924,6 +1932,7 @@ impl From<&SessionState> for TaskContext { state.scalar_functions.clone(), state.aggregate_functions.clone(), state.window_functions.clone(), + state.extension_types.clone(), Arc::clone(&state.runtime_env), ) } diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index b11596c4a30f..1f1ee5dc03d8 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -19,8 +19,10 @@ use crate::{ config::SessionConfig, memory_pool::MemoryPool, registry::FunctionRegistry, runtime_env::RuntimeEnv, }; +use datafusion_common::types::LogicalTypeRef; use datafusion_common::{plan_datafusion_err, DataFusionError, Result}; use datafusion_expr::planner::ExprPlanner; +use datafusion_expr::registry::{ExtensionTypeRegistry, MemoryExtensionTypeRegistry}; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; use std::collections::HashSet; use std::{collections::HashMap, sync::Arc}; @@ -46,6 +48,8 @@ pub struct TaskContext { aggregate_functions: HashMap>, /// Window functions associated with this task context window_functions: HashMap>, + /// Extension types associated with this task context + extension_types: MemoryExtensionTypeRegistry, /// Runtime environment associated with this task context runtime: Arc, } @@ -62,6 +66,7 @@ impl Default for TaskContext { scalar_functions: HashMap::new(), aggregate_functions: HashMap::new(), window_functions: HashMap::new(), + extension_types: MemoryExtensionTypeRegistry::new(), runtime, } } @@ -80,6 +85,7 @@ impl TaskContext { scalar_functions: HashMap>, aggregate_functions: HashMap>, window_functions: HashMap>, + extension_types: HashMap, runtime: Arc, ) -> Self { Self { @@ -89,6 +95,7 @@ impl TaskContext { scalar_functions, aggregate_functions, window_functions, + extension_types: MemoryExtensionTypeRegistry::from(extension_types), runtime, } } @@ -203,6 +210,23 @@ impl FunctionRegistry for TaskContext { } } +impl ExtensionTypeRegistry for TaskContext { + fn get(&self, name: &str) -> Result { + self.extension_types.get(name) + } + + fn register_type( + &mut self, + logical_type: LogicalTypeRef, + ) -> Result> { + self.extension_types.register_type(logical_type) + } + + fn deregister_type(&mut self, name: &str) -> Result> { + self.extension_types.deregister_type(name) + } +} + #[cfg(test)] mod tests { use super::*; @@ -240,6 +264,7 @@ mod tests { HashMap::default(), HashMap::default(), HashMap::default(), + HashMap::default(), runtime, ); @@ -272,6 +297,7 @@ mod tests { HashMap::default(), HashMap::default(), HashMap::default(), + HashMap::default(), runtime, ); diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 4eb49710bcf8..41a86c0fa361 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -15,13 +15,16 @@ // specific language governing permissions and limitations // under the License. -//! FunctionRegistry trait +//! [FunctionRegistry] and [ExtensionTypeRegistry] traits use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; -use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result}; -use std::collections::HashSet; +use datafusion_common::types::{LogicalTypeRef, TypeSignature}; +use datafusion_common::{ + internal_err, not_impl_err, plan_datafusion_err, Result, +}; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -201,3 +204,73 @@ impl FunctionRegistry for MemoryFunctionRegistry { vec![] } } + +pub trait ExtensionTypeRegistry { + /// Returns a reference to the logical type named `name`. + fn get(&self, name: &str) -> Result; + + /// Registers a new [LogicalTypeRef], returning any previously registered implementation. + /// + /// Returns an error if the type cannot be registered, for example if the registry is read only. + fn register_type( + &mut self, + logical_type: LogicalTypeRef, + ) -> Result>; + + /// Deregisters a logical type with the name `name`, returning the implementation that was + /// deregistered. + /// + /// Returns an error if the type cannot be deregistered, for example if the registry is read + /// only. + fn deregister_type(&mut self, name: &str) -> Result>; +} + +/// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s. +#[derive(Default, Debug)] +pub struct MemoryExtensionTypeRegistry { + /// Holds a mapping between the name of an extension type and its logical type. + extension_types: HashMap, +} + +impl MemoryExtensionTypeRegistry { + /// Creates an empty [MemoryExtensionTypeRegistry]. + pub fn new() -> Self { + Self::default() + } +} + +impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { + fn get(&self, name: &str) -> Result { + self.extension_types + .get(name) + .ok_or_else(|| plan_datafusion_err!("Extension type not found.")) + .cloned() + } + + fn register_type( + &mut self, + logical_type: LogicalTypeRef, + ) -> Result> { + let signature = match logical_type.signature() { + TypeSignature::Native(_) => { + return internal_err!("Cannot register a native type") + } + TypeSignature::Extension(sig) => sig, + }; + Ok(self + .extension_types + .insert(signature.name().into(), logical_type)) + } + + fn deregister_type(&mut self, name: &str) -> Result> { + Ok(self.extension_types.remove(name)) + } +} + +impl From> for MemoryExtensionTypeRegistry { + fn from(value: HashMap) -> Self { + Self { + extension_types: value, + } + } +} From 7e421c31e66437c4331d291ac8f33ac085e1cf34 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Fri, 28 Feb 2025 13:40:50 +0100 Subject: [PATCH 03/14] Add LogicalTypePlanningInformation --- datafusion/common/src/types/logical.rs | 45 +++++++++++++++++++++++++- datafusion/common/src/types/native.rs | 9 +++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index efffa46a41f8..212ad9e8a84c 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -20,6 +20,9 @@ use crate::error::Result; use arrow::datatypes::DataType; use core::fmt; use std::{cmp::Ordering, hash::Hash, sync::Arc}; +use std::fmt::Debug; +use arrow::array::ArrayRef; +use arrow::compute::SortOptions; /// Signature that uniquely identifies a type among other types. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -91,6 +94,7 @@ pub type LogicalTypeRef = Arc; pub trait LogicalType: Sync + Send { /// Get the native backing type of this logical type. fn native(&self) -> &NativeType; + /// Get the unique type signature for this logical type. Logical types with identical /// signatures are considered equal. fn signature(&self) -> TypeSignature<'_>; @@ -100,13 +104,21 @@ pub trait LogicalType: Sync + Send { fn default_cast_for(&self, origin: &DataType) -> Result { self.native().default_cast_for(origin) } + + /// Returns a [LogicalTypePlanningInformation] for this logical type. + /// + /// The default implementation returns the planning information of the underlying [NativeType]. + fn planning_information(&self) -> LogicalTypePlanningInformation { + self.native().planning_information() + } } -impl fmt::Debug for dyn LogicalType { +impl Debug for dyn LogicalType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("LogicalType") .field(&self.signature()) .field(&self.native()) + .field(&self.planning_information()) .finish() } } @@ -145,3 +157,34 @@ impl Hash for dyn LogicalType { self.native().hash(state); } } + +/// Encapsulates information on how planning should be done in the presence of a logical type. +#[derive(Clone, Debug)] +pub struct LogicalTypePlanningInformation { + /// Specifies an ordering on elements of this logical type. + pub ordering: OrderingInformation +} + +/// Specifies how a logical type should be sorted. +#[derive(Clone, Debug)] +pub enum OrderingInformation { + /// Use the default arrow comparison. + Default, + /// Use a custom comparison. + /// + /// Using a custom sorting allows users to override the default order of elements or implement + /// ordering for values that do not have a natural order (e.g., unions). It is expected that + /// the custom ordering handles all native types for the [LogicalType]. + Custom(Arc) +} + +/// A [CustomOrdering] can implement non-standard comparisons between values. This ability can be +/// used to customize algorithms that must compare elements. The most prominent example is sorting. +pub trait CustomOrdering: Debug + Send + Sync { + /// TODO + fn execute( + &self, + array_ref: ArrayRef, + sort_options: SortOptions, + ) -> Result; +} diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 39c79b4b9974..974525097760 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -16,7 +16,8 @@ // under the License. use super::{ - LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, + LogicalField, LogicalFieldRef, LogicalFields, LogicalType, + LogicalTypePlanningInformation, LogicalUnionFields, OrderingInformation, TypeSignature, }; use crate::error::{Result, _internal_err}; @@ -359,6 +360,12 @@ impl LogicalType for NativeType { } }) } + + fn planning_information(&self) -> LogicalTypePlanningInformation { + LogicalTypePlanningInformation { + ordering: OrderingInformation::Default, + } + } } // The following From, From, ... implementations are temporary From 4957b3c980422ad40b84814aebf2f1a487bfc245 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Mon, 3 Mar 2025 21:20:07 +0100 Subject: [PATCH 04/14] Add OrderingInformation to PhysicalSortExpr and PhysicalSortRequirement --- datafusion/common/src/lib.rs | 1 + datafusion/common/src/types/logical.rs | 65 +++++++++++--- datafusion/common/src/types/native.rs | 4 +- datafusion/common/src/utils/mod.rs | 11 +-- datafusion/core/src/physical_planner.rs | 2 +- .../tests/fuzz_cases/equivalence/ordering.rs | 2 +- .../tests/fuzz_cases/equivalence/utils.rs | 4 +- .../sort_preserving_repartition_fuzz.rs | 4 +- datafusion/core/tests/memory_limit/mod.rs | 4 +- .../replace_with_order_preserving_variants.rs | 2 +- .../tests/physical_optimizer/test_utils.rs | 2 +- datafusion/datasource/src/file_scan_config.rs | 2 +- datafusion/datasource/src/statistics.rs | 8 +- datafusion/expr-common/src/sort_properties.rs | 86 ++++++++++--------- datafusion/expr/src/udf.rs | 2 +- datafusion/expr/src/udwf.rs | 2 +- datafusion/expr/src/window_state.rs | 18 ++-- .../src/merge_arrays.rs | 16 ++-- .../functions-aggregate-common/src/utils.rs | 4 +- .../functions-aggregate/src/array_agg.rs | 2 +- .../functions-aggregate/src/first_last.rs | 50 ++++++----- .../functions-aggregate/src/nth_value.rs | 2 +- datafusion/functions-window/src/rank.rs | 4 +- datafusion/functions-window/src/row_number.rs | 4 +- datafusion/functions/src/datetime/date_bin.rs | 2 +- .../functions/src/datetime/date_trunc.rs | 2 +- datafusion/functions/src/math/abs.rs | 4 +- datafusion/functions/src/math/log.rs | 17 ++-- datafusion/functions/src/math/monotonicity.rs | 40 ++++----- datafusion/functions/src/math/round.rs | 2 +- datafusion/functions/src/math/signum.rs | 2 +- datafusion/functions/src/math/trunc.rs | 2 +- .../physical-expr-common/src/sort_expr.rs | 59 +++++++------ datafusion/physical-expr-common/src/utils.rs | 7 +- datafusion/physical-expr/src/aggregate.rs | 9 +- .../physical-expr/src/equivalence/ordering.rs | 6 +- .../src/equivalence/properties.rs | 72 +++++++++------- .../physical-expr/src/expressions/binary.rs | 4 +- .../physical-expr/src/expressions/negative.rs | 2 +- .../src/window/sliding_aggregate.rs | 2 +- .../physical-expr/src/window/standard.rs | 17 ++-- .../physical-expr/src/window/window_expr.rs | 9 +- .../src/enforce_distribution.rs | 6 +- .../src/enforce_sorting/sort_pushdown.rs | 4 +- .../src/output_requirements.rs | 2 +- .../src/aggregates/order/partial.rs | 8 +- .../physical-plan/src/aggregates/row_hash.rs | 2 +- .../src/joins/sort_merge_join.rs | 7 +- .../src/joins/symmetric_hash_join.rs | 2 +- datafusion/physical-plan/src/joins/utils.rs | 32 +++---- .../physical-plan/src/repartition/mod.rs | 2 +- datafusion/physical-plan/src/sorts/cursor.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 7 +- .../src/sorts/sort_preserving_merge.rs | 2 +- datafusion/physical-plan/src/sorts/stream.rs | 9 +- datafusion/physical-plan/src/topk/mod.rs | 4 +- datafusion/physical-plan/src/windows/mod.rs | 47 +++++++--- .../proto/src/physical_plan/from_proto.rs | 2 +- .../proto/src/physical_plan/to_proto.rs | 2 +- 59 files changed, 408 insertions(+), 291 deletions(-) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index d5b7c22a546c..f3bb017bc8ea 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -54,6 +54,7 @@ pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; +pub mod sort; /// Reexport arrow crate pub use arrow; diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 212ad9e8a84c..99aa85d38a67 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -17,12 +17,14 @@ use super::NativeType; use crate::error::Result; +use arrow::array::ArrayRef; +use arrow::compute::SortOptions; use arrow::datatypes::DataType; use core::fmt; -use std::{cmp::Ordering, hash::Hash, sync::Arc}; use std::fmt::Debug; -use arrow::array::ArrayRef; -use arrow::compute::SortOptions; +use std::hash::Hasher; +use std::{cmp::Ordering, hash::Hash, sync::Arc}; +use crate::ScalarValue; /// Signature that uniquely identifies a type among other types. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -38,7 +40,7 @@ pub enum TypeSignature<'a> { /// /// The `name` should contain the same value as 'ARROW:extension:name'. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct ExtensionTypeSignature<'a> { +pub struct ExtensionTypeSignature<'a> { name: &'a str, parameters: &'a [TypeParameter<'a>], } @@ -152,7 +154,7 @@ impl Ord for dyn LogicalType { } impl Hash for dyn LogicalType { - fn hash(&self, state: &mut H) { + fn hash(&self, state: &mut H) { self.signature().hash(state); self.native().hash(state); } @@ -162,29 +164,64 @@ impl Hash for dyn LogicalType { #[derive(Clone, Debug)] pub struct LogicalTypePlanningInformation { /// Specifies an ordering on elements of this logical type. - pub ordering: OrderingInformation + pub ordering: SortOrdering, } /// Specifies how a logical type should be sorted. -#[derive(Clone, Debug)] -pub enum OrderingInformation { +#[derive(Clone, Debug, Default)] +pub enum SortOrdering { /// Use the default arrow comparison. + #[default] Default, /// Use a custom comparison. /// /// Using a custom sorting allows users to override the default order of elements or implement /// ordering for values that do not have a natural order (e.g., unions). It is expected that /// the custom ordering handles all native types for the [LogicalType]. - Custom(Arc) + Custom(Arc), +} + +impl SortOrdering { + pub fn partial_cmp(&self, lhs: &ScalarValue, rhs: &ScalarValue) -> Option { + match self { + SortOrdering::Default => lhs.partial_cmp(rhs), + SortOrdering::Custom(_) => todo!("custom order") + } + } +} + +impl PartialEq for SortOrdering { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (SortOrdering::Default, SortOrdering::Default) => true, + (SortOrdering::Custom(c1), SortOrdering::Custom(c2)) => { + c1.ordering_id() == c2.ordering_id() + } + _ => false, + } + } +} + +impl Eq for SortOrdering {} + +impl Hash for SortOrdering { + fn hash(&self, state: &mut H) { + match self { + SortOrdering::Default => state.write_u8(1), + SortOrdering::Custom(ordering) => ordering.ordering_id().hash(state), + } + } } /// A [CustomOrdering] can implement non-standard comparisons between values. This ability can be /// used to customize algorithms that must compare elements. The most prominent example is sorting. pub trait CustomOrdering: Debug + Send + Sync { + /// Returns the ordering id. + /// + /// The ordering id is used to establish equality between instances of [CustomOrdering]. + fn ordering_id(&self) -> &str; + /// TODO - fn execute( - &self, - array_ref: ArrayRef, - sort_options: SortOptions, - ) -> Result; + fn execute(&self, array_ref: ArrayRef, sort_options: SortOptions) + -> Result; } diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 974525097760..7172803a9c99 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -17,7 +17,7 @@ use super::{ LogicalField, LogicalFieldRef, LogicalFields, LogicalType, - LogicalTypePlanningInformation, LogicalUnionFields, OrderingInformation, + LogicalTypePlanningInformation, LogicalUnionFields, SortOrdering, TypeSignature, }; use crate::error::{Result, _internal_err}; @@ -363,7 +363,7 @@ impl LogicalType for NativeType { fn planning_information(&self) -> LogicalTypePlanningInformation { LogicalTypePlanningInformation { - ordering: OrderingInformation::Default, + ordering: SortOrdering::Default, } } } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index ff9cdedab8b1..c954f525db0b 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -29,7 +29,7 @@ use arrow::array::{ OffsetSizeTrait, }; use arrow::buffer::OffsetBuffer; -use arrow::compute::{partition, SortColumn, SortOptions}; +use arrow::compute::{partition}; use arrow::datatypes::{DataType, Field, SchemaRef}; use sqlparser::ast::Ident; use sqlparser::dialect::GenericDialect; @@ -41,6 +41,7 @@ use std::num::NonZero; use std::ops::Range; use std::sync::Arc; use std::thread::available_parallelism; +use crate::sort::{SortColumn, SortOptions}; /// Applies an optional projection to a [`SchemaRef`], returning the /// projected schema @@ -100,13 +101,13 @@ pub fn compare_rows( // Preserving lexical ordering. for ((lhs, rhs), sort_options) in zip_it { // Consider all combinations of NULLS FIRST/LAST and ASC/DESC configurations. - let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first) { + let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first()) { (true, false, false) | (false, true, true) => Ordering::Greater, (true, false, true) | (false, true, false) => Ordering::Less, - (false, false, _) => if sort_options.descending { - rhs.partial_cmp(lhs) + (false, false, _) => if sort_options.descending() { + sort_options.ordering().partial_cmp(rhs, lhs) } else { - lhs.partial_cmp(rhs) + sort_options.ordering().partial_cmp(lhs, rhs) } .ok_or_else(|| { _internal_datafusion_err!("Column array shouldn't be empty") diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index a74cdcc5920b..26d265033513 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -60,7 +60,6 @@ use datafusion_physical_plan::empty::EmptyExec; use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use arrow::array::{builder::StringBuilder, RecordBatch}; -use arrow::compute::SortOptions; use arrow::datatypes::{Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; @@ -95,6 +94,7 @@ use itertools::{multiunzip, Itertools}; use log::{debug, trace}; use sqlparser::ast::NullTreatment; use tokio::sync::Mutex; +use datafusion_common::sort::SortOptions; /// Physical query planner that converts a `LogicalPlan` to an /// `ExecutionPlan` suitable for execution. diff --git a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs index 769deef1187d..47243b9b4a92 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs @@ -309,7 +309,7 @@ fn test_ordering_satisfy_with_equivalence() -> Result<()> { .into_iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(expr), - options, + options: options, }) .collect::(); diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs index d4b41b686631..040155358118 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, Float32Array, Float64Array, RecordBatch, UInt32Array}; use arrow::compute::SortOptions; -use arrow::compute::{lexsort_to_indices, take_record_batch, SortColumn}; +use arrow::compute::{lexsort_to_indices, take_record_batch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{exec_err, plan_datafusion_err, DataFusionError, Result}; @@ -387,7 +387,7 @@ pub fn generate_table_for_eq_properties( for ordering in eq_properties.oeq_class().iter() { let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering .iter() - .map(|PhysicalSortExpr { expr, options }| { + .map(|PhysicalSortExpr { expr, options: options }| { let col = expr.as_any().downcast_ref::().unwrap(); let (idx, _field) = schema.column_with_name(col.name()).unwrap(); let arr = generate_random_array(n_elem, n_distinct); diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 06b93d41af36..35fb208a7f6e 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -20,7 +20,7 @@ mod sp_repartition_fuzz_tests { use std::sync::Arc; use arrow::array::{ArrayRef, Int64Array, RecordBatch, UInt64Array}; - use arrow::compute::{concat_batches, lexsort, SortColumn, SortOptions}; + use arrow::compute::{concat_batches, lexsort}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::physical_plan::{ @@ -162,7 +162,7 @@ mod sp_repartition_fuzz_tests { for ordering in eq_properties.oeq_class().iter() { let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering .iter() - .map(|PhysicalSortExpr { expr, options }| { + .map(|PhysicalSortExpr { expr, options: options }| { let col = expr.as_any().downcast_ref::().unwrap(); let (idx, _field) = schema.column_with_name(col.name()).unwrap(); let arr = generate_random_array(n_elem, n_distinct); diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 2deb8fde2da6..2ced21c3171a 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -731,11 +731,11 @@ impl Scenario { let sort_information = vec![LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options, + options: options, }, PhysicalSortExpr { expr: col("b", &schema).unwrap(), - options, + options: options, }, ])]; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index 58eb866c590c..64a19e1a9a67 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -1141,7 +1141,7 @@ fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: options, } } diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 0b9c3b80bb93..73e54f6e5df1 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -199,7 +199,7 @@ pub fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: options, } } diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index df38464f1b00..55e79eb10d1f 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -965,7 +965,7 @@ fn get_projected_output_ordering( // Compute the new sort expression (with correct index) after projection: new_ordering.push(PhysicalSortExpr { expr: Arc::new(Column::new(name, idx)), - options: *options, + options: options.clone(), }); continue; } diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs index 9df5aa993d43..9cd0225853c2 100644 --- a/datafusion/datasource/src/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -27,10 +27,10 @@ use crate::PartitionedFile; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; use arrow::{ - compute::SortColumn, row::{Row, Rows}, }; use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::sort::SortColumn; use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -122,7 +122,7 @@ impl MinMaxStatistics { .enumerate() .map(|(i, (col, sort))| PhysicalSortExpr { expr: Arc::new(Column::new(col.name(), i)), - options: sort.options, + options: sort.options.clone(), }) .collect::>(), ); @@ -177,7 +177,7 @@ impl MinMaxStatistics { .map(|expr| { expr.expr .data_type(schema) - .map(|data_type| SortField::new_with_options(data_type, expr.options)) + .map(|data_type| SortField::new_with_options(data_type, expr.options.to_arrow().expect("TODO"))) }) .collect::>>() .map_err(|e| e.context("create sort fields"))?; @@ -233,7 +233,7 @@ impl MinMaxStatistics { Ok(SortColumn { values: Arc::clone(values.column(idx)), - options: Some(sort_expr.options), + options: Some(sort_expr.options.clone()), }) }) .collect::>>() diff --git a/datafusion/expr-common/src/sort_properties.rs b/datafusion/expr-common/src/sort_properties.rs index 5d17a34a96fb..f7205122e237 100644 --- a/datafusion/expr-common/src/sort_properties.rs +++ b/datafusion/expr-common/src/sort_properties.rs @@ -19,8 +19,8 @@ use std::ops::Neg; use crate::interval_arithmetic::Interval; -use arrow::compute::SortOptions; use arrow::datatypes::DataType; +use datafusion_common::sort::SortOptions; /// To propagate [`SortOptions`] across the `PhysicalExpr`, it is insufficient /// to simply use `Option`: There must be a differentiation between @@ -33,29 +33,30 @@ use arrow::datatypes::DataType; /// sorted data; however the ((a_ordered + 999) + c_ordered) expression can. Therefore, /// we need two different variants for literals and unordered columns as literals are /// often more ordering-friendly under most mathematical operations. -#[derive(PartialEq, Debug, Clone, Copy, Default)] +#[derive(PartialEq, Debug, Clone, Default)] pub enum SortProperties { - /// Use the ordinary [`SortOptions`] struct to represent ordered data: + /// Use the ordinary [`SortOptions`] struct to represent ordered data Ordered(SortOptions), - // This alternative represents unordered data: + /// This alternative represents unordered data: #[default] Unordered, - // Singleton is used for single-valued literal numbers: + /// Singleton is used for single-valued literal numbers: Singleton, } impl SortProperties { pub fn add(&self, rhs: &Self) -> Self { match (self, rhs) { - (Self::Singleton, _) => *rhs, - (_, Self::Singleton) => *self, + (Self::Singleton, _) => rhs.clone(), + (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.descending == rhs.descending => + if lhs.ordering() == rhs.ordering() + && lhs.descending() == rhs.descending() => { - Self::Ordered(SortOptions { - descending: lhs.descending, - nulls_first: lhs.nulls_first || rhs.nulls_first, - }) + Self::Ordered( + lhs.clone() + .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + ) } _ => Self::Unordered, } @@ -64,18 +65,18 @@ impl SortProperties { pub fn sub(&self, rhs: &Self) -> Self { match (self, rhs) { (Self::Singleton, Self::Singleton) => Self::Singleton, - (Self::Singleton, Self::Ordered(rhs)) => Self::Ordered(SortOptions { - descending: !rhs.descending, - nulls_first: rhs.nulls_first, - }), - (_, Self::Singleton) => *self, + (Self::Singleton, Self::Ordered(rhs)) => { + Self::Ordered(rhs.clone().with_descending(!rhs.descending())) + } + (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.descending != rhs.descending => + if lhs.ordering() == rhs.ordering() + && lhs.descending() != rhs.descending() => { - Self::Ordered(SortOptions { - descending: lhs.descending, - nulls_first: lhs.nulls_first || rhs.nulls_first, - }) + Self::Ordered( + lhs.clone() + .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + ) } _ => Self::Unordered, } @@ -83,15 +84,15 @@ impl SortProperties { pub fn gt_or_gteq(&self, rhs: &Self) -> Self { match (self, rhs) { - (Self::Singleton, Self::Ordered(rhs)) => Self::Ordered(SortOptions { - descending: !rhs.descending, - nulls_first: rhs.nulls_first, - }), - (_, Self::Singleton) => *self, + (Self::Singleton, Self::Ordered(rhs)) => { + Self::Ordered(rhs.clone().with_descending(!rhs.descending())) + } + (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.descending != rhs.descending => + if lhs.ordering() == rhs.ordering() + && lhs.descending() != rhs.descending() => { - *self + self.clone() } _ => Self::Unordered, } @@ -100,18 +101,16 @@ impl SortProperties { pub fn and_or(&self, rhs: &Self) -> Self { match (self, rhs) { (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.descending == rhs.descending => + if lhs.ordering() == rhs.ordering() + && lhs.descending() == rhs.descending() => { - Self::Ordered(SortOptions { - descending: lhs.descending, - nulls_first: lhs.nulls_first || rhs.nulls_first, - }) + Self::Ordered( + lhs.clone() + .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + ) } (Self::Ordered(opt), Self::Singleton) - | (Self::Singleton, Self::Ordered(opt)) => Self::Ordered(SortOptions { - descending: opt.descending, - nulls_first: opt.nulls_first, - }), + | (Self::Singleton, Self::Ordered(opt)) => Self::Ordered(opt.clone()), (Self::Singleton, Self::Singleton) => Self::Singleton, _ => Self::Unordered, } @@ -121,11 +120,14 @@ impl SortProperties { impl Neg for SortProperties { type Output = Self; - fn neg(mut self) -> Self::Output { - if let SortProperties::Ordered(SortOptions { descending, .. }) = &mut self { - *descending = !*descending; + fn neg(self) -> Self::Output { + match self { + SortProperties::Ordered(sort_definition) => { + SortProperties::Ordered(sort_definition.with_reversed_order()) + } + SortProperties::Unordered => self, + SortProperties::Singleton => self, } - self } } diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 8215b671a379..35aef98ea8c7 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -767,7 +767,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { .skip(1) .all(|input| &input.sort_properties == first_order) { - Ok(*first_order) + Ok(first_order.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 4da63d7955f5..f886e6048ff4 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -17,7 +17,6 @@ //! [`WindowUDF`]: User Defined Window Functions -use arrow::compute::SortOptions; use std::cmp::Ordering; use std::hash::{DefaultHasher, Hash, Hasher}; use std::{ @@ -33,6 +32,7 @@ use crate::{ function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature, }; use datafusion_common::{not_impl_err, Result}; +use datafusion_common::sort::SortOptions; use datafusion_doc::Documentation; use datafusion_functions_window_common::expr::ExpressionArgs; use datafusion_functions_window_common::field::WindowUDFFieldArgs; diff --git a/datafusion/expr/src/window_state.rs b/datafusion/expr/src/window_state.rs index f1d0ead23ab1..455758d6376b 100644 --- a/datafusion/expr/src/window_state.rs +++ b/datafusion/expr/src/window_state.rs @@ -23,7 +23,7 @@ use crate::{WindowFrame, WindowFrameBound, WindowFrameUnits}; use arrow::{ array::ArrayRef, - compute::{concat, concat_batches, SortOptions}, + compute::{concat, concat_batches}, datatypes::{DataType, SchemaRef}, record_batch::RecordBatch, }; @@ -32,6 +32,7 @@ use datafusion_common::{ utils::{compare_rows, get_row_at_idx, search_in_slice}, DataFusionError, Result, ScalarValue, }; +use datafusion_common::sort::SortOptions; /// Holds the state of evaluating a window function #[derive(Debug)] @@ -134,12 +135,12 @@ pub enum WindowFrameContext { impl WindowFrameContext { /// Create a new state object for the given window frame. - pub fn new(window_frame: Arc, sort_options: Vec) -> Self { + pub fn new(window_frame: Arc, sort_definition: Vec) -> Self { match window_frame.units { WindowFrameUnits::Rows => WindowFrameContext::Rows(window_frame), WindowFrameUnits::Range => WindowFrameContext::Range { window_frame, - state: WindowFrameStateRange::new(sort_options), + state: WindowFrameStateRange::new(sort_definition), }, WindowFrameUnits::Groups => WindowFrameContext::Groups { window_frame, @@ -288,13 +289,13 @@ impl PartitionBatchState { /// BY clause. This information is used to calculate the range. #[derive(Debug, Default)] pub struct WindowFrameStateRange { - sort_options: Vec, + sort_definitions: Vec, } impl WindowFrameStateRange { /// Create a new object to store the search state. fn new(sort_options: Vec) -> Self { - Self { sort_options } + Self { sort_definitions: sort_options } } /// This function calculates beginning/ending indices for the frame of the current row. @@ -389,14 +390,14 @@ impl WindowFrameStateRange { let current_row_values = get_row_at_idx(range_columns, idx)?; let end_range = if let Some(delta) = delta { let is_descending: bool = self - .sort_options + .sort_definitions .first() .ok_or_else(|| { DataFusionError::Internal( "Sort options unexpectedly absent in a window frame".to_string(), ) })? - .descending; + .descending(); current_row_values .iter() @@ -427,7 +428,7 @@ impl WindowFrameStateRange { last_range.end }; let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| { - let cmp = compare_rows(current, target, &self.sort_options)?; + let cmp = compare_rows(current, target, &self.sort_definitions)?; Ok(if SIDE { cmp.is_lt() } else { cmp.is_le() }) }; search_in_slice(range_columns, &end_range, compare_fn, search_start, length) @@ -670,6 +671,7 @@ mod tests { use super::*; use arrow::array::Float64Array; + use arrow::compute::SortOptions; fn get_test_data() -> (Vec, Vec) { let range_columns: Vec = vec![Arc::new(Float64Array::from(vec![ diff --git a/datafusion/functions-aggregate-common/src/merge_arrays.rs b/datafusion/functions-aggregate-common/src/merge_arrays.rs index 9b9a1240c1a1..79737b40c985 100644 --- a/datafusion/functions-aggregate-common/src/merge_arrays.rs +++ b/datafusion/functions-aggregate-common/src/merge_arrays.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -use arrow::compute::SortOptions; use datafusion_common::utils::compare_rows; use datafusion_common::{exec_err, ScalarValue}; use std::cmp::Ordering; use std::collections::{BinaryHeap, VecDeque}; +use datafusion_common::sort::SortOptions; /// This is a wrapper struct to be able to correctly merge `ARRAY_AGG` data from /// multiple partitions using `BinaryHeap`. When used inside `BinaryHeap`, this @@ -34,7 +34,7 @@ struct CustomElement<'a> { // Comparison "key" ordering: Vec, /// Options defining the ordering semantics - sort_options: &'a [SortOptions], + sort_definitions: &'a [SortOptions], } impl<'a> CustomElement<'a> { @@ -42,13 +42,13 @@ impl<'a> CustomElement<'a> { branch_idx: usize, value: ScalarValue, ordering: Vec, - sort_options: &'a [SortOptions], + sort_definitions: &'a [SortOptions], ) -> Self { Self { branch_idx, value, ordering, - sort_options, + sort_definitions, } } @@ -58,7 +58,7 @@ impl<'a> CustomElement<'a> { target: &[ScalarValue], ) -> datafusion_common::Result { // Calculate ordering according to `sort_options` - compare_rows(current, target, self.sort_options) + compare_rows(current, target, self.sort_definitions) } } @@ -116,7 +116,7 @@ pub fn merge_ordered_arrays( // each `ScalarValue` in the values`. ordering_values: &mut [VecDeque>], // Defines according to which ordering comparisons should be done. - sort_options: &[SortOptions], + sort_definitions: &[SortOptions], ) -> datafusion_common::Result<(Vec, Vec>)> { // Keep track the most recent data of each branch, in binary heap data structure. let mut heap = BinaryHeap::::new(); @@ -149,7 +149,7 @@ pub fn merge_ordered_arrays( branch_idx, value, orderings, - sort_options, + sort_definitions, )); } // If None, we consumed this branch, skip it. @@ -186,7 +186,7 @@ pub fn merge_ordered_arrays( branch_idx, value, orderings, - sort_options, + sort_definitions, )); } } diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs index 083dac615b5d..5e1d184cd0c1 100644 --- a/datafusion/functions-aggregate-common/src/utils.rs +++ b/datafusion/functions-aggregate-common/src/utils.rs @@ -21,7 +21,6 @@ use arrow::array::{ArrayRef, AsArray}; use arrow::datatypes::ArrowNativeType; use arrow::{ array::ArrowNativeTypeOp, - compute::SortOptions, datatypes::{ DataType, Decimal128Type, DecimalType, Field, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, @@ -29,6 +28,7 @@ use arrow::{ }, }; use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::sort::SortOptions; use datafusion_expr_common::accumulator::Accumulator; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -109,7 +109,7 @@ pub fn ordering_fields( /// Selects the sort option attribute from all the given `PhysicalSortExpr`s. pub fn get_sort_options(ordering_req: &LexOrdering) -> Vec { - ordering_req.iter().map(|item| item.options).collect() + ordering_req.iter().map(|item| item.options.clone()).collect() } /// A wrapper around a type to provide hash for floats diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 0f12ac34bfd2..b96bdf68f3c4 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -504,7 +504,7 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { let sort_options = self .ordering_req .iter() - .map(|sort_expr| sort_expr.options) + .map(|sort_expr| sort_expr.options.clone()) .collect::>(); (self.values, self.ordering_values) = merge_ordered_arrays( diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 6df8ede4fc77..2fd024c3c614 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -23,11 +23,12 @@ use std::mem::size_of_val; use std::sync::Arc; use arrow::array::{ArrayRef, AsArray, BooleanArray}; -use arrow::compute::{self, LexicographicalComparator, SortColumn}; +use arrow::compute::{self, LexicographicalComparator}; use arrow::datatypes::{DataType, Field}; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, + internal_datafusion_err, }; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; @@ -254,11 +255,16 @@ impl FirstValueAccumulator { let sort_columns = ordering_values .iter() .zip(self.ordering_req.iter()) - .map(|(values, req)| SortColumn { - values: Arc::clone(values), - options: Some(req.options), + .map(|(values, req)| { + let options = req.options.to_arrow().map_err(|_| { + internal_datafusion_err!("FirstValue does not support custom sorts") + })?; + Ok(compute::SortColumn { + values: Arc::clone(values), + options: Some(options), + }) }) - .collect::>(); + .collect::>>()?; let comparator = LexicographicalComparator::try_new(&sort_columns)?; @@ -317,7 +323,7 @@ impl Accumulator for FirstValueAccumulator { let sort_columns = convert_to_sort_cols( &filtered_states[1..is_set_idx], self.ordering_req.as_ref(), - ); + )?; let comparator = LexicographicalComparator::try_new(&sort_columns)?; let min = (0..filtered_states[0].len()).min_by(|&a, &b| comparator.compare(a, b)); @@ -552,14 +558,10 @@ impl LastValueAccumulator { return Ok((!value.is_empty()).then_some(value.len() - 1)); } } - let sort_columns = ordering_values - .iter() - .zip(self.ordering_req.iter()) - .map(|(values, req)| SortColumn { - values: Arc::clone(values), - options: Some(req.options), - }) - .collect::>(); + let sort_columns = convert_to_sort_cols( + &ordering_values, + self.ordering_req.as_ref(), + )?; let comparator = LexicographicalComparator::try_new(&sort_columns)?; let max_ind = if self.ignore_nulls { @@ -622,7 +624,7 @@ impl Accumulator for LastValueAccumulator { let sort_columns = convert_to_sort_cols( &filtered_states[1..is_set_idx], self.ordering_req.as_ref(), - ); + )?; let comparator = LexicographicalComparator::try_new(&sort_columns)?; let max = (0..filtered_states[0].len()).max_by(|&a, &b| comparator.compare(a, b)); @@ -672,14 +674,22 @@ fn filter_states_according_to_is_set( } /// Combines array refs and their corresponding orderings to construct `SortColumn`s. -fn convert_to_sort_cols(arrs: &[ArrayRef], sort_exprs: &LexOrdering) -> Vec { +fn convert_to_sort_cols( + arrs: &[ArrayRef], + sort_exprs: &LexOrdering, +) -> Result> { arrs.iter() .zip(sort_exprs.iter()) - .map(|(item, sort_expr)| SortColumn { - values: Arc::clone(item), - options: Some(sort_expr.options), + .map(|(item, sort_expr)| { + let options = sort_expr.options.to_arrow().map_err(|_| { + internal_datafusion_err!("FirstValue and LastValue does not support custom sorts") + })?; + Ok(compute::SortColumn { + values: Arc::clone(item), + options: Some(options), + }) }) - .collect::>() + .collect() } #[cfg(test)] diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index d84bd02a6baf..079c8c8eeed9 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -333,7 +333,7 @@ impl Accumulator for NthValueAccumulator { let sort_options = self .ordering_req .iter() - .map(|sort_expr| sort_expr.options) + .map(|sort_expr| sort_expr.options.clone()) .collect::>(); let (new_values, new_orderings) = merge_ordered_arrays( &mut partition_values, diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs index bd2edc5722eb..7fd1fc99c96b 100644 --- a/datafusion/functions-window/src/rank.rs +++ b/datafusion/functions-window/src/rank.rs @@ -27,11 +27,12 @@ use std::sync::{Arc, LazyLock}; use crate::define_udwf_and_expr; use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::{Float64Array, UInt64Array}; -use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::utils::get_row_at_idx; use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_common::sort::SortOptions; +use datafusion_common::types::SortOrdering; use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; use datafusion_expr::{ Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, @@ -173,6 +174,7 @@ impl WindowUDFImpl for Rank { fn sort_options(&self) -> Option { Some(SortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }) diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index 8f462528dbed..6899375554b1 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -19,7 +19,6 @@ use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::UInt64Array; -use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; @@ -33,6 +32,8 @@ use field::WindowUDFFieldArgs; use std::any::Any; use std::fmt::Debug; use std::ops::Range; +use datafusion_common::sort::SortOptions; +use datafusion_common::types::SortOrdering; define_udwf_and_expr!( RowNumber, @@ -92,6 +93,7 @@ impl WindowUDFImpl for RowNumber { fn sort_options(&self) -> Option { Some(SortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }) diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 5ffae46dde48..3bcd13936a2e 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -217,7 +217,7 @@ impl ScalarUDFImpl for DateBinFunc { .map(|r| r.sort_properties.eq(&SortProperties::Singleton)) .unwrap_or(true) { - Ok(date_value.sort_properties) + Ok(date_value.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index ed3eb228bf03..26a2c0b60422 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -273,7 +273,7 @@ impl ScalarUDFImpl for DateTruncFunc { let date_value = &input[1]; if precision.sort_properties.eq(&SortProperties::Singleton) { - Ok(date_value.sort_properties) + Ok(date_value.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 0c686a59016a..4c18986dde7a 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -186,9 +186,9 @@ impl ScalarUDFImpl for AbsFunc { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(-arg.sort_properties) + Ok(-arg.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index fd135f4c5ec0..d6fed00efa3e 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -95,23 +95,24 @@ impl ScalarUDFImpl for LogFunc { fn output_ordering(&self, input: &[ExprProperties]) -> Result { let (base_sort_properties, num_sort_properties) = if input.len() == 1 { // log(x) defaults to log(10, x) - (SortProperties::Singleton, input[0].sort_properties) + (SortProperties::Singleton, input[0].sort_properties.clone()) } else { - (input[0].sort_properties, input[1].sort_properties) + (input[0].sort_properties.clone(), input[1].sort_properties.clone()) }; - match (num_sort_properties, base_sort_properties) { + match (&num_sort_properties, &base_sort_properties) { (first @ SortProperties::Ordered(num), SortProperties::Ordered(base)) - if num.descending != base.descending - && num.nulls_first == base.nulls_first => + if num.ordering() == base.ordering() + && num.descending() != base.descending() + && num.nulls_first() == base.nulls_first() => { - Ok(first) + Ok(first.clone()) } ( first @ (SortProperties::Ordered(_) | SortProperties::Singleton), SortProperties::Singleton, - ) => Ok(first), + ) => Ok(first.clone()), (SortProperties::Singleton, second @ SortProperties::Ordered(_)) => { - Ok(-second) + Ok(-second.clone()) } _ => Ok(SortProperties::Unordered), } diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs index baa3147f6258..6301b5c03056 100644 --- a/datafusion/functions/src/math/monotonicity.rs +++ b/datafusion/functions/src/math/monotonicity.rs @@ -32,7 +32,7 @@ pub fn acos_order(input: &[ExprProperties]) -> Result { Interval::make_symmetric_unit_interval(&range.lower().data_type())?; if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE { - Ok(-arg.sort_properties) + Ok(-arg.sort_properties.clone()) } else { exec_err!("Input range of ACOS contains out-of-domain values") } @@ -63,7 +63,7 @@ pub fn acosh_order(input: &[ExprProperties]) -> Result { )?; if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of ACOSH contains out-of-domain values") } @@ -92,7 +92,7 @@ pub fn asin_order(input: &[ExprProperties]) -> Result { Interval::make_symmetric_unit_interval(&range.lower().data_type())?; if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of ASIN contains out-of-domain values") } @@ -114,7 +114,7 @@ pub fn get_asin_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn asinh_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_ASINH: LazyLock = LazyLock::new(|| { @@ -133,7 +133,7 @@ pub fn get_asinh_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn atan_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_ATAN: LazyLock = LazyLock::new(|| { @@ -159,7 +159,7 @@ pub fn atanh_order(input: &[ExprProperties]) -> Result { Interval::make_symmetric_unit_interval(&range.lower().data_type())?; if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of ATANH contains out-of-domain values") } @@ -210,7 +210,7 @@ pub fn get_atan2_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn cbrt_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_CBRT: LazyLock = LazyLock::new(|| { @@ -229,7 +229,7 @@ pub fn get_cbrt_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn ceil_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_CEIL: LazyLock = LazyLock::new(|| { @@ -275,9 +275,9 @@ pub fn cosh_order(input: &[ExprProperties]) -> Result { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(-arg.sort_properties) + Ok(-arg.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } @@ -299,7 +299,7 @@ pub fn get_cosh_doc() -> &'static Documentation { /// Non-decreasing function that converts radians to degrees. pub fn degrees_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_DEGREES: LazyLock = LazyLock::new(|| { @@ -318,7 +318,7 @@ pub fn get_degrees_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn exp_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_EXP: LazyLock = LazyLock::new(|| { @@ -337,7 +337,7 @@ pub fn get_exp_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn floor_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_FLOOR: LazyLock = LazyLock::new(|| { @@ -362,7 +362,7 @@ pub fn ln_order(input: &[ExprProperties]) -> Result { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of LN contains out-of-domain values") } @@ -390,7 +390,7 @@ pub fn log2_order(input: &[ExprProperties]) -> Result { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of LOG2 contains out-of-domain values") } @@ -418,7 +418,7 @@ pub fn log10_order(input: &[ExprProperties]) -> Result { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of LOG10 contains out-of-domain values") } @@ -440,7 +440,7 @@ pub fn get_log10_doc() -> &'static Documentation { /// Non-decreasing for all real numbers x. pub fn radians_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_RADIONS: LazyLock = LazyLock::new(|| { @@ -480,7 +480,7 @@ pub fn get_sin_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn sinh_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_SINH: LazyLock = LazyLock::new(|| { @@ -505,7 +505,7 @@ pub fn sqrt_order(input: &[ExprProperties]) -> Result { let zero_point = Interval::make_zero(&range.lower().data_type())?; if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE { - Ok(arg.sort_properties) + Ok(arg.sort_properties.clone()) } else { exec_err!("Input range of SQRT contains out-of-domain values") } @@ -548,7 +548,7 @@ pub fn get_tan_doc() -> &'static Documentation { /// Non-decreasing for all real numbers. pub fn tanh_order(input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } static DOCUMENTATION_TANH: LazyLock = LazyLock::new(|| { diff --git a/datafusion/functions/src/math/round.rs b/datafusion/functions/src/math/round.rs index fc87b7e63a62..9a5c36692915 100644 --- a/datafusion/functions/src/math/round.rs +++ b/datafusion/functions/src/math/round.rs @@ -104,7 +104,7 @@ impl ScalarUDFImpl for RoundFunc { .map(|r| r.sort_properties.eq(&SortProperties::Singleton)) .unwrap_or(true) { - Ok(value.sort_properties) + Ok(value.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/functions/src/math/signum.rs b/datafusion/functions/src/math/signum.rs index ba5422afa768..19780758082d 100644 --- a/datafusion/functions/src/math/signum.rs +++ b/datafusion/functions/src/math/signum.rs @@ -86,7 +86,7 @@ impl ScalarUDFImpl for SignumFunc { fn output_ordering(&self, input: &[ExprProperties]) -> Result { // Non-decreasing for all real numbers x. - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { diff --git a/datafusion/functions/src/math/trunc.rs b/datafusion/functions/src/math/trunc.rs index 2ac291204a0b..6f5ddf4cd41f 100644 --- a/datafusion/functions/src/math/trunc.rs +++ b/datafusion/functions/src/math/trunc.rs @@ -113,7 +113,7 @@ impl ScalarUDFImpl for TruncFunc { .map(|r| r.sort_properties.eq(&SortProperties::Singleton)) .unwrap_or(true) { - Ok(value.sort_properties) + Ok(value.sort_properties.clone()) } else { Ok(SortProperties::Unordered) } diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index 38b820edc544..043d2b02316c 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -18,6 +18,12 @@ //! Sort expressions use crate::physical_expr::PhysicalExpr; +use arrow::array::RecordBatch; +use arrow::datatypes::Schema; +use datafusion_common::sort::{SortColumn, SortOptions}; +use datafusion_common::Result; +use datafusion_expr_common::columnar_value::ColumnarValue; +use itertools::Itertools; use std::fmt; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; @@ -25,13 +31,6 @@ use std::ops::{Deref, Index, Range, RangeFrom, RangeTo}; use std::sync::{Arc, LazyLock}; use std::vec::IntoIter; -use arrow::compute::kernels::sort::{SortColumn, SortOptions}; -use arrow::datatypes::Schema; -use arrow::record_batch::RecordBatch; -use datafusion_common::Result; -use datafusion_expr_common::columnar_value::ColumnarValue; -use itertools::Itertools; - /// Represents Sort operation for a column in a RecordBatch /// /// Example: @@ -95,25 +94,25 @@ impl PhysicalSortExpr { /// Set the sort sort options to ASC pub fn asc(mut self) -> Self { - self.options.descending = false; + self.options = self.options.with_descending(false); self } /// Set the sort sort options to DESC pub fn desc(mut self) -> Self { - self.options.descending = true; + self.options = self.options.with_descending(true); self } /// Set the sort sort options to NULLS FIRST pub fn nulls_first(mut self) -> Self { - self.options.nulls_first = true; + self.options = self.options.with_nulls_first(true); self } /// Set the sort sort options to NULLS LAST pub fn nulls_last(mut self) -> Self { - self.options.nulls_first = false; + self.options = self.options.with_nulls_first(false); self } } @@ -156,7 +155,7 @@ impl PhysicalSortExpr { }; Ok(SortColumn { values: array_to_sort, - options: Some(self.options), + options: Some(self.options.clone()), }) } @@ -172,11 +171,15 @@ impl PhysicalSortExpr { let nullable = self.expr.nullable(schema).unwrap_or(true); self.expr.eq(&requirement.expr) && if nullable { - requirement.options.is_none_or(|opts| self.options == opts) - } else { requirement .options - .is_none_or(|opts| self.options.descending == opts.descending) + .as_ref() + .is_none_or(|opts| &self.options == opts) + } else { + requirement.options.as_ref().is_none_or(|opts| { + self.options.ordering() == opts.ordering() + && self.options.descending() == opts.descending() + }) } } } @@ -215,17 +218,14 @@ impl From for PhysicalSortExpr { /// The default is picked to be consistent with /// PostgreSQL: fn from(value: PhysicalSortRequirement) -> Self { - let options = value.options.unwrap_or(SortOptions { - descending: false, - nulls_first: false, - }); - PhysicalSortExpr::new(value.expr, options) + let sort_definition = value.options.unwrap_or_default(); + PhysicalSortExpr::new(value.expr, sort_definition) } } impl From for PhysicalSortRequirement { fn from(value: PhysicalSortExpr) -> Self { - PhysicalSortRequirement::new(value.expr, Some(value.options)) + PhysicalSortRequirement::new(value.expr, Some(value.options.clone())) } } @@ -276,8 +276,14 @@ impl PhysicalSortRequirement { /// which must match only `expr`. /// /// See [`PhysicalSortRequirement`] for examples. - pub fn new(expr: Arc, options: Option) -> Self { - Self { expr, options } + pub fn new( + expr: Arc, + sort_definition: Option, + ) -> Self { + Self { + expr, + options: sort_definition, + } } /// Replace the required expression for this requirement with the new one @@ -291,7 +297,8 @@ impl PhysicalSortRequirement { self.expr.eq(&other.expr) && other .options - .is_none_or(|other_opts| self.options == Some(other_opts)) + .as_ref() + .is_none_or(|other_opts| self.options.as_ref() == Some(other_opts)) } #[deprecated(since = "43.0.0", note = "use LexRequirement::from_lex_ordering")] @@ -312,8 +319,8 @@ impl PhysicalSortRequirement { /// Returns the SQL string representation of the given [SortOptions] object. #[inline] -fn to_str(options: &SortOptions) -> &str { - match (options.descending, options.nulls_first) { +fn to_str(options: &SortOptions) -> &'static str { + match (options.descending(), options.nulls_first()) { (true, true) => "DESC", (true, false) => "DESC NULLS LAST", (false, true) => "ASC", diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs index 114007bfa6af..04a93aa65e6b 100644 --- a/datafusion/physical-expr-common/src/utils.rs +++ b/datafusion/physical-expr-common/src/utils.rs @@ -99,7 +99,12 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result { pub fn reverse_order_bys(order_bys: &LexOrdering) -> LexOrdering { order_bys .iter() - .map(|e| PhysicalSortExpr::new(Arc::clone(&e.expr), !e.options)) + .map(|e| { + PhysicalSortExpr::new( + Arc::clone(&e.expr), + e.options.clone().with_reversed_order(), + ) + }) .collect() } diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index 07a98340dbe7..10d2ab14316c 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -40,9 +40,9 @@ use std::sync::Arc; use crate::expressions::Column; -use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue}; +use datafusion_common::sort::SortOptions; use datafusion_expr::{AggregateUDF, ReversedUDAF, SetMonotonicity}; use datafusion_expr_common::accumulator::Accumulator; use datafusion_expr_common::groups_accumulator::GroupsAccumulator; @@ -553,9 +553,10 @@ impl AggregateFunctionExpr { return None; } let expr = Arc::new(Column::new(self.name(), aggr_func_idx)); - let options = - SortOptions::new(monotonicity == SetMonotonicity::Decreasing, false); - Some(PhysicalSortExpr { expr, options }) + todo!("Sort?") + // let options = + // SortOptions::new(monotonicity == SetMonotonicity::Decreasing, false); + // Some(PhysicalSortExpr { expr, options }) } } diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index 0efd46ad912e..b0303e879969 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -23,8 +23,8 @@ use std::vec::IntoIter; use crate::equivalence::add_offset_to_expr; use crate::{LexOrdering, PhysicalExpr}; -use arrow::compute::SortOptions; use datafusion_common::HashSet; +use datafusion_common::sort::SortOptions; /// An `OrderingEquivalenceClass` object keeps track of different alternative /// orderings than can describe a schema. For example, consider the following table: @@ -227,11 +227,11 @@ impl OrderingEquivalenceClass { /// Gets sort options associated with this expression if it is a leading /// ordering expression. Otherwise, returns `None`. - pub fn get_options(&self, expr: &Arc) -> Option { + pub fn get_options(&self, expr: &Arc) -> Option<&SortOptions> { for ordering in self.iter() { let leading_ordering = &ordering[0]; if leading_ordering.expr.eq(expr) { - return Some(leading_ordering.options); + return Some(&leading_ordering.options); } } None diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 042256951250..a8911a6b4cbf 100755 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -32,7 +32,6 @@ use crate::{ PhysicalExprRef, PhysicalSortExpr, PhysicalSortRequirement, }; -use arrow::compute::SortOptions; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ @@ -42,6 +41,7 @@ use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_physical_expr_common::utils::ExprPropertiesNode; +use datafusion_common::sort::SortOptions; use indexmap::{IndexMap, IndexSet}; use itertools::Itertools; @@ -371,7 +371,7 @@ impl EquivalenceProperties { continue; } - let leading_ordering_options = ordering[0].options; + let leading_ordering_options = ordering[0].options.clone(); for equivalent_expr in &eq_class { let children = equivalent_expr.children(); @@ -391,7 +391,9 @@ impl EquivalenceProperties { break; } child_properties.push(ExprProperties { - sort_properties: SortProperties::Ordered(next.options), + sort_properties: SortProperties::Ordered( + next.options.clone(), + ), range: Interval::make_unbounded( &child.data_type(&self.schema)?, )?, @@ -408,17 +410,21 @@ impl EquivalenceProperties { if let Ok(expr_properties) = equivalent_expr.get_properties(&child_properties) { - if expr_properties.preserves_lex_ordering - && SortProperties::Ordered(leading_ordering_options) - == expr_properties.sort_properties + if let SortProperties::Ordered(sort_properties) = + expr_properties.sort_properties { - // Assume existing ordering is [c ASC, a ASC, b ASC] - // When equality c = f(a,b) is given, if we know that given ordering `[a ASC, b ASC]`, - // ordering `[f(a,b) ASC]` is valid, then we can deduce that ordering `[a ASC, b ASC]` is also valid. - // Hence, ordering `[a ASC, b ASC]` can be added to the state as a valid ordering. - // (e.g. existing ordering where leading ordering is removed) - new_orderings.push(LexOrdering::new(ordering[1..].to_vec())); - break; + if expr_properties.preserves_lex_ordering + && sort_properties == leading_ordering_options + { + // Assume existing ordering is [c ASC, a ASC, b ASC] + // When equality c = f(a,b) is given, if we know that given ordering `[a ASC, b ASC]`, + // ordering `[f(a,b) ASC]` is valid, then we can deduce that ordering `[a ASC, b ASC]` is also valid. + // Hence, ordering `[a ASC, b ASC]` can be added to the state as a valid ordering. + // (e.g. existing ordering where leading ordering is removed) + new_orderings + .push(LexOrdering::new(ordering[1..].to_vec())); + break; + } } } } @@ -638,9 +644,9 @@ impl EquivalenceProperties { && normalized_reqs[..ordering_len].iter().zip(ordering).all( |(req, existing)| { req.expr.eq(&existing.expr) - && req - .options - .is_none_or(|req_opts| req_opts == existing.options) + && req.options.as_ref().is_none_or(|req_opts| { + req_opts == &existing.options + }) }, ) }) @@ -666,7 +672,7 @@ impl EquivalenceProperties { SortProperties::Ordered(options) => { let sort_expr = PhysicalSortExpr { expr: Arc::clone(&req.expr), - options, + options: options, }; sort_expr.satisfy(req, self.schema()) } @@ -734,14 +740,15 @@ impl EquivalenceProperties { .zip(rhs.inner.iter_mut()) .all(|(lhs, rhs)| { lhs.expr.eq(&rhs.expr) - && match (lhs.options, rhs.options) { + && match (lhs.options.as_ref(), rhs.options.as_ref()) + { (Some(lhs_opt), Some(rhs_opt)) => lhs_opt == rhs_opt, (Some(options), None) => { - rhs.options = Some(options); + rhs.options = Some(options.clone()); true } (None, Some(options)) => { - lhs.options = Some(options); + lhs.options = Some(options.clone()); true } (None, None) => true, @@ -784,7 +791,7 @@ impl EquivalenceProperties { { res.push(PhysicalSortExpr { expr: Arc::clone(&r_expr), - options: sort_expr.options, + options: sort_expr.options.clone(), }); } } @@ -874,7 +881,7 @@ impl EquivalenceProperties { self.project_expr(&sort_expr.expr, mapping).map(|expr| { PhysicalSortExpr { expr, - options: sort_expr.options, + options: sort_expr.options.clone(), } }); let is_projected = target_sort_expr.is_some(); @@ -961,16 +968,16 @@ impl EquivalenceProperties { get_expr_properties(source, &relevant_deps, &self.schema) .map(|prop| prop.sort_properties) { - Some((options, relevant_deps)) + Some((options.clone(), relevant_deps)) } else { // Do not consider unordered cases None } }) - .flat_map(|(options, relevant_deps)| { + .flat_map(|(sort_definition, relevant_deps)| { let sort_expr = PhysicalSortExpr { expr: Arc::clone(target), - options, + options: sort_definition, }; // Generate dependent orderings (i.e. prefixes for `sort_expr`): let mut dependency_orderings = @@ -1144,7 +1151,7 @@ impl EquivalenceProperties { SortProperties::Ordered(options) => Some(( PhysicalSortExpr { expr: Arc::clone(&exprs[idx]), - options, + options: options, }, idx, )), @@ -1154,7 +1161,7 @@ impl EquivalenceProperties { Some(( PhysicalSortExpr { expr: Arc::clone(&exprs[idx]), - options, + options: options, }, idx, )) @@ -1470,7 +1477,7 @@ fn update_properties( { node.data.sort_properties = SortProperties::Singleton; } else if let Some(options) = oeq_class.get_options(&normalized_expr) { - node.data.sort_properties = SortProperties::Ordered(options); + node.data.sort_properties = SortProperties::Ordered(options.clone()); } Ok(Transformed::yes(node)) } @@ -1682,7 +1689,9 @@ fn get_expr_properties( if let Some(column_order) = dependencies.iter().find(|&order| expr.eq(&order.expr)) { // If exact match is found, return its ordering. Ok(ExprProperties { - sort_properties: SortProperties::Ordered(column_order.options), + sort_properties: SortProperties::Ordered( + column_order.options.clone(), + ), range: Interval::make_unbounded(&expr.data_type(schema)?)?, preserves_lex_ordering: false, }) @@ -2386,7 +2395,10 @@ fn advance_if_matches_constant( ) -> Option { let expr = iter.peek()?; let const_expr = constants.iter().find(|c| c.eq_expr(expr))?; - let found_expr = PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options); + let found_expr = PhysicalSortExpr::new( + Arc::clone(const_expr.expr()), + expr.options.clone(), + ); iter.next(); Some(found_expr) } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 1f16c5471ed7..af735b4535a5 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -525,8 +525,8 @@ impl PhysicalExpr for BinaryExpr { /// For each operator, [`BinaryExpr`] has distinct rules. /// TODO: There may be rules specific to some data types and expression ranges. fn get_properties(&self, children: &[ExprProperties]) -> Result { - let (l_order, l_range) = (children[0].sort_properties, &children[0].range); - let (r_order, r_range) = (children[1].sort_properties, &children[1].range); + let (l_order, l_range) = (children[0].sort_properties.clone(), &children[0].range); + let (r_order, r_range) = (children[1].sort_properties.clone(), &children[1].range); match self.op() { Operator::Plus => Ok(ExprProperties { sort_properties: l_order.add(&r_order), diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index 8795545274a2..4d8e83172b28 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -162,7 +162,7 @@ impl PhysicalExpr for NegativeExpr { /// The ordering of a [`NegativeExpr`] is simply the reverse of its child. fn get_properties(&self, children: &[ExprProperties]) -> Result { Ok(ExprProperties { - sort_properties: -children[0].sort_properties, + sort_properties: -children[0].sort_properties.clone(), range: children[0].range.clone().arithmetic_negate()?, preserves_lex_ordering: false, }) diff --git a/datafusion/physical-expr/src/window/sliding_aggregate.rs b/datafusion/physical-expr/src/window/sliding_aggregate.rs index 23967e78f07a..4055b70bc783 100644 --- a/datafusion/physical-expr/src/window/sliding_aggregate.rs +++ b/datafusion/physical-expr/src/window/sliding_aggregate.rs @@ -155,7 +155,7 @@ impl WindowExpr for SlidingAggregateWindowExpr { .zip(order_by_exprs) .map(|(req, new_expr)| PhysicalSortExpr { expr: new_expr, - options: req.options, + options: req.options.clone(), }) .collect::(); Some(Arc::new(SlidingAggregateWindowExpr { diff --git a/datafusion/physical-expr/src/window/standard.rs b/datafusion/physical-expr/src/window/standard.rs index 22e8aea83fe7..86625e643a07 100644 --- a/datafusion/physical-expr/src/window/standard.rs +++ b/datafusion/physical-expr/src/window/standard.rs @@ -26,9 +26,9 @@ use crate::window::window_expr::{get_orderby_values, WindowFn}; use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState}; use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr}; use arrow::array::{new_empty_array, ArrayRef}; -use arrow::compute::SortOptions; use arrow::datatypes::Field; use arrow::record_batch::RecordBatch; +use datafusion_common::sort::SortOptions; use datafusion_common::utils::evaluate_partition_ranges; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::window_state::{WindowAggState, WindowFrameContext}; @@ -112,8 +112,11 @@ impl WindowExpr for StandardWindowExpr { let mut evaluator = self.expr.create_evaluator()?; let num_rows = batch.num_rows(); if evaluator.uses_window_frame() { - let sort_options: Vec = - self.order_by.iter().map(|o| o.options).collect(); + let sort_options: Vec = self + .order_by + .iter() + .map(|o| o.options.clone()) + .collect(); let mut row_wise_results = vec![]; let mut values = self.evaluate_args(batch)?; @@ -157,7 +160,11 @@ impl WindowExpr for StandardWindowExpr { ) -> Result<()> { let field = self.expr.field()?; let out_type = field.data_type(); - let sort_options = self.order_by.iter().map(|o| o.options).collect::>(); + let sort_definitions = self + .order_by + .iter() + .map(|o| o.options.clone()) + .collect::>(); for (partition_row, partition_batch_state) in partition_batches.iter() { let window_state = if let Some(window_state) = window_agg_state.get_mut(partition_row) { @@ -204,7 +211,7 @@ impl WindowExpr for StandardWindowExpr { .get_or_insert_with(|| { WindowFrameContext::new( Arc::clone(&self.window_frame), - sort_options.clone(), + sort_definitions.clone(), ) }) .calculate_range( diff --git a/datafusion/physical-expr/src/window/window_expr.rs b/datafusion/physical-expr/src/window/window_expr.rs index 793f2e5ee586..930b93447bc1 100644 --- a/datafusion/physical-expr/src/window/window_expr.rs +++ b/datafusion/physical-expr/src/window/window_expr.rs @@ -23,8 +23,6 @@ use std::sync::Arc; use crate::{LexOrdering, PhysicalExpr}; use arrow::array::{new_empty_array, Array, ArrayRef}; -use arrow::compute::kernels::sort::SortColumn; -use arrow::compute::SortOptions; use arrow::datatypes::Field; use arrow::record_batch::RecordBatch; use datafusion_common::utils::compare_rows; @@ -35,6 +33,7 @@ use datafusion_expr::window_state::{ use datafusion_expr::{Accumulator, PartitionEvaluator, WindowFrame, WindowFrameBound}; use indexmap::IndexMap; +use datafusion_common::sort::{SortColumn, SortOptions}; /// Common trait for [window function] implementations /// @@ -191,7 +190,7 @@ pub trait AggregateWindowExpr: WindowExpr { let mut accumulator = self.get_accumulator()?; let mut last_range = Range { start: 0, end: 0 }; let sort_options: Vec = - self.order_by().iter().map(|o| o.options).collect(); + self.order_by().iter().map(|o| o.options.clone()).collect(); let mut window_frame_ctx = WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options); self.get_result_column( @@ -240,7 +239,7 @@ pub trait AggregateWindowExpr: WindowExpr { // If there is no window state context, initialize it. let window_frame_ctx = state.window_frame_ctx.get_or_insert_with(|| { let sort_options: Vec = - self.order_by().iter().map(|o| o.options).collect(); + self.order_by().iter().map(|o| o.options.clone()).collect(); WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options) }); let out_col = self.get_result_column( @@ -515,7 +514,7 @@ fn is_row_ahead( } let last_value = ScalarValue::try_from_array(old_col, old_col.len() - 1)?; let current_value = ScalarValue::try_from_array(current_col, 0)?; - let cmp = compare_rows(&[current_value], &[last_value], &[*sort_options])?; + let cmp = compare_rows(&[current_value], &[last_value], &[sort_options.clone()])?; Ok(cmp.is_gt()) } diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs index 5e76edad1f56..4a1dc66e10a4 100644 --- a/datafusion/physical-optimizer/src/enforce_distribution.rs +++ b/datafusion/physical-optimizer/src/enforce_distribution.rs @@ -31,7 +31,6 @@ use crate::utils::{ is_sort_preserving_merge, }; -use arrow::compute::SortOptions; use datafusion_common::config::ConfigOptions; use datafusion_common::error::Result; use datafusion_common::stats::Precision; @@ -62,6 +61,7 @@ use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_physical_plan::{Distribution, ExecutionPlan, Partitioning}; use itertools::izip; +use datafusion_common::sort::SortOptions; /// The `EnforceDistribution` rule ensures that distribution requirements are /// met. In doing so, this rule will increase the parallelism in the plan by @@ -459,7 +459,7 @@ where if !positions.is_empty() { let new_join_on = new_join_conditions(&left_keys, &right_keys); let new_sort_options = (0..sort_options.len()) - .map(|idx| sort_options[positions[idx]]) + .map(|idx| sort_options[positions[idx]].clone()) .collect(); join_plan.plan = join_constructor((new_join_on, new_sort_options))?; } @@ -673,7 +673,7 @@ pub fn reorder_join_keys_to_inputs( } = join_keys; let new_join_on = new_join_conditions(&left_keys, &right_keys); let new_sort_options = (0..sort_options.len()) - .map(|idx| sort_options[positions[idx]]) + .map(|idx| sort_options[positions[idx]].clone()) .collect(); return SortMergeJoinExec::try_new( Arc::clone(left), diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs index 17acb6272938..4e74a0b672a1 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs @@ -613,7 +613,7 @@ fn handle_custom_pushdown( } })? .data; - Ok(PhysicalSortRequirement::new(updated_columns, req.options)) + Ok(PhysicalSortRequirement::new(updated_columns, req.options.clone())) }) .collect::>>()?; @@ -691,7 +691,7 @@ fn handle_hash_join( } })? .data; - Ok(PhysicalSortRequirement::new(updated_columns, req.options)) + Ok(PhysicalSortRequirement::new(updated_columns, req.options.clone())) }) .collect::>>()?; diff --git a/datafusion/physical-optimizer/src/output_requirements.rs b/datafusion/physical-optimizer/src/output_requirements.rs index 90a570894a44..d93752388807 100644 --- a/datafusion/physical-optimizer/src/output_requirements.rs +++ b/datafusion/physical-optimizer/src/output_requirements.rs @@ -214,7 +214,7 @@ impl ExecutionPlan for OutputRequirementExec { }; updated_sort_reqs.push(PhysicalSortRequirement { expr: new_expr, - options: req.options, + options: req.options.clone(), }); } } diff --git a/datafusion/physical-plan/src/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs index aff69277a4ce..8b94d1b8cd9b 100644 --- a/datafusion/physical-plan/src/aggregates/order/partial.rs +++ b/datafusion/physical-plan/src/aggregates/order/partial.rs @@ -16,9 +16,10 @@ // under the License. use arrow::array::ArrayRef; -use arrow::compute::SortOptions; use arrow::datatypes::Schema; use arrow_ord::partition::partition; +use datafusion_common::sort::SortOptions; +use datafusion_common::types::SortOrdering; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{Result, ScalarValue}; use datafusion_execution::memory_pool::proxy::VecAllocExt; @@ -200,7 +201,10 @@ impl GroupOrderingPartial { range_sort_key: Vec, ) -> Result<(usize, Vec)> { if let Some(sort_key) = sort_key { - let sort_options = vec![SortOptions::new(false, false); sort_key.len()]; + let sort_options = vec![ + SortOptions::new(SortOrdering::Default, false, false); + sort_key.len() + ]; let ordering = compare_rows(&sort_key, &range_sort_key, &sort_options)?; if ordering == Ordering::Equal { return Ok((current_sort, sort_key)); diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 05122d5a5403..2525e353fbf6 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -36,7 +36,6 @@ use crate::{aggregates, metrics, ExecutionPlan, PhysicalExpr}; use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::*; -use arrow::compute::SortOptions; use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::disk_manager::RefCountedTempFile; @@ -55,6 +54,7 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering; use futures::ready; use futures::stream::{Stream, StreamExt}; use log::debug; +use datafusion_common::sort::SortOptions; #[derive(Debug, Clone)] /// This object tracks the aggregation phase (input/output) diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 6c933ca21807..059aaa318621 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -55,7 +55,7 @@ use crate::{ use arrow::array::{types::UInt64Type, *}; use arrow::compute::{ - self, concat_batches, filter_record_batch, is_not_null, take, SortOptions, + self, concat_batches, filter_record_batch, is_not_null, take, }; use arrow::datatypes::{DataType, SchemaRef, TimeUnit}; use arrow::error::ArrowError; @@ -73,6 +73,7 @@ use datafusion_physical_expr::PhysicalExprRef; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; use futures::{Stream, StreamExt}; +use datafusion_common::sort::SortOptions; /// Join execution plan that executes equi-join predicates on multiple partitions using Sort-Merge /// join algorithm and applies an optional filter post join. Can be used to join arbitrarily large @@ -188,11 +189,11 @@ impl SortMergeJoinExec { .map(|((l, r), sort_op)| { let left = PhysicalSortExpr { expr: Arc::clone(l), - options: *sort_op, + options: sort_op.clone(), }; let right = PhysicalSortExpr { expr: Arc::clone(r), - options: *sort_op, + options: sort_op.clone(), }; (left, right) }) diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 47af4ab9a765..27557f96c504 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -730,7 +730,7 @@ fn determine_prune_length( }; // Perform binary search on the array to determine the length of the record batch to be pruned - bisect::(&[batch_arr], &[target], &[origin_sorted_expr.options]) + bisect::(&[batch_arr], &[target], &[origin_sorted_expr.options.clone()]) } /// This method determines if the result of the join should be produced in the final step or not. diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index cffc4b4bff8e..555ef8d57206 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -489,7 +489,7 @@ fn offset_ordering( .iter() .map(|sort_expr| PhysicalSortExpr { expr: add_offset_to_expr(Arc::clone(&sort_expr.expr), offset), - options: sort_expr.options, + options: sort_expr.options.clone(), }) .collect(), _ => ordering.clone(), @@ -2646,25 +2646,25 @@ mod tests { let left_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options, }, ]); let right_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 2)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 1)), - options, + options: options, }, ]); let join_type = JoinType::Inner; @@ -2680,45 +2680,45 @@ mod tests { Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options, + options: options, }, ])), Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options, }, ])), ]; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 40e68cfcae83..9d11b205e02e 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1700,7 +1700,7 @@ mod test { let options = SortOptions::default(); LexOrdering::new(vec![PhysicalSortExpr { expr: col("c0", schema).unwrap(), - options, + options: options, }]) } diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index 8ea7c43d2613..e97f8dcf21c7 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -22,9 +22,9 @@ use arrow::array::{ PrimitiveArray, }; use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer}; -use arrow::compute::SortOptions; use arrow::datatypes::ArrowNativeTypeOp; use arrow::row::Rows; +use datafusion_common::sort::SortOptions; use datafusion_execution::memory_pool::MemoryReservation; /// A comparable collection of values for use with [`Cursor`] diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index d84068527a64..159898566987 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -47,7 +47,7 @@ use crate::{ use arrow::array::{ Array, RecordBatch, RecordBatchOptions, StringViewArray, UInt32Array, }; -use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays, SortColumn}; +use arrow::compute::{concat_batches, take_arrays}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::row::{RowConverter, SortField}; use datafusion_common::{internal_err, Result}; @@ -60,6 +60,7 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; +use datafusion_common::sort::{lexsort_to_indices, SortColumn}; struct ExternalSorterMetrics { /// metrics @@ -788,7 +789,7 @@ pub(crate) fn lexsort_to_indices_multi_columns( |(mut fields, mut columns), sort_column| { fields.push(SortField::new_with_options( sort_column.values.data_type().clone(), - sort_column.options.unwrap_or_default(), + sort_column.options.map(|o| o.to_arrow().expect("TODO")).unwrap_or_default(), )); columns.push(sort_column.values); (fields, columns) @@ -1162,7 +1163,7 @@ impl ExecutionPlan for SortExec { }; updated_exprs.push(PhysicalSortExpr { expr: new_expr, - options: sort.options, + options: sort.options.clone(), }); } diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 454a06855175..22a126131f80 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -357,7 +357,7 @@ impl ExecutionPlan for SortPreservingMergeExec { }; updated_exprs.push(PhysicalSortExpr { expr: updated_expr, - options: sort.options, + options: sort.options.clone(), }); } diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index e029c60b285b..9fe21f829759 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -22,7 +22,7 @@ use arrow::array::Array; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use arrow::row::{RowConverter, SortField}; -use datafusion_common::Result; +use datafusion_common::{internal_err, Result}; use datafusion_execution::memory_pool::MemoryReservation; use datafusion_physical_expr_common::sort_expr::LexOrdering; use futures::stream::{Fuse, StreamExt}; @@ -101,7 +101,10 @@ impl RowCursorStream { .iter() .map(|expr| { let data_type = expr.expr.data_type(schema)?; - Ok(SortField::new_with_options(data_type, expr.options)) + match expr.options.to_arrow() { + Ok(options) => Ok(SortField::new_with_options(data_type, options)), + Err(_) => internal_err!("Custom orderings not supported in RowCursorStream."), + } }) .collect::>>()?; @@ -195,7 +198,7 @@ impl FieldCursorStream { let mut array_reservation = self.reservation.new_empty(); array_reservation.try_grow(size_in_mem)?; Ok(ArrayValues::new( - self.sort.options, + self.sort.options.clone(), array, array_reservation, )) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 85de1eefce2e..004600fcd0b9 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -29,7 +29,7 @@ use crate::spill::get_record_batch_memory_size; use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; use arrow::array::{Array, ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; -use datafusion_common::HashMap; +use datafusion_common::{HashMap, _internal_datafusion_err}; use datafusion_common::Result; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, @@ -115,7 +115,7 @@ impl TopK { .map(|e| { Ok(SortField::new_with_options( e.expr.data_type(&schema)?, - e.options, + e.options.to_arrow().map_err(|_| _internal_datafusion_err!("Custom sorts not supported in TopK"))?, )) }) .collect::>()?; diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index d38bf2a186a8..d1fdb1cd27f0 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -31,7 +31,6 @@ use crate::{ }; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow_schema::SortOptions; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ PartitionEvaluator, ReversedUDWF, SetMonotonicity, WindowFrame, @@ -53,6 +52,8 @@ use itertools::Itertools; // Public interface: pub use bounded_window_agg_exec::BoundedWindowAggExec; +use datafusion_common::sort::SortOptions; +use datafusion_common::types::SortOrdering; pub use datafusion_physical_expr::window::{ PlainAggregateWindowExpr, StandardWindowExpr, WindowExpr, }; @@ -268,7 +269,10 @@ impl StandardWindowFunctionExpr for WindowUDFExpr { .zip(schema.column_with_name(self.name())) .map(|(options, (idx, field))| { let expr = Arc::new(Column::new(field.name(), idx)); - PhysicalSortExpr { expr, options } + PhysicalSortExpr { + expr, + options: options.clone(), + } }) } } @@ -289,11 +293,14 @@ pub(crate) fn calc_requirements< .collect::>(), ); for element in orderby_sort_exprs.into_iter() { - let PhysicalSortExpr { expr, options } = element.borrow(); + let PhysicalSortExpr { + expr, + options, + } = element.borrow(); if !sort_reqs.iter().any(|e| e.expr.eq(expr)) { sort_reqs.push(PhysicalSortRequirement::new( Arc::clone(expr), - Some(*options), + Some(options.clone()), )); } } @@ -428,7 +435,7 @@ pub(crate) fn window_equivalence_properties( let new_ordering = vec![LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(increasing, true), + SortOptions::new(SortOrdering::Default, increasing, true), )])]; window_eq_properties.add_new_orderings(new_ordering); } else { @@ -437,7 +444,7 @@ pub(crate) fn window_equivalence_properties( let mut existing = lex.take_exprs(); existing.push(PhysicalSortExpr::new( Arc::new(window_col.clone()), - SortOptions::new(increasing, true), + SortOptions::new(SortOrdering::Default, increasing, true), )); window_eq_properties .add_new_ordering(LexOrdering::new(existing)); @@ -472,14 +479,14 @@ pub(crate) fn window_equivalence_properties( let new_ordering = LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(false, false), + SortOptions::new(SortOrdering::Default, false, false), )]); window_eq_properties.add_new_ordering(new_ordering); } else if !increasing && (!asc || no_partitioning) { let new_ordering = LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(true, false), + SortOptions::new(SortOrdering::Default, true, false), )]); window_eq_properties.add_new_ordering(new_ordering); }; @@ -626,8 +633,11 @@ pub fn get_window_mode( fn sort_options_resolving_constant(expr: Arc) -> Vec { vec![ - PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), - PhysicalSortExpr::new(expr, SortOptions::new(true, true)), + PhysicalSortExpr::new( + Arc::clone(&expr), + SortOptions::new(SortOrdering::Default, false, false), + ), + PhysicalSortExpr::new(expr, SortOptions::new(SortOrdering::Default, true, true)), ] } @@ -689,7 +699,7 @@ mod tests { ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: options, } } @@ -753,7 +763,10 @@ mod tests { descending, nulls_first, }; - orderbys.push(PhysicalSortExpr { expr, options }); + orderbys.push(PhysicalSortExpr { + expr, + options: options, + }); } let mut expected: Option = None; @@ -992,7 +1005,10 @@ mod tests { // Give default ordering, this is same with input ordering direction // In this test we do check for reversibility. let options = SortOptions::default(); - order_by_exprs.push(PhysicalSortExpr { expr, options }); + order_by_exprs.push(PhysicalSortExpr { + expr, + options: options, + }); } let res = get_window_mode( &partition_by_exprs, @@ -1158,7 +1174,10 @@ mod tests { descending: *descending, nulls_first: *nulls_first, }; - order_by_exprs.push(PhysicalSortExpr { expr, options }); + order_by_exprs.push(PhysicalSortExpr { + expr, + options: options, + }); } assert_eq!( diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 6331b7fb3114..0fd554ee5af5 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -79,7 +79,7 @@ pub fn parse_physical_sort_expr( descending: !proto.asc, nulls_first: proto.nulls_first, }; - Ok(PhysicalSortExpr { expr, options }) + Ok(PhysicalSortExpr { expr, options: options }) } else { Err(proto_error("Unexpected empty physical expression")) } diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 3f67842fe625..01a1c5bd2a4e 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -180,7 +180,7 @@ pub fn serialize_physical_sort_expr( sort_expr: PhysicalSortExpr, codec: &dyn PhysicalExtensionCodec, ) -> Result { - let PhysicalSortExpr { expr, options } = sort_expr; + let PhysicalSortExpr { expr, options: options } = sort_expr; let expr = serialize_physical_expr(&expr, codec)?; Ok(PhysicalSortExprNode { expr: Some(Box::new(expr)), From 7a02df760977c6fa50ce3ecdabd0d8f6380ca7d0 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Mon, 3 Mar 2025 22:27:38 +0100 Subject: [PATCH 05/14] Separate AdvSortOption and SortOption and use the latter if custom sorting is not supported. --- datafusion/catalog/src/session.rs | 2 +- datafusion/common/src/lib.rs | 2 +- datafusion/common/src/sort.rs | 156 ++++++++++++++++++ datafusion/common/src/types/logical.rs | 27 +-- datafusion/common/src/types/native.rs | 3 +- datafusion/common/src/utils/mod.rs | 11 +- .../core/src/execution/session_state.rs | 2 +- datafusion/core/src/physical_planner.rs | 5 +- datafusion/core/tests/dataframe/mod.rs | 18 +- .../tests/fuzz_cases/equivalence/ordering.rs | 8 +- .../tests/fuzz_cases/equivalence/utils.rs | 29 ++-- .../sort_preserving_repartition_fuzz.rs | 8 +- datafusion/core/tests/memory_limit/mod.rs | 4 +- .../replace_with_order_preserving_variants.rs | 2 +- .../tests/physical_optimizer/test_utils.rs | 2 +- datafusion/datasource/src/statistics.rs | 13 +- datafusion/expr-common/src/sort_properties.rs | 30 ++-- datafusion/expr/src/registry.rs | 10 +- datafusion/expr/src/udwf.rs | 5 +- datafusion/expr/src/window_state.rs | 18 +- .../src/merge_arrays.rs | 16 +- .../functions-aggregate-common/src/utils.rs | 9 +- .../functions-aggregate/src/array_agg.rs | 4 +- .../functions-aggregate/src/first_last.rs | 44 ++--- .../functions-aggregate/src/nth_value.rs | 8 +- datafusion/functions-window/src/rank.rs | 4 +- datafusion/functions-window/src/row_number.rs | 4 +- datafusion/functions/src/math/log.rs | 11 +- .../physical-expr-common/src/sort_expr.rs | 50 +++--- datafusion/physical-expr/src/aggregate.rs | 13 +- .../physical-expr/src/equivalence/ordering.rs | 4 +- .../src/equivalence/properties.rs | 86 +++++----- .../physical-expr/src/expressions/binary.rs | 4 +- .../physical-expr/src/window/standard.rs | 43 +++-- .../physical-expr/src/window/window_expr.rs | 23 ++- .../src/enforce_distribution.rs | 7 +- .../src/enforce_sorting/sort_pushdown.rs | 10 +- .../src/aggregates/order/partial.rs | 8 +- .../physical-plan/src/aggregates/row_hash.rs | 4 +- .../src/joins/sort_merge_join.rs | 17 +- .../src/joins/symmetric_hash_join.rs | 6 +- datafusion/physical-plan/src/joins/utils.rs | 30 ++-- .../physical-plan/src/repartition/mod.rs | 2 +- datafusion/physical-plan/src/sorts/cursor.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 5 +- datafusion/physical-plan/src/sorts/stream.rs | 6 +- datafusion/physical-plan/src/topk/mod.rs | 6 +- .../src/windows/bounded_window_agg_exec.rs | 6 +- datafusion/physical-plan/src/windows/mod.rs | 71 ++++---- .../proto/src/physical_plan/from_proto.rs | 2 +- .../proto/src/physical_plan/to_proto.rs | 5 +- .../tests/cases/roundtrip_physical_plan.rs | 6 +- 52 files changed, 531 insertions(+), 340 deletions(-) create mode 100644 datafusion/common/src/sort.rs diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 997260787770..0f8436064ed6 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -17,6 +17,7 @@ use async_trait::async_trait; use datafusion_common::config::ConfigOptions; +use datafusion_common::types::LogicalTypeRef; use datafusion_common::{DFSchema, Result}; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; @@ -28,7 +29,6 @@ use parking_lot::{Mutex, RwLock}; use std::any::Any; use std::collections::HashMap; use std::sync::{Arc, Weak}; -use datafusion_common::types::LogicalTypeRef; /// Interface for accessing [`SessionState`] from the catalog. /// diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index f3bb017bc8ea..acb9c14c0b24 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -48,13 +48,13 @@ pub mod instant; pub mod parsers; pub mod rounding; pub mod scalar; +pub mod sort; pub mod spans; pub mod stats; pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; -pub mod sort; /// Reexport arrow crate pub use arrow; diff --git a/datafusion/common/src/sort.rs b/datafusion/common/src/sort.rs new file mode 100644 index 000000000000..be056f80efdd --- /dev/null +++ b/datafusion/common/src/sort.rs @@ -0,0 +1,156 @@ +use crate::error::_internal_err; +use crate::types::SortOrdering; +use crate::Result; +use arrow::array::{ArrayRef, DynComparator, UInt32Array}; +use arrow::compute::SortOptions; +use arrow::error::ArrowError; + +/// TODO +#[derive(Clone, Debug, Default, Hash, PartialEq, Eq)] +pub struct AdvSortOptions { + /// Specifies the ordering that is used for sorting. This enables implementing user-defined + /// sorting. + pub ordering: SortOrdering, + /// Whether to sort in descending order + pub descending: bool, + /// Whether to sort nulls first + pub nulls_first: bool, +} + +impl AdvSortOptions { + /// Creates a new [AdvSortOptions]. + pub fn new(ordering: SortOrdering, descending: bool, nulls_first: bool) -> Self { + Self { + ordering, + descending, + nulls_first, + } + } + + /// Creates a new [AdvSortOptions] with a default ordering from the arrow [SortOption]. + pub fn with_default_ordering(options: SortOptions) -> Self { + Self::new( + SortOrdering::Default, + options.descending, + options.nulls_first, + ) + } + + /// Tries to create an [SortOptions] with the same `descending` and `nulls_first`. + /// + /// # Errors + /// + /// This method will return an error if a custom [SortOrdering] is used. + pub fn to_arrow(&self) -> Result { + match self.ordering { + SortOrdering::Default => Ok(SortOptions { + descending: self.descending, + nulls_first: self.nulls_first, + }), + SortOrdering::Custom(_) => { + _internal_err!("Cannot create arrow SortOptions with custom ordering") + } + } + } + + /// Returns a [AdvSortOptions] with a flipped descending. + /// + /// This does not change the order of nulls. + pub fn with_reversed_order(mut self) -> Self { + self.descending = !self.descending; + self + } + + /// Returns a [AdvSortOptions] with the given `value` for `descending`. + pub fn with_descending(mut self, value: bool) -> Self { + self.descending = value; + self + } + + /// Returns a [AdvSortOptions] with the given `value` for `nulls_first`. + pub fn with_nulls_first(mut self, value: bool) -> Self { + self.nulls_first = value; + self + } +} + +/// TODO +#[derive(Clone, Debug)] +pub struct AdvSortColumn { + pub values: ArrayRef, + pub options: Option, +} + +impl AdvSortColumn { + pub fn dyn_compartor(&self) -> DynComparator { + todo!() + } + + pub fn to_arrow(&self) -> Result { + let has_custom_sort = self + .options + .as_ref() + .map(|opt| opt.ordering != SortOrdering::Default) + .unwrap_or(false); + match has_custom_sort { + true => _internal_err!("Cannot create arrow SortColumn with custom sort"), + false => Ok(arrow::compute::SortColumn { + values: self.values.clone(), + options: self.options.as_ref().map(|o| o.to_arrow().unwrap()), + }), + } + } +} + +/// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer +/// (`UInt32Array`) of indices. +pub fn lexsort_to_indices( + columns: &[AdvSortColumn], + limit: Option, +) -> std::result::Result { + if columns.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Sort requires at least one column".to_string(), + )); + } + + let all_columns_default_ordering = columns + .iter() + .map(|c| c.to_arrow()) + .collect::>>(); + if let Ok(columns) = all_columns_default_ordering { + return arrow::compute::lexsort_to_indices(&columns, limit); + } + + todo!("Custom sorting not yet implemented.") + // + // if columns.len() == 1 && can_sort_to_indices(columns[0].values.data_type()) { + // // fallback to non-lexical sort + // let column = &columns[0]; + // return sort_to_indices(&column.values, column.options, limit); + // } + // + // let row_count = columns[0].values.len(); + // if columns.iter().any(|item| item.values.len() != row_count) { + // return Err(ArrowError::ComputeError( + // "lexical sort columns have different row counts".to_string(), + // )); + // }; + // + // let mut value_indices = (0..row_count).collect::>(); + // let mut len = value_indices.len(); + // + // if let Some(limit) = limit { + // len = limit.min(len); + // } + // + // let lexicographical_comparator = LexicographicalComparator::try_new(columns)?; + // // uint32 can be sorted unstably + // sort_unstable_by(&mut value_indices, len, |a, b| { + // lexicographical_comparator.compare(*a, *b) + // }); + // + // Ok(UInt32Array::from_iter_values( + // value_indices.iter().take(len).map(|i| *i as u32), + // )) +} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 99aa85d38a67..85d20ccd18bf 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -17,6 +17,7 @@ use super::NativeType; use crate::error::Result; +use crate::ScalarValue; use arrow::array::ArrayRef; use arrow::compute::SortOptions; use arrow::datatypes::DataType; @@ -24,7 +25,6 @@ use core::fmt; use std::fmt::Debug; use std::hash::Hasher; use std::{cmp::Ordering, hash::Hash, sync::Arc}; -use crate::ScalarValue; /// Signature that uniquely identifies a type among other types. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -32,26 +32,13 @@ pub enum TypeSignature<'a> { /// Represents a built-in native type. Native(&'a NativeType), /// Represents an arrow-compatible extension type. - Extension(ExtensionTypeSignature<'a>), -} - -/// Represents an arrow-compatible extension type. -/// () -/// -/// The `name` should contain the same value as 'ARROW:extension:name'. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct ExtensionTypeSignature<'a> { - name: &'a str, - parameters: &'a [TypeParameter<'a>], -} - -impl ExtensionTypeSignature<'_> { - /// Returns the name of the extension type. + /// () /// /// The `name` should contain the same value as 'ARROW:extension:name'. - pub fn name(&self) -> &str { - &self.name - } + Extension { + name: &'a str, + parameters: &'a [TypeParameter<'a>], + }, } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -185,7 +172,7 @@ impl SortOrdering { pub fn partial_cmp(&self, lhs: &ScalarValue, rhs: &ScalarValue) -> Option { match self { SortOrdering::Default => lhs.partial_cmp(rhs), - SortOrdering::Custom(_) => todo!("custom order") + SortOrdering::Custom(_) => todo!("custom order"), } } } diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 7172803a9c99..0355f9868b1a 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -17,8 +17,7 @@ use super::{ LogicalField, LogicalFieldRef, LogicalFields, LogicalType, - LogicalTypePlanningInformation, LogicalUnionFields, SortOrdering, - TypeSignature, + LogicalTypePlanningInformation, LogicalUnionFields, SortOrdering, TypeSignature, }; use crate::error::{Result, _internal_err}; use arrow::compute::can_cast_types; diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index c954f525db0b..ff9cdedab8b1 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -29,7 +29,7 @@ use arrow::array::{ OffsetSizeTrait, }; use arrow::buffer::OffsetBuffer; -use arrow::compute::{partition}; +use arrow::compute::{partition, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field, SchemaRef}; use sqlparser::ast::Ident; use sqlparser::dialect::GenericDialect; @@ -41,7 +41,6 @@ use std::num::NonZero; use std::ops::Range; use std::sync::Arc; use std::thread::available_parallelism; -use crate::sort::{SortColumn, SortOptions}; /// Applies an optional projection to a [`SchemaRef`], returning the /// projected schema @@ -101,13 +100,13 @@ pub fn compare_rows( // Preserving lexical ordering. for ((lhs, rhs), sort_options) in zip_it { // Consider all combinations of NULLS FIRST/LAST and ASC/DESC configurations. - let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first()) { + let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first) { (true, false, false) | (false, true, true) => Ordering::Greater, (true, false, true) | (false, true, false) => Ordering::Less, - (false, false, _) => if sort_options.descending() { - sort_options.ordering().partial_cmp(rhs, lhs) + (false, false, _) => if sort_options.descending { + rhs.partial_cmp(lhs) } else { - sort_options.ordering().partial_cmp(lhs, rhs) + lhs.partial_cmp(rhs) } .ok_or_else(|| { _internal_datafusion_err!("Column array shouldn't be empty") diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 51bf3cc3e805..5265d00558bc 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -73,6 +73,7 @@ use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, Sq use async_trait::async_trait; use chrono::{DateTime, Utc}; +use datafusion_common::types::LogicalTypeRef; use itertools::Itertools; use log::{debug, info}; use object_store::ObjectStore; @@ -80,7 +81,6 @@ use sqlparser::ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias}; use sqlparser::dialect::dialect_from_str; use url::Url; use uuid::Uuid; -use datafusion_common::types::LogicalTypeRef; /// `SessionState` contains all the necessary state to plan and execute queries, /// such as configuration, functions, and runtime environment. Please see the diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 26d265033513..699723f0e68a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -61,6 +61,7 @@ use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use arrow::array::{builder::StringBuilder, RecordBatch}; use arrow::datatypes::{Schema, SchemaRef}; +use arrow_schema::SortOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::{ @@ -89,12 +90,12 @@ use datafusion_physical_plan::unnest::ListUnnest; use crate::schema_equivalence::schema_satisfied_by; use async_trait::async_trait; +use datafusion_common::sort::AdvSortOptions; use futures::{StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; use log::{debug, trace}; use sqlparser::ast::NullTreatment; use tokio::sync::Mutex; -use datafusion_common::sort::SortOptions; /// Physical query planner that converts a `LogicalPlan` to an /// `ExecutionPlan` suitable for execution. @@ -1688,7 +1689,7 @@ pub fn create_physical_sort_expr( } = e; Ok(PhysicalSortExpr { expr: create_physical_expr(expr, input_dfschema, execution_props)?, - options: SortOptions { + options: AdvSortOptions { descending: !asc, nulls_first: *nulls_first, }, diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 6295fafe7741..b471a2e11281 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -19,9 +19,17 @@ mod dataframe_functions; mod describe; -use arrow::array::{record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; +use arrow::array::{ + record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, + FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, + Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, + StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder, +}; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::{DataType, Field, Float32Type, Float64Type, Int32Type, Schema, SchemaRef, UInt64Type, UnionFields, UnionMode}; +use arrow::datatypes::{ + DataType, Field, Float32Type, Float64Type, Int32Type, Schema, SchemaRef, UInt64Type, + UnionFields, UnionMode, +}; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; use datafusion_expr::utils::COUNT_STAR_EXPANSION; @@ -31,12 +39,12 @@ use datafusion_functions_aggregate::expr_fn::{ }; use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::expr_fn::{first_value, row_number}; +use futures::StreamExt; use object_store::local::LocalFileSystem; use sqlparser::ast::NullTreatment; use std::collections::HashMap; use std::fs; use std::sync::Arc; -use futures::StreamExt; use tempfile::TempDir; use url::Url; @@ -2845,8 +2853,8 @@ async fn sort_on_union_with_logical_type() -> Result<()> { (0, Arc::new(Field::new("A", DataType::Int32, false))), (1, Arc::new(Field::new("B", DataType::Float64, false))), ] - .into_iter() - .collect(); + .into_iter() + .collect(); let schema = Schema::new(vec![Field::new( "my_union", DataType::Union(fields, UnionMode::Dense), diff --git a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs index 47243b9b4a92..cc38d1674176 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs @@ -21,6 +21,8 @@ use crate::fuzz_cases::equivalence::utils::{ is_table_same_after_sort, TestScalarUDF, }; use arrow::compute::SortOptions; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_common::Result; use datafusion_expr::{Operator, ScalarUDF}; use datafusion_physical_expr::expressions::{col, BinaryExpr}; @@ -309,7 +311,11 @@ fn test_ordering_satisfy_with_equivalence() -> Result<()> { .into_iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(expr), - options: options, + options: AdvSortOptions::new( + SortOrdering::Default, + options.descending, + options.nulls_first, + ), }) .collect::(); diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs index 040155358118..f89cfb8150f3 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs @@ -387,18 +387,23 @@ pub fn generate_table_for_eq_properties( for ordering in eq_properties.oeq_class().iter() { let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering .iter() - .map(|PhysicalSortExpr { expr, options: options }| { - let col = expr.as_any().downcast_ref::().unwrap(); - let (idx, _field) = schema.column_with_name(col.name()).unwrap(); - let arr = generate_random_array(n_elem, n_distinct); - ( - SortColumn { - values: arr, - options: Some(*options), - }, - idx, - ) - }) + .map( + |PhysicalSortExpr { + expr, + options: options, + }| { + let col = expr.as_any().downcast_ref::().unwrap(); + let (idx, _field) = schema.column_with_name(col.name()).unwrap(); + let arr = generate_random_array(n_elem, n_distinct); + ( + SortColumn { + values: arr, + options: Some(*options), + }, + idx, + ) + }, + ) .unzip(); let sort_arrs = arrow::compute::lexsort(&sort_columns, None)?; diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 35fb208a7f6e..b18fb3008b4c 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -20,9 +20,9 @@ mod sp_repartition_fuzz_tests { use std::sync::Arc; use arrow::array::{ArrayRef, Int64Array, RecordBatch, UInt64Array}; - use arrow::compute::{concat_batches, lexsort}; + use arrow::compute::{concat_batches, lexsort, SortColumn}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; - + use arrow_schema::SortOptions; use datafusion::physical_plan::{ collect, metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, @@ -162,14 +162,14 @@ mod sp_repartition_fuzz_tests { for ordering in eq_properties.oeq_class().iter() { let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering .iter() - .map(|PhysicalSortExpr { expr, options: options }| { + .map(|PhysicalSortExpr { expr, options }| { let col = expr.as_any().downcast_ref::().unwrap(); let (idx, _field) = schema.column_with_name(col.name()).unwrap(); let arr = generate_random_array(n_elem, n_distinct); ( SortColumn { values: arr, - options: Some(*options), + options: Some(options.to_arrow().unwrap()), }, idx, ) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 2ced21c3171a..2deb8fde2da6 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -731,11 +731,11 @@ impl Scenario { let sort_information = vec![LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: options, + options, }, PhysicalSortExpr { expr: col("b", &schema).unwrap(), - options: options, + options, }, ])]; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index 64a19e1a9a67..58eb866c590c 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -1141,7 +1141,7 @@ fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options: options, + options, } } diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 73e54f6e5df1..0b9c3b80bb93 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -199,7 +199,7 @@ pub fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options: options, + options, } } diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs index 9cd0225853c2..4ff6f59b3822 100644 --- a/datafusion/datasource/src/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -27,10 +27,10 @@ use crate::PartitionedFile; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; use arrow::{ + compute::SortColumn, row::{Row, Rows}, }; use datafusion_common::{plan_err, DataFusionError, Result}; -use datafusion_common::sort::SortColumn; use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -175,9 +175,12 @@ impl MinMaxStatistics { let sort_fields = sort_order .iter() .map(|expr| { - expr.expr - .data_type(schema) - .map(|data_type| SortField::new_with_options(data_type, expr.options.to_arrow().expect("TODO"))) + expr.expr.data_type(schema).and_then(|data_type| { + Ok(SortField::new_with_options( + data_type, + expr.options.to_arrow()?, + )) + }) }) .collect::>>() .map_err(|e| e.context("create sort fields"))?; @@ -233,7 +236,7 @@ impl MinMaxStatistics { Ok(SortColumn { values: Arc::clone(values.column(idx)), - options: Some(sort_expr.options.clone()), + options: Some(sort_expr.options.to_arrow()?), }) }) .collect::>>() diff --git a/datafusion/expr-common/src/sort_properties.rs b/datafusion/expr-common/src/sort_properties.rs index f7205122e237..1a5ee26f5005 100644 --- a/datafusion/expr-common/src/sort_properties.rs +++ b/datafusion/expr-common/src/sort_properties.rs @@ -20,9 +20,9 @@ use std::ops::Neg; use crate::interval_arithmetic::Interval; use arrow::datatypes::DataType; -use datafusion_common::sort::SortOptions; +use datafusion_common::sort::AdvSortOptions; -/// To propagate [`SortOptions`] across the `PhysicalExpr`, it is insufficient +/// To propagate [`AdvSortOptions`] across the `PhysicalExpr`, it is insufficient /// to simply use `Option`: There must be a differentiation between /// unordered columns and literal values, since literals may not break the ordering /// when they are used as a child of some binary expression when the other child has @@ -35,8 +35,8 @@ use datafusion_common::sort::SortOptions; /// often more ordering-friendly under most mathematical operations. #[derive(PartialEq, Debug, Clone, Default)] pub enum SortProperties { - /// Use the ordinary [`SortOptions`] struct to represent ordered data - Ordered(SortOptions), + /// Use the ordinary [`AdvSortOptions`] struct to represent ordered data + Ordered(AdvSortOptions), /// This alternative represents unordered data: #[default] Unordered, @@ -50,12 +50,11 @@ impl SortProperties { (Self::Singleton, _) => rhs.clone(), (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.ordering() == rhs.ordering() - && lhs.descending() == rhs.descending() => + if lhs.ordering == rhs.ordering && lhs.descending == rhs.descending => { Self::Ordered( lhs.clone() - .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + .with_nulls_first(lhs.nulls_first || rhs.nulls_first), ) } _ => Self::Unordered, @@ -66,16 +65,15 @@ impl SortProperties { match (self, rhs) { (Self::Singleton, Self::Singleton) => Self::Singleton, (Self::Singleton, Self::Ordered(rhs)) => { - Self::Ordered(rhs.clone().with_descending(!rhs.descending())) + Self::Ordered(rhs.clone().with_descending(!rhs.descending)) } (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.ordering() == rhs.ordering() - && lhs.descending() != rhs.descending() => + if lhs.ordering == rhs.ordering && lhs.descending != rhs.descending => { Self::Ordered( lhs.clone() - .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + .with_nulls_first(lhs.nulls_first || rhs.nulls_first), ) } _ => Self::Unordered, @@ -85,12 +83,11 @@ impl SortProperties { pub fn gt_or_gteq(&self, rhs: &Self) -> Self { match (self, rhs) { (Self::Singleton, Self::Ordered(rhs)) => { - Self::Ordered(rhs.clone().with_descending(!rhs.descending())) + Self::Ordered(rhs.clone().with_descending(!rhs.descending)) } (_, Self::Singleton) => self.clone(), (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.ordering() == rhs.ordering() - && lhs.descending() != rhs.descending() => + if lhs.ordering == rhs.ordering && lhs.descending != rhs.descending => { self.clone() } @@ -101,12 +98,11 @@ impl SortProperties { pub fn and_or(&self, rhs: &Self) -> Self { match (self, rhs) { (Self::Ordered(lhs), Self::Ordered(rhs)) - if lhs.ordering() == rhs.ordering() - && lhs.descending() == rhs.descending() => + if lhs.ordering == rhs.ordering && lhs.descending == rhs.descending => { Self::Ordered( lhs.clone() - .with_nulls_first(lhs.nulls_first() || rhs.nulls_first()), + .with_nulls_first(lhs.nulls_first || rhs.nulls_first), ) } (Self::Ordered(opt), Self::Singleton) diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 41a86c0fa361..a27d39277faa 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -21,9 +21,7 @@ use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use datafusion_common::types::{LogicalTypeRef, TypeSignature}; -use datafusion_common::{ - internal_err, not_impl_err, plan_datafusion_err, Result, -}; +use datafusion_common::{internal_err, not_impl_err, plan_datafusion_err, Result}; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -255,11 +253,9 @@ impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { TypeSignature::Native(_) => { return internal_err!("Cannot register a native type") } - TypeSignature::Extension(sig) => sig, + TypeSignature::Extension { name, .. } => name, }; - Ok(self - .extension_types - .insert(signature.name().into(), logical_type)) + Ok(self.extension_types.insert(signature.into(), logical_type)) } fn deregister_type(&mut self, name: &str) -> Result> { diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index f886e6048ff4..e62b1c827908 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -17,6 +17,8 @@ //! [`WindowUDF`]: User Defined Window Functions +use arrow::compute::SortOptions; +use arrow::datatypes::{DataType, Field}; use std::cmp::Ordering; use std::hash::{DefaultHasher, Hash, Hasher}; use std::{ @@ -25,14 +27,11 @@ use std::{ sync::Arc, }; -use arrow::datatypes::{DataType, Field}; - use crate::expr::WindowFunction; use crate::{ function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature, }; use datafusion_common::{not_impl_err, Result}; -use datafusion_common::sort::SortOptions; use datafusion_doc::Documentation; use datafusion_functions_window_common::expr::ExpressionArgs; use datafusion_functions_window_common::field::WindowUDFFieldArgs; diff --git a/datafusion/expr/src/window_state.rs b/datafusion/expr/src/window_state.rs index 455758d6376b..f1d0ead23ab1 100644 --- a/datafusion/expr/src/window_state.rs +++ b/datafusion/expr/src/window_state.rs @@ -23,7 +23,7 @@ use crate::{WindowFrame, WindowFrameBound, WindowFrameUnits}; use arrow::{ array::ArrayRef, - compute::{concat, concat_batches}, + compute::{concat, concat_batches, SortOptions}, datatypes::{DataType, SchemaRef}, record_batch::RecordBatch, }; @@ -32,7 +32,6 @@ use datafusion_common::{ utils::{compare_rows, get_row_at_idx, search_in_slice}, DataFusionError, Result, ScalarValue, }; -use datafusion_common::sort::SortOptions; /// Holds the state of evaluating a window function #[derive(Debug)] @@ -135,12 +134,12 @@ pub enum WindowFrameContext { impl WindowFrameContext { /// Create a new state object for the given window frame. - pub fn new(window_frame: Arc, sort_definition: Vec) -> Self { + pub fn new(window_frame: Arc, sort_options: Vec) -> Self { match window_frame.units { WindowFrameUnits::Rows => WindowFrameContext::Rows(window_frame), WindowFrameUnits::Range => WindowFrameContext::Range { window_frame, - state: WindowFrameStateRange::new(sort_definition), + state: WindowFrameStateRange::new(sort_options), }, WindowFrameUnits::Groups => WindowFrameContext::Groups { window_frame, @@ -289,13 +288,13 @@ impl PartitionBatchState { /// BY clause. This information is used to calculate the range. #[derive(Debug, Default)] pub struct WindowFrameStateRange { - sort_definitions: Vec, + sort_options: Vec, } impl WindowFrameStateRange { /// Create a new object to store the search state. fn new(sort_options: Vec) -> Self { - Self { sort_definitions: sort_options } + Self { sort_options } } /// This function calculates beginning/ending indices for the frame of the current row. @@ -390,14 +389,14 @@ impl WindowFrameStateRange { let current_row_values = get_row_at_idx(range_columns, idx)?; let end_range = if let Some(delta) = delta { let is_descending: bool = self - .sort_definitions + .sort_options .first() .ok_or_else(|| { DataFusionError::Internal( "Sort options unexpectedly absent in a window frame".to_string(), ) })? - .descending(); + .descending; current_row_values .iter() @@ -428,7 +427,7 @@ impl WindowFrameStateRange { last_range.end }; let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| { - let cmp = compare_rows(current, target, &self.sort_definitions)?; + let cmp = compare_rows(current, target, &self.sort_options)?; Ok(if SIDE { cmp.is_lt() } else { cmp.is_le() }) }; search_in_slice(range_columns, &end_range, compare_fn, search_start, length) @@ -671,7 +670,6 @@ mod tests { use super::*; use arrow::array::Float64Array; - use arrow::compute::SortOptions; fn get_test_data() -> (Vec, Vec) { let range_columns: Vec = vec![Arc::new(Float64Array::from(vec![ diff --git a/datafusion/functions-aggregate-common/src/merge_arrays.rs b/datafusion/functions-aggregate-common/src/merge_arrays.rs index 79737b40c985..9b9a1240c1a1 100644 --- a/datafusion/functions-aggregate-common/src/merge_arrays.rs +++ b/datafusion/functions-aggregate-common/src/merge_arrays.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +use arrow::compute::SortOptions; use datafusion_common::utils::compare_rows; use datafusion_common::{exec_err, ScalarValue}; use std::cmp::Ordering; use std::collections::{BinaryHeap, VecDeque}; -use datafusion_common::sort::SortOptions; /// This is a wrapper struct to be able to correctly merge `ARRAY_AGG` data from /// multiple partitions using `BinaryHeap`. When used inside `BinaryHeap`, this @@ -34,7 +34,7 @@ struct CustomElement<'a> { // Comparison "key" ordering: Vec, /// Options defining the ordering semantics - sort_definitions: &'a [SortOptions], + sort_options: &'a [SortOptions], } impl<'a> CustomElement<'a> { @@ -42,13 +42,13 @@ impl<'a> CustomElement<'a> { branch_idx: usize, value: ScalarValue, ordering: Vec, - sort_definitions: &'a [SortOptions], + sort_options: &'a [SortOptions], ) -> Self { Self { branch_idx, value, ordering, - sort_definitions, + sort_options, } } @@ -58,7 +58,7 @@ impl<'a> CustomElement<'a> { target: &[ScalarValue], ) -> datafusion_common::Result { // Calculate ordering according to `sort_options` - compare_rows(current, target, self.sort_definitions) + compare_rows(current, target, self.sort_options) } } @@ -116,7 +116,7 @@ pub fn merge_ordered_arrays( // each `ScalarValue` in the values`. ordering_values: &mut [VecDeque>], // Defines according to which ordering comparisons should be done. - sort_definitions: &[SortOptions], + sort_options: &[SortOptions], ) -> datafusion_common::Result<(Vec, Vec>)> { // Keep track the most recent data of each branch, in binary heap data structure. let mut heap = BinaryHeap::::new(); @@ -149,7 +149,7 @@ pub fn merge_ordered_arrays( branch_idx, value, orderings, - sort_definitions, + sort_options, )); } // If None, we consumed this branch, skip it. @@ -186,7 +186,7 @@ pub fn merge_ordered_arrays( branch_idx, value, orderings, - sort_definitions, + sort_options, )); } } diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs index 5e1d184cd0c1..0a5043bcf899 100644 --- a/datafusion/functions-aggregate-common/src/utils.rs +++ b/datafusion/functions-aggregate-common/src/utils.rs @@ -21,6 +21,7 @@ use arrow::array::{ArrayRef, AsArray}; use arrow::datatypes::ArrowNativeType; use arrow::{ array::ArrowNativeTypeOp, + compute::SortOptions, datatypes::{ DataType, Decimal128Type, DecimalType, Field, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, @@ -28,7 +29,6 @@ use arrow::{ }, }; use datafusion_common::{exec_err, DataFusionError, Result}; -use datafusion_common::sort::SortOptions; use datafusion_expr_common::accumulator::Accumulator; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -108,8 +108,11 @@ pub fn ordering_fields( } /// Selects the sort option attribute from all the given `PhysicalSortExpr`s. -pub fn get_sort_options(ordering_req: &LexOrdering) -> Vec { - ordering_req.iter().map(|item| item.options.clone()).collect() +pub fn get_sort_options(ordering_req: &LexOrdering) -> Result> { + ordering_req + .iter() + .map(|item| item.options.to_arrow()) + .collect() } /// A wrapper around a type to provide hash for floats diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index b96bdf68f3c4..6930eeea0c23 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -504,8 +504,8 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { let sort_options = self .ordering_req .iter() - .map(|sort_expr| sort_expr.options.clone()) - .collect::>(); + .map(|sort_expr| sort_expr.options.to_arrow()) + .collect::>>()?; (self.values, self.ordering_values) = merge_ordered_arrays( &mut partition_values, diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 2fd024c3c614..a5f89528ee2a 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -23,12 +23,11 @@ use std::mem::size_of_val; use std::sync::Arc; use arrow::array::{ArrayRef, AsArray, BooleanArray}; -use arrow::compute::{self, LexicographicalComparator}; +use arrow::compute::{self, LexicographicalComparator, SortColumn}; use arrow::datatypes::{DataType, Field}; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, - internal_datafusion_err, }; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; @@ -256,12 +255,9 @@ impl FirstValueAccumulator { .iter() .zip(self.ordering_req.iter()) .map(|(values, req)| { - let options = req.options.to_arrow().map_err(|_| { - internal_datafusion_err!("FirstValue does not support custom sorts") - })?; - Ok(compute::SortColumn { + Ok(SortColumn { values: Arc::clone(values), - options: Some(options), + options: Some(req.options.to_arrow()?), }) }) .collect::>>()?; @@ -301,7 +297,7 @@ impl Accumulator for FirstValueAccumulator { if compare_rows( &self.orderings, orderings, - &get_sort_options(self.ordering_req.as_ref()), + &get_sort_options(self.ordering_req.as_ref())?, )? .is_gt() { @@ -332,7 +328,7 @@ impl Accumulator for FirstValueAccumulator { let first_row = get_row_at_idx(&filtered_states, first_idx)?; // When collecting orderings, we exclude the is_set flag from the state. let first_ordering = &first_row[1..is_set_idx]; - let sort_options = get_sort_options(self.ordering_req.as_ref()); + let sort_options = get_sort_options(self.ordering_req.as_ref())?; // Either there is no existing value, or there is an earlier version in new data. if !self.is_set || compare_rows(&self.orderings, first_ordering, &sort_options)?.is_gt() @@ -558,10 +554,8 @@ impl LastValueAccumulator { return Ok((!value.is_empty()).then_some(value.len() - 1)); } } - let sort_columns = convert_to_sort_cols( - &ordering_values, - self.ordering_req.as_ref(), - )?; + let sort_columns = + convert_to_sort_cols(&ordering_values, self.ordering_req.as_ref())?; let comparator = LexicographicalComparator::try_new(&sort_columns)?; let max_ind = if self.ignore_nulls { @@ -602,7 +596,7 @@ impl Accumulator for LastValueAccumulator { if compare_rows( &self.orderings, orderings, - &get_sort_options(self.ordering_req.as_ref()), + &get_sort_options(self.ordering_req.as_ref())?, )? .is_lt() { @@ -633,7 +627,7 @@ impl Accumulator for LastValueAccumulator { let last_row = get_row_at_idx(&filtered_states, last_idx)?; // When collecting orderings, we exclude the is_set flag from the state. let last_ordering = &last_row[1..is_set_idx]; - let sort_options = get_sort_options(self.ordering_req.as_ref()); + let sort_options = get_sort_options(self.ordering_req.as_ref())?; // Either there is no existing value, or there is a newer (latest) // version in the new data: if !self.is_set @@ -677,19 +671,15 @@ fn filter_states_according_to_is_set( fn convert_to_sort_cols( arrs: &[ArrayRef], sort_exprs: &LexOrdering, -) -> Result> { - arrs.iter() - .zip(sort_exprs.iter()) - .map(|(item, sort_expr)| { - let options = sort_expr.options.to_arrow().map_err(|_| { - internal_datafusion_err!("FirstValue and LastValue does not support custom sorts") - })?; - Ok(compute::SortColumn { - values: Arc::clone(item), - options: Some(options), - }) +) -> Result> { + Ok(arrs + .iter() + .zip(get_sort_options(sort_exprs)?) + .map(|(item, options)| SortColumn { + values: Arc::clone(item), + options: Some(options), }) - .collect() + .collect::>()) } #[cfg(test)] diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index 079c8c8eeed9..e4f0a9db5525 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -35,7 +35,7 @@ use datafusion_expr::{ Signature, SortExpr, Volatility, }; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; -use datafusion_functions_aggregate_common::utils::ordering_fields; +use datafusion_functions_aggregate_common::utils::{get_sort_options, ordering_fields}; use datafusion_macros::user_doc; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -330,11 +330,7 @@ impl Accumulator for NthValueAccumulator { partition_ordering_values.push(ordering_values.into()); } - let sort_options = self - .ordering_req - .iter() - .map(|sort_expr| sort_expr.options.clone()) - .collect::>(); + let sort_options = get_sort_options(&self.ordering_req)?; let (new_values, new_orderings) = merge_ordered_arrays( &mut partition_values, &mut partition_ordering_values, diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs index 7fd1fc99c96b..bd2edc5722eb 100644 --- a/datafusion/functions-window/src/rank.rs +++ b/datafusion/functions-window/src/rank.rs @@ -27,12 +27,11 @@ use std::sync::{Arc, LazyLock}; use crate::define_udwf_and_expr; use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::{Float64Array, UInt64Array}; +use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::utils::get_row_at_idx; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_common::sort::SortOptions; -use datafusion_common::types::SortOrdering; use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; use datafusion_expr::{ Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, @@ -174,7 +173,6 @@ impl WindowUDFImpl for Rank { fn sort_options(&self) -> Option { Some(SortOptions { - ordering: SortOrdering::Default, descending: false, nulls_first: false, }) diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index 6899375554b1..8f462528dbed 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -19,6 +19,7 @@ use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::UInt64Array; +use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; @@ -32,8 +33,6 @@ use field::WindowUDFFieldArgs; use std::any::Any; use std::fmt::Debug; use std::ops::Range; -use datafusion_common::sort::SortOptions; -use datafusion_common::types::SortOrdering; define_udwf_and_expr!( RowNumber, @@ -93,7 +92,6 @@ impl WindowUDFImpl for RowNumber { fn sort_options(&self) -> Option { Some(SortOptions { - ordering: SortOrdering::Default, descending: false, nulls_first: false, }) diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index d6fed00efa3e..e2f15d547f2a 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -97,13 +97,16 @@ impl ScalarUDFImpl for LogFunc { // log(x) defaults to log(10, x) (SortProperties::Singleton, input[0].sort_properties.clone()) } else { - (input[0].sort_properties.clone(), input[1].sort_properties.clone()) + ( + input[0].sort_properties.clone(), + input[1].sort_properties.clone(), + ) }; match (&num_sort_properties, &base_sort_properties) { (first @ SortProperties::Ordered(num), SortProperties::Ordered(base)) - if num.ordering() == base.ordering() - && num.descending() != base.descending() - && num.nulls_first() == base.nulls_first() => + if num.ordering == base.ordering + && num.descending != base.descending + && num.nulls_first == base.nulls_first => { Ok(first.clone()) } diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index 043d2b02316c..3d95a9cf3306 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -19,8 +19,9 @@ use crate::physical_expr::PhysicalExpr; use arrow::array::RecordBatch; +use arrow::compute::SortColumn; use arrow::datatypes::Schema; -use datafusion_common::sort::{SortColumn, SortOptions}; +use datafusion_common::sort::{AdvSortColumn, AdvSortOptions}; use datafusion_common::Result; use datafusion_expr_common::columnar_value::ColumnarValue; use itertools::Itertools; @@ -78,18 +79,18 @@ pub struct PhysicalSortExpr { /// Physical expression representing the column to sort pub expr: Arc, /// Option to specify how the given column should be sorted - pub options: SortOptions, + pub options: AdvSortOptions, } impl PhysicalSortExpr { /// Create a new PhysicalSortExpr - pub fn new(expr: Arc, options: SortOptions) -> Self { + pub fn new(expr: Arc, options: AdvSortOptions) -> Self { Self { expr, options } } - /// Create a new PhysicalSortExpr with default [`SortOptions`] + /// Create a new PhysicalSortExpr with default [`AdvSortOptions`] pub fn new_default(expr: Arc) -> Self { - Self::new(expr, SortOptions::default()) + Self::new(expr, AdvSortOptions::default()) } /// Set the sort sort options to ASC @@ -148,12 +149,21 @@ impl Display for PhysicalSortExpr { impl PhysicalSortExpr { /// evaluate the sort expression into SortColumn that can be passed into arrow sort kernel pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result { + self.evaluate_to_adv_sort_column(batch)?.to_arrow() + } + + /// Evaluate the sort expression into an [AdvSortColumn] that can be passed to the DataFusion + /// sorting algorithms that supporting user-defined sorting. + pub fn evaluate_to_adv_sort_column( + &self, + batch: &RecordBatch, + ) -> Result { let value_to_sort = self.expr.evaluate(batch)?; let array_to_sort = match value_to_sort { ColumnarValue::Array(array) => array, ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(batch.num_rows())?, }; - Ok(SortColumn { + Ok(AdvSortColumn { values: array_to_sort, options: Some(self.options.clone()), }) @@ -177,8 +187,8 @@ impl PhysicalSortExpr { .is_none_or(|opts| &self.options == opts) } else { requirement.options.as_ref().is_none_or(|opts| { - self.options.ordering() == opts.ordering() - && self.options.descending() == opts.descending() + self.options.ordering == opts.ordering + && self.options.descending == opts.descending }) } } @@ -186,10 +196,10 @@ impl PhysicalSortExpr { /// Represents sort requirement associated with a plan /// -/// If the requirement includes [`SortOptions`] then both the +/// If the requirement includes [`AdvSortOptions`] then both the /// expression *and* the sort options must match. /// -/// If the requirement does not include [`SortOptions`]) then only the +/// If the requirement does not include [`AdvSortOptions`]) then only the /// expressions must match. /// /// # Examples @@ -209,7 +219,7 @@ pub struct PhysicalSortRequirement { pub expr: Arc, /// Option to specify how the given column should be sorted. /// If unspecified, there are no constraints on sort options. - pub options: Option, + pub options: Option, } impl From for PhysicalSortExpr { @@ -276,14 +286,8 @@ impl PhysicalSortRequirement { /// which must match only `expr`. /// /// See [`PhysicalSortRequirement`] for examples. - pub fn new( - expr: Arc, - sort_definition: Option, - ) -> Self { - Self { - expr, - options: sort_definition, - } + pub fn new(expr: Arc, options: Option) -> Self { + Self { expr, options } } /// Replace the required expression for this requirement with the new one @@ -317,10 +321,10 @@ impl PhysicalSortRequirement { } } -/// Returns the SQL string representation of the given [SortOptions] object. +/// Returns the SQL string representation of the given [AdvSortOptions] object. #[inline] -fn to_str(options: &SortOptions) -> &'static str { - match (options.descending(), options.nulls_first()) { +fn to_str(options: &AdvSortOptions) -> &str { + match (options.descending, options.nulls_first) { (true, true) => "DESC", (true, false) => "DESC NULLS LAST", (false, true) => "ASC", @@ -445,7 +449,7 @@ impl LexOrdering { /// Collapse a `LexOrdering` into a new duplicate-free `LexOrdering` based on expression. /// /// This function filters duplicate entries that have same physical - /// expression inside, ignoring [`SortOptions`]. For example: + /// expression inside, ignoring [`AdvSortOptions`]. For example: /// /// `vec![a ASC, a DESC]` collapses to `vec![a ASC]`. pub fn collapse(self) -> Self { diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index 10d2ab14316c..751783bf3e34 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -41,8 +41,9 @@ use std::sync::Arc; use crate::expressions::Column; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue}; -use datafusion_common::sort::SortOptions; use datafusion_expr::{AggregateUDF, ReversedUDAF, SetMonotonicity}; use datafusion_expr_common::accumulator::Accumulator; use datafusion_expr_common::groups_accumulator::GroupsAccumulator; @@ -553,10 +554,12 @@ impl AggregateFunctionExpr { return None; } let expr = Arc::new(Column::new(self.name(), aggr_func_idx)); - todo!("Sort?") - // let options = - // SortOptions::new(monotonicity == SetMonotonicity::Decreasing, false); - // Some(PhysicalSortExpr { expr, options }) + let options = AdvSortOptions::new( + SortOrdering::Default, + monotonicity == SetMonotonicity::Decreasing, + false, + ); + Some(PhysicalSortExpr { expr, options }) } } diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index b0303e879969..da978dec6df6 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -23,8 +23,8 @@ use std::vec::IntoIter; use crate::equivalence::add_offset_to_expr; use crate::{LexOrdering, PhysicalExpr}; +use datafusion_common::sort::AdvSortOptions; use datafusion_common::HashSet; -use datafusion_common::sort::SortOptions; /// An `OrderingEquivalenceClass` object keeps track of different alternative /// orderings than can describe a schema. For example, consider the following table: @@ -227,7 +227,7 @@ impl OrderingEquivalenceClass { /// Gets sort options associated with this expression if it is a leading /// ordering expression. Otherwise, returns `None`. - pub fn get_options(&self, expr: &Arc) -> Option<&SortOptions> { + pub fn get_options(&self, expr: &Arc) -> Option<&AdvSortOptions> { for ordering in self.iter() { let leading_ordering = &ordering[0]; if leading_ordering.expr.eq(expr) { diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index a8911a6b4cbf..3b7d266bbe1e 100755 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -41,7 +41,7 @@ use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_physical_expr_common::utils::ExprPropertiesNode; -use datafusion_common::sort::SortOptions; +use datafusion_common::sort::AdvSortOptions; use indexmap::{IndexMap, IndexSet}; use itertools::Itertools; @@ -644,9 +644,10 @@ impl EquivalenceProperties { && normalized_reqs[..ordering_len].iter().zip(ordering).all( |(req, existing)| { req.expr.eq(&existing.expr) - && req.options.as_ref().is_none_or(|req_opts| { - req_opts == &existing.options - }) + && req + .options + .as_ref() + .is_none_or(|req_opts| *req_opts == existing.options) }, ) }) @@ -672,7 +673,7 @@ impl EquivalenceProperties { SortProperties::Ordered(options) => { let sort_expr = PhysicalSortExpr { expr: Arc::clone(&req.expr), - options: options, + options, }; sort_expr.satisfy(req, self.schema()) } @@ -740,8 +741,7 @@ impl EquivalenceProperties { .zip(rhs.inner.iter_mut()) .all(|(lhs, rhs)| { lhs.expr.eq(&rhs.expr) - && match (lhs.options.as_ref(), rhs.options.as_ref()) - { + && match (lhs.options.as_ref(), rhs.options.as_ref()) { (Some(lhs_opt), Some(rhs_opt)) => lhs_opt == rhs_opt, (Some(options), None) => { rhs.options = Some(options.clone()); @@ -974,10 +974,10 @@ impl EquivalenceProperties { None } }) - .flat_map(|(sort_definition, relevant_deps)| { + .flat_map(|(options, relevant_deps)| { let sort_expr = PhysicalSortExpr { expr: Arc::clone(target), - options: sort_definition, + options, }; // Generate dependent orderings (i.e. prefixes for `sort_expr`): let mut dependency_orderings = @@ -1157,7 +1157,7 @@ impl EquivalenceProperties { )), SortProperties::Singleton => { // Assign default ordering to constant expressions - let options = SortOptions::default(); + let options = AdvSortOptions::default(); Some(( PhysicalSortExpr { expr: Arc::clone(&exprs[idx]), @@ -1689,9 +1689,7 @@ fn get_expr_properties( if let Some(column_order) = dependencies.iter().find(|&order| expr.eq(&order.expr)) { // If exact match is found, return its ordering. Ok(ExprProperties { - sort_properties: SortProperties::Ordered( - column_order.options.clone(), - ), + sort_properties: SortProperties::Ordered(column_order.options.clone()), range: Interval::make_unbounded(&expr.data_type(schema)?)?, preserves_lex_ordering: false, }) @@ -2395,10 +2393,8 @@ fn advance_if_matches_constant( ) -> Option { let expr = iter.peek()?; let const_expr = constants.iter().find(|c| c.eq_expr(expr))?; - let found_expr = PhysicalSortExpr::new( - Arc::clone(const_expr.expr()), - expr.options.clone(), - ); + let found_expr = + PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options.clone()); iter.next(); Some(found_expr) } @@ -2525,7 +2521,7 @@ mod tests { let offset = schema.fields.len(); let col_a2 = &add_offset_to_expr(Arc::clone(col_a), offset); let col_b2 = &add_offset_to_expr(Arc::clone(col_b), offset); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; @@ -2640,7 +2636,7 @@ mod tests { let col_y = &col("y", &child_schema)?; let col_z = &col("z", &child_schema)?; let col_w = &col("w", &child_schema)?; - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; @@ -2697,7 +2693,7 @@ mod tests { #[test] fn test_normalize_ordering_equivalence_classes() -> Result<()> { - let sort_options = SortOptions::default(); + let sort_options = AdvSortOptions::default(); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -2743,8 +2739,8 @@ mod tests { #[test] fn test_get_indices_of_matching_sort_exprs_with_order_eq() -> Result<()> { - let sort_options = SortOptions::default(); - let sort_options_not = SortOptions::default().not(); + let sort_options = AdvSortOptions::default(); + let sort_options_not = AdvSortOptions::default().not(); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -2867,7 +2863,7 @@ mod tests { let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_d = &col("d", &schema)?; - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; @@ -2948,11 +2944,11 @@ mod tests { Arc::clone(col_d), )) as Arc; - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { descending: true, nulls_first: true, }; @@ -3058,7 +3054,7 @@ mod tests { // TEST CASE 1 // ordering of the constants are treated as default ordering. // This is the convention currently used. - (vec![col_h], vec![(col_h, SortOptions::default())]), + (vec![col_h], vec![(col_h, AdvSortOptions::default())]), ]; for (exprs, expected) in test_cases { let exprs = exprs.into_iter().cloned().collect::>(); @@ -3077,11 +3073,11 @@ mod tests { let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let eq_properties = EquivalenceProperties::new(schema); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { descending: true, nulls_first: true, }; @@ -3142,11 +3138,11 @@ mod tests { let col_d = &col("d", &test_schema)?; let col_e = &col("e", &test_schema)?; let col_f = &col("f", &test_schema)?; - let option_asc = SortOptions { + let option_asc = AdvSortOptions { descending: false, nulls_first: false, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { descending: true, nulls_first: true, }; @@ -3206,7 +3202,7 @@ mod tests { #[test] fn test_schema_normalize_sort_requirement_with_equivalence() -> Result<()> { - let option1 = SortOptions { + let option1 = AdvSortOptions { descending: false, nulls_first: false, }; @@ -3254,7 +3250,7 @@ mod tests { .map(|c| { col(c, schema.as_ref()).map(|expr| PhysicalSortExpr { expr, - options: SortOptions { + options: AdvSortOptions { descending: false, nulls_first: true, }, @@ -3347,7 +3343,7 @@ mod tests { .map(|&name| { col(name, &schema).map(|col| PhysicalSortExpr { expr: col, - options: SortOptions::default(), + options: AdvSortOptions::default(), }) }) .collect::>()?; @@ -3932,7 +3928,7 @@ mod tests { let name = parts.next().expect("empty sort expression"); let mut sort_expr = PhysicalSortExpr::new( col(name, schema).expect("invalid column name"), - SortOptions::default(), + AdvSortOptions::default(), ); if let Some(options) = parts.next() { @@ -4108,11 +4104,11 @@ mod tests { let sort_exprs = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_a), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::clone(&col_b), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]); @@ -4136,8 +4132,8 @@ mod tests { let col_b = col("b", &schema)?; let col_c = col("c", &schema)?; - let asc = SortOptions::default(); - let desc = SortOptions { + let asc = AdvSortOptions::default(); + let desc = AdvSortOptions { descending: true, nulls_first: true, }; @@ -4192,7 +4188,7 @@ mod tests { // Make a and b equivalent eq_properties.add_equal_conditions(&col_a, &col_b)?; - let asc = SortOptions::default(); + let asc = AdvSortOptions::default(); // Initial ordering: [a ASC, c ASC] eq_properties.add_new_orderings([LexOrdering::new(vec![ @@ -4236,8 +4232,8 @@ mod tests { let col_a = col("a", &schema)?; let col_b = col("b", &schema)?; - let asc = SortOptions::default(); - let desc = SortOptions { + let asc = AdvSortOptions::default(); + let desc = AdvSortOptions { descending: true, nulls_first: true, }; @@ -4281,7 +4277,7 @@ mod tests { let col_d = col("d", &schema)?; let col_e = col("e", &schema)?; - let asc = SortOptions::default(); + let asc = AdvSortOptions::default(); // Constants: c is constant eq_properties = eq_properties.with_constants([ConstExpr::from(&col_c)]); @@ -4479,7 +4475,7 @@ mod tests { .iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ); @@ -4492,7 +4488,7 @@ mod tests { cols.iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ) @@ -4506,7 +4502,7 @@ mod tests { cols.iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index af735b4535a5..47b4d30ba974 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -525,8 +525,8 @@ impl PhysicalExpr for BinaryExpr { /// For each operator, [`BinaryExpr`] has distinct rules. /// TODO: There may be rules specific to some data types and expression ranges. fn get_properties(&self, children: &[ExprProperties]) -> Result { - let (l_order, l_range) = (children[0].sort_properties.clone(), &children[0].range); - let (r_order, r_range) = (children[1].sort_properties.clone(), &children[1].range); + let (l_order, l_range) = (&children[0].sort_properties, &children[0].range); + let (r_order, r_range) = (&children[1].sort_properties, &children[1].range); match self.op() { Operator::Plus => Ok(ExprProperties { sort_properties: l_order.add(&r_order), diff --git a/datafusion/physical-expr/src/window/standard.rs b/datafusion/physical-expr/src/window/standard.rs index 86625e643a07..d3b34efa4232 100644 --- a/datafusion/physical-expr/src/window/standard.rs +++ b/datafusion/physical-expr/src/window/standard.rs @@ -26,11 +26,12 @@ use crate::window::window_expr::{get_orderby_values, WindowFn}; use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState}; use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr}; use arrow::array::{new_empty_array, ArrayRef}; +use arrow::compute::SortOptions; use arrow::datatypes::Field; use arrow::record_batch::RecordBatch; -use datafusion_common::sort::SortOptions; +use datafusion_common::types::SortOrdering; use datafusion_common::utils::evaluate_partition_ranges; -use datafusion_common::{Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::window_state::{WindowAggState, WindowFrameContext}; use datafusion_expr::WindowFrame; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -46,18 +47,25 @@ pub struct StandardWindowExpr { impl StandardWindowExpr { /// create a new standard window function expression - pub fn new( + pub fn try_new( expr: Arc, partition_by: &[Arc], order_by: &LexOrdering, window_frame: Arc, - ) -> Self { - Self { + ) -> Result { + let has_unsupported_ordering = order_by + .iter() + .any(|expr| expr.options.ordering != SortOrdering::Default); + if has_unsupported_ordering { + return internal_err!("Unsupported ordering for window expressions."); + } + + Ok(Self { expr, partition_by: partition_by.to_vec(), order_by: order_by.clone(), window_frame, - } + }) } /// Get StandardWindowFunction expr of StandardWindowExpr @@ -115,7 +123,7 @@ impl WindowExpr for StandardWindowExpr { let sort_options: Vec = self .order_by .iter() - .map(|o| o.options.clone()) + .map(|o| o.options.to_arrow().expect("Checked in try_new")) .collect(); let mut row_wise_results = vec![]; @@ -160,10 +168,10 @@ impl WindowExpr for StandardWindowExpr { ) -> Result<()> { let field = self.expr.field()?; let out_type = field.data_type(); - let sort_definitions = self + let sort_options = self .order_by .iter() - .map(|o| o.options.clone()) + .map(|o| o.options.to_arrow().expect("Checked in try_new")) .collect::>(); for (partition_row, partition_batch_state) in partition_batches.iter() { let window_state = @@ -211,7 +219,7 @@ impl WindowExpr for StandardWindowExpr { .get_or_insert_with(|| { WindowFrameContext::new( Arc::clone(&self.window_frame), - sort_definitions.clone(), + sort_options.clone(), ) }) .calculate_range( @@ -257,12 +265,15 @@ impl WindowExpr for StandardWindowExpr { fn get_reverse_expr(&self) -> Option> { self.expr.reverse_expr().map(|reverse_expr| { - Arc::new(StandardWindowExpr::new( - reverse_expr, - &self.partition_by.clone(), - reverse_order_bys(self.order_by.as_ref()).as_ref(), - Arc::new(self.window_frame.reverse()), - )) as _ + Arc::new( + StandardWindowExpr::try_new( + reverse_expr, + &self.partition_by.clone(), + reverse_order_bys(self.order_by.as_ref()).as_ref(), + Arc::new(self.window_frame.reverse()), + ) + .expect("self has no custom sorts"), + ) as _ }) } diff --git a/datafusion/physical-expr/src/window/window_expr.rs b/datafusion/physical-expr/src/window/window_expr.rs index 930b93447bc1..1c181031bc09 100644 --- a/datafusion/physical-expr/src/window/window_expr.rs +++ b/datafusion/physical-expr/src/window/window_expr.rs @@ -23,6 +23,8 @@ use std::sync::Arc; use crate::{LexOrdering, PhysicalExpr}; use arrow::array::{new_empty_array, Array, ArrayRef}; +use arrow::compute::kernels::sort::SortColumn; +use arrow::compute::SortOptions; use arrow::datatypes::Field; use arrow::record_batch::RecordBatch; use datafusion_common::utils::compare_rows; @@ -33,7 +35,6 @@ use datafusion_expr::window_state::{ use datafusion_expr::{Accumulator, PartitionEvaluator, WindowFrame, WindowFrameBound}; use indexmap::IndexMap; -use datafusion_common::sort::{SortColumn, SortOptions}; /// Common trait for [window function] implementations /// @@ -189,8 +190,11 @@ pub trait AggregateWindowExpr: WindowExpr { fn aggregate_evaluate(&self, batch: &RecordBatch) -> Result { let mut accumulator = self.get_accumulator()?; let mut last_range = Range { start: 0, end: 0 }; - let sort_options: Vec = - self.order_by().iter().map(|o| o.options.clone()).collect(); + let sort_options: Vec = self + .order_by() + .iter() + .map(|o| o.options.to_arrow()) + .collect::>>()?; let mut window_frame_ctx = WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options); self.get_result_column( @@ -237,9 +241,12 @@ pub trait AggregateWindowExpr: WindowExpr { let most_recent_row = partition_batch_state.most_recent_row.as_ref(); // If there is no window state context, initialize it. + let sort_options: Vec = self + .order_by() + .iter() + .map(|o| o.options.to_arrow()) + .collect::>>()?; let window_frame_ctx = state.window_frame_ctx.get_or_insert_with(|| { - let sort_options: Vec = - self.order_by().iter().map(|o| o.options.clone()).collect(); WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options) }); let out_col = self.get_result_column( @@ -359,7 +366,7 @@ pub(crate) fn is_end_bound_safe( &window_frame.end_bound, &order_bys[0], most_recent_order_bys.map(|items| &items[0]), - &sort_exprs[0].options, + &sort_exprs[0].options.to_arrow()?, idx, ), WindowFrameContext::Groups { @@ -370,7 +377,7 @@ pub(crate) fn is_end_bound_safe( state, &order_bys[0], most_recent_order_bys.map(|items| &items[0]), - &sort_exprs[0].options, + &sort_exprs[0].options.to_arrow()?, ), } } @@ -514,7 +521,7 @@ fn is_row_ahead( } let last_value = ScalarValue::try_from_array(old_col, old_col.len() - 1)?; let current_value = ScalarValue::try_from_array(current_col, 0)?; - let cmp = compare_rows(&[current_value], &[last_value], &[sort_options.clone()])?; + let cmp = compare_rows(&[current_value], &[last_value], &[*sort_options])?; Ok(cmp.is_gt()) } diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs index 4a1dc66e10a4..7c838da945c3 100644 --- a/datafusion/physical-optimizer/src/enforce_distribution.rs +++ b/datafusion/physical-optimizer/src/enforce_distribution.rs @@ -21,15 +21,15 @@ //! according to the configuration), this rule increases partition counts in //! the physical plan. -use std::fmt::Debug; -use std::sync::Arc; - use crate::optimizer::PhysicalOptimizerRule; use crate::output_requirements::OutputRequirementExec; use crate::utils::{ add_sort_above_with_check, is_coalesce_partitions, is_repartition, is_sort_preserving_merge, }; +use arrow::compute::SortOptions; +use std::fmt::Debug; +use std::sync::Arc; use datafusion_common::config::ConfigOptions; use datafusion_common::error::Result; @@ -61,7 +61,6 @@ use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_physical_plan::{Distribution, ExecutionPlan, Partitioning}; use itertools::izip; -use datafusion_common::sort::SortOptions; /// The `EnforceDistribution` rule ensures that distribution requirements are /// met. In doing so, this rule will increase the parallelism in the plan by diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs index 4e74a0b672a1..9a869211404e 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs @@ -613,7 +613,10 @@ fn handle_custom_pushdown( } })? .data; - Ok(PhysicalSortRequirement::new(updated_columns, req.options.clone())) + Ok(PhysicalSortRequirement::new( + updated_columns, + req.options.clone(), + )) }) .collect::>>()?; @@ -691,7 +694,10 @@ fn handle_hash_join( } })? .data; - Ok(PhysicalSortRequirement::new(updated_columns, req.options.clone())) + Ok(PhysicalSortRequirement::new( + updated_columns, + req.options.clone(), + )) }) .collect::>>()?; diff --git a/datafusion/physical-plan/src/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs index 8b94d1b8cd9b..aff69277a4ce 100644 --- a/datafusion/physical-plan/src/aggregates/order/partial.rs +++ b/datafusion/physical-plan/src/aggregates/order/partial.rs @@ -16,10 +16,9 @@ // under the License. use arrow::array::ArrayRef; +use arrow::compute::SortOptions; use arrow::datatypes::Schema; use arrow_ord::partition::partition; -use datafusion_common::sort::SortOptions; -use datafusion_common::types::SortOrdering; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{Result, ScalarValue}; use datafusion_execution::memory_pool::proxy::VecAllocExt; @@ -201,10 +200,7 @@ impl GroupOrderingPartial { range_sort_key: Vec, ) -> Result<(usize, Vec)> { if let Some(sort_key) = sort_key { - let sort_options = vec![ - SortOptions::new(SortOrdering::Default, false, false); - sort_key.len() - ]; + let sort_options = vec![SortOptions::new(false, false); sort_key.len()]; let ordering = compare_rows(&sort_key, &range_sort_key, &sort_options)?; if ordering == Ordering::Equal { return Ok((current_sort, sort_key)); diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 2525e353fbf6..ecd2e0ca91e1 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -49,12 +49,12 @@ use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr}; use super::order::GroupOrdering; use super::AggregateExec; +use datafusion_common::sort::AdvSortOptions; use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; use futures::ready; use futures::stream::{Stream, StreamExt}; use log::debug; -use datafusion_common::sort::SortOptions; #[derive(Debug, Clone)] /// This object tracks the aggregation phase (input/output) @@ -521,7 +521,7 @@ impl GroupedHashAggregateStream { .enumerate() .map(|(idx, field)| PhysicalSortExpr { expr: Arc::new(Column::new(field.name().as_str(), idx)) as _, - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(); diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 059aaa318621..0582c87c9ada 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -55,7 +55,7 @@ use crate::{ use arrow::array::{types::UInt64Type, *}; use arrow::compute::{ - self, concat_batches, filter_record_batch, is_not_null, take, + self, concat_batches, filter_record_batch, is_not_null, take, SortOptions, }; use arrow::datatypes::{DataType, SchemaRef, TimeUnit}; use arrow::error::ArrowError; @@ -72,8 +72,9 @@ use datafusion_physical_expr::equivalence::join_equivalence_properties; use datafusion_physical_expr::PhysicalExprRef; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use futures::{Stream, StreamExt}; -use datafusion_common::sort::SortOptions; /// Join execution plan that executes equi-join predicates on multiple partitions using Sort-Merge /// join algorithm and applies an optional filter post join. Can be used to join arbitrarily large @@ -189,11 +190,19 @@ impl SortMergeJoinExec { .map(|((l, r), sort_op)| { let left = PhysicalSortExpr { expr: Arc::clone(l), - options: sort_op.clone(), + options: AdvSortOptions::new( + SortOrdering::Default, + sort_op.descending, + sort_op.nulls_first, + ), }; let right = PhysicalSortExpr { expr: Arc::clone(r), - options: sort_op.clone(), + options: AdvSortOptions::new( + SortOrdering::Default, + sort_op.descending, + sort_op.nulls_first, + ), }; (left, right) }) diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 27557f96c504..8f4a441cab9a 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -730,7 +730,11 @@ fn determine_prune_length( }; // Perform binary search on the array to determine the length of the record batch to be pruned - bisect::(&[batch_arr], &[target], &[origin_sorted_expr.options.clone()]) + bisect::( + &[batch_arr], + &[target], + &[origin_sorted_expr.options.to_arrow()?], + ) } /// This method determines if the result of the join should be produced in the final step or not. diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 555ef8d57206..380d2d832d9d 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -2646,25 +2646,25 @@ mod tests { let left_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options: options, + options, }, ]); let right_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 2)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 1)), - options: options, + options, }, ]); let join_type = JoinType::Inner; @@ -2680,45 +2680,45 @@ mod tests { Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options: options, + options, }, ])), Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options: options, + options, }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options: options, + options, }, ])), ]; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 9d11b205e02e..40e68cfcae83 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1700,7 +1700,7 @@ mod test { let options = SortOptions::default(); LexOrdering::new(vec![PhysicalSortExpr { expr: col("c0", schema).unwrap(), - options: options, + options, }]) } diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index e97f8dcf21c7..8ea7c43d2613 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -22,9 +22,9 @@ use arrow::array::{ PrimitiveArray, }; use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer}; +use arrow::compute::SortOptions; use arrow::datatypes::ArrowNativeTypeOp; use arrow::row::Rows; -use datafusion_common::sort::SortOptions; use datafusion_execution::memory_pool::MemoryReservation; /// A comparable collection of values for use with [`Cursor`] diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 159898566987..41b39226c9ba 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -47,7 +47,7 @@ use crate::{ use arrow::array::{ Array, RecordBatch, RecordBatchOptions, StringViewArray, UInt32Array, }; -use arrow::compute::{concat_batches, take_arrays}; +use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays, SortColumn}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::row::{RowConverter, SortField}; use datafusion_common::{internal_err, Result}; @@ -60,7 +60,6 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; -use datafusion_common::sort::{lexsort_to_indices, SortColumn}; struct ExternalSorterMetrics { /// metrics @@ -789,7 +788,7 @@ pub(crate) fn lexsort_to_indices_multi_columns( |(mut fields, mut columns), sort_column| { fields.push(SortField::new_with_options( sort_column.values.data_type().clone(), - sort_column.options.map(|o| o.to_arrow().expect("TODO")).unwrap_or_default(), + sort_column.options.unwrap_or_default(), )); columns.push(sort_column.values); (fields, columns) diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index 9fe21f829759..d15e1b34aae9 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -103,7 +103,9 @@ impl RowCursorStream { let data_type = expr.expr.data_type(schema)?; match expr.options.to_arrow() { Ok(options) => Ok(SortField::new_with_options(data_type, options)), - Err(_) => internal_err!("Custom orderings not supported in RowCursorStream."), + Err(_) => internal_err!( + "Custom orderings not supported in RowCursorStream." + ), } }) .collect::>>()?; @@ -198,7 +200,7 @@ impl FieldCursorStream { let mut array_reservation = self.reservation.new_empty(); array_reservation.try_grow(size_in_mem)?; Ok(ArrayValues::new( - self.sort.options.clone(), + self.sort.options.to_arrow()?, array, array_reservation, )) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 004600fcd0b9..a36d3c314fae 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -29,8 +29,8 @@ use crate::spill::get_record_batch_memory_size; use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; use arrow::array::{Array, ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; -use datafusion_common::{HashMap, _internal_datafusion_err}; use datafusion_common::Result; +use datafusion_common::{internal_datafusion_err, HashMap}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, @@ -115,7 +115,9 @@ impl TopK { .map(|e| { Ok(SortField::new_with_options( e.expr.data_type(&schema)?, - e.options.to_arrow().map_err(|_| _internal_datafusion_err!("Custom sorts not supported in TopK"))?, + e.options.to_arrow().map_err(|_| { + internal_datafusion_err!("Custom sorts not supported in TopK") + })?, )) }) .collect::>()?; diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 0d9c58b3bf49..ea84d4b9350f 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1599,7 +1599,7 @@ mod tests { let window_exprs = vec![ // LAST_VALUE(a) - Arc::new(StandardWindowExpr::new( + Arc::new(StandardWindowExpr::try_new( last_value_func, &[], &LexOrdering::default(), @@ -1610,7 +1610,7 @@ mod tests { )), )) as _, // NTH_VALUE(a, -1) - Arc::new(StandardWindowExpr::new( + Arc::new(StandardWindowExpr::try_new( nth_value_func1, &[], &LexOrdering::default(), @@ -1621,7 +1621,7 @@ mod tests { )), )) as _, // NTH_VALUE(a, -2) - Arc::new(StandardWindowExpr::new( + Arc::new(StandardWindowExpr::try_new( nth_value_func2, &[], &LexOrdering::default(), diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index d1fdb1cd27f0..5c36d431d766 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -52,7 +52,7 @@ use itertools::Itertools; // Public interface: pub use bounded_window_agg_exec::BoundedWindowAggExec; -use datafusion_common::sort::SortOptions; +use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; pub use datafusion_physical_expr::window::{ PlainAggregateWindowExpr, StandardWindowExpr, WindowExpr, @@ -121,12 +121,14 @@ pub fn create_window_expr( aggregate, ) } - WindowFunctionDefinition::WindowUDF(fun) => Arc::new(StandardWindowExpr::new( - create_udwf_window_expr(fun, args, input_schema, name, ignore_nulls)?, - partition_by, - order_by, - window_frame, - )), + WindowFunctionDefinition::WindowUDF(fun) => { + Arc::new(StandardWindowExpr::try_new( + create_udwf_window_expr(fun, args, input_schema, name, ignore_nulls)?, + partition_by, + order_by, + window_frame, + )?) + } }) } @@ -271,7 +273,7 @@ impl StandardWindowFunctionExpr for WindowUDFExpr { let expr = Arc::new(Column::new(field.name(), idx)); PhysicalSortExpr { expr, - options: options.clone(), + options: AdvSortOptions::with_default_ordering(options), } }) } @@ -293,10 +295,7 @@ pub(crate) fn calc_requirements< .collect::>(), ); for element in orderby_sort_exprs.into_iter() { - let PhysicalSortExpr { - expr, - options, - } = element.borrow(); + let PhysicalSortExpr { expr, options } = element.borrow(); if !sort_reqs.iter().any(|e| e.expr.eq(expr)) { sort_reqs.push(PhysicalSortRequirement::new( Arc::clone(expr), @@ -435,7 +434,11 @@ pub(crate) fn window_equivalence_properties( let new_ordering = vec![LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(SortOrdering::Default, increasing, true), + AdvSortOptions::new( + SortOrdering::Default, + increasing, + true, + ), )])]; window_eq_properties.add_new_orderings(new_ordering); } else { @@ -444,7 +447,11 @@ pub(crate) fn window_equivalence_properties( let mut existing = lex.take_exprs(); existing.push(PhysicalSortExpr::new( Arc::new(window_col.clone()), - SortOptions::new(SortOrdering::Default, increasing, true), + AdvSortOptions::new( + SortOrdering::Default, + increasing, + true, + ), )); window_eq_properties .add_new_ordering(LexOrdering::new(existing)); @@ -479,14 +486,22 @@ pub(crate) fn window_equivalence_properties( let new_ordering = LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(SortOrdering::Default, false, false), + AdvSortOptions::new( + SortOrdering::Default, + false, + false, + ), )]); window_eq_properties.add_new_ordering(new_ordering); } else if !increasing && (!asc || no_partitioning) { let new_ordering = LexOrdering::new(vec![PhysicalSortExpr::new( Arc::new(window_col), - SortOptions::new(SortOrdering::Default, true, false), + AdvSortOptions::new( + SortOrdering::Default, + true, + false, + ), )]); window_eq_properties.add_new_ordering(new_ordering); }; @@ -635,9 +650,12 @@ fn sort_options_resolving_constant(expr: Arc) -> Vec PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options: options, + options, } } @@ -763,10 +781,7 @@ mod tests { descending, nulls_first, }; - orderbys.push(PhysicalSortExpr { - expr, - options: options, - }); + orderbys.push(PhysicalSortExpr { expr, options }); } let mut expected: Option = None; @@ -1005,10 +1020,7 @@ mod tests { // Give default ordering, this is same with input ordering direction // In this test we do check for reversibility. let options = SortOptions::default(); - order_by_exprs.push(PhysicalSortExpr { - expr, - options: options, - }); + order_by_exprs.push(PhysicalSortExpr { expr, options }); } let res = get_window_mode( &partition_by_exprs, @@ -1174,10 +1186,7 @@ mod tests { descending: *descending, nulls_first: *nulls_first, }; - order_by_exprs.push(PhysicalSortExpr { - expr, - options: options, - }); + order_by_exprs.push(PhysicalSortExpr { expr, options }); } assert_eq!( diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 0fd554ee5af5..6331b7fb3114 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -79,7 +79,7 @@ pub fn parse_physical_sort_expr( descending: !proto.asc, nulls_first: proto.nulls_first, }; - Ok(PhysicalSortExpr { expr, options: options }) + Ok(PhysicalSortExpr { expr, options }) } else { Err(proto_error("Unexpected empty physical expression")) } diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 01a1c5bd2a4e..edabeeb077b7 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -180,7 +180,10 @@ pub fn serialize_physical_sort_expr( sort_expr: PhysicalSortExpr, codec: &dyn PhysicalExtensionCodec, ) -> Result { - let PhysicalSortExpr { expr, options: options } = sort_expr; + let PhysicalSortExpr { + expr, + options: options, + } = sort_expr; let expr = serialize_physical_expr(&expr, codec)?; Ok(PhysicalSortExprNode { expr: Some(Box::new(expr)), diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index a2506bb318d2..96b2b7e1f008 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -283,7 +283,7 @@ fn roundtrip_udwf() -> Result<()> { let field_b = Field::new("b", DataType::Int64, false); let schema = Arc::new(Schema::new(vec![field_a, field_b])); - let udwf_expr = Arc::new(StandardWindowExpr::new( + let udwf_expr = Arc::new(StandardWindowExpr::try_new( create_udwf_window_expr( &row_number_udwf(), &[], @@ -330,7 +330,7 @@ fn roundtrip_window() -> Result<()> { "NTH_VALUE(a, 2) PARTITION BY [b] ORDER BY [a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW".to_string(), false, )?; - let udwf_expr = Arc::new(StandardWindowExpr::new( + let udwf_expr = Arc::new(StandardWindowExpr::try_new( nth_value_window, &[col("b", &schema)?], &LexOrdering::new(vec![PhysicalSortExpr { @@ -1146,7 +1146,7 @@ fn roundtrip_udwf_extension_codec() -> Result<()> { WindowFrameBound::CurrentRow, ); - let udwf_expr = Arc::new(StandardWindowExpr::new( + let udwf_expr = Arc::new(StandardWindowExpr::try_new( udwf, &[col("b", &schema)?], &LexOrdering::new(vec![PhysicalSortExpr { From 3934a677dfa6d9f6e9cba018e7d1acef0c16e56e Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 5 Mar 2025 11:30:45 +0100 Subject: [PATCH 06/14] Use extension type registry when planning --- datafusion/catalog/src/session.rs | 4 +- .../core/src/datasource/listing/table.rs | 3 +- datafusion/core/src/datasource/memory.rs | 2 +- datafusion/core/src/datasource/mod.rs | 12 +++- datafusion/core/src/datasource/stream.rs | 12 ++-- .../core/src/execution/session_state.rs | 31 ++++++++-- datafusion/core/src/physical_planner.rs | 62 +++++++++++-------- datafusion/core/tests/memory_limit/mod.rs | 5 +- datafusion/execution/src/task.rs | 4 +- datafusion/expr/src/registry.rs | 33 +++++++++- 10 files changed, 122 insertions(+), 46 deletions(-) diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 0f8436064ed6..31ad2fc7c73f 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -17,7 +17,6 @@ use async_trait::async_trait; use datafusion_common::config::ConfigOptions; -use datafusion_common::types::LogicalTypeRef; use datafusion_common::{DFSchema, Result}; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; @@ -29,6 +28,7 @@ use parking_lot::{Mutex, RwLock}; use std::any::Any; use std::collections::HashMap; use std::sync::{Arc, Weak}; +use datafusion_expr::registry::MemoryExtensionTypeRegistry; /// Interface for accessing [`SessionState`] from the catalog. /// @@ -115,7 +115,7 @@ pub trait Session: Send + Sync { fn window_functions(&self) -> &HashMap>; /// Return reference to extension types - fn extension_types(&self) -> &HashMap; + fn extension_types(&self) -> &MemoryExtensionTypeRegistry; /// Return the runtime env fn runtime_env(&self) -> &Arc; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index adef02c38d73..73abf6c401ea 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -59,6 +59,7 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{future, stream, StreamExt, TryStreamExt}; use itertools::Itertools; use object_store::ObjectStore; +use datafusion_expr::registry::EmptyExtensionTypeRegistry; /// Configuration for creating a [`ListingTable`] #[derive(Debug, Clone)] @@ -809,7 +810,7 @@ impl ListingTable { /// If file_sort_order is specified, creates the appropriate physical expressions fn try_create_output_ordering(&self) -> Result> { - create_ordering(&self.table_schema, &self.options.file_sort_order) + create_ordering(&EmptyExtensionTypeRegistry::new(), &self.table_schema, &self.options.file_sort_order) } } diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index b8bec410070c..4897e18ce6ac 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -243,9 +243,9 @@ impl TableProvider for MemTable { .iter() .map(|sort_exprs| { create_physical_sort_exprs( + state, sort_exprs, &df_schema, - state.execution_props(), ) }) .collect::>>()?; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 2b7bb14b6f6c..7d09e08c3ae3 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -48,13 +48,15 @@ pub use crate::logical_expr::TableType; pub use datafusion_execution::object_store; pub use statistics::get_statistics_with_limit; -use arrow::compute::SortOptions; use arrow::datatypes::Schema; use datafusion_common::{plan_err, Result}; +use datafusion_common::sort::AdvSortOptions; use datafusion_expr::{Expr, SortExpr}; +use datafusion_expr::registry::ExtensionTypeRegistry; use datafusion_physical_expr::{expressions, LexOrdering, PhysicalSortExpr}; fn create_ordering( + extension_types: &impl ExtensionTypeRegistry, schema: &Schema, sort_order: &[Vec], ) -> Result> { @@ -67,9 +69,15 @@ fn create_ordering( match &sort.expr { Expr::Column(col) => match expressions::col(&col.name, schema) { Ok(expr) => { + let ordering = schema.field_with_name(&col.name)? + .extension_type_name() + .and_then(|ext| extension_types.get(ext).ok()) + .map(|ext| ext.planning_information().ordering.clone()) + .unwrap_or_default(); sort_exprs.push(PhysicalSortExpr { expr, - options: SortOptions { + options: AdvSortOptions { + ordering, descending: !sort.asc, nulls_first: sort.nulls_first, }, diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index d5fe070be82c..80c9418f7424 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -323,17 +323,18 @@ impl TableProvider for StreamTable { async fn scan( &self, - _state: &dyn Session, + state: &dyn Session, projection: Option<&Vec>, _filters: &[Expr], limit: Option, ) -> Result> { + let task_ctx = TaskContext::from(state); let projected_schema = match projection { Some(p) => { let projected = self.0.source.schema().project(p)?; - create_ordering(&projected, &self.0.order)? + create_ordering(&task_ctx, &projected, &self.0.order)? } - None => create_ordering(self.0.source.schema(), &self.0.order)?, + None => create_ordering(&task_ctx, self.0.source.schema(), &self.0.order)?, }; Ok(Arc::new(StreamingTableExec::try_new( @@ -348,14 +349,15 @@ impl TableProvider for StreamTable { async fn insert_into( &self, - _state: &dyn Session, + state: &dyn Session, input: Arc, _insert_op: InsertOp, ) -> Result> { + let task_ctx = TaskContext::from(state); let ordering = match self.0.order.first() { Some(x) => { let schema = self.0.source.schema(); - let orders = create_ordering(schema, std::slice::from_ref(x))?; + let orders = create_ordering(&task_ctx, schema, std::slice::from_ref(x))?; let ordering = orders.into_iter().next().unwrap(); Some(ordering.into_iter().map(Into::into).collect()) } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 5265d00558bc..30fffc197a9e 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -52,7 +52,10 @@ use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::planner::{ExprPlanner, TypePlanner}; -use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry}; +use datafusion_expr::registry::{ + ExtensionTypeRegistry, FunctionRegistry, MemoryExtensionTypeRegistry, + SerializerRegistry, +}; use datafusion_expr::simplify::SimplifyInfo; use datafusion_expr::var_provider::{is_system_variables, VarType}; use datafusion_expr::{ @@ -150,7 +153,7 @@ pub struct SessionState { /// Window functions registered in the context window_functions: HashMap>, /// Extension types registered in the context - extension_types: HashMap, + extension_types: MemoryExtensionTypeRegistry, /// Deserializer registry for extensions. serializer_registry: Arc, /// Holds registered external FileFormat implementations @@ -250,7 +253,7 @@ impl Session for SessionState { &self.window_functions } - fn extension_types(&self) -> &HashMap { + fn extension_types(&self) -> &MemoryExtensionTypeRegistry { &self.extension_types } @@ -1407,7 +1410,7 @@ impl SessionStateBuilder { scalar_functions: HashMap::new(), aggregate_functions: HashMap::new(), window_functions: HashMap::new(), - extension_types: HashMap::new(), + extension_types: MemoryExtensionTypeRegistry::new(), serializer_registry: serializer_registry .unwrap_or(Arc::new(EmptySerializerRegistry)), file_formats: HashMap::new(), @@ -1903,6 +1906,26 @@ impl FunctionRegistry for SessionState { } } +impl ExtensionTypeRegistry for SessionState { + fn get(&self, name: &str) -> datafusion_common::Result { + self.extension_types.get(name) + } + + fn register_type( + &mut self, + logical_type: LogicalTypeRef, + ) -> datafusion_common::Result> { + self.extension_types.register_type(logical_type) + } + + fn deregister_type( + &mut self, + name: &str, + ) -> datafusion_common::Result> { + self.extension_types.deregister_type(name) + } +} + impl OptimizerConfig for SessionState { fn query_execution_start_time(&self) -> DateTime { self.execution_props.query_execution_start_time diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 699723f0e68a..03fe2bfc9c37 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -26,7 +26,7 @@ use crate::datasource::listing::ListingTableUrl; use crate::datasource::physical_plan::FileSinkConfig; use crate::datasource::{source_as_provider, DefaultTableSource}; use crate::error::{DataFusionError, Result}; -use crate::execution::context::{ExecutionProps, SessionState}; +use crate::execution::context::SessionState; use crate::logical_expr::utils::generate_sort_key; use crate::logical_expr::{ Aggregate, EmptyRelation, Join, Projection, Sort, TableScan, Unnest, Values, Window, @@ -90,7 +90,10 @@ use datafusion_physical_plan::unnest::ListUnnest; use crate::schema_equivalence::schema_satisfied_by; use async_trait::async_trait; +use datafusion_catalog::Session; use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; +use datafusion_expr::registry::ExtensionTypeRegistry; use futures::{StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; use log::{debug, trace}; @@ -608,13 +611,7 @@ impl DefaultPhysicalPlanner { let logical_schema = node.schema(); let window_expr = window_expr .iter() - .map(|e| { - create_window_expr( - e, - logical_schema, - session_state.execution_props(), - ) - }) + .map(|e| create_window_expr(session_state, e, logical_schema)) .collect::>>()?; let can_repartition = session_state.config().target_partitions() > 1 @@ -706,10 +703,10 @@ impl DefaultPhysicalPlanner { .iter() .map(|e| { create_aggregate_expr_and_maybe_filter( + session_state, e, logical_input_schema, &physical_input_schema, - session_state.execution_props(), ) }) .collect::>>()?; @@ -819,11 +816,8 @@ impl DefaultPhysicalPlanner { }) => { let physical_input = children.one()?; let input_dfschema = input.as_ref().schema(); - let sort_expr = create_physical_sort_exprs( - expr, - input_dfschema, - session_state.execution_props(), - )?; + let sort_expr = + create_physical_sort_exprs(session_state, expr, input_dfschema)?; let new_sort = SortExec::new(sort_expr, physical_input).with_fetch(*fetch); Arc::new(new_sort) @@ -1508,10 +1502,10 @@ pub fn is_window_frame_bound_valid(window_frame: &WindowFrame) -> bool { /// Create a window expression with a name from a logical expression pub fn create_window_expr_with_name( + session_state: &SessionState, e: &Expr, name: impl Into, logical_schema: &DFSchema, - execution_props: &ExecutionProps, ) -> Result> { let name = name.into(); let physical_schema: &Schema = &logical_schema.into(); @@ -1527,12 +1521,13 @@ pub fn create_window_expr_with_name( null_treatment, }, }) => { + let execution_props = session_state.execution_props(); let physical_args = create_physical_exprs(args, logical_schema, execution_props)?; let partition_by = create_physical_exprs(partition_by, logical_schema, execution_props)?; let order_by = - create_physical_sort_exprs(order_by, logical_schema, execution_props)?; + create_physical_sort_exprs(session_state, order_by, logical_schema)?; if !is_window_frame_bound_valid(window_frame) { return plan_err!( @@ -1561,16 +1556,16 @@ pub fn create_window_expr_with_name( /// Create a window expression from a logical expression or an alias pub fn create_window_expr( + session_state: &SessionState, e: &Expr, logical_schema: &DFSchema, - execution_props: &ExecutionProps, ) -> Result> { // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()), _ => (e.schema_name().to_string(), e), }; - create_window_expr_with_name(e, name, logical_schema, execution_props) + create_window_expr_with_name(session_state, e, name, logical_schema) } type AggregateExprWithOptionalArgs = ( @@ -1583,11 +1578,11 @@ type AggregateExprWithOptionalArgs = ( /// Create an aggregate expression with a name from a logical expression pub fn create_aggregate_expr_with_name_and_maybe_filter( + session: &dyn Session, e: &Expr, name: Option, logical_input_schema: &DFSchema, physical_input_schema: &Schema, - execution_props: &ExecutionProps, ) -> Result { match e { Expr::AggregateFunction(AggregateFunction { @@ -1607,6 +1602,7 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter( physical_name(e)? }; + let execution_props = session.execution_props(); let physical_args = create_physical_exprs(args, logical_input_schema, execution_props)?; let filter = match filter { @@ -1624,9 +1620,9 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter( let (agg_expr, filter, order_by) = { let physical_sort_exprs = match order_by { Some(exprs) => Some(create_physical_sort_exprs( + session, exprs, logical_input_schema, - execution_props, )?), None => None, }; @@ -1655,10 +1651,10 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter( /// Create an aggregate expression from a logical expression or an alias pub fn create_aggregate_expr_and_maybe_filter( + session: &dyn Session, e: &Expr, logical_input_schema: &DFSchema, physical_input_schema: &Schema, - execution_props: &ExecutionProps, ) -> Result { // unpack (nested) aliased logical expressions, e.g. "sum(col) as total" let (name, e) = match e { @@ -1668,28 +1664,42 @@ pub fn create_aggregate_expr_and_maybe_filter( }; create_aggregate_expr_with_name_and_maybe_filter( + session, e, name, logical_input_schema, physical_input_schema, - execution_props, ) } /// Create a physical sort expression from a logical expression pub fn create_physical_sort_expr( + session: &dyn Session, e: &SortExpr, input_dfschema: &DFSchema, - execution_props: &ExecutionProps, ) -> Result { + // TODO this is not a nice solution. Somewhat related to the discussion in #14247 as we would + // a field method for PhysicalExpr. + let extension_types = session.extension_types(); + let ordering = match &e.expr { + Expr::Column(name) => input_dfschema + .field_from_column(name)? + .extension_type_name() + .and_then(|ext| extension_types.get(ext).ok()) + .map(|ext| ext.planning_information().ordering.clone()) + .unwrap_or_default(), + _ => SortOrdering::Default, + }; + let SortExpr { expr, asc, nulls_first, } = e; Ok(PhysicalSortExpr { - expr: create_physical_expr(expr, input_dfschema, execution_props)?, + expr: create_physical_expr(expr, input_dfschema, session.execution_props())?, options: AdvSortOptions { + ordering, descending: !asc, nulls_first: *nulls_first, }, @@ -1698,13 +1708,13 @@ pub fn create_physical_sort_expr( /// Create vector of physical sort expression from a vector of logical expression pub fn create_physical_sort_exprs( + session: &dyn Session, exprs: &[SortExpr], input_dfschema: &DFSchema, - execution_props: &ExecutionProps, ) -> Result { exprs .iter() - .map(|expr| create_physical_sort_expr(expr, input_dfschema, execution_props)) + .map(|expr| create_physical_sort_expr(session, expr, input_dfschema)) .collect::>() } diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 2deb8fde2da6..2b38fd4e1e9a 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -56,6 +56,8 @@ use test_utils::AccessLogGenerator; use async_trait::async_trait; use futures::StreamExt; use tokio::fs::File; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; #[cfg(test)] #[ctor::ctor] @@ -724,7 +726,8 @@ impl Scenario { .collect(); let schema = batches[0][0].schema(); - let options = SortOptions { + let options = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index 1f1ee5dc03d8..dac838fa0acd 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -85,7 +85,7 @@ impl TaskContext { scalar_functions: HashMap>, aggregate_functions: HashMap>, window_functions: HashMap>, - extension_types: HashMap, + extension_types: MemoryExtensionTypeRegistry, runtime: Arc, ) -> Self { Self { @@ -95,7 +95,7 @@ impl TaskContext { scalar_functions, aggregate_functions, window_functions, - extension_types: MemoryExtensionTypeRegistry::from(extension_types), + extension_types, runtime, } } diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index a27d39277faa..b31eaf997875 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -21,7 +21,9 @@ use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use datafusion_common::types::{LogicalTypeRef, TypeSignature}; -use datafusion_common::{internal_err, not_impl_err, plan_datafusion_err, Result}; +use datafusion_common::{ + internal_err, not_impl_err, plan_datafusion_err, plan_err, Result, +}; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -224,7 +226,7 @@ pub trait ExtensionTypeRegistry { } /// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s. -#[derive(Default, Debug)] +#[derive(Clone, Default, Debug)] pub struct MemoryExtensionTypeRegistry { /// Holds a mapping between the name of an extension type and its logical type. extension_types: HashMap, @@ -270,3 +272,30 @@ impl From> for MemoryExtensionTypeRegistry { } } } + +/// Represents an [ExtensionTypeRegistry] with no registered extension types. +pub struct EmptyExtensionTypeRegistry; + +impl EmptyExtensionTypeRegistry { + /// Creates a new [EmptyExtensionTypeRegistry]. + pub fn new() -> Self { + Self {} + } +} + +impl ExtensionTypeRegistry for EmptyExtensionTypeRegistry { + fn get(&self, _name: &str) -> Result { + plan_err!("Extension type not found.") + } + + fn register_type( + &mut self, + _logical_type: LogicalTypeRef, + ) -> Result> { + plan_err!("Cannot register type.") + } + + fn deregister_type(&mut self, _name: &str) -> Result> { + plan_err!("Cannot deregister type.") + } +} From 383ee7c22d3853c5d372b20f32529965d8bde85f Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 5 Mar 2025 12:29:18 +0100 Subject: [PATCH 07/14] Fix compiler errors in some tests --- datafusion/core/tests/memory_limit/mod.rs | 3 +- .../enforce_distribution.rs | 70 +++++++++---------- .../physical_optimizer/enforce_sorting.rs | 3 +- .../physical_optimizer/limit_pushdown.rs | 4 +- .../limited_distinct_aggregation.rs | 5 +- .../physical_optimizer/projection_pushdown.rs | 30 ++++---- .../replace_with_order_preserving_variants.rs | 3 +- .../tests/physical_optimizer/test_utils.rs | 3 +- 8 files changed, 62 insertions(+), 59 deletions(-) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 2b38fd4e1e9a..ceca4661efc4 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -24,7 +24,6 @@ use std::sync::{Arc, LazyLock}; #[cfg(feature = "extended_tests")] mod memory_limit_validation; use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray}; -use arrow::compute::SortOptions; use arrow::datatypes::{Int32Type, SchemaRef}; use arrow_schema::{DataType, Field, Schema}; use datafusion::assert_batches_eq; @@ -734,7 +733,7 @@ impl Scenario { let sort_information = vec![LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options, + options: options.clone(), }, PhysicalSortExpr { expr: col("b", &schema).unwrap(), diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index ac1bef6b13d2..e8d699006ea1 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -25,7 +25,6 @@ use crate::physical_optimizer::test_utils::{ }; use crate::physical_optimizer::test_utils::{parquet_exec_with_sort, trim_plan_display}; -use arrow::compute::SortOptions; use datafusion::config::ConfigOptions; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::listing::PartitionedFile; @@ -35,6 +34,7 @@ use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; +use datafusion_common::sort::AdvSortOptions; use datafusion_expr::{JoinType, Operator}; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion_physical_expr::PhysicalExpr; @@ -1653,7 +1653,7 @@ fn merge_does_not_need_sort() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); // Scan some sorted parquet files @@ -1855,7 +1855,7 @@ fn repartition_sorted_limit() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = limit_exec(sort_exec(sort_key, parquet_exec(), false)); @@ -1877,7 +1877,7 @@ fn repartition_sorted_limit_with_filter() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_required_exec_with_req( filter_exec(sort_exec(sort_key.clone(), parquet_exec(), false)), @@ -1956,7 +1956,7 @@ fn repartition_through_sort_preserving_merge() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_preserving_merge_exec(sort_key, parquet_exec()); @@ -1977,7 +1977,7 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_preserving_merge_exec( sort_key.clone(), @@ -2009,7 +2009,7 @@ fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = union_exec(vec![parquet_exec_with_sort(vec![sort_key.clone()]); 2]); let plan = sort_preserving_merge_exec(sort_key, input); @@ -2043,7 +2043,7 @@ fn repartition_does_not_destroy_sort() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("d", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_required_exec_with_req( filter_exec(parquet_exec_with_sort(vec![sort_key.clone()])), @@ -2078,7 +2078,7 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input1 = sort_required_exec_with_req( parquet_exec_with_sort(vec![sort_key.clone()]), @@ -2120,7 +2120,7 @@ fn repartition_transitively_with_projection() -> Result<()> { let proj = Arc::new(ProjectionExec::try_new(proj_exprs, parquet_exec())?); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("sum", &proj.schema()).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_preserving_merge_exec(sort_key, proj); @@ -2153,7 +2153,7 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let alias = vec![ ("a".to_string(), "a".to_string()), @@ -2186,7 +2186,7 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let alias = vec![ ("a".to_string(), "a".to_string()), @@ -2219,7 +2219,7 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_exec(sort_key, filter_exec(parquet_exec()), false); @@ -2253,7 +2253,7 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = sort_exec( sort_key, @@ -2323,7 +2323,7 @@ fn parallelization_multiple_files() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan = filter_exec(parquet_exec_multiple_sorted(vec![sort_key.clone()])); @@ -2476,7 +2476,7 @@ fn parallelization_sorted_limit() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan_parquet = limit_exec(sort_exec(sort_key.clone(), parquet_exec(), false)); let plan_csv = limit_exec(sort_exec(sort_key, csv_exec(), false)); @@ -2508,7 +2508,7 @@ fn parallelization_limit_with_filter() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan_parquet = limit_exec(filter_exec(sort_exec( sort_key.clone(), @@ -2629,7 +2629,7 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); // sort preserving merge already sorted input, let plan_parquet = sort_preserving_merge_exec( @@ -2657,7 +2657,7 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); // 2 sorted parquet files unioned (partitions are concatenated, sort is preserved) let input_parquet = @@ -2691,7 +2691,7 @@ fn parallelization_does_not_benefit() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); // SortRequired // Parquet(sorted) @@ -2723,7 +2723,7 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); //Projection(a as a2, b as b2) @@ -2735,7 +2735,7 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> projection_exec_with_alias(parquet_exec_with_sort(vec![sort_key]), alias_pairs); let sort_key_after_projection = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c2", &proj_parquet.schema()).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan_parquet = sort_preserving_merge_exec(sort_key_after_projection, proj_parquet); @@ -2762,7 +2762,7 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); //Projection(a as a2, b as b2) @@ -2775,7 +2775,7 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> { projection_exec_with_alias(csv_exec_with_sort(vec![sort_key]), alias_pairs); let sort_key_after_projection = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c2", &proj_csv.schema()).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv); let expected = &[ @@ -2825,7 +2825,7 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); @@ -2850,7 +2850,7 @@ fn preserve_ordering_through_repartition() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("d", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); @@ -2873,7 +2873,7 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); @@ -2905,7 +2905,7 @@ fn no_need_for_sort_after_filter() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); @@ -2929,13 +2929,13 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key]); let sort_req = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let physical_plan = sort_preserving_merge_exec(sort_req, filter_exec(input)); @@ -2967,7 +2967,7 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key]); let physical_plan = filter_exec(input); @@ -2988,7 +2988,7 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec(); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); @@ -3025,7 +3025,7 @@ fn put_sort_when_input_is_valid() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); @@ -3062,7 +3062,7 @@ fn do_not_add_unnecessary_hash() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let alias = vec![("a".to_string(), "a".to_string())]; let input = parquet_exec_with_sort(vec![sort_key]); @@ -3085,7 +3085,7 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let alias = vec![("a".to_string(), "a".to_string())]; let input = parquet_exec_multiple_sorted(vec![sort_key]); diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 3223768acb74..319612c32d24 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -62,6 +62,7 @@ use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; use rstest::rstest; +use datafusion_common::sort::AdvSortOptions; /// Create a csv exec for tests fn csv_exec_ordered( @@ -3366,7 +3367,7 @@ async fn test_preserve_needed_coalesce() -> Result<()> { let schema = schema(); let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let plan: Arc = single_partitioned_aggregate(plan, vec![("a".to_string(), "a1".to_string())]); diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs index dd2c1960a658..4d5cf93fc0dd 100644 --- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs @@ -17,10 +17,10 @@ use std::sync::Arc; -use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; use datafusion_common::error::Result; +use datafusion_common::sort::AdvSortOptions; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::BinaryExpr; @@ -317,7 +317,7 @@ fn pushes_global_limit_into_multiple_fetch_plans() -> Result<()> { let sort = sort_exec( vec![PhysicalSortExpr { expr: col("c1", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }], repartition, ); diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index f9810eab8f59..1cd43c6085e0 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -25,8 +25,9 @@ use crate::physical_optimizer::test_utils::{ }; use arrow::datatypes::DataType; -use arrow::{compute::SortOptions, util::pretty::pretty_format_batches}; +use arrow::util::pretty::pretty_format_batches; use datafusion::prelude::SessionContext; +use datafusion_common::sort::AdvSortOptions; use datafusion_common::Result; use datafusion_execution::config::SessionConfig; use datafusion_expr::Operator; @@ -238,7 +239,7 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> { fn test_has_order_by() -> Result<()> { let sort_key = LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema()).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let source = parquet_exec_with_sort(vec![sort_key]); let schema = source.schema(); diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 836758b21318..b857e9aaf232 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -18,7 +18,6 @@ use std::any::Any; use std::sync::Arc; -use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::MemorySourceConfig; @@ -58,6 +57,7 @@ use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; use itertools::Itertools; +use datafusion_common::sort::AdvSortOptions; /// Mocked UDF #[derive(Debug)] @@ -510,16 +510,16 @@ fn test_streaming_table_after_projection() -> Result<()> { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("e", 2)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), ] .into_iter(), @@ -570,16 +570,16 @@ fn test_streaming_table_after_projection() -> Result<()> { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("e", 1)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 2)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::new(Column::new("d", 0)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), ] ); @@ -644,7 +644,7 @@ fn test_output_req_after_projection() -> Result<()> { Some(LexRequirement::new(vec![ PhysicalSortRequirement { expr: Arc::new(Column::new("b", 1)), - options: Some(SortOptions::default()), + options: Some(AdvSortOptions::default()), }, PhysicalSortRequirement { expr: Arc::new(BinaryExpr::new( @@ -652,7 +652,7 @@ fn test_output_req_after_projection() -> Result<()> { Operator::Plus, Arc::new(Column::new("a", 0)), )), - options: Some(SortOptions::default()), + options: Some(AdvSortOptions::default()), }, ])), Distribution::HashPartitioned(vec![ @@ -690,7 +690,7 @@ fn test_output_req_after_projection() -> Result<()> { let expected_reqs = LexRequirement::new(vec![ PhysicalSortRequirement { expr: Arc::new(Column::new("b", 2)), - options: Some(SortOptions::default()), + options: Some(AdvSortOptions::default()), }, PhysicalSortRequirement { expr: Arc::new(BinaryExpr::new( @@ -698,7 +698,7 @@ fn test_output_req_after_projection() -> Result<()> { Operator::Plus, Arc::new(Column::new("new_a", 1)), )), - options: Some(SortOptions::default()), + options: Some(AdvSortOptions::default()), }, ]); assert_eq!( @@ -1249,7 +1249,7 @@ fn test_sort_after_projection() -> Result<()> { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::new(BinaryExpr::new( @@ -1257,7 +1257,7 @@ fn test_sort_after_projection() -> Result<()> { Operator::Plus, Arc::new(Column::new("a", 0)), )), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]), csv.clone(), @@ -1299,7 +1299,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::new(BinaryExpr::new( @@ -1307,7 +1307,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { Operator::Plus, Arc::new(Column::new("a", 0)), )), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]), csv.clone(), diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index 58eb866c590c..b4230e3af422 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -51,6 +51,7 @@ use object_store::memory::InMemory; use object_store::ObjectStore; use rstest::rstest; use url::Url; +use datafusion_common::sort::AdvSortOptions; /// Runs the `replace_with_order_preserving_variants` sub-rule and asserts /// the plan against the original and expected plans. @@ -1141,7 +1142,7 @@ fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: AdvSortOptions::with_default_ordering(options), } } diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 0b9c3b80bb93..3e3fcc7c4429 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -30,6 +30,7 @@ use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; +use datafusion_common::sort::AdvSortOptions; use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; @@ -199,7 +200,7 @@ pub fn sort_expr_options( ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: AdvSortOptions::with_default_ordering(options), } } From c7a180bf69b4c73eed0067ea0c959a406f0fde47 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 5 Mar 2025 12:29:39 +0100 Subject: [PATCH 08/14] Formatting --- datafusion/catalog/src/session.rs | 2 +- datafusion/core/src/datasource/listing/table.rs | 8 ++++++-- datafusion/core/src/datasource/memory.rs | 6 +----- datafusion/core/src/datasource/mod.rs | 7 ++++--- datafusion/core/tests/memory_limit/mod.rs | 4 ++-- .../core/tests/physical_optimizer/enforce_distribution.rs | 2 +- .../core/tests/physical_optimizer/enforce_sorting.rs | 2 +- .../core/tests/physical_optimizer/projection_pushdown.rs | 2 +- .../replace_with_order_preserving_variants.rs | 2 +- 9 files changed, 18 insertions(+), 17 deletions(-) diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 31ad2fc7c73f..c0c82e7f41bc 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -22,13 +22,13 @@ use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::registry::MemoryExtensionTypeRegistry; use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF}; use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr}; use parking_lot::{Mutex, RwLock}; use std::any::Any; use std::collections::HashMap; use std::sync::{Arc, Weak}; -use datafusion_expr::registry::MemoryExtensionTypeRegistry; /// Interface for accessing [`SessionState`] from the catalog. /// diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 73abf6c401ea..4bc85108eb4d 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -55,11 +55,11 @@ use datafusion_physical_expr::{ use async_trait::async_trait; use datafusion_catalog::Session; +use datafusion_expr::registry::EmptyExtensionTypeRegistry; use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{future, stream, StreamExt, TryStreamExt}; use itertools::Itertools; use object_store::ObjectStore; -use datafusion_expr::registry::EmptyExtensionTypeRegistry; /// Configuration for creating a [`ListingTable`] #[derive(Debug, Clone)] @@ -810,7 +810,11 @@ impl ListingTable { /// If file_sort_order is specified, creates the appropriate physical expressions fn try_create_output_ordering(&self) -> Result> { - create_ordering(&EmptyExtensionTypeRegistry::new(), &self.table_schema, &self.options.file_sort_order) + create_ordering( + &EmptyExtensionTypeRegistry::new(), + &self.table_schema, + &self.options.file_sort_order, + ) } } diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 4897e18ce6ac..79c053c71d0e 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -242,11 +242,7 @@ impl TableProvider for MemTable { let file_sort_order = sort_order .iter() .map(|sort_exprs| { - create_physical_sort_exprs( - state, - sort_exprs, - &df_schema, - ) + create_physical_sort_exprs(state, sort_exprs, &df_schema) }) .collect::>>()?; source = source.try_with_sort_information(file_sort_order)?; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 7d09e08c3ae3..1dc18f0dfbd0 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -49,10 +49,10 @@ pub use datafusion_execution::object_store; pub use statistics::get_statistics_with_limit; use arrow::datatypes::Schema; -use datafusion_common::{plan_err, Result}; use datafusion_common::sort::AdvSortOptions; -use datafusion_expr::{Expr, SortExpr}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::registry::ExtensionTypeRegistry; +use datafusion_expr::{Expr, SortExpr}; use datafusion_physical_expr::{expressions, LexOrdering, PhysicalSortExpr}; fn create_ordering( @@ -69,7 +69,8 @@ fn create_ordering( match &sort.expr { Expr::Column(col) => match expressions::col(&col.name, schema) { Ok(expr) => { - let ordering = schema.field_with_name(&col.name)? + let ordering = schema + .field_with_name(&col.name)? .extension_type_name() .and_then(|ext| extension_types.get(ext).ok()) .map(|ext| ext.planning_information().ordering.clone()) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index ceca4661efc4..0b572951da99 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -53,10 +53,10 @@ use rand::Rng; use test_utils::AccessLogGenerator; use async_trait::async_trait; -use futures::StreamExt; -use tokio::fs::File; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; +use futures::StreamExt; +use tokio::fs::File; #[cfg(test)] #[ctor::ctor] diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index e8d699006ea1..8d5f1fdbb7e8 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -32,9 +32,9 @@ use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; +use datafusion_common::sort::AdvSortOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; -use datafusion_common::sort::AdvSortOptions; use datafusion_expr::{JoinType, Operator}; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion_physical_expr::PhysicalExpr; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 319612c32d24..eab878fb081e 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -61,8 +61,8 @@ use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; -use rstest::rstest; use datafusion_common::sort::AdvSortOptions; +use rstest::rstest; /// Create a csv exec for tests fn csv_exec_ordered( diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index b857e9aaf232..65340aaf5b01 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -56,8 +56,8 @@ use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; -use itertools::Itertools; use datafusion_common::sort::AdvSortOptions; +use itertools::Itertools; /// Mocked UDF #[derive(Debug)] diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index b4230e3af422..58154b7070e8 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -47,11 +47,11 @@ use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_optimizer::enforce_sorting::replace_with_order_preserving_variants::{replace_with_order_preserving_variants, OrderPreservationContext}; use datafusion_common::config::ConfigOptions; +use datafusion_common::sort::AdvSortOptions; use object_store::memory::InMemory; use object_store::ObjectStore; use rstest::rstest; use url::Url; -use datafusion_common::sort::AdvSortOptions; /// Runs the `replace_with_order_preserving_variants` sub-rule and asserts /// the plan against the original and expected plans. From 70e6ae045cd6fc867439228bdc6ad65f0faca8c1 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 5 Mar 2025 14:07:59 +0100 Subject: [PATCH 09/14] Implement most of the custom ordering infrastructure --- datafusion/common/src/sort.rs | 199 +++++++++++++----- datafusion/common/src/types/field.rs | 15 ++ datafusion/common/src/types/logical.rs | 57 ++++- datafusion/core/src/datasource/mod.rs | 2 +- datafusion/core/src/execution/context/mod.rs | 22 ++ .../core/src/execution/session_state.rs | 12 +- datafusion/core/src/physical_planner.rs | 2 +- datafusion/core/tests/dataframe/mod.rs | 28 +-- datafusion/core/tests/dataframe/test_types.rs | 94 +++++++++ datafusion/execution/src/task.rs | 12 +- datafusion/expr/src/registry.rs | 18 +- datafusion/physical-plan/src/sorts/sort.rs | 63 +----- 12 files changed, 380 insertions(+), 144 deletions(-) create mode 100644 datafusion/core/tests/dataframe/test_types.rs diff --git a/datafusion/common/src/sort.rs b/datafusion/common/src/sort.rs index be056f80efdd..8ac0b32a48a3 100644 --- a/datafusion/common/src/sort.rs +++ b/datafusion/common/src/sort.rs @@ -1,9 +1,11 @@ -use crate::error::_internal_err; +use crate::error::{_exec_err, _internal_err}; use crate::types::SortOrdering; use crate::Result; use arrow::array::{ArrayRef, DynComparator, UInt32Array}; -use arrow::compute::SortOptions; -use arrow::error::ArrowError; +use arrow::compute::{partial_sort, SortColumn, SortOptions}; +use std::cmp::Ordering; +use arrow::datatypes::DataType; +use arrow::row::{RowConverter, SortField}; /// TODO #[derive(Clone, Debug, Default, Hash, PartialEq, Eq)] @@ -82,11 +84,21 @@ pub struct AdvSortColumn { } impl AdvSortColumn { - pub fn dyn_compartor(&self) -> DynComparator { - todo!() + pub fn dyn_compartor(&self) -> Result { + let ordering = self + .options + .as_ref() + .map(|opt| opt.ordering.clone()) + .unwrap_or_default(); + let options = self + .options + .as_ref() + .and_then(|opt| opt.to_arrow().ok()) + .unwrap_or_default(); + ordering.dyn_comparator(self.values.clone(), options) } - pub fn to_arrow(&self) -> Result { + pub fn to_arrow(&self) -> Result { let has_custom_sort = self .options .as_ref() @@ -94,7 +106,7 @@ impl AdvSortColumn { .unwrap_or(false); match has_custom_sort { true => _internal_err!("Cannot create arrow SortColumn with custom sort"), - false => Ok(arrow::compute::SortColumn { + false => Ok(SortColumn { values: self.values.clone(), options: self.options.as_ref().map(|o| o.to_arrow().unwrap()), }), @@ -102,55 +114,148 @@ impl AdvSortColumn { } } +/// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data +/// at given two indices. The lifetime is the same at the data wrapped. +pub struct LexicographicalComparator { + compare_items: Vec, +} + +impl LexicographicalComparator { + /// lexicographically compare values at the wrapped columns with given indices. + pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { + for comparator in &self.compare_items { + match comparator(a_idx, b_idx) { + Ordering::Equal => continue, + r => return r, + } + } + Ordering::Equal + } + + /// Create a new lex comparator that will wrap the given sort columns and give comparison + /// results with two indices. + pub fn new(compare_items: Vec) -> LexicographicalComparator { + LexicographicalComparator { compare_items } + } +} + /// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer /// (`UInt32Array`) of indices. pub fn lexsort_to_indices( columns: &[AdvSortColumn], - limit: Option, -) -> std::result::Result { + fetch: Option, +) -> Result { if columns.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Sort requires at least one column".to_string(), - )); + return _exec_err!("Sort requires at least one column"); } let all_columns_default_ordering = columns .iter() .map(|c| c.to_arrow()) .collect::>>(); - if let Ok(columns) = all_columns_default_ordering { - return arrow::compute::lexsort_to_indices(&columns, limit); - } - - todo!("Custom sorting not yet implemented.") - // - // if columns.len() == 1 && can_sort_to_indices(columns[0].values.data_type()) { - // // fallback to non-lexical sort - // let column = &columns[0]; - // return sort_to_indices(&column.values, column.options, limit); - // } - // - // let row_count = columns[0].values.len(); - // if columns.iter().any(|item| item.values.len() != row_count) { - // return Err(ArrowError::ComputeError( - // "lexical sort columns have different row counts".to_string(), - // )); - // }; - // - // let mut value_indices = (0..row_count).collect::>(); - // let mut len = value_indices.len(); - // - // if let Some(limit) = limit { - // len = limit.min(len); - // } - // - // let lexicographical_comparator = LexicographicalComparator::try_new(columns)?; - // // uint32 can be sorted unstably - // sort_unstable_by(&mut value_indices, len, |a, b| { - // lexicographical_comparator.compare(*a, *b) - // }); - // - // Ok(UInt32Array::from_iter_values( - // value_indices.iter().take(len).map(|i| *i as u32), - // )) + if let Ok(sort_columns) = all_columns_default_ordering { + if is_multi_column_with_lists(&sort_columns) { + // lex_sort_to_indices doesn't support List with more than one column + // https://github.com/apache/arrow-rs/issues/5454 + lexsort_to_indices_multi_columns(sort_columns, fetch)? + } else { + arrow::compute::lexsort_to_indices(&sort_columns, fetch)? + }; + } + + if columns.len() == 1 { + // fallback to non-lexical sort + let column = &columns[0]; + let options = column + .options + .as_ref() + .expect("Otherwise fallback to arrow earlier"); + return options.ordering.sort_to_indices( + &column.values, + SortOptions::new(options.descending, options.nulls_first), + fetch, + ); + } + + let row_count = columns[0].values.len(); + if columns.iter().any(|item| item.values.len() != row_count) { + return _exec_err!("lexical sort columns have different row counts"); + }; + + let mut value_indices = (0..row_count).collect::>(); + let mut len = value_indices.len(); + + if let Some(limit) = fetch { + len = limit.min(len); + } + + let compare_items = columns.iter() + .map(|c| c.dyn_compartor()) + .collect::>>()?; + + let lexicographical_comparator = LexicographicalComparator::new(compare_items); + // uint32 can be sorted unstably + sort_unstable_by(&mut value_indices, len, |a, b| { + lexicographical_comparator.compare(*a, *b) + }); + + Ok(UInt32Array::from_iter_values( + value_indices.iter().take(len).map(|i| *i as u32), + )) +} + +#[inline] +fn is_multi_column_with_lists(sort_columns: &[SortColumn]) -> bool { + sort_columns.iter().any(|c| { + matches!( + c.values.data_type(), + DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) + ) + }) +} + +pub(crate) fn lexsort_to_indices_multi_columns( + sort_columns: Vec, + limit: Option, +) -> Result { + let (fields, columns) = sort_columns.into_iter().fold( + (vec![], vec![]), + |(mut fields, mut columns), sort_column| { + fields.push(SortField::new_with_options( + sort_column.values.data_type().clone(), + sort_column.options.unwrap_or_default(), + )); + columns.push(sort_column.values); + (fields, columns) + }, + ); + + // TODO reuse converter and rows, refer to TopK. + let converter = RowConverter::new(fields)?; + let rows = converter.convert_columns(&columns)?; + let mut sort: Vec<_> = rows.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + + let mut len = rows.num_rows(); + if let Some(limit) = limit { + len = limit.min(len); + } + let indices = + UInt32Array::from_iter_values(sort.iter().take(len).map(|(i, _)| *i as u32)); + + Ok(indices) +} + + +/// we can only do this if the T is primitive +#[inline] +fn sort_unstable_by(array: &mut [T], limit: usize, cmp: F) +where + F: FnMut(&T, &T) -> Ordering, +{ + if array.len() == limit { + array.sort_unstable_by(cmp); + } else { + partial_sort(array, limit, cmp); + } } diff --git a/datafusion/common/src/types/field.rs b/datafusion/common/src/types/field.rs index 5a880ba10a41..0150d4d51ad7 100644 --- a/datafusion/common/src/types/field.rs +++ b/datafusion/common/src/types/field.rs @@ -29,6 +29,21 @@ pub struct LogicalField { pub nullable: bool, } +impl LogicalField { + /// Creates a new [LogicalField]. + pub fn new( + name: impl Into, + logical_type: LogicalTypeRef, + nullable: bool, + ) -> Self { + Self { + name: name.into(), + logical_type, + nullable, + } + } +} + impl PartialEq for LogicalField { fn eq(&self, other: &Self) -> bool { self.name == other.name diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 85d20ccd18bf..ba261fdfb9e0 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -18,7 +18,7 @@ use super::NativeType; use crate::error::Result; use crate::ScalarValue; -use arrow::array::ArrayRef; +use arrow::array::{make_comparator, Array, ArrayRef, DynComparator, UInt32Array}; use arrow::compute::SortOptions; use arrow::datatypes::DataType; use core::fmt; @@ -169,12 +169,45 @@ pub enum SortOrdering { } impl SortOrdering { - pub fn partial_cmp(&self, lhs: &ScalarValue, rhs: &ScalarValue) -> Option { + pub fn compare_scalars( + &self, + lhs: &ScalarValue, + rhs: &ScalarValue, + ) -> Option { match self { SortOrdering::Default => lhs.partial_cmp(rhs), - SortOrdering::Custom(_) => todo!("custom order"), + SortOrdering::Custom(c) => c.compare_scalars(lhs, rhs), } } + + pub fn sort_to_indices( + &self, + array: &dyn Array, + options: SortOptions, + fetch: Option, + ) -> Result { + match self { + SortOrdering::Default => Ok(arrow::compute::sort_to_indices( + array, + Some(options), + fetch, + )?), + SortOrdering::Custom(c) => c.sort_to_indices(array, options, fetch), + } + } + + pub fn dyn_comparator( + &self, + array: ArrayRef, + options: SortOptions, + ) -> Result { + Ok(match self { + SortOrdering::Default => { + make_comparator(array.as_ref(), array.as_ref(), options)? + } + SortOrdering::Custom(c) => c.dyn_comparator(array, options)?, + }) + } } impl PartialEq for SortOrdering { @@ -209,6 +242,20 @@ pub trait CustomOrdering: Debug + Send + Sync { fn ordering_id(&self) -> &str; /// TODO - fn execute(&self, array_ref: ArrayRef, sort_options: SortOptions) - -> Result; + fn compare_scalars(&self, lhs: &ScalarValue, rhs: &ScalarValue) -> Option; + + /// TODO + fn sort_to_indices( + &self, + array: &dyn Array, + options: SortOptions, + fetch: Option, + ) -> Result; + + /// TODO + fn dyn_comparator( + &self, + array: ArrayRef, + options: SortOptions, + ) -> Result; } diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 1dc18f0dfbd0..27bd9797d019 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -72,7 +72,7 @@ fn create_ordering( let ordering = schema .field_with_name(&col.name)? .extension_type_name() - .and_then(|ext| extension_types.get(ext).ok()) + .and_then(|ext| extension_types.get_extension_type(ext).ok()) .map(|ext| ext.planning_information().ordering.clone()) .unwrap_or_default(); sort_exprs.push(PhysicalSortExpr { diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index c27d1e4fd46b..6c171a541a30 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -75,9 +75,11 @@ use chrono::{DateTime, Utc}; use datafusion_catalog::{ DynamicFileCatalog, SessionStore, TableFunction, TableFunctionImpl, UrlTableFactory, }; +use datafusion_common::types::LogicalTypeRef; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::registry::ExtensionTypeRegistry; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; use object_store::ObjectStore; use parking_lot::RwLock; @@ -1641,6 +1643,26 @@ impl FunctionRegistry for SessionContext { } } +impl ExtensionTypeRegistry for SessionContext { + fn get_extension_type(&self, name: &str) -> Result { + self.state.read().get_extension_type(name) + } + + fn register_extension_type( + &mut self, + logical_type: LogicalTypeRef, + ) -> Result> { + self.state.write().register_extension_type(logical_type) + } + + fn deregister_extension_type( + &mut self, + name: &str, + ) -> Result> { + self.state.write().deregister_extension_type(name) + } +} + /// Create a new task context instance from SessionContext impl From<&SessionContext> for TaskContext { fn from(session: &SessionContext) -> Self { diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 30fffc197a9e..e95b2d935b30 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1907,22 +1907,22 @@ impl FunctionRegistry for SessionState { } impl ExtensionTypeRegistry for SessionState { - fn get(&self, name: &str) -> datafusion_common::Result { - self.extension_types.get(name) + fn get_extension_type(&self, name: &str) -> datafusion_common::Result { + self.extension_types.get_extension_type(name) } - fn register_type( + fn register_extension_type( &mut self, logical_type: LogicalTypeRef, ) -> datafusion_common::Result> { - self.extension_types.register_type(logical_type) + self.extension_types.register_extension_type(logical_type) } - fn deregister_type( + fn deregister_extension_type( &mut self, name: &str, ) -> datafusion_common::Result> { - self.extension_types.deregister_type(name) + self.extension_types.deregister_extension_type(name) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 03fe2bfc9c37..45c53a42b8f1 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1685,7 +1685,7 @@ pub fn create_physical_sort_expr( Expr::Column(name) => input_dfschema .field_from_column(name)? .extension_type_name() - .and_then(|ext| extension_types.get(ext).ok()) + .and_then(|ext| extension_types.get_extension_type(ext).ok()) .map(|ext| ext.planning_information().ordering.clone()) .unwrap_or_default(), _ => SortOrdering::Default, diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index b471a2e11281..85e9fcbc85af 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -18,13 +18,9 @@ // Include tests in dataframe_functions mod dataframe_functions; mod describe; +mod test_types; -use arrow::array::{ - record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, - FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, - Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, - StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder, -}; +use arrow::array::{record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{ DataType, Field, Float32Type, Float64Type, Int32Type, Schema, SchemaRef, UInt64Type, @@ -32,6 +28,7 @@ use arrow::datatypes::{ }; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; +use arrow_schema::extension::EXTENSION_TYPE_NAME_KEY; use datafusion_expr::utils::COUNT_STAR_EXPANSION; use datafusion_functions_aggregate::count::{count_all, count_udaf}; use datafusion_functions_aggregate::expr_fn::{ @@ -48,6 +45,7 @@ use std::sync::Arc; use tempfile::TempDir; use url::Url; +use crate::dataframe::test_types::IntOrFloatType; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::MemTable; use datafusion::error::Result; @@ -71,6 +69,7 @@ use datafusion_common_runtime::SpawnedTask; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_expr::expr::{GroupingSet, Sort, WindowFunction}; +use datafusion_expr::registry::ExtensionTypeRegistry; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, @@ -2855,11 +2854,15 @@ async fn sort_on_union_with_logical_type() -> Result<()> { ] .into_iter() .collect(); - let schema = Schema::new(vec![Field::new( - "my_union", - DataType::Union(fields, UnionMode::Dense), - false, - )]); + + let my_extension_type = Arc::new(IntOrFloatType::new()); + let union_type = DataType::Union(fields, UnionMode::Dense); + let field = + Field::new("my_union", union_type, false).with_metadata(HashMap::from([( + EXTENSION_TYPE_NAME_KEY.into(), + IntOrFloatType::name().into(), + )])); + let schema = Schema::new(vec![field]); let mut builder = UnionBuilder::new_dense(); builder.append::("A", 1)?; @@ -2868,7 +2871,8 @@ async fn sort_on_union_with_logical_type() -> Result<()> { builder.append::("B", 3.0)?; let union = builder.build()?; - let ctx = SessionContext::new(); + let mut ctx = SessionContext::new(); + ctx.register_extension_type(my_extension_type)?; ctx.register_table( "test_table", Arc::new(MemTable::try_new( diff --git a/datafusion/core/tests/dataframe/test_types.rs b/datafusion/core/tests/dataframe/test_types.rs new file mode 100644 index 000000000000..512a3fefc472 --- /dev/null +++ b/datafusion/core/tests/dataframe/test_types.rs @@ -0,0 +1,94 @@ +use arrow::array::{Array, ArrayRef, DynComparator, UInt32Array}; +use arrow_schema::{DataType, SortOptions}; +use datafusion_common::types::{ + logical_float64, logical_int32, CustomOrdering, LogicalField, LogicalType, + LogicalTypePlanningInformation, NativeType, SortOrdering, TypeSignature, +}; +use datafusion_common::ScalarValue; +use std::cmp::Ordering; +use std::sync::Arc; + +/// Represents a type that is either an integer or a float. +pub struct IntOrFloatType { + native_type: NativeType, +} + +impl IntOrFloatType { + pub fn name() -> &'static str { + "int_or_float" + } + + pub fn new() -> IntOrFloatType { + let fields = [ + ( + 0, + Arc::new(LogicalField::new("integer", logical_int32(), false)), + ), + ( + 1, + Arc::new(LogicalField::new("float", logical_float64(), false)), + ), + ] + .into_iter() + .collect(); + Self { + native_type: NativeType::Union(fields), + } + } +} + +impl LogicalType for IntOrFloatType { + fn native(&self) -> &NativeType { + &self.native_type + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Extension { + name: Self::name(), + parameters: &[], + } + } + + fn default_cast_for(&self, _origin: &DataType) -> datafusion_common::Result { + unimplemented!() + } + + fn planning_information(&self) -> LogicalTypePlanningInformation { + LogicalTypePlanningInformation { + ordering: SortOrdering::Custom(Arc::new(IntOrFloatTypeOrdering {})), + } + } +} + +/// The order of the IntOrFloat is defined as follows: +/// - All integers followed by all floats +/// - Within one subtype, the integers and floats are sorted using their natural order. +#[derive(Debug)] +struct IntOrFloatTypeOrdering {} + +impl CustomOrdering for IntOrFloatTypeOrdering { + fn ordering_id(&self) -> &str { + "order_int_or_float" + } + + fn compare_scalars(&self, _lhs: &ScalarValue, _rhs: &ScalarValue) -> Option { + unimplemented!("TODO") + } + + fn sort_to_indices( + &self, + _array: &dyn Array, + _options: SortOptions, + _fetch: Option, + ) -> datafusion_common::Result { + unimplemented!("TODO") + } + + fn dyn_comparator( + &self, + _array: ArrayRef, + _options: SortOptions, + ) -> datafusion_common::Result { + unimplemented!("TODO") + } +} diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index dac838fa0acd..199dbe5e940c 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -211,19 +211,19 @@ impl FunctionRegistry for TaskContext { } impl ExtensionTypeRegistry for TaskContext { - fn get(&self, name: &str) -> Result { - self.extension_types.get(name) + fn get_extension_type(&self, name: &str) -> Result { + self.extension_types.get_extension_type(name) } - fn register_type( + fn register_extension_type( &mut self, logical_type: LogicalTypeRef, ) -> Result> { - self.extension_types.register_type(logical_type) + self.extension_types.register_extension_type(logical_type) } - fn deregister_type(&mut self, name: &str) -> Result> { - self.extension_types.deregister_type(name) + fn deregister_extension_type(&mut self, name: &str) -> Result> { + self.extension_types.deregister_extension_type(name) } } diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index b31eaf997875..dabf54c2f2a9 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -207,12 +207,12 @@ impl FunctionRegistry for MemoryFunctionRegistry { pub trait ExtensionTypeRegistry { /// Returns a reference to the logical type named `name`. - fn get(&self, name: &str) -> Result; + fn get_extension_type(&self, name: &str) -> Result; /// Registers a new [LogicalTypeRef], returning any previously registered implementation. /// /// Returns an error if the type cannot be registered, for example if the registry is read only. - fn register_type( + fn register_extension_type( &mut self, logical_type: LogicalTypeRef, ) -> Result>; @@ -222,7 +222,7 @@ pub trait ExtensionTypeRegistry { /// /// Returns an error if the type cannot be deregistered, for example if the registry is read /// only. - fn deregister_type(&mut self, name: &str) -> Result>; + fn deregister_extension_type(&mut self, name: &str) -> Result>; } /// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s. @@ -240,14 +240,14 @@ impl MemoryExtensionTypeRegistry { } impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { - fn get(&self, name: &str) -> Result { + fn get_extension_type(&self, name: &str) -> Result { self.extension_types .get(name) .ok_or_else(|| plan_datafusion_err!("Extension type not found.")) .cloned() } - fn register_type( + fn register_extension_type( &mut self, logical_type: LogicalTypeRef, ) -> Result> { @@ -260,7 +260,7 @@ impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { Ok(self.extension_types.insert(signature.into(), logical_type)) } - fn deregister_type(&mut self, name: &str) -> Result> { + fn deregister_extension_type(&mut self, name: &str) -> Result> { Ok(self.extension_types.remove(name)) } } @@ -284,18 +284,18 @@ impl EmptyExtensionTypeRegistry { } impl ExtensionTypeRegistry for EmptyExtensionTypeRegistry { - fn get(&self, _name: &str) -> Result { + fn get_extension_type(&self, _name: &str) -> Result { plan_err!("Extension type not found.") } - fn register_type( + fn register_extension_type( &mut self, _logical_type: LogicalTypeRef, ) -> Result> { plan_err!("Cannot register type.") } - fn deregister_type(&mut self, _name: &str) -> Result> { + fn deregister_extension_type(&mut self, _name: &str) -> Result> { plan_err!("Cannot deregister type.") } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 41b39226c9ba..c98861d878fb 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -44,12 +44,9 @@ use crate::{ Statistics, }; -use arrow::array::{ - Array, RecordBatch, RecordBatchOptions, StringViewArray, UInt32Array, -}; -use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays, SortColumn}; -use arrow::datatypes::{DataType, SchemaRef}; -use arrow::row::{RowConverter, SortField}; +use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray}; +use arrow::compute::{concat_batches, take_arrays}; +use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, Result}; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -58,6 +55,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr_common::sort_expr::LexRequirement; +use datafusion_common::sort::lexsort_to_indices; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; @@ -740,17 +738,10 @@ pub fn sort_batch( ) -> Result { let sort_columns = expressions .iter() - .map(|expr| expr.evaluate_to_sort_column(batch)) + .map(|expr| expr.evaluate_to_adv_sort_column(batch)) .collect::>>()?; - let indices = if is_multi_column_with_lists(&sort_columns) { - // lex_sort_to_indices doesn't support List with more than one column - // https://github.com/apache/arrow-rs/issues/5454 - lexsort_to_indices_multi_columns(sort_columns, fetch)? - } else { - lexsort_to_indices(&sort_columns, fetch)? - }; - + let indices = lexsort_to_indices(sort_columns.as_slice(), fetch)?; let mut columns = take_arrays(batch.columns(), &indices, None)?; // The columns may be larger than the unsorted columns in `batch` especially for variable length @@ -769,48 +760,6 @@ pub fn sort_batch( )?) } -#[inline] -fn is_multi_column_with_lists(sort_columns: &[SortColumn]) -> bool { - sort_columns.iter().any(|c| { - matches!( - c.values.data_type(), - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) - ) - }) -} - -pub(crate) fn lexsort_to_indices_multi_columns( - sort_columns: Vec, - limit: Option, -) -> Result { - let (fields, columns) = sort_columns.into_iter().fold( - (vec![], vec![]), - |(mut fields, mut columns), sort_column| { - fields.push(SortField::new_with_options( - sort_column.values.data_type().clone(), - sort_column.options.unwrap_or_default(), - )); - columns.push(sort_column.values); - (fields, columns) - }, - ); - - // TODO reuse converter and rows, refer to TopK. - let converter = RowConverter::new(fields)?; - let rows = converter.convert_columns(&columns)?; - let mut sort: Vec<_> = rows.iter().enumerate().collect(); - sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - - let mut len = rows.num_rows(); - if let Some(limit) = limit { - len = limit.min(len); - } - let indices = - UInt32Array::from_iter_values(sort.iter().take(len).map(|(i, _)| *i as u32)); - - Ok(indices) -} - /// Sort execution plan. /// /// Support sorting datasets that are larger than the memory allotted From ac26f3086bafbac28963cf56aeb124db643dfba3 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 5 Mar 2025 15:09:37 +0100 Subject: [PATCH 10/14] Reduce API surface area for custom orderings --- datafusion/common/src/sort.rs | 14 ----- datafusion/common/src/types/logical.rs | 41 +------------ datafusion/core/tests/dataframe/mod.rs | 23 ++++---- datafusion/core/tests/dataframe/test_types.rs | 59 +++++++++++++------ 4 files changed, 52 insertions(+), 85 deletions(-) diff --git a/datafusion/common/src/sort.rs b/datafusion/common/src/sort.rs index 8ac0b32a48a3..e4ba851c5512 100644 --- a/datafusion/common/src/sort.rs +++ b/datafusion/common/src/sort.rs @@ -163,20 +163,6 @@ pub fn lexsort_to_indices( }; } - if columns.len() == 1 { - // fallback to non-lexical sort - let column = &columns[0]; - let options = column - .options - .as_ref() - .expect("Otherwise fallback to arrow earlier"); - return options.ordering.sort_to_indices( - &column.values, - SortOptions::new(options.descending, options.nulls_first), - fetch, - ); - } - let row_count = columns[0].values.len(); if columns.iter().any(|item| item.values.len() != row_count) { return _exec_err!("lexical sort columns have different row counts"); diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index ba261fdfb9e0..4d8deb9bb3f9 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -17,8 +17,7 @@ use super::NativeType; use crate::error::Result; -use crate::ScalarValue; -use arrow::array::{make_comparator, Array, ArrayRef, DynComparator, UInt32Array}; +use arrow::array::{make_comparator, ArrayRef, DynComparator}; use arrow::compute::SortOptions; use arrow::datatypes::DataType; use core::fmt; @@ -169,33 +168,6 @@ pub enum SortOrdering { } impl SortOrdering { - pub fn compare_scalars( - &self, - lhs: &ScalarValue, - rhs: &ScalarValue, - ) -> Option { - match self { - SortOrdering::Default => lhs.partial_cmp(rhs), - SortOrdering::Custom(c) => c.compare_scalars(lhs, rhs), - } - } - - pub fn sort_to_indices( - &self, - array: &dyn Array, - options: SortOptions, - fetch: Option, - ) -> Result { - match self { - SortOrdering::Default => Ok(arrow::compute::sort_to_indices( - array, - Some(options), - fetch, - )?), - SortOrdering::Custom(c) => c.sort_to_indices(array, options, fetch), - } - } - pub fn dyn_comparator( &self, array: ArrayRef, @@ -241,17 +213,6 @@ pub trait CustomOrdering: Debug + Send + Sync { /// The ordering id is used to establish equality between instances of [CustomOrdering]. fn ordering_id(&self) -> &str; - /// TODO - fn compare_scalars(&self, lhs: &ScalarValue, rhs: &ScalarValue) -> Option; - - /// TODO - fn sort_to_indices( - &self, - array: &dyn Array, - options: SortOptions, - fetch: Option, - ) -> Result; - /// TODO fn dyn_comparator( &self, diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 85e9fcbc85af..b9c82f13cb74 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -22,10 +22,7 @@ mod test_types; use arrow::array::{record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::{ - DataType, Field, Float32Type, Float64Type, Int32Type, Schema, SchemaRef, UInt64Type, - UnionFields, UnionMode, -}; +use arrow::datatypes::{DataType, Field, Float32Type, Float64Type, Int32Type, Int64Type, Schema, SchemaRef, UInt64Type, UnionFields, UnionMode}; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; use arrow_schema::extension::EXTENSION_TYPE_NAME_KEY; @@ -2849,8 +2846,8 @@ async fn sort_on_ambiguous_column() -> Result<()> { #[tokio::test] async fn sort_on_union_with_logical_type() -> Result<()> { let fields = [ - (0, Arc::new(Field::new("A", DataType::Int32, false))), - (1, Arc::new(Field::new("B", DataType::Float64, false))), + (0, Arc::new(Field::new("integer", DataType::Int64, false))), + (1, Arc::new(Field::new("float", DataType::Float64, false))), ] .into_iter() .collect(); @@ -2862,13 +2859,13 @@ async fn sort_on_union_with_logical_type() -> Result<()> { EXTENSION_TYPE_NAME_KEY.into(), IntOrFloatType::name().into(), )])); - let schema = Schema::new(vec![field]); + let schema = Arc::new(Schema::new(vec![field])); let mut builder = UnionBuilder::new_dense(); - builder.append::("A", 1)?; - builder.append::("B", 3.0)?; - builder.append::("A", 1)?; - builder.append::("B", 3.0)?; + builder.append::("integer", 1)?; + builder.append::("integer", -1)?; + builder.append::("float", 3.0)?; + builder.append::("float", 6.0)?; let union = builder.build()?; let mut ctx = SessionContext::new(); @@ -2876,9 +2873,9 @@ async fn sort_on_union_with_logical_type() -> Result<()> { ctx.register_table( "test_table", Arc::new(MemTable::try_new( - Arc::new(schema.clone()), + schema.clone(), vec![vec![RecordBatch::try_new( - Arc::new(schema), + schema.clone(), vec![Arc::new(union)], )?]], )?), diff --git a/datafusion/core/tests/dataframe/test_types.rs b/datafusion/core/tests/dataframe/test_types.rs index 512a3fefc472..dce22cf4ff38 100644 --- a/datafusion/core/tests/dataframe/test_types.rs +++ b/datafusion/core/tests/dataframe/test_types.rs @@ -1,10 +1,11 @@ -use arrow::array::{Array, ArrayRef, DynComparator, UInt32Array}; +use arrow::array::{ArrayRef, AsArray, DynComparator, UnionArray}; +use arrow::datatypes::{Float64Type, Int64Type}; use arrow_schema::{DataType, SortOptions}; +use datafusion_common::cast::as_union_array; use datafusion_common::types::{ logical_float64, logical_int32, CustomOrdering, LogicalField, LogicalType, LogicalTypePlanningInformation, NativeType, SortOrdering, TypeSignature, }; -use datafusion_common::ScalarValue; use std::cmp::Ordering; use std::sync::Arc; @@ -49,7 +50,10 @@ impl LogicalType for IntOrFloatType { } } - fn default_cast_for(&self, _origin: &DataType) -> datafusion_common::Result { + fn default_cast_for( + &self, + _origin: &DataType, + ) -> datafusion_common::Result { unimplemented!() } @@ -71,24 +75,43 @@ impl CustomOrdering for IntOrFloatTypeOrdering { "order_int_or_float" } - fn compare_scalars(&self, _lhs: &ScalarValue, _rhs: &ScalarValue) -> Option { - unimplemented!("TODO") + fn dyn_comparator( + &self, + array: ArrayRef, + options: SortOptions, + ) -> datafusion_common::Result { + // TODO check data type + + Ok(Box::new(move |lhs, rhs| { + let array = as_union_array(array.as_ref()).expect("should be union"); + let result = compare_impl(array, lhs, rhs); + match options.descending { + true => result.reverse(), + false => result, + } + })) } +} - fn sort_to_indices( - &self, - _array: &dyn Array, - _options: SortOptions, - _fetch: Option, - ) -> datafusion_common::Result { - unimplemented!("TODO") +fn compare_impl(array: &UnionArray, lhs: usize, rhs: usize) -> Ordering { + let type_lhs = array.type_ids()[lhs]; + let type_rhs = array.type_ids()[rhs]; + + if type_lhs != type_rhs { + return type_lhs.cmp(&type_rhs); } - fn dyn_comparator( - &self, - _array: ArrayRef, - _options: SortOptions, - ) -> datafusion_common::Result { - unimplemented!("TODO") + let offset_lhs = array.value_offset(lhs); + let offset_rhs = array.value_offset(rhs); + match type_lhs { + 0 => { + let array = array.child(type_lhs).as_primitive::(); + array.value(offset_lhs).cmp(&array.value(offset_rhs)) + } + 1 => { + let array = array.child(type_lhs).as_primitive::(); + array.value(offset_lhs).total_cmp(&array.value(offset_rhs)) + } + _ => unreachable!("Union only has two variants"), } } From fed37f19a9932ca8f4ffbc919e4b2848ea5cf904 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Fri, 7 Mar 2025 12:52:36 +0100 Subject: [PATCH 11/14] Fix compiler errors after merging properties.rs refactoring --- .../src/equivalence/properties.rs | 0 .../src/equivalence/properties/mod.rs | 36 +++++++++++-------- .../src/equivalence/properties/union.rs | 3 +- 3 files changed, 24 insertions(+), 15 deletions(-) delete mode 100755 datafusion/physical-expr/src/equivalence/properties.rs diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs deleted file mode 100755 index e69de29bb2d1..000000000000 diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs index 080587c0e231..32e40d9d41eb 100644 --- a/datafusion/physical-expr/src/equivalence/properties/mod.rs +++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs @@ -41,7 +41,6 @@ use crate::{ PhysicalSortExpr, PhysicalSortRequirement, }; -use arrow::compute::SortOptions; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{plan_err, Constraint, Constraints, HashMap, Result}; @@ -49,6 +48,7 @@ use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_physical_expr_common::utils::ExprPropertiesNode; +use datafusion_common::sort::AdvSortOptions; use indexmap::IndexSet; use itertools::Itertools; @@ -378,7 +378,7 @@ impl EquivalenceProperties { continue; } - let leading_ordering_options = ordering[0].options; + let leading_ordering_options = ordering[0].options.clone(); for equivalent_expr in &eq_class { let children = equivalent_expr.children(); @@ -398,7 +398,9 @@ impl EquivalenceProperties { break; } child_properties.push(ExprProperties { - sort_properties: SortProperties::Ordered(next.options), + sort_properties: SortProperties::Ordered( + next.options.clone(), + ), range: Interval::make_unbounded( &child.data_type(&self.schema)?, )?, @@ -415,9 +417,14 @@ impl EquivalenceProperties { if let Ok(expr_properties) = equivalent_expr.get_properties(&child_properties) { + let SortProperties::Ordered(expr_ordering_options) = + &expr_properties.sort_properties + else { + break; + }; + if expr_properties.preserves_lex_ordering - && SortProperties::Ordered(leading_ordering_options) - == expr_properties.sort_properties + && &leading_ordering_options == expr_ordering_options { // Assume existing ordering is [c ASC, a ASC, b ASC] // When equality c = f(a,b) is given, if we know that given ordering `[a ASC, b ASC]`, @@ -647,7 +654,8 @@ impl EquivalenceProperties { req.expr.eq(&existing.expr) && req .options - .is_none_or(|req_opts| req_opts == existing.options) + .as_ref() + .is_none_or(|req_opts| *req_opts == existing.options) }, ) }) @@ -741,14 +749,14 @@ impl EquivalenceProperties { .zip(rhs.inner.iter_mut()) .all(|(lhs, rhs)| { lhs.expr.eq(&rhs.expr) - && match (lhs.options, rhs.options) { + && match (&lhs.options, &rhs.options) { (Some(lhs_opt), Some(rhs_opt)) => lhs_opt == rhs_opt, (Some(options), None) => { - rhs.options = Some(options); + rhs.options = Some(options.clone()); true } (None, Some(options)) => { - lhs.options = Some(options); + lhs.options = Some(options.clone()); true } (None, None) => true, @@ -791,7 +799,7 @@ impl EquivalenceProperties { { res.push(PhysicalSortExpr { expr: Arc::clone(&r_expr), - options: sort_expr.options, + options: sort_expr.options.clone(), }); } } @@ -881,7 +889,7 @@ impl EquivalenceProperties { self.project_expr(&sort_expr.expr, mapping).map(|expr| { PhysicalSortExpr { expr, - options: sort_expr.options, + options: sort_expr.options.clone(), } }); let is_projected = target_sort_expr.is_some(); @@ -1157,7 +1165,7 @@ impl EquivalenceProperties { )), SortProperties::Singleton => { // Assign default ordering to constant expressions - let options = SortOptions::default(); + let options = AdvSortOptions::default(); Some(( PhysicalSortExpr { expr: Arc::clone(&exprs[idx]), @@ -1477,7 +1485,7 @@ fn update_properties( { node.data.sort_properties = SortProperties::Singleton; } else if let Some(options) = oeq_class.get_options(&normalized_expr) { - node.data.sort_properties = SortProperties::Ordered(options); + node.data.sort_properties = SortProperties::Ordered(options.clone()); } Ok(Transformed::yes(node)) } @@ -1554,7 +1562,7 @@ fn get_expr_properties( if let Some(column_order) = dependencies.iter().find(|&order| expr.eq(&order.expr)) { // If exact match is found, return its ordering. Ok(ExprProperties { - sort_properties: SortProperties::Ordered(column_order.options), + sort_properties: SortProperties::Ordered(column_order.options.clone()), range: Interval::make_unbounded(&expr.data_type(schema)?)?, preserves_lex_ordering: false, }) diff --git a/datafusion/physical-expr/src/equivalence/properties/union.rs b/datafusion/physical-expr/src/equivalence/properties/union.rs index 64ef9278e248..df0721fa51a5 100644 --- a/datafusion/physical-expr/src/equivalence/properties/union.rs +++ b/datafusion/physical-expr/src/equivalence/properties/union.rs @@ -313,7 +313,8 @@ fn advance_if_matches_constant( ) -> Option { let expr = iter.peek()?; let const_expr = constants.iter().find(|c| c.eq_expr(expr))?; - let found_expr = PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options); + let found_expr = + PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options.clone()); iter.next(); Some(found_expr) } From 940524ca0ac117dd8a9780891c6d37d885ab3c1c Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Fri, 7 Mar 2025 13:08:24 +0100 Subject: [PATCH 12/14] Custom sorting test passing --- datafusion/core/tests/dataframe/mod.rs | 45 +++++++++---------- datafusion/core/tests/dataframe/test_types.rs | 31 ++++++++++--- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 03318185f85e..ea64fa4a8513 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -20,9 +20,12 @@ mod dataframe_functions; mod describe; mod test_types; -use arrow::array::{record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; +use arrow::array::{as_union_array, record_batch, Array, ArrayRef, AsArray, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::{DataType, Field, Float32Type, Float64Type, Int32Type, Int64Type, Schema, SchemaRef, UInt64Type, UnionFields, UnionMode}; +use arrow::datatypes::{ + DataType, Field, Float32Type, Float64Type, Int32Type, Int64Type, Schema, SchemaRef, + UInt64Type, UnionFields, UnionMode, +}; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; use arrow_schema::extension::EXTENSION_TYPE_NAME_KEY; @@ -3079,29 +3082,22 @@ async fn sort_on_ambiguous_column() -> Result<()> { #[tokio::test] async fn sort_on_union_with_logical_type() -> Result<()> { - let fields = [ - (0, Arc::new(Field::new("integer", DataType::Int64, false))), - (1, Arc::new(Field::new("float", DataType::Float64, false))), - ] - .into_iter() - .collect(); - - let my_extension_type = Arc::new(IntOrFloatType::new()); - let union_type = DataType::Union(fields, UnionMode::Dense); - let field = - Field::new("my_union", union_type, false).with_metadata(HashMap::from([( - EXTENSION_TYPE_NAME_KEY.into(), - IntOrFloatType::name().into(), - )])); - let schema = Arc::new(Schema::new(vec![field])); - let mut builder = UnionBuilder::new_dense(); builder.append::("integer", 1)?; + builder.append::("float", 6.0)?; builder.append::("integer", -1)?; builder.append::("float", 3.0)?; - builder.append::("float", 6.0)?; let union = builder.build()?; + let my_extension_type = Arc::new(IntOrFloatType::new()); + let field = Field::new("my_union", union.data_type().clone(), false).with_metadata( + HashMap::from([( + EXTENSION_TYPE_NAME_KEY.into(), + IntOrFloatType::name().into(), + )]), + ); + let schema = Arc::new(Schema::new(vec![field])); + let mut ctx = SessionContext::new(); ctx.register_extension_type(my_extension_type)?; ctx.register_table( @@ -3115,17 +3111,20 @@ async fn sort_on_union_with_logical_type() -> Result<()> { )?), )?; - ctx.table("test_table") + let record_batch = ctx.table("test_table") .await? - .sort_by(vec![Expr::from(datafusion::common::Column::from( - "my_union", - ))])? + .sort_by(vec![col("my_union")])? .execute_stream() .await? .next() .await .unwrap()?; + let result = as_union_array(record_batch.column_by_name("my_union").unwrap()); + assert_eq!(result.type_ids(), &[0, 0, 1, 1]); + assert_eq!(result.child(0).as_primitive::().values(), &[-1, 1]); + assert_eq!(result.child(1).as_primitive::().values(), &[3.0, 6.0]); + Ok(()) } diff --git a/datafusion/core/tests/dataframe/test_types.rs b/datafusion/core/tests/dataframe/test_types.rs index dce22cf4ff38..1144c82b3136 100644 --- a/datafusion/core/tests/dataframe/test_types.rs +++ b/datafusion/core/tests/dataframe/test_types.rs @@ -1,4 +1,4 @@ -use arrow::array::{ArrayRef, AsArray, DynComparator, UnionArray}; +use arrow::array::{Array, ArrayRef, AsArray, DynComparator, UnionArray}; use arrow::datatypes::{Float64Type, Int64Type}; use arrow_schema::{DataType, SortOptions}; use datafusion_common::cast::as_union_array; @@ -84,15 +84,36 @@ impl CustomOrdering for IntOrFloatTypeOrdering { Ok(Box::new(move |lhs, rhs| { let array = as_union_array(array.as_ref()).expect("should be union"); - let result = compare_impl(array, lhs, rhs); - match options.descending { - true => result.reverse(), - false => result, + + match (array.is_null(lhs), array.is_null(rhs)) { + (true, true) => Ordering::Equal, + (true, false) => { + if options.nulls_first { + Ordering::Less + } else { + Ordering::Greater + } + } + (false, true) => { + if options.nulls_first { + Ordering::Greater + } else { + Ordering::Less + } + } + (false, false) => { + let result = compare_impl(array, lhs, rhs); + match options.descending { + true => result.reverse(), + false => result, + } + } } })) } } +/// Default comparison between two (`lhs` & `rhs`) non-null [IntOrFloat] elements. fn compare_impl(array: &UnionArray, lhs: usize, rhs: usize) -> Ordering { let type_lhs = array.type_ids()[lhs]; let type_rhs = array.type_ids()[rhs]; From 996d4fd21ccaeaf023d303ec9210116b68a598aa Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sun, 9 Mar 2025 10:46:55 +0100 Subject: [PATCH 13/14] Fix compiler errors in tests --- datafusion/catalog-listing/src/helpers.rs | 5 + datafusion/common/src/sort.rs | 23 ++- .../core/src/datasource/listing/table.rs | 6 +- .../core/src/execution/session_state.rs | 5 +- datafusion/core/tests/dataframe/mod.rs | 21 ++- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 5 +- .../tests/fuzz_cases/equivalence/ordering.rs | 10 +- .../fuzz_cases/equivalence/projection.rs | 6 +- .../fuzz_cases/equivalence/properties.rs | 2 +- .../tests/fuzz_cases/equivalence/utils.rs | 19 +-- .../core/tests/fuzz_cases/merge_fuzz.rs | 6 +- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 6 +- .../sort_preserving_repartition_fuzz.rs | 10 +- .../core/tests/fuzz_cases/window_fuzz.rs | 7 +- datafusion/datasource/src/file_scan_config.rs | 10 +- datafusion/datasource/src/memory.rs | 8 +- datafusion/execution/src/task.rs | 9 +- datafusion/expr/src/registry.rs | 13 +- datafusion/functions/src/math/log.rs | 33 +++-- datafusion/functions/src/math/monotonicity.rs | 136 ++++++++++++------ .../physical-expr/src/equivalence/mod.rs | 15 +- .../physical-expr/src/equivalence/ordering.rs | 7 +- .../src/equivalence/properties/dependency.rs | 117 +++++++-------- datafusion/physical-expr/src/utils/mod.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 37 ++--- .../src/aggregates/order/partial.rs | 3 +- .../src/joins/nested_loop_join.rs | 6 +- .../src/joins/stream_join_utils.rs | 16 +-- .../src/joins/symmetric_hash_join.rs | 61 ++++---- datafusion/physical-plan/src/joins/utils.rs | 34 ++--- .../physical-plan/src/repartition/mod.rs | 5 +- .../physical-plan/src/sorts/partial_sort.rs | 78 +++++----- datafusion/physical-plan/src/sorts/sort.rs | 29 ++-- .../src/sorts/sort_preserving_merge.rs | 25 ++-- datafusion/physical-plan/src/union.rs | 3 +- .../src/windows/bounded_window_agg_exec.rs | 14 +- datafusion/physical-plan/src/windows/mod.rs | 25 ++-- .../proto/src/physical_plan/from_proto.rs | 6 +- datafusion/proto/src/physical_plan/mod.rs | 9 +- .../proto/src/physical_plan/to_proto.rs | 2 +- 40 files changed, 510 insertions(+), 324 deletions(-) diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index 9ac8423042d3..fc048861b263 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -547,6 +547,7 @@ mod tests { use datafusion_expr::{ case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF, }; + use datafusion_expr::registry::MemoryExtensionTypeRegistry; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; @@ -1060,6 +1061,10 @@ mod tests { unimplemented!() } + fn extension_types(&self) -> &MemoryExtensionTypeRegistry { + unimplemented!() + } + fn runtime_env(&self) -> &Arc { unimplemented!() } diff --git a/datafusion/common/src/sort.rs b/datafusion/common/src/sort.rs index e4ba851c5512..6554b938667f 100644 --- a/datafusion/common/src/sort.rs +++ b/datafusion/common/src/sort.rs @@ -1,11 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::error::{_exec_err, _internal_err}; use crate::types::SortOrdering; use crate::Result; use arrow::array::{ArrayRef, DynComparator, UInt32Array}; use arrow::compute::{partial_sort, SortColumn, SortOptions}; -use std::cmp::Ordering; use arrow::datatypes::DataType; use arrow::row::{RowConverter, SortField}; +use std::cmp::Ordering; /// TODO #[derive(Clone, Debug, Default, Hash, PartialEq, Eq)] @@ -175,7 +192,8 @@ pub fn lexsort_to_indices( len = limit.min(len); } - let compare_items = columns.iter() + let compare_items = columns + .iter() .map(|c| c.dyn_compartor()) .collect::>>()?; @@ -232,7 +250,6 @@ pub(crate) fn lexsort_to_indices_multi_columns( Ok(indices) } - /// we can only do this if the T is primitive #[inline] fn sort_unstable_by(array: &mut [T], limit: usize, cmp: F) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index cefd361c8ff6..1642ec5065ca 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1199,7 +1199,6 @@ mod tests { test::{columns, object_store::register_test_store}, }; - use arrow::compute::SortOptions; use arrow::record_batch::RecordBatch; use datafusion_common::stats::Precision; use datafusion_common::{assert_contains, ScalarValue}; @@ -1211,6 +1210,8 @@ mod tests { use crate::test::object_store::{ensure_head_concurrency, make_test_store_and_state}; use tempfile::TempDir; use url::Url; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; #[tokio::test] async fn read_single_file() -> Result<()> { @@ -1319,7 +1320,8 @@ mod tests { Ok(vec![LexOrdering::new( vec![PhysicalSortExpr { expr: physical_col("string_col", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::default(), descending: false, nulls_first: false, }, diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f072aee3eb9d..41195c6af446 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1864,7 +1864,10 @@ impl FunctionRegistry for SessionState { } impl ExtensionTypeRegistry for SessionState { - fn get_extension_type(&self, name: &str) -> datafusion_common::Result { + fn get_extension_type( + &self, + name: &str, + ) -> datafusion_common::Result { self.extension_types.get_extension_type(name) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index ea64fa4a8513..9860da03e222 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -20,7 +20,13 @@ mod dataframe_functions; mod describe; mod test_types; -use arrow::array::{as_union_array, record_batch, Array, ArrayRef, AsArray, BooleanArray, DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, UnionBuilder}; +use arrow::array::{ + as_union_array, record_batch, Array, ArrayRef, AsArray, BooleanArray, + DictionaryArray, FixedSizeListArray, FixedSizeListBuilder, Float32Array, + Float64Array, Int32Array, Int32Builder, Int8Array, LargeListArray, ListArray, + ListBuilder, RecordBatch, StringArray, StringBuilder, StructBuilder, UInt32Array, + UInt32Builder, UnionArray, UnionBuilder, +}; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{ DataType, Field, Float32Type, Float64Type, Int32Type, Int64Type, Schema, SchemaRef, @@ -3111,7 +3117,8 @@ async fn sort_on_union_with_logical_type() -> Result<()> { )?), )?; - let record_batch = ctx.table("test_table") + let record_batch = ctx + .table("test_table") .await? .sort_by(vec![col("my_union")])? .execute_stream() @@ -3122,8 +3129,14 @@ async fn sort_on_union_with_logical_type() -> Result<()> { let result = as_union_array(record_batch.column_by_name("my_union").unwrap()); assert_eq!(result.type_ids(), &[0, 0, 1, 1]); - assert_eq!(result.child(0).as_primitive::().values(), &[-1, 1]); - assert_eq!(result.child(1).as_primitive::().values(), &[3.0, 6.0]); + assert_eq!( + result.child(0).as_primitive::().values(), + &[-1, 1] + ); + assert_eq!( + result.child(1).as_primitive::().values(), + &[3.0, 6.0] + ); Ok(()) } diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 1025a49ea1e3..8a1d188be163 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -23,7 +23,7 @@ use crate::fuzz_cases::aggregation_fuzzer::{ }; use arrow::array::{types::Int64Type, Array, ArrayRef, AsArray, Int64Array, RecordBatch}; -use arrow::compute::{concat_batches, SortOptions}; +use arrow::compute::{concat_batches}; use arrow::datatypes::{ DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, @@ -51,6 +51,7 @@ use test_utils::{add_empty_batches, StringBatchGenerator}; use rand::rngs::StdRng; use rand::{thread_rng, Rng, SeedableRng}; use tokio::task::JoinSet; +use datafusion_common::sort::AdvSortOptions; // ======================================================================== // The new aggregation fuzz tests based on [`AggregationFuzzer`] @@ -315,7 +316,7 @@ async fn run_aggregate_test(input1: Vec, group_by_columns: Vec<&str for ordering_col in ["a", "b", "c"] { sort_keys.push(PhysicalSortExpr { expr: col(ordering_col, &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) } diff --git a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs index cc38d1674176..8a46a4371080 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs @@ -37,7 +37,8 @@ fn test_ordering_satisfy_with_equivalence_random() -> Result<()> { const N_RANDOM_SCHEMA: usize = 5; const N_ELEMENTS: usize = 125; const N_DISTINCT: usize = 5; - const SORT_OPTIONS: SortOptions = SortOptions { + const SORT_OPTIONS: AdvSortOptions = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -63,7 +64,7 @@ fn test_ordering_satisfy_with_equivalence_random() -> Result<()> { .into_iter() .map(|expr| PhysicalSortExpr { expr: Arc::clone(expr), - options: SORT_OPTIONS, + options: SORT_OPTIONS.clone(), }) .collect::(); let expected = is_table_same_after_sort( @@ -94,7 +95,8 @@ fn test_ordering_satisfy_with_equivalence_complex_random() -> Result<()> { const N_RANDOM_SCHEMA: usize = 100; const N_ELEMENTS: usize = 125; const N_DISTINCT: usize = 5; - const SORT_OPTIONS: SortOptions = SortOptions { + const SORT_OPTIONS: AdvSortOptions = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -135,7 +137,7 @@ fn test_ordering_satisfy_with_equivalence_complex_random() -> Result<()> { .into_iter() .map(|expr| PhysicalSortExpr { expr: Arc::clone(expr), - options: SORT_OPTIONS, + options: SORT_OPTIONS.clone(), }) .collect::(); let expected = is_table_same_after_sort( diff --git a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs index a3fa1157b38f..680f89198e96 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs @@ -19,7 +19,6 @@ use crate::fuzz_cases::equivalence::utils::{ apply_projection, create_random_schema, generate_table_for_eq_properties, is_table_same_after_sort, TestScalarUDF, }; -use arrow::compute::SortOptions; use datafusion_common::Result; use datafusion_expr::{Operator, ScalarUDF}; use datafusion_physical_expr::equivalence::ProjectionMapping; @@ -29,6 +28,8 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use itertools::Itertools; use std::sync::Arc; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; #[test] fn project_orderings_random() -> Result<()> { @@ -108,7 +109,8 @@ fn ordering_satisfy_after_projection_random() -> Result<()> { const N_RANDOM_SCHEMA: usize = 20; const N_ELEMENTS: usize = 125; const N_DISTINCT: usize = 5; - const SORT_OPTIONS: SortOptions = SortOptions { + const SORT_OPTIONS: AdvSortOptions = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; diff --git a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs index 593e1c6c2dca..60356ceb6d4d 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs @@ -75,7 +75,7 @@ fn test_find_longest_permutation_random() -> Result<()> { .zip(ordering.iter()) .map(|(&idx, sort_expr)| PhysicalSortExpr { expr: Arc::clone(&exprs[idx]), - options: sort_expr.options, + options: sort_expr.options.clone(), }) .collect::(); assert_eq!( diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs index f89cfb8150f3..5ce2ec059a50 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs @@ -23,7 +23,7 @@ use std::cmp::Ordering; use std::sync::Arc; use arrow::array::{ArrayRef, Float32Array, Float64Array, RecordBatch, UInt32Array}; -use arrow::compute::SortOptions; +use arrow::compute::{SortColumn, SortOptions}; use arrow::compute::{lexsort_to_indices, take_record_batch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::utils::{compare_rows, get_row_at_idx}; @@ -38,6 +38,8 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering; use itertools::izip; use rand::prelude::*; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; pub fn output_schema( mapping: &ProjectionMapping, @@ -108,7 +110,8 @@ pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperti let mut rng = StdRng::seed_from_u64(seed); let mut remaining_exprs = col_exprs[0..4].to_vec(); // only a, b, c, d are sorted - let options_asc = SortOptions { + let options_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -121,7 +124,7 @@ pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperti .drain(0..n_sort_expr) .map(|expr| PhysicalSortExpr { expr: Arc::clone(expr), - options: options_asc, + options: options_asc.clone(), }) .collect(); @@ -267,7 +270,7 @@ pub fn is_table_same_after_sort( let values = expr_result.into_array(new_batch.num_rows())?; Ok(SortColumn { values, - options: Some(order_expr.options), + options: Some(order_expr.options.to_arrow().unwrap()), }) }) .collect::>>()?; @@ -390,7 +393,7 @@ pub fn generate_table_for_eq_properties( .map( |PhysicalSortExpr { expr, - options: options, + options, }| { let col = expr.as_any().downcast_ref::().unwrap(); let (idx, _field) = schema.column_with_name(col.name()).unwrap(); @@ -398,7 +401,7 @@ pub fn generate_table_for_eq_properties( ( SortColumn { values: arr, - options: Some(*options), + options: Some(options.to_arrow().unwrap()), }, idx, ) @@ -507,7 +510,7 @@ pub fn convert_to_sort_exprs( .iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(*expr), - options: *options, + options: AdvSortOptions::with_default_ordering(*options), }) .collect() } @@ -585,7 +588,7 @@ impl ScalarUDFImpl for TestScalarUDF { } fn output_ordering(&self, input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs index 92f375525066..6592f57adfa0 100644 --- a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use arrow::{ array::{ArrayRef, Int32Array}, - compute::SortOptions, record_batch::RecordBatch, }; use datafusion::datasource::memory::MemorySourceConfig; @@ -31,6 +30,8 @@ use datafusion::physical_plan::{ sorts::sort_preserving_merge::SortPreservingMergeExec, }; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_physical_expr_common::sort_expr::LexOrdering; use test_utils::{batches_to_vec, partitions_to_sorted_vec, stagger_batch_with_seed}; @@ -111,7 +112,8 @@ async fn run_merge_test(input: Vec>) { let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("x", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index 0b0f0aa2f105..d086ae36c63b 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use arrow::{ array::{as_string_array, ArrayRef, Int32Array, StringArray}, - compute::SortOptions, record_batch::RecordBatch, }; use datafusion::datasource::memory::MemorySourceConfig; @@ -36,6 +35,8 @@ use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::LexOrdering; use rand::Rng; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use test_utils::{batches_to_vec, partitions_to_sorted_vec}; const KB: usize = 1 << 10; @@ -237,7 +238,8 @@ impl SortTest { .iter() .map(|c| PhysicalSortExpr { expr: col(c, &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index b18fb3008b4c..0760c5fbc249 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -22,7 +22,6 @@ mod sp_repartition_fuzz_tests { use arrow::array::{ArrayRef, Int64Array, RecordBatch, UInt64Array}; use arrow::compute::{concat_batches, lexsort, SortColumn}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; - use arrow_schema::SortOptions; use datafusion::physical_plan::{ collect, metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, @@ -49,6 +48,8 @@ mod sp_repartition_fuzz_tests { use datafusion_physical_expr_common::sort_expr::LexOrdering; use itertools::izip; use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; // Generate a schema which consists of 6 columns (a, b, c, d, e, f) fn create_test_schema() -> Result { @@ -88,7 +89,8 @@ mod sp_repartition_fuzz_tests { let mut rng = StdRng::seed_from_u64(seed); let mut remaining_exprs = col_exprs[0..4].to_vec(); // only a, b, c, d are sorted - let options_asc = SortOptions { + let options_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -101,7 +103,7 @@ mod sp_repartition_fuzz_tests { .drain(0..n_sort_expr) .map(|expr| PhysicalSortExpr { expr: expr.clone(), - options: options_asc, + options: options_asc.clone(), }) .collect(); @@ -351,7 +353,7 @@ mod sp_repartition_fuzz_tests { for ordering_col in ["a", "b", "c"] { sort_keys.push(PhysicalSortExpr { expr: col(ordering_col, &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) } diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index a7f9e38c9ae3..c135c8df036e 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, StringArray}; -use arrow::compute::{concat_batches, SortOptions}; +use arrow::compute::{concat_batches}; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; @@ -54,6 +54,7 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering; use rand::distributions::Alphanumeric; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; +use datafusion_common::sort::AdvSortOptions; use test_utils::add_empty_batches; #[tokio::test(flavor = "multi_thread", worker_threads = 16)] @@ -598,7 +599,7 @@ async fn run_window_test( for column in &orderby_columns { orderby_exprs.push(PhysicalSortExpr { expr: col(column, &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }) } if orderby_exprs.len() > 1 && !window_frame.can_accept_multi_orderby() { @@ -612,7 +613,7 @@ async fn run_window_test( for partition_by_expr in &partitionby_exprs { sort_keys.push(PhysicalSortExpr { expr: partition_by_expr.clone(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) } for order_by_expr in &orderby_exprs { diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index bfb4ce50e4b9..b3d3549179dd 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -1082,12 +1082,11 @@ mod tests { use crate::{test_util::MockSource, tests::aggr_test_schema}; use super::*; - use arrow::{ - array::{Int32Array, RecordBatch}, - compute::SortOptions, - }; + use arrow::array::{Int32Array, RecordBatch}; + use datafusion_common::sort::AdvSortOptions; use datafusion_common::stats::Precision; + use datafusion_common::types::SortOrdering; use datafusion_common::{assert_batches_eq, DFSchema}; use datafusion_expr::{execution_props::ExecutionProps, SortExpr}; use datafusion_physical_expr::create_physical_expr; @@ -1105,7 +1104,8 @@ mod tests { } = e; Ok(PhysicalSortExpr { expr: create_physical_expr(expr, input_dfschema, execution_props)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: !asc, nulls_first: *nulls_first, }, diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index 64fd56971b29..2f01010f245d 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -728,8 +728,8 @@ mod memory_source_tests { use crate::source::DataSourceExec; use datafusion_physical_plan::ExecutionPlan; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::sort::AdvSortOptions; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -744,16 +744,16 @@ mod memory_source_tests { let sort1 = LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]); let sort2 = LexOrdering::new(vec![PhysicalSortExpr { expr: col("c", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let mut expected_output_order = LexOrdering::default(); expected_output_order.extend(sort1.clone()); diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index 199dbe5e940c..46fcb0b98bb6 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -222,7 +222,10 @@ impl ExtensionTypeRegistry for TaskContext { self.extension_types.register_extension_type(logical_type) } - fn deregister_extension_type(&mut self, name: &str) -> Result> { + fn deregister_extension_type( + &mut self, + name: &str, + ) -> Result> { self.extension_types.deregister_extension_type(name) } } @@ -264,7 +267,7 @@ mod tests { HashMap::default(), HashMap::default(), HashMap::default(), - HashMap::default(), + MemoryExtensionTypeRegistry::new(), runtime, ); @@ -297,7 +300,7 @@ mod tests { HashMap::default(), HashMap::default(), HashMap::default(), - HashMap::default(), + MemoryExtensionTypeRegistry::new(), runtime, ); diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index dabf54c2f2a9..ca23b59b7189 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -222,7 +222,8 @@ pub trait ExtensionTypeRegistry { /// /// Returns an error if the type cannot be deregistered, for example if the registry is read /// only. - fn deregister_extension_type(&mut self, name: &str) -> Result>; + fn deregister_extension_type(&mut self, name: &str) + -> Result>; } /// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s. @@ -260,7 +261,10 @@ impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { Ok(self.extension_types.insert(signature.into(), logical_type)) } - fn deregister_extension_type(&mut self, name: &str) -> Result> { + fn deregister_extension_type( + &mut self, + name: &str, + ) -> Result> { Ok(self.extension_types.remove(name)) } } @@ -295,7 +299,10 @@ impl ExtensionTypeRegistry for EmptyExtensionTypeRegistry { plan_err!("Cannot register type.") } - fn deregister_extension_type(&mut self, _name: &str) -> Result> { + fn deregister_extension_type( + &mut self, + _name: &str, + ) -> Result> { plan_err!("Cannot deregister type.") } } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index e2f15d547f2a..e0d0d22cb188 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -259,9 +259,10 @@ mod tests { use super::*; use arrow::array::{Float32Array, Float64Array, Int64Array}; - use arrow::compute::SortOptions; use datafusion_common::cast::{as_float32_array, as_float64_array}; use datafusion_common::DFSchema; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::simplify::SimplifyContext; @@ -585,13 +586,15 @@ mod tests { let orders = vec![ ExprProperties::new_unknown(), ExprProperties::new_unknown().with_order(SortProperties::Ordered( - SortOptions { + AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, )), ExprProperties::new_unknown().with_order(SortProperties::Ordered( - SortOptions { + AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, @@ -628,38 +631,44 @@ mod tests { // base: Ascending, num: Ascending SortProperties::Unordered, // base: Ascending, num: Descending - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), // base: Ascending, num: Literal - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), // base: Descending, num: Unordered SortProperties::Unordered, // base: Descending, num: Ascending - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }), // base: Descending, num: Descending SortProperties::Unordered, // base: Descending, num: Literal - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }), // base: Literal, num: Unordered SortProperties::Unordered, // base: Literal, num: Ascending - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }), // base: Literal, num: Descending - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), @@ -670,13 +679,15 @@ mod tests { // Test with different `nulls_first` let base_order = ExprProperties::new_unknown().with_order( - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), ); let num_order = ExprProperties::new_unknown().with_order( - SortProperties::Ordered(SortOptions { + SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs index 6301b5c03056..fc360377efdc 100644 --- a/datafusion/functions/src/math/monotonicity.rs +++ b/datafusion/functions/src/math/monotonicity.rs @@ -567,9 +567,9 @@ pub fn get_tanh_doc() -> &'static Documentation { #[cfg(test)] mod tests { - use arrow::compute::SortOptions; use datafusion_common::Result; - + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use super::*; #[derive(Debug)] @@ -602,11 +602,13 @@ mod tests { func: acos_order, lower: -0.5, upper: 0.5, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: false, })), @@ -616,7 +618,8 @@ mod tests { func: acos_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -627,11 +630,13 @@ mod tests { func: acosh_order, lower: 2.0, upper: 100.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, })), @@ -641,7 +646,8 @@ mod tests { func: acosh_order, lower: 0.5, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: false, }), @@ -652,11 +658,13 @@ mod tests { func: asin_order, lower: -0.5, upper: 0.5, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -666,7 +674,8 @@ mod tests { func: asin_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -677,11 +686,13 @@ mod tests { func: asinh_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -691,11 +702,13 @@ mod tests { func: asinh_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -705,11 +718,13 @@ mod tests { func: atan_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -719,11 +734,13 @@ mod tests { func: atan_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -733,11 +750,13 @@ mod tests { func: atanh_order, lower: -0.6, upper: 0.6, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -747,7 +766,8 @@ mod tests { func: atanh_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -758,11 +778,13 @@ mod tests { func: cbrt_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -772,11 +794,13 @@ mod tests { func: cbrt_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -786,11 +810,13 @@ mod tests { func: ceil_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -800,11 +826,13 @@ mod tests { func: ceil_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -814,7 +842,8 @@ mod tests { func: cos_order, lower: 0.0, upper: 2.0 * std::f64::consts::PI, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -825,7 +854,8 @@ mod tests { func: cos_order, lower: -2.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -836,11 +866,13 @@ mod tests { func: cosh_order, lower: 5.0, upper: 100.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -850,11 +882,13 @@ mod tests { func: cosh_order, lower: -100.0, upper: -5.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: false, })), @@ -864,7 +898,8 @@ mod tests { func: cosh_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), @@ -875,11 +910,13 @@ mod tests { func: degrees_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, })), @@ -889,11 +926,13 @@ mod tests { func: exp_order, lower: -1000.0, upper: 1000.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -903,11 +942,13 @@ mod tests { func: floor_order, lower: -1.0, upper: 1.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, })), @@ -917,11 +958,13 @@ mod tests { func: ln_order, lower: 1.0, upper: 2.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), - expected: Ok(SortProperties::Ordered(SortOptions { + expected: Ok(SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, })), @@ -931,7 +974,8 @@ mod tests { func: ln_order, lower: -5.0, upper: -4.0, - input_sort: SortProperties::Ordered(SortOptions { + input_sort: SortProperties::Ordered(AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }), diff --git a/datafusion/physical-expr/src/equivalence/mod.rs b/datafusion/physical-expr/src/equivalence/mod.rs index e94d2bad5712..51296de2bf39 100644 --- a/datafusion/physical-expr/src/equivalence/mod.rs +++ b/datafusion/physical-expr/src/equivalence/mod.rs @@ -67,13 +67,13 @@ pub fn add_offset_to_expr( #[cfg(test)] mod tests { - use super::*; use crate::expressions::col; use crate::PhysicalSortExpr; - use arrow::compute::SortOptions; + use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + use datafusion_common::sort::AdvSortOptions; use datafusion_common::{plan_datafusion_err, Result}; use datafusion_physical_expr_common::sort_expr::{ LexOrdering, PhysicalSortRequirement, @@ -89,7 +89,7 @@ mod tests { let name = parts.next().expect("empty sort expression"); let mut sort_expr = PhysicalSortExpr::new( col(name, schema).expect("invalid column name"), - SortOptions::default(), + AdvSortOptions::default(), ); if let Some(options) = parts.next() { @@ -206,7 +206,10 @@ mod tests { in_data .iter() .map(|(expr, options)| { - PhysicalSortRequirement::new(Arc::clone(*expr), *options) + PhysicalSortRequirement::new( + Arc::clone(*expr), + options.map(|opt| AdvSortOptions::with_default_ordering(opt)), + ) }) .collect() } @@ -219,7 +222,7 @@ mod tests { .iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(*expr), - options: *options, + options: AdvSortOptions::with_default_ordering(*options), }) .collect() } @@ -243,7 +246,7 @@ mod tests { .iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(expr), - options: *options, + options: AdvSortOptions::with_default_ordering(*options), }) .collect(), ) diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index da978dec6df6..23cf3a9ab597 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -360,6 +360,7 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::Result; + use datafusion_common::sort::AdvSortOptions; use datafusion_expr::{Operator, ScalarUDF}; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -371,16 +372,16 @@ mod tests { ])); let crude = LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let finer = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]); // finer ordering satisfies, crude ordering should return true diff --git a/datafusion/physical-expr/src/equivalence/properties/dependency.rs b/datafusion/physical-expr/src/equivalence/properties/dependency.rs index 9eba295e562e..1ee9571cff2e 100644 --- a/datafusion/physical-expr/src/equivalence/properties/dependency.rs +++ b/datafusion/physical-expr/src/equivalence/properties/dependency.rs @@ -424,7 +424,6 @@ pub fn generate_dependency_orderings( #[cfg(test)] mod tests { - use std::ops::Not; use std::sync::Arc; use super::*; @@ -439,6 +438,8 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion_common::{Constraint, Constraints, Result}; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use datafusion_expr::sort_properties::SortProperties; use datafusion_expr::Operator; @@ -541,7 +542,7 @@ mod tests { #[test] fn test_normalize_ordering_equivalence_classes() -> Result<()> { - let sort_options = SortOptions::default(); + let sort_options = AdvSortOptions::default(); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -557,11 +558,11 @@ mod tests { let others = vec![ LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_b_expr), - options: sort_options, + options: sort_options.clone(), }]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_c_expr), - options: sort_options, + options: sort_options.clone(), }]), ]; eq_properties.add_new_orderings(others); @@ -570,11 +571,11 @@ mod tests { expected_eqs.add_new_orderings([ LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_b_expr), - options: sort_options, + options: sort_options.clone(), }]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_c_expr), - options: sort_options, + options: sort_options.clone(), }]), ]); @@ -587,8 +588,8 @@ mod tests { #[test] fn test_get_indices_of_matching_sort_exprs_with_order_eq() -> Result<()> { - let sort_options = SortOptions::default(); - let sort_options_not = SortOptions::default().not(); + let sort_options = AdvSortOptions::default(); + let sort_options_not = AdvSortOptions::default().with_reversed_order(); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -601,11 +602,11 @@ mod tests { eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: sort_options_not, + options: sort_options_not.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: sort_options, + options: sort_options.clone(), }, ])]); let (result, idxs) = eq_properties.find_longest_permutation(&required_columns); @@ -615,11 +616,11 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(col_b), - options: sort_options_not + options: sort_options_not.clone() }, PhysicalSortExpr { expr: Arc::clone(col_a), - options: sort_options + options: sort_options.clone() } ]) ); @@ -636,16 +637,16 @@ mod tests { eq_properties.add_new_orderings([ LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options: sort_options, + options: sort_options.clone(), }]), LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: sort_options_not, + options: sort_options_not.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: sort_options, + options: sort_options.clone(), }, ]), ]); @@ -656,11 +657,11 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(col_b), - options: sort_options_not + options: sort_options_not.clone() }, PhysicalSortExpr { expr: Arc::clone(col_a), - options: sort_options + options: sort_options.clone() } ]) ); @@ -680,15 +681,15 @@ mod tests { eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), - options: sort_options_not, + options: sort_options_not.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options: sort_options, + options: sort_options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options: sort_options, + options: sort_options.clone(), }, ])]); let (_, idxs) = eq_properties.find_longest_permutation(&required_columns); @@ -711,7 +712,8 @@ mod tests { let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_d = &col("d", &schema)?; - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -721,11 +723,11 @@ mod tests { eq_properties.add_new_orderings(vec![ LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(col_b), - options: option_asc, + options: option_asc.clone(), }]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(col_d), - options: option_asc, + options: option_asc.clone(), }]), ]); @@ -737,12 +739,12 @@ mod tests { Operator::Plus, Arc::clone(col_b), )) as Arc, - SortProperties::Ordered(option_asc), + SortProperties::Ordered(option_asc.clone()), ), // b - (Arc::clone(col_b), SortProperties::Ordered(option_asc)), + (Arc::clone(col_b), SortProperties::Ordered(option_asc.clone())), // a - (Arc::clone(col_a), SortProperties::Ordered(option_asc)), + (Arc::clone(col_a), SortProperties::Ordered(option_asc.clone())), // a + c ( Arc::new(BinaryExpr::new( @@ -804,11 +806,11 @@ mod tests { eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(col_d), - options: option_asc, + options: AdvSortOptions::with_default_ordering(option_asc), }, PhysicalSortExpr { expr: Arc::clone(col_h), - options: option_desc, + options: AdvSortOptions::with_default_ordering(option_desc), }, ])]); let test_cases = vec![ @@ -1098,7 +1100,8 @@ mod tests { .map(|c| { col(c, schema.as_ref()).map(|expr| PhysicalSortExpr { expr, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -1191,7 +1194,7 @@ mod tests { .map(|&name| { col(name, &schema).map(|col| PhysicalSortExpr { expr: col, - options: SortOptions::default(), + options: AdvSortOptions::default(), }) }) .collect::>()?; @@ -1362,11 +1365,11 @@ mod tests { let sort_exprs = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_a), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, PhysicalSortExpr { expr: Arc::clone(&col_b), - options: SortOptions::default(), + options: AdvSortOptions::default(), }, ]); @@ -1390,8 +1393,9 @@ mod tests { let col_b = col("b", &schema)?; let col_c = col("c", &schema)?; - let asc = SortOptions::default(); - let desc = SortOptions { + let asc = AdvSortOptions::default(); + let desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }; @@ -1400,22 +1404,22 @@ mod tests { eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_a), - options: asc, + options: asc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_b), - options: desc, + options: desc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_c), - options: asc, + options: asc.clone(), }, ])]); // New ordering: [a ASC] let new_order = LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_a), - options: asc, + options: asc.clone(), }]); let result = eq_properties.with_reorder(new_order); @@ -1446,24 +1450,24 @@ mod tests { // Make a and b equivalent eq_properties.add_equal_conditions(&col_a, &col_b)?; - let asc = SortOptions::default(); + let asc = AdvSortOptions::default(); // Initial ordering: [a ASC, c ASC] eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_a), - options: asc, + options: asc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_c), - options: asc, + options: asc.clone(), }, ])]); // New ordering: [b ASC] let new_order = LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_b), - options: asc, + options: asc.clone(), }]); let result = eq_properties.with_reorder(new_order); @@ -1490,8 +1494,9 @@ mod tests { let col_a = col("a", &schema)?; let col_b = col("b", &schema)?; - let asc = SortOptions::default(); - let desc = SortOptions { + let asc = AdvSortOptions::default(); + let desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }; @@ -1500,18 +1505,18 @@ mod tests { eq_properties.add_new_orderings([LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_a), - options: asc, + options: asc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_b), - options: desc, + options: desc.clone(), }, ])]); // New ordering: [a DESC] let new_order = LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_a), - options: desc, + options: desc.clone(), }]); let result = eq_properties.with_reorder(new_order.clone()); @@ -1535,7 +1540,7 @@ mod tests { let col_d = col("d", &schema)?; let col_e = col("e", &schema)?; - let asc = SortOptions::default(); + let asc = AdvSortOptions::default(); // Constants: c is constant eq_properties = eq_properties.with_constants([ConstExpr::from(&col_c)]); @@ -1548,16 +1553,16 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_d), - options: asc, + options: asc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_a), - options: asc, + options: asc.clone(), }, ]), LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::clone(&col_e), - options: asc, + options: asc.clone(), }]), ]); @@ -1565,11 +1570,11 @@ mod tests { let new_order = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(&col_b), - options: asc, + options: asc.clone(), }, PhysicalSortExpr { expr: Arc::clone(&col_c), - options: asc, + options: asc.clone(), }, ]); @@ -1697,7 +1702,7 @@ mod tests { .iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ); @@ -1710,7 +1715,7 @@ mod tests { cols.iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ) @@ -1724,7 +1729,7 @@ mod tests { cols.iter() .map(|col_name| PhysicalSortExpr { expr: col(col_name, schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }) .collect(), ) diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index 7e4c7f0e10ba..a527c7fc6f96 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -308,7 +308,7 @@ pub(crate) mod tests { } fn output_ordering(&self, input: &[ExprProperties]) -> Result { - Ok(input[0].sort_properties) + Ok(input[0].sort_properties.clone()) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 7d4837d04774..d6ecb079f4f8 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1380,7 +1380,7 @@ mod tests { DictionaryArray, Float32Array, Float64Array, Int32Array, StructArray, UInt32Array, UInt64Array, }; - use arrow::compute::{concat_batches, SortOptions}; + use arrow::compute::concat_batches; use arrow::datatypes::{DataType, Int32Type}; use datafusion_common::{ assert_batches_eq, assert_batches_sorted_eq, internal_err, DataFusionError, @@ -1401,6 +1401,8 @@ mod tests { use datafusion_physical_expr::Partitioning; use datafusion_physical_expr::PhysicalSortExpr; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use futures::{FutureExt, Stream}; // Generate a schema which consists of 5 columns (a, b, c, d, e) @@ -2152,7 +2154,7 @@ mod tests { // FIRST_VALUE(b ORDER BY b ) fn test_first_value_agg_expr( schema: &Schema, - sort_options: SortOptions, + sort_options: AdvSortOptions, ) -> Result> { let ordering_req = [PhysicalSortExpr { expr: col("b", schema)?, @@ -2171,7 +2173,7 @@ mod tests { // LAST_VALUE(b ORDER BY b ) fn test_last_value_agg_expr( schema: &Schema, - sort_options: SortOptions, + sort_options: AdvSortOptions, ) -> Result> { let ordering_req = [PhysicalSortExpr { expr: col("b", schema)?, @@ -2224,7 +2226,8 @@ mod tests { let groups = PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); - let sort_options = SortOptions { + let sort_options = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -2301,7 +2304,8 @@ mod tests { // Assume column a and b are aliases // Assume also that a ASC and c DESC describe the same global ordering for the table. (Since they are ordering equivalent). - let options1 = SortOptions { + let options1 = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -2317,30 +2321,30 @@ mod tests { None, Some(vec![PhysicalSortExpr { expr: Arc::clone(col_a), - options: options1, + options: options1.clone(), }]), Some(vec![ PhysicalSortExpr { expr: Arc::clone(col_a), - options: options1, + options: options1.clone(), }, PhysicalSortExpr { expr: Arc::clone(col_b), - options: options1, + options: options1.clone(), }, PhysicalSortExpr { expr: Arc::clone(col_c), - options: options1, + options: options1.clone(), }, ]), Some(vec![ PhysicalSortExpr { expr: Arc::clone(col_a), - options: options1, + options: options1.clone(), }, PhysicalSortExpr { expr: Arc::clone(col_b), - options: options1, + options: options1.clone(), }, ]), ]; @@ -2348,11 +2352,11 @@ mod tests { let common_requirement = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::clone(col_a), - options: options1, + options: options1.clone(), }, PhysicalSortExpr { expr: Arc::clone(col_c), - options: options1, + options: options1.clone(), }, ]); let mut aggr_exprs = order_by_exprs @@ -2388,15 +2392,16 @@ mod tests { ])); let col_a = col("a", &schema)?; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }; let groups = PhysicalGroupBy::new_single(vec![(col_a, "a".to_string())]); let aggregates: Vec> = vec![ - test_first_value_agg_expr(&schema, option_desc)?, - test_last_value_agg_expr(&schema, option_desc)?, + test_first_value_agg_expr(&schema, option_desc.clone())?, + test_last_value_agg_expr(&schema, option_desc.clone())?, ]; let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); let aggregate_exec = Arc::new(AggregateExec::try_new( diff --git a/datafusion/physical-plan/src/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs index aff69277a4ce..4c7caf26c3cf 100644 --- a/datafusion/physical-plan/src/aggregates/order/partial.rs +++ b/datafusion/physical-plan/src/aggregates/order/partial.rs @@ -278,6 +278,7 @@ impl GroupOrderingPartial { mod tests { use arrow::array::Int32Array; use arrow_schema::{DataType, Field}; + use datafusion_common::sort::AdvSortOptions; use datafusion_physical_expr::{expressions::col, PhysicalSortExpr}; use super::*; @@ -294,7 +295,7 @@ mod tests { let ordering = LexOrdering::new(vec![PhysicalSortExpr::new( col("a", &schema)?, - SortOptions::default(), + AdvSortOptions::default(), )]); let mut group_ordering = diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index f680de6738e5..d2562982b3a7 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -1040,7 +1040,6 @@ pub(crate) mod tests { }; use arrow::array::Int32Array; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field}; use datafusion_common::{assert_batches_sorted_eq, assert_contains, ScalarValue}; use datafusion_execution::runtime_env::RuntimeEnvBuilder; @@ -1050,6 +1049,8 @@ pub(crate) mod tests { use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use rstest::rstest; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; fn build_table( a: (&str, &Vec), @@ -1082,7 +1083,8 @@ pub(crate) mod tests { let index = schema.index_of(name).unwrap(); let sort_expr = PhysicalSortExpr { expr: Arc::new(Column::new(name, index)), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, diff --git a/datafusion/physical-plan/src/joins/stream_join_utils.rs b/datafusion/physical-plan/src/joins/stream_join_utils.rs index 677601a12845..34e6d69b5a3e 100644 --- a/datafusion/physical-plan/src/joins/stream_join_utils.rs +++ b/datafusion/physical-plan/src/joins/stream_join_utils.rs @@ -799,8 +799,8 @@ pub mod tests { use super::*; use crate::{joins::test_utils::complicated_filter, joins::utils::ColumnIndex}; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field}; + use datafusion_common::sort::AdvSortOptions; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{binary, cast, col}; @@ -811,7 +811,7 @@ pub mod tests { // Sorting information for the left side: let left_child_sort_expr = PhysicalSortExpr { expr: col("left_1", &left_child_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }; let right_child_schema = Schema::new(vec![ @@ -826,7 +826,7 @@ pub mod tests { col("right_2", &right_child_schema)?, &right_child_schema, )?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }; let intermediate_schema = Schema::new(vec![ @@ -1002,7 +1002,7 @@ pub mod tests { &left_schema, &PhysicalSortExpr { expr: col("la1", left_schema.as_ref())?, - options: SortOptions::default(), + options: AdvSortOptions::default(), } )? .is_some()); @@ -1012,7 +1012,7 @@ pub mod tests { &left_schema, &PhysicalSortExpr { expr: col("lt1", left_schema.as_ref())?, - options: SortOptions::default(), + options: AdvSortOptions::default(), } )? .is_none()); @@ -1022,7 +1022,7 @@ pub mod tests { &right_schema, &PhysicalSortExpr { expr: col("ra1", right_schema.as_ref())?, - options: SortOptions::default(), + options: AdvSortOptions::default(), } )? .is_some()); @@ -1032,7 +1032,7 @@ pub mod tests { &right_schema, &PhysicalSortExpr { expr: col("rb1", right_schema.as_ref())?, - options: SortOptions::default(), + options: AdvSortOptions::default(), } )? .is_none()); @@ -1078,7 +1078,7 @@ pub mod tests { col("b", &schema)?, &schema, )?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }; let res = convert_sort_expr_with_filter_schema( diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 84b0fad503c8..83fab5342fe9 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -1728,7 +1728,6 @@ mod tests { partitioned_sym_join_with_filter, split_record_batches, }; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; use datafusion_common::ScalarValue; use datafusion_execution::config::SessionConfig; @@ -1737,6 +1736,8 @@ mod tests { use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use rstest::*; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; const TABLE_SIZE: i32 = 30; @@ -1841,11 +1842,11 @@ mod tests { col("la2", left_schema)?, left_schema, )?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ra1", right_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let (left, right) = create_memory_table( left_partition, @@ -1916,11 +1917,11 @@ mod tests { let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("la1", left_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ra1", right_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let (left, right) = create_memory_table( left_partition, @@ -2061,14 +2062,16 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("la1_des", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ra1_des", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, @@ -2120,14 +2123,16 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("l_asc_null_first", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("r_asc_null_first", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -2179,14 +2184,16 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("l_asc_null_last", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("r_asc_null_last", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, @@ -2240,14 +2247,16 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("l_desc_null_first", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("r_desc_null_first", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, @@ -2302,12 +2311,12 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("la1", left_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ra1", right_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let (left, right) = create_memory_table( left_partition, @@ -2361,17 +2370,17 @@ mod tests { let left_sorted = vec![ LexOrdering::new(vec![PhysicalSortExpr { expr: col("la1", left_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), LexOrdering::new(vec![PhysicalSortExpr { expr: col("la2", left_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), ]; let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ra1", right_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let (left, right) = create_memory_table( @@ -2442,14 +2451,16 @@ mod tests { let on = vec![(col("lc1", left_schema)?, col("rc1", right_schema)?)]; let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("lt1", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("rt1", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -2525,14 +2536,16 @@ mod tests { let on = vec![(col("lc1", left_schema)?, col("rc1", right_schema)?)]; let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("li1", left_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("ri1", right_schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -2601,11 +2614,11 @@ mod tests { let right_schema = &right_partition[0].schema(); let left_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("l_float", left_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let right_sorted = LexOrdering::new(vec![PhysicalSortExpr { expr: col("r_float", right_schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let (left, right) = create_memory_table( left_partition, diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 380d2d832d9d..16631b0e9bcc 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -1827,13 +1827,13 @@ mod tests { use std::pin::Pin; use arrow::array::Int32Array; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Fields}; use arrow::error::{ArrowError, Result as ArrowResult}; use datafusion_common::stats::Precision::{Absent, Exact, Inexact}; use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue}; use rstest::rstest; + use datafusion_common::sort::AdvSortOptions; fn check( left: &[Column], @@ -2642,29 +2642,29 @@ mod tests { #[test] fn test_calculate_join_output_ordering() -> Result<()> { - let options = SortOptions::default(); + let options = AdvSortOptions::default(); let left_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options.clone(), }, ]); let right_ordering = LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 2)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 1)), - options, + options: options.clone(), }, ]); let join_type = JoinType::Inner; @@ -2680,45 +2680,45 @@ mod tests { Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options, + options: options.clone(), }, ])), Some(LexOrdering::new(vec![ PhysicalSortExpr { expr: Arc::new(Column::new("z", 7)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("y", 6)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("a", 0)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("c", 2)), - options, + options: options.clone(), }, PhysicalSortExpr { expr: Arc::new(Column::new("d", 3)), - options, + options: options.clone(), }, ])), ]; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index e9a360c2ece3..c196fd6466a1 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1599,9 +1599,8 @@ mod tests { #[cfg(test)] mod test { - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; - + use datafusion_common::sort::AdvSortOptions; use super::*; use crate::test::TestMemoryExec; use crate::union::UnionExec; @@ -1701,7 +1700,7 @@ mod test { } fn sort_exprs(schema: &Schema) -> LexOrdering { - let options = SortOptions::default(); + let options = AdvSortOptions::default(); LexOrdering::new(vec![PhysicalSortExpr { expr: col("c0", schema).unwrap(), options, diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index bd0e6268de52..e097f52408a5 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -460,13 +460,13 @@ mod tests { use std::collections::HashMap; use arrow::array::*; - use arrow::compute::SortOptions; use arrow::datatypes::*; use futures::FutureExt; use itertools::Itertools; use datafusion_common::assert_batches_eq; - + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use crate::collect; use crate::expressions::col; use crate::expressions::PhysicalSortExpr; @@ -491,7 +491,8 @@ mod tests { Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, false), ]); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -500,15 +501,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(&source), @@ -553,7 +554,8 @@ mod tests { Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, false), ]); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -564,15 +566,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(&source), @@ -625,7 +627,8 @@ mod tests { Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, false), ]); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -636,15 +639,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(source), @@ -712,11 +715,13 @@ mod tests { async fn test_partitioned_input_partial_sort() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); let mem_exec = prepare_partitioned_input(); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -725,15 +730,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_desc, + options: option_desc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(&mem_exec), @@ -768,11 +773,13 @@ mod tests { let task_ctx = Arc::new(TaskContext::default()); let mem_exec = prepare_partitioned_input(); let schema = mem_exec.schema(); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -786,15 +793,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_desc, + options: option_desc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(&mem_exec), @@ -832,7 +839,8 @@ mod tests { let task_ctx = Arc::new(TaskContext::default()); let mem_exec = prepare_partitioned_input(); let schema = mem_exec.schema(); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }; @@ -841,11 +849,11 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_asc, + options: option_asc.clone(), }, ]), Arc::clone(&mem_exec), @@ -890,7 +898,7 @@ mod tests { let partial_sort_exec = Arc::new(PartialSortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("field_name", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), input, 1, @@ -930,11 +938,13 @@ mod tests { Field::new("b", DataType::Float64, true), Field::new("c", DataType::Float64, true), ])); - let option_asc = SortOptions { + let option_asc = AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }; - let option_desc = SortOptions { + let option_desc = AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }; @@ -980,15 +990,15 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("b", &schema)?, - options: option_asc, + options: option_asc.clone(), }, PhysicalSortExpr { expr: col("c", &schema)?, - options: option_desc, + options: option_desc.clone(), }, ]), TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?, @@ -1056,7 +1066,7 @@ mod tests { let sort_exec = Arc::new(PartialSortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), blocking_exec, 1, diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index ea3d9c2eeb66..ed5c9bdeecb8 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1152,7 +1152,6 @@ mod tests { use crate::test::TestMemoryExec; use arrow::array::*; - use arrow::compute::SortOptions; use arrow::datatypes::*; use datafusion_common::cast::as_primitive_array; use datafusion_common::{assert_batches_eq, Result, ScalarValue}; @@ -1163,6 +1162,8 @@ mod tests { use datafusion_physical_expr::EquivalenceProperties; use futures::{FutureExt, Stream}; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; #[derive(Debug, Clone)] pub struct SortedUnboundedExec { @@ -1293,7 +1294,7 @@ mod tests { let sort_exec = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), Arc::new(CoalescePartitionsExec::new(csv)), )); @@ -1339,7 +1340,7 @@ mod tests { let sort_exec = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), Arc::new(CoalescePartitionsExec::new(input)), )); @@ -1407,7 +1408,7 @@ mod tests { let sort_exec = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), Arc::new(CoalescePartitionsExec::new(input)), )); @@ -1500,7 +1501,7 @@ mod tests { SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), Arc::new(CoalescePartitionsExec::new(csv)), ) @@ -1549,7 +1550,7 @@ mod tests { let sort_exec = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("field_name", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), input, )); @@ -1601,14 +1602,16 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, }, PhysicalSortExpr { expr: col("b", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: false, }, @@ -1687,14 +1690,16 @@ mod tests { LexOrdering::new(vec![ PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, }, PhysicalSortExpr { expr: col("b", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, @@ -1765,7 +1770,7 @@ mod tests { let sort_exec = Arc::new(SortExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), blocking_exec, )); @@ -1796,7 +1801,7 @@ mod tests { let expressions = LexOrdering::new(vec![PhysicalSortExpr { expr: Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let result = sort_batch(&batch, expressions.as_ref(), None).unwrap(); diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index f05ee274d52f..a541a3ec0a7b 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -401,7 +401,6 @@ mod tests { ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray, TimestampNanosecondArray, }; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::{assert_batches_eq, assert_contains, DataFusionError}; use datafusion_common_runtime::SpawnedTask; @@ -415,6 +414,8 @@ mod tests { use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; use futures::{FutureExt, Stream, StreamExt}; use tokio::time::timeout; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; // The number in the function is highly related to the memory limit we are testing // any change of the constant should be aware of @@ -789,7 +790,8 @@ mod tests { let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: true, nulls_first: true, }, @@ -963,14 +965,16 @@ mod tests { let sort = LexOrdering::new(vec![ PhysicalSortExpr { expr: col("b", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, }, PhysicalSortExpr { expr: col("c", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, @@ -1014,7 +1018,8 @@ mod tests { let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("b", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -1049,7 +1054,8 @@ mod tests { let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("b", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, @@ -1082,7 +1088,7 @@ mod tests { let schema = make_partition(11).schema(); let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("i", &schema).unwrap(), - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let batches = @@ -1221,7 +1227,7 @@ mod tests { let sort_preserving_merge_exec = Arc::new(SortPreservingMergeExec::new( LexOrdering::new(vec![PhysicalSortExpr { expr: col("a", &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]), blocking_exec, )); @@ -1269,7 +1275,8 @@ mod tests { let sort = LexOrdering::new(vec![PhysicalSortExpr { expr: col("value", &schema).unwrap(), - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: true, }, diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 791370917523..b5739b45c0c5 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -654,6 +654,7 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::DataType; use datafusion_common::ScalarValue; + use datafusion_common::sort::AdvSortOptions; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -680,7 +681,7 @@ mod tests { .iter() .map(|(expr, options)| PhysicalSortExpr { expr: Arc::clone(*expr), - options: *options, + options: AdvSortOptions::with_default_ordering(*options), }) .collect::() } diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index ce80d4ab0a04..9d0c07f26d67 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1213,7 +1213,6 @@ mod tests { builder::{Int64Builder, UInt64Builder}, RecordBatch, }; - use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::{ assert_batches_eq, exec_datafusion_err, Result, ScalarValue, @@ -1236,6 +1235,8 @@ mod tests { use futures::{pin_mut, ready, FutureExt, Stream, StreamExt}; use itertools::Itertools; use tokio::time::timeout; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; #[derive(Debug, Clone)] struct TestStreamPartition { @@ -1334,7 +1335,7 @@ mod tests { let partitionby_exprs = vec![col(hash, &schema)?]; let orderby_exprs = LexOrdering::new(vec![PhysicalSortExpr { expr: col(order_by, &schema)?, - options: SortOptions::default(), + options: AdvSortOptions::default(), }]); let window_frame = WindowFrame::new_bounds( WindowFrameUnits::Range, @@ -1451,7 +1452,8 @@ mod tests { fn schema_orders(schema: &SchemaRef) -> Result> { let orderings = vec![LexOrdering::new(vec![PhysicalSortExpr { expr: col("sn", schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: false, nulls_first: false, }, @@ -1612,7 +1614,7 @@ mod tests { WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameBound::CurrentRow, )), - )) as _, + )?) as _, // NTH_VALUE(a, -1) Arc::new(StandardWindowExpr::try_new( nth_value_func1, @@ -1623,7 +1625,7 @@ mod tests { WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameBound::CurrentRow, )), - )) as _, + )?) as _, // NTH_VALUE(a, -2) Arc::new(StandardWindowExpr::try_new( nth_value_func2, @@ -1634,7 +1636,7 @@ mod tests { WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameBound::CurrentRow, )), - )) as _, + )?) as _, ]; let physical_plan = BoundedWindowAggExec::try_new( window_exprs, diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 5c36d431d766..83f7726f97a6 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -717,7 +717,7 @@ mod tests { ) -> PhysicalSortExpr { PhysicalSortExpr { expr: col(name, schema).unwrap(), - options, + options: AdvSortOptions::with_default_ordering(options), } } @@ -777,7 +777,8 @@ mod tests { let mut orderbys = vec![]; for (col_name, descending, nulls_first) in ob_params { let expr = col(col_name, &schema)?; - let options = SortOptions { + let options = AdvSortOptions { + ordering: SortOrdering::Default, descending, nulls_first, }; @@ -786,7 +787,8 @@ mod tests { let mut expected: Option = None; for (col_name, reqs) in expected_params { - let options = reqs.map(|(descending, nulls_first)| SortOptions { + let options = reqs.map(|(descending, nulls_first)| AdvSortOptions { + ordering: SortOrdering::Default, descending, nulls_first, }); @@ -856,14 +858,16 @@ mod tests { { let physical_ordering = PhysicalSortExpr { expr: col("nullable_col", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: physical_desc, nulls_first: physical_nulls_first, }, }; let required_ordering = PhysicalSortExpr { expr: col("nullable_col", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: req_desc, nulls_first: req_nulls_first, }, @@ -896,14 +900,16 @@ mod tests { { let physical_ordering = PhysicalSortExpr { expr: col("non_nullable_col", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: physical_desc, nulls_first: physical_nulls_first, }, }; let required_ordering = PhysicalSortExpr { expr: col("non_nullable_col", &schema)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: req_desc, nulls_first: req_nulls_first, }, @@ -1019,7 +1025,7 @@ mod tests { let expr = col(col_name, &test_schema)?; // Give default ordering, this is same with input ordering direction // In this test we do check for reversibility. - let options = SortOptions::default(); + let options = AdvSortOptions::default(); order_by_exprs.push(PhysicalSortExpr { expr, options }); } let res = get_window_mode( @@ -1182,7 +1188,8 @@ mod tests { let mut order_by_exprs = LexOrdering::default(); for (col_name, descending, nulls_first) in order_by_params { let expr = col(col_name, &test_schema)?; - let options = SortOptions { + let options = AdvSortOptions { + ordering: SortOrdering::Default, descending: *descending, nulls_first: *nulls_first, }; diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 6331b7fb3114..4cdd9c8a77ac 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -19,7 +19,6 @@ use std::sync::Arc; -use arrow::compute::SortOptions; use chrono::{TimeZone, Utc}; use datafusion_expr::dml::InsertOp; use object_store::path::Path; @@ -43,6 +42,8 @@ use datafusion::physical_plan::expressions::{ use datafusion::physical_plan::windows::{create_window_expr, schema_add_window_field}; use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_proto_common::common::proto_error; use crate::convert_required; @@ -75,7 +76,8 @@ pub fn parse_physical_sort_expr( ) -> Result { if let Some(expr) = &proto.expr { let expr = parse_physical_expr(expr.as_ref(), registry, input_schema, codec)?; - let options = SortOptions { + let options = AdvSortOptions { + ordering: SortOrdering::Default, // TODO this should come from a registry descending: !proto.asc, nulls_first: proto.nulls_first, }; diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 60972ac54ba7..f00852f6e06b 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -22,7 +22,6 @@ use datafusion::physical_expr::aggregate::AggregateExprBuilder; use prost::bytes::BufMut; use prost::Message; -use datafusion::arrow::compute::SortOptions; use datafusion::arrow::datatypes::SchemaRef; use datafusion::datasource::file_format::csv::CsvSink; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; @@ -68,6 +67,8 @@ use datafusion::physical_plan::{ }; use datafusion_common::config::TableParquetOptions; use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; use crate::common::{byte_to_string, str_to_byte}; @@ -893,7 +894,8 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { .as_ref(); Ok(PhysicalSortExpr { expr: parse_physical_expr(expr, registry, input.schema().as_ref(), extension_codec)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: !sort_expr.asc, nulls_first: sort_expr.nulls_first, }, @@ -940,7 +942,8 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { .as_ref(); Ok(PhysicalSortExpr { expr: parse_physical_expr(expr, registry, input.schema().as_ref(), extension_codec)?, - options: SortOptions { + options: AdvSortOptions { + ordering: SortOrdering::Default, descending: !sort_expr.asc, nulls_first: sort_expr.nulls_first, }, diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index edabeeb077b7..880fd560973a 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -182,7 +182,7 @@ pub fn serialize_physical_sort_expr( ) -> Result { let PhysicalSortExpr { expr, - options: options, + options, } = sort_expr; let expr = serialize_physical_expr(&expr, codec)?; Ok(PhysicalSortExprNode { From 2159c96db54bdcff747f0a06016125875e1d52f6 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sun, 9 Mar 2025 11:18:51 +0100 Subject: [PATCH 14/14] Minor additions to docs and formatting --- datafusion/catalog-listing/src/helpers.rs | 2 +- datafusion/common/src/sort.rs | 21 ++++++++--- datafusion/common/src/types/logical.rs | 2 +- .../core/src/datasource/listing/table.rs | 4 +-- datafusion/core/tests/dataframe/mod.rs | 6 ++-- datafusion/core/tests/dataframe/test_types.rs | 32 +++++++++++------ .../core/tests/fuzz_cases/aggregate_fuzz.rs | 4 +-- .../fuzz_cases/equivalence/projection.rs | 4 +-- .../tests/fuzz_cases/equivalence/utils.rs | 35 ++++++++----------- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 2 +- .../sort_preserving_repartition_fuzz.rs | 4 +-- .../core/tests/fuzz_cases/window_fuzz.rs | 4 +-- datafusion/functions/src/math/log.rs | 2 +- datafusion/functions/src/math/monotonicity.rs | 4 +-- .../physical-expr/src/equivalence/ordering.rs | 2 +- .../src/equivalence/properties/dependency.rs | 12 +++++-- .../src/joins/nested_loop_join.rs | 2 +- .../src/joins/symmetric_hash_join.rs | 2 +- datafusion/physical-plan/src/joins/utils.rs | 2 +- .../physical-plan/src/repartition/mod.rs | 4 +-- .../physical-plan/src/sorts/partial_sort.rs | 6 ++-- datafusion/physical-plan/src/sorts/sort.rs | 2 +- .../src/sorts/sort_preserving_merge.rs | 4 +-- datafusion/physical-plan/src/union.rs | 2 +- .../src/windows/bounded_window_agg_exec.rs | 4 +-- .../proto/src/physical_plan/from_proto.rs | 2 +- datafusion/proto/src/physical_plan/mod.rs | 2 +- .../proto/src/physical_plan/to_proto.rs | 5 +-- 28 files changed, 99 insertions(+), 78 deletions(-) diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index fc048861b263..a6837f252820 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -544,10 +544,10 @@ mod tests { // use futures::StreamExt; use super::*; + use datafusion_expr::registry::MemoryExtensionTypeRegistry; use datafusion_expr::{ case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF, }; - use datafusion_expr::registry::MemoryExtensionTypeRegistry; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; diff --git a/datafusion/common/src/sort.rs b/datafusion/common/src/sort.rs index 6554b938667f..d9c7ce6a5e9c 100644 --- a/datafusion/common/src/sort.rs +++ b/datafusion/common/src/sort.rs @@ -24,7 +24,8 @@ use arrow::datatypes::DataType; use arrow::row::{RowConverter, SortField}; use std::cmp::Ordering; -/// TODO +/// An advanced version of arrow's [SortOptions] that allows for the following features: +/// - Custom sort ordering #[derive(Clone, Debug, Default, Hash, PartialEq, Eq)] pub struct AdvSortOptions { /// Specifies the ordering that is used for sorting. This enables implementing user-defined @@ -93,7 +94,7 @@ impl AdvSortOptions { } } -/// TODO +/// An advanced version of arrow's [SortColumn] that uses an [AdvSortOptions]. #[derive(Clone, Debug)] pub struct AdvSortColumn { pub values: ArrayRef, @@ -101,6 +102,8 @@ pub struct AdvSortColumn { } impl AdvSortColumn { + /// Creates a [DynComparator] from this sort column. The comparison implementation is decided + /// by [SortOrdering::dyn_comparator]. pub fn dyn_compartor(&self) -> Result { let ordering = self .options @@ -115,6 +118,12 @@ impl AdvSortColumn { ordering.dyn_comparator(self.values.clone(), options) } + /// Tries to convert this sort column into an arrow-native [SortColumn]. + /// + /// # Errors + /// + /// This method returns an error if a custom ordering is specified. This is because this + /// ordering cannot be encoded in an arrow native version. pub fn to_arrow(&self) -> Result { let has_custom_sort = self .options @@ -131,9 +140,13 @@ impl AdvSortColumn { } } -/// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data -/// at given two indices. The lifetime is the same at the data wrapped. +/// A lexicographical comparator that wraps given array data (columns) and can lexicographically +/// compare data at given two indices. The lifetime is the same at the data wrapped. +/// +/// We require a separate version of this arrow data structure as we must construct it directly +/// from a `Vec`. pub struct LexicographicalComparator { + /// Comparators for the lexicographical ordering. compare_items: Vec, } diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 4d8deb9bb3f9..cbd37c547bab 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -213,7 +213,7 @@ pub trait CustomOrdering: Debug + Send + Sync { /// The ordering id is used to establish equality between instances of [CustomOrdering]. fn ordering_id(&self) -> &str; - /// TODO + /// Returns a [DynComparator] over the given `array` adhering to the given `options`. fn dyn_comparator( &self, array: ArrayRef, diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 1642ec5065ca..24b9c45fd51a 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1208,10 +1208,10 @@ mod tests { use datafusion_physical_plan::ExecutionPlanProperties; use crate::test::object_store::{ensure_head_concurrency, make_test_store_and_state}; - use tempfile::TempDir; - use url::Url; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use tempfile::TempDir; + use url::Url; #[tokio::test] async fn read_single_file() -> Result<()> { diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 9860da03e222..68c59bffdf92 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -3089,7 +3089,7 @@ async fn sort_on_ambiguous_column() -> Result<()> { #[tokio::test] async fn sort_on_union_with_logical_type() -> Result<()> { let mut builder = UnionBuilder::new_dense(); - builder.append::("integer", 1)?; + builder.append::("integer", 10)?; builder.append::("float", 6.0)?; builder.append::("integer", -1)?; builder.append::("float", 3.0)?; @@ -3128,10 +3128,10 @@ async fn sort_on_union_with_logical_type() -> Result<()> { .unwrap()?; let result = as_union_array(record_batch.column_by_name("my_union").unwrap()); - assert_eq!(result.type_ids(), &[0, 0, 1, 1]); + assert_eq!(result.type_ids(), &[0, 1, 1, 0]); assert_eq!( result.child(0).as_primitive::().values(), - &[-1, 1] + &[-1, 10] ); assert_eq!( result.child(1).as_primitive::().values(), diff --git a/datafusion/core/tests/dataframe/test_types.rs b/datafusion/core/tests/dataframe/test_types.rs index 1144c82b3136..269031f0aca2 100644 --- a/datafusion/core/tests/dataframe/test_types.rs +++ b/datafusion/core/tests/dataframe/test_types.rs @@ -64,9 +64,8 @@ impl LogicalType for IntOrFloatType { } } -/// The order of the IntOrFloat is defined as follows: -/// - All integers followed by all floats -/// - Within one subtype, the integers and floats are sorted using their natural order. +/// The order of the IntOrFloat is computed by converting both values to an `f64` and comparing +/// the resulting value. #[derive(Debug)] struct IntOrFloatTypeOrdering {} @@ -118,21 +117,32 @@ fn compare_impl(array: &UnionArray, lhs: usize, rhs: usize) -> Ordering { let type_lhs = array.type_ids()[lhs]; let type_rhs = array.type_ids()[rhs]; - if type_lhs != type_rhs { - return type_lhs.cmp(&type_rhs); - } - let offset_lhs = array.value_offset(lhs); let offset_rhs = array.value_offset(rhs); - match type_lhs { + + let lhs = match type_lhs { 0 => { let array = array.child(type_lhs).as_primitive::(); - array.value(offset_lhs).cmp(&array.value(offset_rhs)) + array.value(offset_lhs) as f64 } 1 => { let array = array.child(type_lhs).as_primitive::(); - array.value(offset_lhs).total_cmp(&array.value(offset_rhs)) + array.value(offset_lhs) } _ => unreachable!("Union only has two variants"), - } + }; + + let rhs = match type_rhs { + 0 => { + let array = array.child(type_rhs).as_primitive::(); + array.value(offset_rhs) as f64 + } + 1 => { + let array = array.child(type_rhs).as_primitive::(); + array.value(offset_rhs) + } + _ => unreachable!("Union only has two variants"), + }; + + lhs.total_cmp(&rhs) } diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 8a1d188be163..1d0953338953 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -23,7 +23,7 @@ use crate::fuzz_cases::aggregation_fuzzer::{ }; use arrow::array::{types::Int64Type, Array, ArrayRef, AsArray, Int64Array, RecordBatch}; -use arrow::compute::{concat_batches}; +use arrow::compute::concat_batches; use arrow::datatypes::{ DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, @@ -48,10 +48,10 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::InputOrderMode; use test_utils::{add_empty_batches, StringBatchGenerator}; +use datafusion_common::sort::AdvSortOptions; use rand::rngs::StdRng; use rand::{thread_rng, Rng, SeedableRng}; use tokio::task::JoinSet; -use datafusion_common::sort::AdvSortOptions; // ======================================================================== // The new aggregation fuzz tests based on [`AggregationFuzzer`] diff --git a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs index 680f89198e96..a8bd8301bf1c 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs @@ -19,6 +19,8 @@ use crate::fuzz_cases::equivalence::utils::{ apply_projection, create_random_schema, generate_table_for_eq_properties, is_table_same_after_sort, TestScalarUDF, }; +use datafusion_common::sort::AdvSortOptions; +use datafusion_common::types::SortOrdering; use datafusion_common::Result; use datafusion_expr::{Operator, ScalarUDF}; use datafusion_physical_expr::equivalence::ProjectionMapping; @@ -28,8 +30,6 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use itertools::Itertools; use std::sync::Arc; -use datafusion_common::sort::AdvSortOptions; -use datafusion_common::types::SortOrdering; #[test] fn project_orderings_random() -> Result<()> { diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs index 5ce2ec059a50..fd6788b26be7 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs @@ -23,8 +23,8 @@ use std::cmp::Ordering; use std::sync::Arc; use arrow::array::{ArrayRef, Float32Array, Float64Array, RecordBatch, UInt32Array}; -use arrow::compute::{SortColumn, SortOptions}; use arrow::compute::{lexsort_to_indices, take_record_batch}; +use arrow::compute::{SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{exec_err, plan_datafusion_err, DataFusionError, Result}; @@ -36,10 +36,10 @@ use datafusion_physical_expr::equivalence::{EquivalenceClass, ProjectionMapping} use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use itertools::izip; -use rand::prelude::*; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; +use itertools::izip; +use rand::prelude::*; pub fn output_schema( mapping: &ProjectionMapping, @@ -390,23 +390,18 @@ pub fn generate_table_for_eq_properties( for ordering in eq_properties.oeq_class().iter() { let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering .iter() - .map( - |PhysicalSortExpr { - expr, - options, - }| { - let col = expr.as_any().downcast_ref::().unwrap(); - let (idx, _field) = schema.column_with_name(col.name()).unwrap(); - let arr = generate_random_array(n_elem, n_distinct); - ( - SortColumn { - values: arr, - options: Some(options.to_arrow().unwrap()), - }, - idx, - ) - }, - ) + .map(|PhysicalSortExpr { expr, options }| { + let col = expr.as_any().downcast_ref::().unwrap(); + let (idx, _field) = schema.column_with_name(col.name()).unwrap(); + let arr = generate_random_array(n_elem, n_distinct); + ( + SortColumn { + values: arr, + options: Some(options.to_arrow().unwrap()), + }, + idx, + ) + }) .unzip(); let sort_arrs = arrow::compute::lexsort(&sort_columns, None)?; diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index d086ae36c63b..98c415b98cf2 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -34,9 +34,9 @@ use datafusion_execution::memory_pool::GreedyMemoryPool; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use rand::Rng; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; +use rand::Rng; use test_utils::{batches_to_vec, partitions_to_sorted_vec}; const KB: usize = 1 << 10; diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 0760c5fbc249..201586f2e155 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -45,11 +45,11 @@ mod sp_repartition_fuzz_tests { use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::source::DataSourceExec; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use datafusion_physical_expr_common::sort_expr::LexOrdering; use itertools::izip; use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; - use datafusion_common::sort::AdvSortOptions; - use datafusion_common::types::SortOrdering; // Generate a schema which consists of 6 columns (a, b, c, d, e, f) fn create_test_schema() -> Result { diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index c135c8df036e..b5b6c9446c80 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, StringArray}; -use arrow::compute::{concat_batches}; +use arrow::compute::concat_batches; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; @@ -51,10 +51,10 @@ use datafusion_physical_expr::expressions::{cast, col, lit}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_common::sort::AdvSortOptions; use rand::distributions::Alphanumeric; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; -use datafusion_common::sort::AdvSortOptions; use test_utils::add_empty_batches; #[tokio::test(flavor = "multi_thread", worker_threads = 16)] diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index e0d0d22cb188..7a7187b3dc2f 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -260,9 +260,9 @@ mod tests { use arrow::array::{Float32Array, Float64Array, Int64Array}; use datafusion_common::cast::{as_float32_array, as_float64_array}; - use datafusion_common::DFSchema; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::simplify::SimplifyContext; diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs index fc360377efdc..ecbd56fe7a3a 100644 --- a/datafusion/functions/src/math/monotonicity.rs +++ b/datafusion/functions/src/math/monotonicity.rs @@ -567,10 +567,10 @@ pub fn get_tanh_doc() -> &'static Documentation { #[cfg(test)] mod tests { - use datafusion_common::Result; + use super::*; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; - use super::*; + use datafusion_common::Result; #[derive(Debug)] struct MonotonicityTestCase { diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index 23cf3a9ab597..837abfd5ae6c 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -359,8 +359,8 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::Result; use datafusion_common::sort::AdvSortOptions; + use datafusion_common::Result; use datafusion_expr::{Operator, ScalarUDF}; use datafusion_physical_expr_common::sort_expr::LexOrdering; diff --git a/datafusion/physical-expr/src/equivalence/properties/dependency.rs b/datafusion/physical-expr/src/equivalence/properties/dependency.rs index 1ee9571cff2e..5a018c62a2a0 100644 --- a/datafusion/physical-expr/src/equivalence/properties/dependency.rs +++ b/datafusion/physical-expr/src/equivalence/properties/dependency.rs @@ -437,9 +437,9 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; - use datafusion_common::{Constraint, Constraints, Result}; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use datafusion_common::{Constraint, Constraints, Result}; use datafusion_expr::sort_properties::SortProperties; use datafusion_expr::Operator; @@ -742,9 +742,15 @@ mod tests { SortProperties::Ordered(option_asc.clone()), ), // b - (Arc::clone(col_b), SortProperties::Ordered(option_asc.clone())), + ( + Arc::clone(col_b), + SortProperties::Ordered(option_asc.clone()), + ), // a - (Arc::clone(col_a), SortProperties::Ordered(option_asc.clone())), + ( + Arc::clone(col_a), + SortProperties::Ordered(option_asc.clone()), + ), // a + c ( Arc::new(BinaryExpr::new( diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index d2562982b3a7..124d007db4ed 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -1048,9 +1048,9 @@ pub(crate) mod tests { use datafusion_physical_expr::{Partitioning, PhysicalExpr}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; - use rstest::rstest; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use rstest::rstest; fn build_table( a: (&str, &Vec), diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 83fab5342fe9..2c3afaca4047 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -1735,9 +1735,9 @@ mod tests { use datafusion_physical_expr::expressions::{binary, col, lit, Column}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; - use rstest::*; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use rstest::*; const TABLE_SIZE: i32 = 30; diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 16631b0e9bcc..f39d0e682d16 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -1832,8 +1832,8 @@ mod tests { use datafusion_common::stats::Precision::{Absent, Exact, Inexact}; use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue}; - use rstest::rstest; use datafusion_common::sort::AdvSortOptions; + use rstest::rstest; fn check( left: &[Column], diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index c196fd6466a1..17c6882a8eda 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1599,11 +1599,11 @@ mod tests { #[cfg(test)] mod test { - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::sort::AdvSortOptions; use super::*; use crate::test::TestMemoryExec; use crate::union::UnionExec; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::sort::AdvSortOptions; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index e097f52408a5..371a93c3802c 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -464,9 +464,6 @@ mod tests { use futures::FutureExt; use itertools::Itertools; - use datafusion_common::assert_batches_eq; - use datafusion_common::sort::AdvSortOptions; - use datafusion_common::types::SortOrdering; use crate::collect; use crate::expressions::col; use crate::expressions::PhysicalSortExpr; @@ -475,6 +472,9 @@ mod tests { use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::TestMemoryExec; + use datafusion_common::assert_batches_eq; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use super::*; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index ed5c9bdeecb8..decc4ecf4741 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1161,9 +1161,9 @@ mod tests { use datafusion_physical_expr::expressions::{Column, Literal}; use datafusion_physical_expr::EquivalenceProperties; - use futures::{FutureExt, Stream}; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; + use futures::{FutureExt, Stream}; #[derive(Debug, Clone)] pub struct SortedUnboundedExec { diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index a541a3ec0a7b..ac39a2c4fd37 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -411,11 +411,11 @@ mod tests { use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; use futures::{FutureExt, Stream, StreamExt}; use tokio::time::timeout; - use datafusion_common::sort::AdvSortOptions; - use datafusion_common::types::SortOrdering; // The number in the function is highly related to the memory limit we are testing // any change of the constant should be aware of diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index b5739b45c0c5..9741b724adf8 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -653,8 +653,8 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::DataType; - use datafusion_common::ScalarValue; use datafusion_common::sort::AdvSortOptions; + use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 9d0c07f26d67..f7ffa0b4d0e2 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1231,12 +1231,12 @@ mod tests { use datafusion_physical_expr::window::StandardWindowExpr; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; + use datafusion_common::sort::AdvSortOptions; + use datafusion_common::types::SortOrdering; use futures::future::Shared; use futures::{pin_mut, ready, FutureExt, Stream, StreamExt}; use itertools::Itertools; use tokio::time::timeout; - use datafusion_common::sort::AdvSortOptions; - use datafusion_common::types::SortOrdering; #[derive(Debug, Clone)] struct TestStreamPartition { diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 4cdd9c8a77ac..3f5c24673119 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -41,9 +41,9 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::windows::{create_window_expr, schema_add_window_field}; use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; -use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; +use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_proto_common::common::proto_error; use crate::convert_required; diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index f00852f6e06b..453c2686c163 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -66,9 +66,9 @@ use datafusion::physical_plan::{ ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr, }; use datafusion_common::config::TableParquetOptions; -use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use datafusion_common::sort::AdvSortOptions; use datafusion_common::types::SortOrdering; +use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; use crate::common::{byte_to_string, str_to_byte}; diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 880fd560973a..3f67842fe625 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -180,10 +180,7 @@ pub fn serialize_physical_sort_expr( sort_expr: PhysicalSortExpr, codec: &dyn PhysicalExtensionCodec, ) -> Result { - let PhysicalSortExpr { - expr, - options, - } = sort_expr; + let PhysicalSortExpr { expr, options } = sort_expr; let expr = serialize_physical_expr(&expr, codec)?; Ok(PhysicalSortExprNode { expr: Some(Box::new(expr)),