apache · Rachelint · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 10, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/advanced_udaf.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use arrow::datatypes::{Field, Schema};
-use datafusion::physical_expr::NullState;
+use datafusion::physical_expr::FlatNullState;
 use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
 use std::{any::Any, sync::Arc};
 
@@ -217,7 +217,7 @@ struct GeometricMeanGroupsAccumulator {
     prods: Vec<f64>,
 
     /// Track nulls in the input / filters
-    null_state: NullState,
+    null_state: FlatNullState,
 }
 
 impl GeometricMeanGroupsAccumulator {
@@ -227,7 +227,7 @@ impl GeometricMeanGroupsAccumulator {
             return_data_type: DataType::Float64,
             counts: vec![],
             prods: vec![],
-            null_state: NullState::new(),
+            null_state: FlatNullState::new(),
         }
     }
 }
@@ -248,13 +248,17 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
         // increment counts, update sums
         self.counts.resize(total_num_groups, 0);
         self.prods.resize(total_num_groups, 1.0);
-        // Use the `NullState` structure to generate specialized code for null / non null input elements
+        // Use the `NullState` structure to generate specialized code for null / non null input elements.
+        // `block_id` is ignored in `value_fn`, because `AvgGroupsAccumulator`
+        // still not support blocked groups.
+        // More details can see `GroupsAccumulator::supports_blocked_groups`.
         self.null_state.accumulate(
             group_indices,
             values,
             opt_filter,
             total_num_groups,
-            |group_index, new_value| {
+            |_, group_index, new_value| {
+                let group_index = group_index as usize;
                 let prod = &mut self.prods[group_index];
                 *prod = prod.mul_wrapping(new_value);
 
@@ -279,13 +283,16 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
         let partial_counts = values[1].as_primitive::<UInt32Type>();
         // update counts with partial counts
         self.counts.resize(total_num_groups, 0);
+        // `block_id` is ignored in `value_fn`, because `AvgGroupsAccumulator`
+        // still not support blocked groups.
+        // More details can see `GroupsAccumulator::supports_blocked_groups`.
         self.null_state.accumulate(
             group_indices,
             partial_counts,
             opt_filter,
             total_num_groups,
-            |group_index, partial_count| {
-                self.counts[group_index] += partial_count;
+            |_, group_index, partial_count| {
+                self.counts[group_index as usize] += partial_count;
             },
         );
 
@@ -296,8 +303,8 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
             partial_prods,
             opt_filter,
             total_num_groups,
-            |group_index, new_value: <Float64Type as ArrowPrimitiveType>::Native| {
-                let prod = &mut self.prods[group_index];
+            |_, group_index, new_value: <Float64Type as ArrowPrimitiveType>::Native| {
+                let prod = &mut self.prods[group_index as usize];
                 *prod = prod.mul_wrapping(new_value);
             },
         );

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -411,6 +411,17 @@ config_namespace! {
         /// written, it may be necessary to increase this size to avoid errors from
         /// the remote end point.
         pub objectstore_writer_buffer_size: usize, default = 10 * 1024 * 1024
+
+        /// Should DataFusion use a blocked approach to manage grouping state.
+        /// By default, the blocked approach is used which
+        /// allocates capacity based on a predefined block size firstly.
+        /// When the block reaches its limit, we allocate a new block (also with
+        /// the same predefined block size based capacity) instead of expanding
+        /// the current one and copying the data.
+        /// If `false`, a single allocation approach is used, where
+        /// values are managed within a single large memory block.
+        /// As this block grows, it often triggers numerous copies, resulting in poor performance.
+        pub enable_aggregation_blocked_groups: bool, default = true
     }
 }
 

diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -233,6 +233,46 @@ async fn test_median() {
         .await;
 }
 
+// Testing `blocked groups optimization`
+// Details of this optimization can see:
+// https://github.com/apache/datafusion/issues/7065
+#[tokio::test(flavor = "multi_thread")]
+async fn test_blocked_groups_optimization() {
+    let data_gen_config = baseline_config();
+
+    // Blocked groups supporting lists:
+    //
+    // `GroupAccumulator`:
+    //    - PrimitiveGroupsAccumulator
+    //
+    // `GroupValues`:
+    //   - GroupValuesPrimitive
+    //
+
+    // Test `Numeric aggregation` + `Single group by`
+    let aggr_functions = ["sum", "min", "max"];
+    let aggr_arguments = data_gen_config.numeric_columns();
+    let groups_by_columns = data_gen_config.numeric_columns();
+
+    let mut query_builder = QueryBuilder::new()
+        .with_table_name("fuzz_table")
+        .with_aggregate_arguments(aggr_arguments)
+        .set_group_by_columns(groups_by_columns)
+        .with_min_group_by_columns(1)
+        .with_max_group_by_columns(1)
+        .with_no_grouping(false);
+
+    for func in aggr_functions {
+        query_builder = query_builder.with_aggregate_function(func);
+    }
+
+    AggregationFuzzerBuilder::from(data_gen_config)
+        .add_query_builder(query_builder)
+        .build()
+        .run()
+        .await;
+}
+
 /// Return a standard set of columns for testing data generation
 ///
 /// Includes numeric and string types

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
@@ -103,6 +103,7 @@ impl SessionContextGenerator {
             target_partitions,
             skip_partial_params,
             sort_hint: false,
+            enable_aggregation_blocked_groups: false,
             table_name: self.table_name.clone(),
             table_provider: Arc::new(provider),
         };
@@ -146,11 +147,14 @@ impl SessionContextGenerator {
                 (provider, false)
             };
 
+        let enable_aggregation_blocked_groups = rng.gen_bool(0.5);
+
         let builder = GeneratedSessionContextBuilder {
             batch_size,
             target_partitions,
             sort_hint,
             skip_partial_params,
+            enable_aggregation_blocked_groups,
             table_name: self.table_name.clone(),
             table_provider: Arc::new(provider),
         };
@@ -174,6 +178,7 @@ struct GeneratedSessionContextBuilder {
     target_partitions: usize,
     sort_hint: bool,
     skip_partial_params: SkipPartialParams,
+    enable_aggregation_blocked_groups: bool,
     table_name: String,
     table_provider: Arc<dyn TableProvider>,
 }
@@ -198,6 +203,10 @@ impl GeneratedSessionContextBuilder {
             "datafusion.execution.skip_partial_aggregation_probe_ratio_threshold",
             &ScalarValue::Float64(Some(self.skip_partial_params.ratio_threshold)),
         );
+        session_config = session_config.set(
+            "datafusion.execution.enable_aggregation_blocked_groups",
+            &ScalarValue::Boolean(Some(self.enable_aggregation_blocked_groups)),
+        );
 
         let ctx = SessionContext::new_with_config(session_config);
         ctx.register_table(self.table_name, self.table_provider)?;
@@ -207,6 +216,7 @@ impl GeneratedSessionContextBuilder {
             target_partitions: self.target_partitions,
             sort_hint: self.sort_hint,
             skip_partial_params: self.skip_partial_params,
+            enable_aggregation_blocked_groups: self.enable_aggregation_blocked_groups,
         };
 
         Ok(SessionContextWithParams { ctx, params })
@@ -221,6 +231,7 @@ pub struct SessionContextParams {
     target_partitions: usize,
     sort_hint: bool,
     skip_partial_params: SkipPartialParams,
+    enable_aggregation_blocked_groups: bool,
 }
 
 /// Partial skipping parameters

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
@@ -247,7 +247,6 @@ impl QueryBuilder {
 
     fn generate_query(&self) -> String {
         let group_by = self.random_group_by();
-        dbg!(&group_by);
         let mut query = String::from("SELECT ");
         query.push_str(&group_by.join(", "));
         if !group_by.is_empty() {

diff --git a/datafusion/expr-common/src/groups_accumulator.rs b/datafusion/expr-common/src/groups_accumulator.rs
@@ -18,19 +18,23 @@
 //! Vectorized [`GroupsAccumulator`]
 
 use arrow::array::{ArrayRef, BooleanArray};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{not_impl_err, DataFusionError, Result};
 
 /// Describes how many rows should be emitted during grouping.
 #[derive(Debug, Clone, Copy)]
 pub enum EmitTo {
-    /// Emit all groups
+    /// Emit all groups, will clear all existing group indexes
     All,
     /// Emit only the first `n` groups and shift all existing group
     /// indexes down by `n`.
     ///
     /// For example, if `n=10`, group_index `0, 1, ... 9` are emitted
     /// and group indexes `10, 11, 12, ...` become `0, 1, 2, ...`.
     First(usize),
+    /// Emit next block in the blocked managed groups
+    ///
+    /// Similar as `Emit::All`, will also clear all existing group indexes
+    NextBlock,
 }
 
 impl EmitTo {
@@ -39,6 +43,9 @@ impl EmitTo {
     /// remaining values in `v`.
     ///
     /// This avoids copying if Self::All
+    ///
+    /// NOTICE: only support emit strategies: `Self::All` and `Self::First`
-    /// NOTICE: only support emit strategies: `Self::All` and `Self::First`
+    /// NOTICE: only support emit strategies: `Self::All` and `Self::First`
+    /// Will call `panic` if called with `Self::NextBlock`
-    /// NOTICE: only support emit strategies: `Self::All` and `Self::First`
+    /// NOTICE: only support emit strategies: `Self::All` and `Self::First`
+    /// Will call `panic` if called with `Self::NextBlock`
+    ///
     pub fn take_needed<T>(&self, v: &mut Vec<T>) -> Vec<T> {
         match self {
             Self::All => {
@@ -52,6 +59,7 @@ impl EmitTo {
                 std::mem::swap(v, &mut t);
                 t
             }
+            Self::NextBlock => unreachable!("don't support take block in take_needed"),
         }
     }
 }
@@ -250,4 +258,49 @@ pub trait GroupsAccumulator: Send {
     /// This function is called once per batch, so it should be `O(n)` to
     /// compute, not `O(num_groups)`
     fn size(&self) -> usize;
+
+    /// Returns `true` if this accumulator supports blocked groups.
+    ///
+    /// Blocked groups(or called blocked management approach) is an optimization
+    /// to reduce the cost of managing aggregation intermediate states.
+    ///
+    /// Here is brief introduction for two states management approaches:
+    ///   - Blocked approach, states are stored and managed in multiple `Vec`s,
+    ///     we call it `Block`s. Organize like this is for avoiding to resize `Vec`
+    ///     and allocate a new `Vec` instead to reduce cost and get better performance.
+    ///     When locating data in `Block`s, we need to use `block_id` to locate the
+    ///     needed `Block` at first, and use `block_offset` to locate the needed
+    ///     data in `Block` after.
+    ///
+    ///   - Single approach, all states are stored and managed in a single large `Block`.
+    ///     So when locating data, `block_id` will always be 0, and we only need `block_offset`
+    ///     to locate data in the single `Block`.
+    ///
+    /// More details can see:
+    /// <https://github.com/apache/datafusion/issues/7065>
+    ///
+    fn supports_blocked_groups(&self) -> bool {
+        false
+    }
+
+    /// Alter the block size in the accumulator
+    ///
+    /// If the target block size is `None`, it will use a single big
+    /// block(can think it a `Vec`) to manage the state.
+    ///
+    /// If the target block size` is `Some(blk_size)`, it will try to
+    /// set the block size to `blk_size`, and the try will only success
+    /// when the accumulator has supported blocked mode.
+    ///
+    /// NOTICE: After altering block size, all data in previous will be cleared.
-    /// NOTICE: After altering block size, all data in previous will be cleared.
+    /// NOTICE: After altering block size, all data in existing accumulators will be cleared.
-    /// NOTICE: After altering block size, all data in previous will be cleared.
+    /// NOTICE: After altering block size, all data in existing accumulators will be cleared.
+    ///
+    fn alter_block_size(&mut self, block_size: Option<usize>) -> Result<()> {
+        if block_size.is_some() {
+            return Err(DataFusionError::NotImplemented(
+                "this accumulator doesn't support blocked mode yet".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
@@ -19,7 +19,9 @@
 //! Adapter that makes [`GroupsAccumulator`] out of [`Accumulator`]
 
 pub mod accumulate;
+pub mod blocks;
 pub mod bool_op;
+pub mod group_index_operations;
 pub mod nulls;
 pub mod prim_op;