diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md new file mode 100644 index 000000000000..23a996faf007 --- /dev/null +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -0,0 +1,77 @@ + + +# Why Is List Item Always Nullable? + +## Motivation + +There were independent proposals to make the `nullable` setting of list +items in accumulator state be configurable. This meant adding additional +fields which captured the `nullable` setting from schema in planning for +the first argument to the aggregation function, and the returned value. + +These fields were to be added to `StateFieldArgs`. But then we found out +that aggregate computation does not depend on it, and it can be avoided. + +This document exists to make that reasoning explicit. + +## Background + +The list data type is used in the accumulator state for a few aggregate +functions like: + +- `sum` +- `count` +- `array_agg` +- `bit_and`, `bit_or` and `bit_xor` +- `nth_value` + +In all of the above cases the data type of the list item is equivalent +to either the first argument of the aggregate function or the returned +value. + +For example, in `array_agg` the data type of item is equivalent to the +first argument and the definition looks like this: + +```rust +// `args` : `StateFieldArgs` +// `input_type` : data type of the first argument +let mut fields = vec![Field::new_list( + format_state_name(self.name(), "nth_value"), + Field::new("item", args.input_type.clone(), true /* nullable of list item */ ), + false, // nullable of list itself +)]; +``` + +For all the aggregates listed above, the list item is always defined as +nullable. + +## Computing Intermediate State + +By setting `nullable` (of list item) to be always `true` like this we +ensure that the aggregate computation works even when nulls are +present. The advantage of doing it this way is that it eliminates the +need for additional code and special treatment of nulls in the +accumulator state. + +## Nullable Of List Itself + +The `nullable` of list itself depends on the aggregate. In the case of +`array_agg` the list is nullable(`true`), meanwhile for `sum` the list +is not nullable(`false`). diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 96b39ae4121e..c25d592428bb 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -89,6 +89,7 @@ impl AggregateUDFImpl for ArrayAgg { if args.is_distinct { return Ok(vec![Field::new_list( format_state_name(args.name, "distinct_array_agg"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), true, )]); @@ -96,6 +97,7 @@ impl AggregateUDFImpl for ArrayAgg { let mut fields = vec![Field::new_list( format_state_name(args.name, "array_agg"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), true, )]; diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index 6c2d6cb5285c..f6dd0bc20a83 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -203,6 +203,7 @@ impl AggregateUDFImpl for BitwiseOperation { args.name, format!("{} distinct", self.name()).as_str(), ), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.return_type.clone(), true), false, )]) diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index 0ead22e90a16..56850d0e02a1 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -125,6 +125,7 @@ impl AggregateUDFImpl for Count { if args.is_distinct { Ok(vec![Field::new_list( format_state_name(args.name, "count distinct"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), false, )]) diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index 9bbd68c9bdf6..0c1f1f292822 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -134,10 +134,7 @@ impl AggregateUDFImpl for NthValueAgg { fn state_fields(&self, args: StateFieldsArgs) -> Result> { let mut fields = vec![Field::new_list( format_state_name(self.name(), "nth_value"), - // TODO: The nullability of the list element should be configurable. - // The hard-coded `true` should be changed once the field for - // nullability is added to `StateFieldArgs` struct. - // See: https://github.com/apache/datafusion/pull/11063 + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), false, )]; diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index a9f31dc05be9..08e3908a5829 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -174,6 +174,7 @@ impl AggregateUDFImpl for Sum { if args.is_distinct { Ok(vec![Field::new_list( format_state_name(args.name, "sum distinct"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.return_type.clone(), true), false, )])