Add array_distance function (#12211)

austin362667 · web-flow · commit bd506980bd04 · 2024-08-29T12:49:04.000+08:00
* Add `distance` aggregation function

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

Add `distance` aggregation function

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Add sql logic test for `distance`

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Simplify diff calculation

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Add `array_distance`/`list_distance` as list function in functions-nested

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Remove aggregate function `distance`

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* format

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* clean up error handling

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Add `array_distance` in scalar array functions docs

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Update bulletin

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

* Prettify example

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;

---------

Signed-off-by: Austin Liu &lt;austin362667@gmail.com&gt;
diff --git a/datafusion/functions-nested/src/distance.rs b/datafusion/functions-nested/src/distance.rs
@@ -0,0 +1,215 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [ScalarUDFImpl] definitions for array_distance function.
+
+use crate::utils::{downcast_arg, make_scalar_function};
+use arrow_array::{
+    Array, ArrayRef, Float64Array, LargeListArray, ListArray, OffsetSizeTrait,
+};
+use arrow_schema::DataType;
+use arrow_schema::DataType::{FixedSizeList, Float64, LargeList, List};
+use core::any::type_name;
+use datafusion_common::cast::{
+    as_float32_array, as_float64_array, as_generic_list_array, as_int32_array,
+    as_int64_array,
+};
+use datafusion_common::DataFusionError;
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+make_udf_expr_and_func!(
+    ArrayDistance,
+    array_distance,
+    array,
+    "returns the Euclidean distance between two numeric arrays.",
+    array_distance_udf
+);
+
+#[derive(Debug)]
+pub(super) struct ArrayDistance {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl ArrayDistance {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+            aliases: vec!["list_distance".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArrayDistance {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "array_distance"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match arg_types[0] {
+            List(_) | LargeList(_) | FixedSizeList(_, _) => Ok(Float64),
+            _ => exec_err!("The array_distance function can only accept List/LargeList/FixedSizeList."),
+        }
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        make_scalar_function(array_distance_inner)(args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+pub fn array_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 2 {
+        return exec_err!("array_distance expects exactly two arguments");
+    }
+
+    match (&args[0].data_type(), &args[1].data_type()) {
+        (List(_), List(_)) => general_array_distance::<i32>(args),
+        (LargeList(_), LargeList(_)) => general_array_distance::<i64>(args),
+        (array_type1, array_type2) => {
+            exec_err!("array_distance does not support types '{array_type1:?}' and '{array_type2:?}'")
+        }
+    }
+}
+
+fn general_array_distance<O: OffsetSizeTrait>(arrays: &[ArrayRef]) -> Result<ArrayRef> {
+    let list_array1 = as_generic_list_array::<O>(&arrays[0])?;
+    let list_array2 = as_generic_list_array::<O>(&arrays[1])?;
+
+    let result = list_array1
+        .iter()
+        .zip(list_array2.iter())
+        .map(|(arr1, arr2)| compute_array_distance(arr1, arr2))
+        .collect::<Result<Float64Array>>()?;
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Computes the Euclidean distance between two arrays
+fn compute_array_distance(
+    arr1: Option<ArrayRef>,
+    arr2: Option<ArrayRef>,
+) -> Result<Option<f64>> {
+    let value1 = match arr1 {
+        Some(arr) => arr,
+        None => return Ok(None),
+    };
+    let value2 = match arr2 {
+        Some(arr) => arr,
+        None => return Ok(None),
+    };
+
+    let mut value1 = value1;
+    let mut value2 = value2;
+
+    loop {
+        match value1.data_type() {
+            List(_) => {
+                if downcast_arg!(value1, ListArray).null_count() > 0 {
+                    return Ok(None);
+                }
+                value1 = downcast_arg!(value1, ListArray).value(0);
+            }
+            LargeList(_) => {
+                if downcast_arg!(value1, LargeListArray).null_count() > 0 {
+                    return Ok(None);
+                }
+                value1 = downcast_arg!(value1, LargeListArray).value(0);
+            }
+            _ => break,
+        }
+
+        match value2.data_type() {
+            List(_) => {
+                if downcast_arg!(value2, ListArray).null_count() > 0 {
+                    return Ok(None);
+                }
+                value2 = downcast_arg!(value2, ListArray).value(0);
+            }
+            LargeList(_) => {
+                if downcast_arg!(value2, LargeListArray).null_count() > 0 {
+                    return Ok(None);
+                }
+                value2 = downcast_arg!(value2, LargeListArray).value(0);
+            }
+            _ => break,
+        }
+    }
+
+    // Check for NULL values inside the arrays
+    if value1.null_count() != 0 || value2.null_count() != 0 {
+        return Ok(None);
+    }
+
+    let values1 = convert_to_f64_array(&value1)?;
+    let values2 = convert_to_f64_array(&value2)?;
+
+    if values1.len() != values2.len() {
+        return exec_err!("Both arrays must have the same length");
+    }
+
+    let sum_squares: f64 = values1
+        .iter()
+        .zip(values2.iter())
+        .map(|(v1, v2)| {
+            let diff = v1.unwrap_or(0.0) - v2.unwrap_or(0.0);
+            diff * diff
+        })
+        .sum();
+
+    Ok(Some(sum_squares.sqrt()))
+}
+
+/// Converts an array of any numeric type to a Float64Array.
+fn convert_to_f64_array(array: &ArrayRef) -> Result<Float64Array> {
+    match array.data_type() {
+        DataType::Float64 => Ok(as_float64_array(array)?.clone()),
+        DataType::Float32 => {
+            let array = as_float32_array(array)?;
+            let converted: Float64Array =
+                array.iter().map(|v| v.map(|v| v as f64)).collect();
+            Ok(converted)
+        }
+        DataType::Int64 => {
+            let array = as_int64_array(array)?;
+            let converted: Float64Array =
+                array.iter().map(|v| v.map(|v| v as f64)).collect();
+            Ok(converted)
+        }
+        DataType::Int32 => {
+            let array = as_int32_array(array)?;
+            let converted: Float64Array =
+                array.iter().map(|v| v.map(|v| v as f64)).collect();
+            Ok(converted)
+        }
+        _ => exec_err!("Unsupported array type for conversion to Float64Array"),
+    }
+}
diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs
@@ -34,6 +34,7 @@ pub mod array_has;
 pub mod cardinality;
 pub mod concat;
 pub mod dimension;
+pub mod distance;
 pub mod empty;
 pub mod except;
 pub mod expr_ext;
@@ -73,6 +74,7 @@ pub mod expr_fn {
     pub use super::concat::array_prepend;
     pub use super::dimension::array_dims;
     pub use super::dimension::array_ndims;
+    pub use super::distance::array_distance;
     pub use super::empty::array_empty;
     pub use super::except::array_except;
     pub use super::extract::array_element;
@@ -128,6 +130,7 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         array_has::array_has_any_udf(),
         empty::array_empty_udf(),
         length::array_length_udf(),
+        distance::array_distance_udf(),
         flatten::flatten_udf(),
         sort::array_sort_udf(),
         repeat::array_repeat_udf(),
diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt
@@ -4715,6 +4715,60 @@ NULL 10
 NULL 10
 NULL 10
 
+query RRR
+select array_distance([2], [3]), list_distance([1], [2]), list_distance([1], [-2]);
+----
+1 1 3
+
+query error
+select list_distance([1], [1, 2]);
+
+query R
+select array_distance([[1, 1]], [1, 2]);
+----
+1
+
+query R
+select array_distance([[1, 1]], [[1, 2]]);
+----
+1
+
+query R
+select array_distance([[1, 1]], [[1, 2]]);
+----
+1
+
+query RR
+select array_distance([1, 1, 0, 0], [2, 2, 1, 1]), list_distance([1, 2, 3], [1, 2, 3]);
+----
+2 0
+
+query RR
+select array_distance([1.0, 1, 0, 0], [2, 2.0, 1, 1]), list_distance([1, 2.0, 3], [1, 2, 3]);
+----
+2 0
+
+query R
+select list_distance([1, 1, NULL, 0], [2, 2, NULL, NULL]);
+----
+NULL
+
+query R
+select list_distance([NULL, NULL], [NULL, NULL]);
+----
+NULL
+
+query R
+select list_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.5]) AS distance;
+----
+0.5
+
+query R
+select list_distance([1, 2, 3], [1, 2, 3]) AS distance;
+----
+0
+
+
 ## array_dims (aliases: `list_dims`)
 
 # array dims error
diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
@@ -2093,6 +2093,7 @@ to_unixtime(expression[, ..., format_n])
 - [array_concat](#array_concat)
 - [array_contains](#array_contains)
 - [array_dims](#array_dims)
+- [array_distance](#array_distance)
 - [array_distinct](#array_distinct)
 - [array_has](#array_has)
 - [array_has_all](#array_has_all)
@@ -2135,6 +2136,7 @@ to_unixtime(expression[, ..., format_n])
 - [list_cat](#list_cat)
 - [list_concat](#list_concat)
 - [list_dims](#list_dims)
+- [list_distance](#list_distance)
 - [list_distinct](#list_distinct)
 - [list_element](#list_element)
 - [list_except](#list_except)
@@ -2388,6 +2390,36 @@ array_dims(array)
 
 - list_dims
 
+### `array_distance`
+
+Returns the Euclidean distance between two input arrays of equal length.
+
+```
+array_distance(array1, array2)
+```
+
+#### Arguments
+
+- **array1**: Array expression.
+  Can be a constant, column, or function, and any combination of array operators.
+- **array2**: Array expression.
+  Can be a constant, column, or function, and any combination of array operators.
+
+#### Example
+
+```
+> select array_distance([1, 2], [1, 4]);
++------------------------------------+
+| array_distance(List([1,2], [1,4])) |
++------------------------------------+
+| 2.0                                |
++------------------------------------+
+```
+
+#### Aliases
+
+- list_distance
+
 ### `array_distinct`
 
 Returns distinct values from the array after removing duplicates.
@@ -3224,6 +3256,10 @@ _Alias of [array_concat](#array_concat)._
 
 _Alias of [array_dims](#array_dims)._
 
+### `list_distance`
+
+_Alias of [array_distance](#array_distance)._
+
 ### `list_distinct`
 
 _Alias of [array_dims](#array_distinct)._