enhance doc and fix test

goldmedal · goldmedal · commit badcdb6d0442 · 2025-03-11T21:22:34.000+08:00
diff --git a/datafusion/expr/src/async_udf.rs b/datafusion/expr/src/async_udf.rs
@@ -33,7 +33,7 @@ use std::sync::Arc;
 /// Note this is less efficient than the ScalarUDFImpl, but it can be used
 /// to register remote functions in the context.
 ///
-/// The name is chosen to  mirror ScalarUDFImpl
+/// The name is chosen to mirror ScalarUDFImpl
 #[async_trait]
 pub trait AsyncScalarUDFImpl: Debug + Send + Sync {
     /// the function cast as any
@@ -49,6 +49,9 @@ pub trait AsyncScalarUDFImpl: Debug + Send + Sync {
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType>;
 
     /// The ideal batch size for this function.
+    ///
+    /// This is used to determine what size of data to be evaluated at once.
+    /// If None, the whole batch will be evaluated at once.
     fn ideal_batch_size(&self) -> Option<usize> {
         None
     }
@@ -65,10 +68,6 @@ pub trait AsyncScalarUDFImpl: Debug + Send + Sync {
 ///
 /// Note this is not meant to be used directly, but is meant to be an implementation detail
 /// for AsyncUDFImpl.
-///
-/// This is used to register remote functions in the context. The function
-/// should not be invoked by DataFusion. It's only used to generate the logical
-/// plan and unparsed them to SQL.
 #[derive(Debug)]
 pub struct AsyncScalarUDF {
     inner: Arc<dyn AsyncScalarUDFImpl>,
diff --git a/datafusion/physical-expr/src/async_scalar_function.rs b/datafusion/physical-expr/src/async_scalar_function.rs
@@ -29,7 +29,7 @@ use std::fmt::Display;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-/// Wrapper for a Async function that can be used in a DataFusion query
+/// Wrapper around a scalar function that can be evaluated asynchronously
 #[derive(Debug, Clone, Eq)]
 pub struct AsyncFuncExpr {
     /// The name of the output column this function will generate
@@ -206,7 +206,7 @@ impl PhysicalExpr for AsyncFuncExpr {
     }
 
     fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
-        // TODO: implement this
+        // TODO: implement this for scalar value input
         not_impl_err!("AsyncFuncExpr.evaluate")
     }
 
diff --git a/datafusion/physical-plan/src/async_func.rs b/datafusion/physical-plan/src/async_func.rs
@@ -35,19 +35,16 @@ use log::trace;
 use std::any::Any;
 use std::sync::Arc;
 
-/// This structure evaluates  a set of async expressions on a record
+/// This structure evaluates a set of async expressions on a record
 /// batch producing a new record batch
 ///
-/// This is similar to a ProjectionExec except that the functions can be async
-///
 /// The schema of the output of the AsyncFuncExec is:
 /// Input columns followed by one column for each async expression
 #[derive(Debug)]
 pub struct AsyncFuncExec {
     /// The async expressions to evaluate
     async_exprs: Vec<Arc<AsyncFuncExpr>>,
     input: Arc<dyn ExecutionPlan>,
-    /// Cache holding plan properties like equivalences, output partitioning etc.
     cache: PlanProperties,
     metrics: ExecutionPlanMetricsSet,
 }
diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt
@@ -241,6 +241,7 @@ physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
 physical_plan after coalesce_batches SAME TEXT AS ABOVE
+physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
 physical_plan after LimitPushdown SAME TEXT AS ABOVE
@@ -315,6 +316,7 @@ physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
 physical_plan after coalesce_batches SAME TEXT AS ABOVE
+physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
 01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
@@ -355,6 +357,7 @@ physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
 physical_plan after coalesce_batches SAME TEXT AS ABOVE
+physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
 01)GlobalLimitExec: skip=0, fetch=10
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ use std::fmt::Display;`
`29`	`29`	`use std::hash::{Hash, Hasher};`
`30`	`30`	`use std::sync::Arc;`
`31`	`31`
`32`		`-/// Wrapper for a Async function that can be used in a DataFusion query`
	`32`	`+/// Wrapper around a scalar function that can be evaluated asynchronously`
`33`	`33`	`#[derive(Debug, Clone, Eq)]`
`34`	`34`	`pub struct AsyncFuncExpr {`
`35`	`35`	`/// The name of the output column this function will generate`
`@@ -206,7 +206,7 @@ impl PhysicalExpr for AsyncFuncExpr {`
`206`	`206`	`}`
`207`	`207`
`208`	`208`	`fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {`
`209`		`- // TODO: implement this`
	`209`	`+ // TODO: implement this for scalar value input`
`210`	`210`	`not_impl_err!("AsyncFuncExpr.evaluate")`
`211`	`211`	`}`
`212`	`212`