apache
diff --git a/‎benchmarks/expected-plans/q2.txt
Lines changed: 4 additions & 4 deletions b/‎benchmarks/expected-plans/q2.txt
Lines changed: 4 additions & 4 deletions
diff --git a/‎datafusion/core/tests/dataframe.rs
Lines changed: 3 additions & 3 deletions b/‎datafusion/core/tests/dataframe.rs
Lines changed: 3 additions & 3 deletions
diff --git a/‎datafusion/core/tests/sql/subqueries.rs
Lines changed: 226 additions & 2 deletions b/‎datafusion/core/tests/sql/subqueries.rs
Lines changed: 226 additions & 2 deletions
@@ -3,7 +3,7 @@
 +---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | logical_plan  | Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST                                                                                                                                                                    |
 |               |   Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment                                                                                                                                                   |
-|               |     Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.__value                                                                                                                                                                                                 |
+|               |     Inner Join: partsupp.ps_supplycost = __scalar_sq_1.__value, part.p_partkey = __scalar_sq_1.ps_partkey                                                                                                                                                                                                 |
 |               |       Projection: part.p_partkey, part.p_mfgr, partsupp.ps_supplycost, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, nation.n_name                                                                                                                       |
 |               |         Inner Join: nation.n_regionkey = region.r_regionkey                                                                                                                                                                                                                                               |
 |               |           Projection: part.p_partkey, part.p_mfgr, partsupp.ps_supplycost, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, nation.n_name, nation.n_regionkey                                                                                               |
@@ -40,9 +40,9 @@
 |               |   SortExec: expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST]                                                                                                                                                                                            |
 |               |     ProjectionExec: expr=[s_acctbal@6 as s_acctbal, s_name@3 as s_name, n_name@8 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@4 as s_address, s_phone@5 as s_phone, s_comment@7 as s_comment]                                                                                       |
 |               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                         |
-|               |         HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 0 }), (Column { name: "ps_supplycost", index: 2 }, Column { name: "__value", index: 1 })]                                                                       |
+|               |         HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_supplycost", index: 2 }, Column { name: "__value", index: 1 }), (Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 0 })]                                                                       |
 |               |           CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                     |
-|               |             RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }, Column { name: "ps_supplycost", index: 2 }], 2), input_partitions=2                                                                                                                                               |
+|               |             RepartitionExec: partitioning=Hash([Column { name: "ps_supplycost", index: 2 }, Column { name: "p_partkey", index: 0 }], 2), input_partitions=2                                                                                                                                               |
 |               |               ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, ps_supplycost@2 as ps_supplycost, s_name@3 as s_name, s_address@4 as s_address, s_phone@5 as s_phone, s_acctbal@6 as s_acctbal, s_comment@7 as s_comment, n_name@8 as n_name]                                           |
 |               |                 CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                               |
 |               |                   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "n_regionkey", index: 9 }, Column { name: "r_regionkey", index: 0 })]                                                                                                                                              |
@@ -85,7 +85,7 @@
 |               |                               FilterExec: r_name@1 = EUROPE                                                                                                                                                                                                                                               |
 |               |                                 MemoryExec: partitions=0, partition_sizes=[]                                                                                                                                                                                                                              |
 |               |           CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                     |
-|               |             RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "__value", index: 1 }], 2), input_partitions=2                                                                                                                                                    |
+|               |             RepartitionExec: partitioning=Hash([Column { name: "__value", index: 1 }, Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2                                                                                                                                                    |
 |               |               ProjectionExec: expr=[ps_partkey@0 as ps_partkey, MIN(partsupp.ps_supplycost)@1 as __value]                                                                                                                                                                                                 |
 |               |                 AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[MIN(partsupp.ps_supplycost)]                                                                                                                                                                                |
 |               |                   CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                             |
 
@@ -39,8 +39,8 @@ use datafusion_expr::expr::{GroupingSet, Sort};
 use datafusion_expr::utils::COUNT_STAR_EXPANSION;
 use datafusion_expr::Expr::Wildcard;
 use datafusion_expr::{
-    avg, col, count, exists, expr, in_subquery, lit, max, scalar_subquery, sum,
-    AggregateFunction, Expr, ExprSchemable, WindowFrame, WindowFrameBound,
+    avg, col, count, exists, expr, in_subquery, lit, max, out_ref_col, scalar_subquery,
+    sum, AggregateFunction, Expr, ExprSchemable, WindowFrame, WindowFrameBound,
     WindowFrameUnits, WindowFunction,
 };
 
@@ -241,7 +241,7 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> {
             scalar_subquery(Arc::new(
                 ctx.table("t2")
                     .await?
-                    .filter(col("t1.a").eq(col("t2.a")))?
+                    .filter(out_ref_col(DataType::UInt32, "t1.a").eq(col("t2.a")))?
                     .aggregate(vec![], vec![count(lit(COUNT_STAR_EXPANSION))])?
                     .select(vec![count(lit(COUNT_STAR_EXPANSION))])?
                     .into_unoptimized_plan(),
 
@@ -185,7 +185,7 @@ async fn invalid_scalar_subquery() -> Result<()> {
     let dataframe = ctx.sql(sql).await.expect(&msg);
     let err = dataframe.into_optimized_plan().err().unwrap();
     assert_eq!(
-        r#"Context("check_analyzed_plan", Plan("Scalar subquery should only return one column"))"#,
+        r#"Context("check_analyzed_plan", Plan("Scalar subquery should only return one column, but found 2: t2.t2_id, t2.t2_name"))"#,
         &format!("{err:?}")
     );
 
@@ -203,7 +203,231 @@ async fn subquery_not_allowed() -> Result<()> {
     let err = dataframe.into_optimized_plan().err().unwrap();
 
     assert_eq!(
-        r#"Context("check_analyzed_plan", Plan("In/Exist subquery can not be used in Sort plan nodes"))"#,
+        r#"Context("check_analyzed_plan", Plan("In/Exist subquery can only be used in Projection, Filter, Window functions, Aggregate and Join plan nodes"))"#,
+        &format!("{err:?}")
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn non_aggregated_correlated_scalar_subquery() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int) as t2_int from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("Correlated scalar subquery must be aggregated to return at most one row"))"#,
+        &format!("{err:?}")
+    );
+
+    let sql = "SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1_int group by t2_int) as t2_int from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("Correlated scalar subquery must be aggregated to return at most one row"))"#,
+        &format!("{err:?}")
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn non_aggregated_correlated_scalar_subquery_with_limit() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 2) as t2_int from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("Correlated scalar subquery must be aggregated to return at most one row"))"#,
+        &format!("{err:?}")
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn non_aggregated_correlated_scalar_subquery_with_single_row() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1) as t2_int from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let plan = dataframe.into_optimized_plan()?;
+
+    let expected = vec![
+        "Projection: t1.t1_id, (<subquery>) AS t2_int [t1_id:UInt32;N, t2_int:UInt32;N]",
+        "  Subquery: [t2_int:UInt32;N]",
+        "    Limit: skip=0, fetch=1 [t2_int:UInt32;N]",
+        "      Projection: t2.t2_int [t2_int:UInt32;N]",
+        "        Filter: t2.t2_int = outer_ref(t1.t1_int) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "          TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "  TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]",
+    ];
+    let formatted = plan.display_indent_schema().to_string();
+    let actual: Vec<&str> = formatted.trim().lines().collect();
+    assert_eq!(
+        expected, actual,
+        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    );
+
+    let sql = "SELECT t1_id from t1 where t1_int = (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1)";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let plan = dataframe.into_optimized_plan()?;
+
+    let expected = vec![
+        "Projection: t1.t1_id [t1_id:UInt32;N]",
+        "  Filter: t1.t1_int = (<subquery>) [t1_id:UInt32;N, t1_int:UInt32;N]",
+        "    Subquery: [t2_int:UInt32;N]",
+        "      Limit: skip=0, fetch=1 [t2_int:UInt32;N]",
+        "        Projection: t2.t2_int [t2_int:UInt32;N]",
+        "          Filter: t2.t2_int = outer_ref(t1.t1_int) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "            TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "    TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]",
+    ];
+    let formatted = plan.display_indent_schema().to_string();
+    let actual: Vec<&str> = formatted.trim().lines().collect();
+    assert_eq!(
+        expected, actual,
+        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    );
+
+    let sql = "SELECT t1_id, (SELECT a FROM (select 1 as a) WHERE a = t1.t1_int) as t2_int from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let plan = dataframe.into_optimized_plan()?;
+
+    let expected = vec![
+        "Projection: t1.t1_id, (<subquery>) AS t2_int [t1_id:UInt32;N, t2_int:Int64]",
+        "  Subquery: [a:Int64]",
+        "    Projection: a [a:Int64]",
+        "      Filter: a = CAST(outer_ref(t1.t1_int) AS Int64) [a:Int64]",
+        "        Projection: Int64(1) AS a [a:Int64]",
+        "          EmptyRelation []",
+        "  TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]",
+    ];
+    let formatted = plan.display_indent_schema().to_string();
+    let actual: Vec<&str> = formatted.trim().lines().collect();
+    assert_eq!(
+        expected, actual,
+        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn non_equal_correlated_scalar_subquery() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id < t1.t1_id) as t2_sum from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("Correlated column is not allowed in predicate: t2.t2_id < outer_ref(t1.t1_id)"))"#,
+        &format!("{err:?}")
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn aggregated_correlated_scalar_subquery() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let plan = dataframe.into_optimized_plan()?;
+
+    let expected = vec![
+        "Projection: t1.t1_id, (<subquery>) AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]",
+        "  Subquery: [SUM(t2.t2_int):UInt64;N]",
+        "    Projection: SUM(t2.t2_int) [SUM(t2.t2_int):UInt64;N]",
+        "      Aggregate: groupBy=[[]], aggr=[[SUM(t2.t2_int)]] [SUM(t2.t2_int):UInt64;N]",
+        "        Filter: t2.t2_id = outer_ref(t1.t1_id) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "          TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "  TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]",
+    ];
+    let formatted = plan.display_indent_schema().to_string();
+    let actual: Vec<&str> = formatted.trim().lines().collect();
+    assert_eq!(
+        expected, actual,
+        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn aggregated_correlated_scalar_subquery_with_extra_group_by_columns() -> Result<()>
+{
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id group by t2_name) as t2_sum from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("A GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns"))"#,
+        &format!("{err:?}")
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn aggregated_correlated_scalar_subquery_with_extra_group_by_constant() -> Result<()>
+{
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+
+    let sql = "SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id group by t2_id, 'a') as t2_sum from t1";
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let plan = dataframe.into_optimized_plan()?;
+
+    let expected = vec![
+        "Projection: t1.t1_id, (<subquery>) AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]",
+        "  Subquery: [SUM(t2.t2_int):UInt64;N]",
+        "    Projection: SUM(t2.t2_int) [SUM(t2.t2_int):UInt64;N]",
+        "      Aggregate: groupBy=[[t2.t2_id, Utf8(\"a\")]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, Utf8(\"a\"):Utf8, SUM(t2.t2_int):UInt64;N]",
+        "        Filter: t2.t2_id = outer_ref(t1.t1_id) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "          TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "  TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]",
+    ];
+    let formatted = plan.display_indent_schema().to_string();
+    let actual: Vec<&str> = formatted.trim().lines().collect();
+    assert_eq!(
+        expected, actual,
+        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn group_by_correlated_scalar_subquery() -> Result<()> {
+    let ctx = create_join_context("t1_id", "t2_id", true)?;
+    let sql = "SELECT sum(t1_int) from t1 GROUP BY (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id)";
+
+    let msg = format!("Creating logical plan for '{sql}'");
+    let dataframe = ctx.sql(sql).await.expect(&msg);
+    let err = dataframe.into_optimized_plan().err().unwrap();
+
+    assert_eq!(
+        r#"Context("check_analyzed_plan", Plan("Correlated scalar subquery in the GROUP BY clause must also be in the aggregate expressions"))"#,
         &format!("{err:?}")
     );