-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat: add optimizer config param to avoid grouping partitions prefer_existing_union
#10259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1192,7 +1192,11 @@ fn ensure_distribution( | |
.collect::<Result<Vec<_>>>()?; | ||
|
||
let children_plans = children.iter().map(|c| c.plan.clone()).collect::<Vec<_>>(); | ||
plan = if plan.as_any().is::<UnionExec>() && can_interleave(children_plans.iter()) { | ||
|
||
plan = if plan.as_any().is::<UnionExec>() | ||
&& !config.optimizer.prefer_existing_union | ||
&& can_interleave(children_plans.iter()) | ||
{ | ||
// Add a special case for [`UnionExec`] since we want to "bubble up" | ||
// hash-partitioned data. So instead of | ||
// | ||
|
@@ -1721,23 +1725,33 @@ pub(crate) mod tests { | |
/// * `TARGET_PARTITIONS` (optional) - number of partitions to repartition to | ||
/// * `REPARTITION_FILE_SCANS` (optional) - if true, will repartition file scans | ||
/// * `REPARTITION_FILE_MIN_SIZE` (optional) - minimum file size to repartition | ||
NGA-TRAN marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// * `PREFER_EXISTING_UNION` (optional) - if true, will not attempt to convert Union to Interleave | ||
macro_rules! assert_optimized { | ||
($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr) => { | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, false, 10, false, 1024); | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, false, 10, false, 1024, false); | ||
}; | ||
|
||
($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr) => { | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024); | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024, false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these macros are getting a bit hairy -- maybe we can clean them up (convert to using functions rather than macros) in a subsequent PR |
||
}; | ||
|
||
($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $PREFER_EXISTING_UNION: expr) => { | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024, $PREFER_EXISTING_UNION); | ||
}; | ||
|
||
($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr) => { | ||
assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, $TARGET_PARTITIONS, $REPARTITION_FILE_SCANS, $REPARTITION_FILE_MIN_SIZE, false); | ||
}; | ||
|
||
($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr, $PREFER_EXISTING_UNION: expr) => { | ||
let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); | ||
|
||
let mut config = ConfigOptions::new(); | ||
config.execution.target_partitions = $TARGET_PARTITIONS; | ||
config.optimizer.repartition_file_scans = $REPARTITION_FILE_SCANS; | ||
config.optimizer.repartition_file_min_size = $REPARTITION_FILE_MIN_SIZE; | ||
config.optimizer.prefer_existing_sort = $PREFER_EXISTING_SORT; | ||
config.optimizer.prefer_existing_union = $PREFER_EXISTING_UNION; | ||
|
||
// NOTE: These tests verify the joint `EnforceDistribution` + `EnforceSorting` cascade | ||
// because they were written prior to the separation of `BasicEnforcement` into | ||
|
@@ -3097,7 +3111,67 @@ pub(crate) mod tests { | |
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", | ||
]; | ||
assert_optimized!(expected, plan.clone(), true); | ||
assert_optimized!(expected, plan, false); | ||
assert_optimized!(expected, plan.clone(), false); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn union_not_to_interleave() -> Result<()> { | ||
// group by (a as a1) | ||
let left = aggregate_exec_with_alias( | ||
parquet_exec(), | ||
vec![("a".to_string(), "a1".to_string())], | ||
); | ||
// group by (a as a2) | ||
let right = aggregate_exec_with_alias( | ||
parquet_exec(), | ||
vec![("a".to_string(), "a1".to_string())], | ||
); | ||
|
||
// Union | ||
let plan = Arc::new(UnionExec::new(vec![left, right])); | ||
|
||
// final agg | ||
let plan = | ||
aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); | ||
|
||
// Only two RepartitionExecs added, no final RepartitionExec required | ||
let expected = &[ | ||
"AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", | ||
"RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20", | ||
"AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", | ||
"UnionExec", | ||
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", | ||
"RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", | ||
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", | ||
"RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", | ||
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", | ||
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", | ||
"RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", | ||
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", | ||
"RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", | ||
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", | ||
]; | ||
// no sort in the plan but since we need it as a parameter, make it default false | ||
let prefer_existing_sort = false; | ||
let first_enforce_distribution = true; | ||
let prefer_existing_union = true; | ||
|
||
assert_optimized!( | ||
expected, | ||
plan.clone(), | ||
first_enforce_distribution, | ||
prefer_existing_sort, | ||
prefer_existing_union | ||
); | ||
assert_optimized!( | ||
expected, | ||
plan, | ||
!first_enforce_distribution, | ||
prefer_existing_sort, | ||
prefer_existing_union | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Observing the test |
||
); | ||
|
||
Ok(()) | ||
} | ||
|
@@ -3651,7 +3725,8 @@ pub(crate) mod tests { | |
true, | ||
target_partitions, | ||
true, | ||
repartition_size | ||
repartition_size, | ||
false | ||
); | ||
|
||
let expected = [ | ||
|
@@ -3668,7 +3743,8 @@ pub(crate) mod tests { | |
true, | ||
target_partitions, | ||
true, | ||
repartition_size | ||
repartition_size, | ||
false | ||
); | ||
|
||
Ok(()) | ||
|
@@ -3731,7 +3807,7 @@ pub(crate) mod tests { | |
)), | ||
vec![("a".to_string(), "a".to_string())], | ||
); | ||
assert_optimized!(expected, plan, true, false, 2, true, 10); | ||
assert_optimized!(expected, plan, true, false, 2, true, 10, false); | ||
} | ||
Ok(()) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that one of the motivations (influxdata#4) for this new flag is to preserve the sorting of the Union - would re-using the existing flag
prefer_existing_sort
make sense here?One argument against that would be if you wanted
prefer_existing_sort: false
andprefer_existing_union: true
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an excellent point @phillipleblanc . I just double checked and we actually have
prefer_existing_sort
set to true in IOx already (code ref, not public)What do you think about using the existing flag @NGA-TRAN ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@phillipleblanc and @alamb
@mustafasrepo 's comment #10259 (comment) suggest to use the current approach. Do I need to do anything more here?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope, I think this approach looks good! Thanks!