-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Fix duplicate unqualified Field name (schema error) on join queries #15438
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ddbf54e
8d6ac43
36645ef
a9d9949
c3c6abb
63cd54d
2f925ce
926fcb7
97ef820
301e232
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1468,19 +1468,37 @@ impl ValuesFields { | |
} | ||
} | ||
|
||
// `name_map` tracks a mapping between a field name and the number of appearances of that field. | ||
// | ||
// Some field names might already come to this function with the count (number of times it appeared) | ||
// as a sufix e.g. id:1, so there's still a chance of name collisions, for example, | ||
// if these three fields passed to this function: "col:1", "col" and "col", the function | ||
// would rename them to -> col:1, col, col:1 causing a posteriror error when building the DFSchema. | ||
// that's why we need the `seen` set, so the fields are always unique. | ||
// | ||
pub fn change_redundant_column(fields: &Fields) -> Vec<Field> { | ||
let mut name_map = HashMap::new(); | ||
let mut seen: HashSet<String> = HashSet::new(); | ||
LiaCastaneda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
fields | ||
.into_iter() | ||
.map(|field| { | ||
let counter = name_map.entry(field.name().to_string()).or_insert(0); | ||
*counter += 1; | ||
if *counter > 1 { | ||
let new_name = format!("{}:{}", field.name(), *counter - 1); | ||
Field::new(new_name, field.data_type().clone(), field.is_nullable()) | ||
} else { | ||
field.as_ref().clone() | ||
Comment on lines
1479
to
-1482
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be the root cause of the issue: when doing joins, there is a function requalify_sides_if_needed to handle aliasing the columns so the resulting schema of a join :
The first JOIN to be converted to a logical plan:
Moreover we can observe that if we do just two levels of joins we would get no error:
|
||
let base_name = field.name(); | ||
let count = name_map.entry(base_name.clone()).or_insert(0); | ||
let mut new_name = base_name.clone(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I played around with trying to avoid this Since this function is only called when creating subqueries I think it is fine https://github.com/search?q=repo%3Aapache%2Fdatafusion%20change_redundant_column&type=code |
||
|
||
// Loop until we find a name that hasn't been used | ||
while seen.contains(&new_name) { | ||
*count += 1; | ||
new_name = format!("{}:{}", base_name, count); | ||
} | ||
|
||
seen.insert(new_name.clone()); | ||
|
||
let mut modified_field = | ||
Field::new(&new_name, field.data_type().clone(), field.is_nullable()); | ||
modified_field.set_metadata(field.metadata().clone()); | ||
modified_field | ||
}) | ||
.collect() | ||
} | ||
|
@@ -2730,10 +2748,13 @@ mod tests { | |
let t1_field_1 = Field::new("a", DataType::Int32, false); | ||
let t2_field_1 = Field::new("a", DataType::Int32, false); | ||
let t2_field_3 = Field::new("a", DataType::Int32, false); | ||
let t2_field_4 = Field::new("a:1", DataType::Int32, false); | ||
let t1_field_2 = Field::new("b", DataType::Int32, false); | ||
let t2_field_2 = Field::new("b", DataType::Int32, false); | ||
|
||
let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; | ||
let field_vec = vec![ | ||
t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4, | ||
]; | ||
let remove_redundant = change_redundant_column(&Fields::from(field_vec)); | ||
|
||
assert_eq!( | ||
|
@@ -2744,6 +2765,7 @@ mod tests { | |
Field::new("b", DataType::Int32, false), | ||
Field::new("b:1", DataType::Int32, false), | ||
Field::new("a:2", DataType::Int32, false), | ||
Field::new("a:1:1", DataType::Int32, false), | ||
] | ||
); | ||
Ok(()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ use crate::PhysicalExpr; | |
|
||
use arrow::datatypes::SchemaRef; | ||
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; | ||
use datafusion_common::{internal_err, Result}; | ||
use datafusion_common::Result; | ||
|
||
/// Stores the mapping between source expressions and target expressions for a | ||
/// projection. | ||
|
@@ -66,9 +66,9 @@ impl ProjectionMapping { | |
let idx = col.index(); | ||
let matching_input_field = input_schema.field(idx); | ||
if col.name() != matching_input_field.name() { | ||
return internal_err!("Input field name {} does not match with the projection expression {}", | ||
matching_input_field.name(),col.name()) | ||
} | ||
let fixed_col = Column::new(col.name(), idx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI @berkaysynnada and @akurmustafa There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry for being late :( https://github.com/apache/datafusion/pull/15438/files#discussion_r2025001167 |
||
return Ok(Transformed::yes(Arc::new(fixed_col))); | ||
} | ||
let matching_input_column = | ||
Column::new(matching_input_field.name(), idx); | ||
Ok(Transformed::yes(Arc::new(matching_input_column))) | ||
|
Uh oh!
There was an error while loading. Please reload this page.