feat: add hint for missing fields

Lordworms · Lordworms · commit 527fbd14162f · 2025-02-05T20:34:04.000-08:00
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
@@ -1069,7 +1069,7 @@ mod tests {
             Column names are case sensitive. \
             You can use double quotes to refer to the \"\"t1.c0\"\" column \
             or set the datafusion.sql_parser.enable_ident_normalization configuration. \
-            Valid fields are t1.c0, t1.c1.";
+            Did you mean 't1.c0'?.";
         assert_eq!(err.strip_backtrace(), expected);
         Ok(())
     }
diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs
@@ -26,6 +26,7 @@ use std::io;
 use std::result;
 use std::sync::Arc;
 
+use crate::utils::datafusion_strsim::normalized_levenshtein;
 use crate::utils::quote_identifier;
 use crate::{Column, DFSchema, Diagnostic, TableReference};
 #[cfg(feature = "avro")]
@@ -176,6 +177,11 @@ impl Display for SchemaError {
                     .iter()
                     .map(|column| column.flat_name().to_lowercase())
                     .collect::<Vec<String>>();
+
+                let valid_fields_names = valid_fields
+                    .iter()
+                    .map(|column| column.flat_name())
+                    .collect::<Vec<String>>();
                 if lower_valid_fields.contains(&field.flat_name().to_lowercase()) {
                     write!(
                         f,
@@ -184,7 +190,15 @@ impl Display for SchemaError {
                         field.quoted_flat_name()
                     )?;
                 }
-                if !valid_fields.is_empty() {
+                let field_name = field.name();
+                if let Some(matched) = valid_fields_names
+                    .iter()
+                    .filter(|str| normalized_levenshtein(str, field_name) > 0.5)
+                    .collect::<Vec<&String>>()
+                    .first()
+                {
+                    write!(f, ". Did you mean '{matched}'?")?;
+                } else if !valid_fields.is_empty() {
                     write!(
                         f,
                         ". Valid fields are {}",
diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs
@@ -736,6 +736,27 @@ pub mod datafusion_strsim {
     pub fn levenshtein(a: &str, b: &str) -> usize {
         generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
     }
+
+    /// Calculates the normalized Levenshtein distance between two strings.
+    /// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
+    /// that the strings are identical and 0.0 indicates no similarity.
+    ///
+    /// ```
+    /// use datafusion_common::utils::datafusion_strsim::normalized_levenshtein;
+    ///
+    /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
+    ///
+    /// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
+    ///
+    /// assert!((normalized_levenshtein("kitten", "sitten") - 0.833).abs() < 0.001);
+    /// ```
+    pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
+        if a.is_empty() && b.is_empty() {
+            return 1.0;
+        }
+        1.0 - (levenshtein(a, b) as f64)
+            / (a.chars().count().max(b.chars().count()) as f64)
+    }
 }
 
 /// Merges collections `first` and `second`, removes duplicates and sorts the
diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt
@@ -161,3 +161,13 @@ create table records (timestamp timestamp, value float) as values (
     '2021-01-01 00:00:00', 1.0,
     '2021-01-01 00:00:00', 2.0
 );
+
+
+statement ok
+create table a(timestamp int, birthday int);
+
+query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\.
+select timetamp from a;
+
+query error DataFusion error: Schema error: No field named dadsada\. Valid fields are a\.timestamp, a\.birthday\.
+select dadsada from a;
diff --git a/datafusion/sqllogictest/test_files/identifiers.slt b/datafusion/sqllogictest/test_files/identifiers.slt
@@ -90,16 +90,16 @@ drop table  case_insensitive_test
 statement ok
 CREATE TABLE test("Column1" string) AS VALUES ('content1');
 
-statement error DataFusion error: Schema error: No field named column1. Valid fields are test\."Column1"\.
+statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
 SELECT COLumn1 from test
 
-statement error DataFusion error: Schema error: No field named column1. Valid fields are test\."Column1"\.
+statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
 SELECT Column1 from test
 
-statement error DataFusion error: Schema error: No field named column1. Valid fields are test\."Column1"\.
+statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
 SELECT column1 from test
 
-statement error DataFusion error: Schema error: No field named column1. Valid fields are test\."Column1"\.
+statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
 SELECT "column1" from test
 
 statement ok

Original file line number	Diff line number	Diff line change
`@@ -1069,7 +1069,7 @@ mod tests {`
`1069`	`1069`	`Column names are case sensitive. \`
`1070`	`1070`	`You can use double quotes to refer to the \"\"t1.c0\"\" column \`
`1071`	`1071`	`or set the datafusion.sql_parser.enable_ident_normalization configuration. \`
`1072`		`- Valid fields are t1.c0, t1.c1.";`
	`1072`	`+ Did you mean 't1.c0'?.";`
`1073`	`1073`	`assert_eq!(err.strip_backtrace(), expected);`
`1074`	`1074`	`Ok(())`
`1075`	`1075`	`}`