Skip to content

Commit 49f95af

Browse files
jatin510alamb
andauthored
Implemented simplify for the starts_with function to convert it into a LIKE expression. (#14119)
* Implemented `simplify` for the `starts_with` function to convert it into a LIKE expression, enabling predicate pruning optimization. * fix: escape special characters in starts_with to LIKE conversion * updated simply function to handle utf8, largeutf8 and utf8view data type. and updated the coresponding test * Add some more tests * Add pruning test --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent bd0b56f commit 49f95af

File tree

3 files changed

+120
-4
lines changed

3 files changed

+120
-4
lines changed

datafusion/functions/src/string/starts_with.rs

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ use std::sync::Arc;
2020

2121
use arrow::array::ArrayRef;
2222
use arrow::datatypes::DataType;
23+
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
2324

2425
use crate::utils::make_scalar_function;
25-
use datafusion_common::{internal_err, Result};
26-
use datafusion_expr::{ColumnarValue, Documentation};
26+
use datafusion_common::{internal_err, Result, ScalarValue};
27+
use datafusion_expr::{ColumnarValue, Documentation, Expr, Like};
2728
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2829
use datafusion_macros::user_doc;
2930

@@ -98,6 +99,48 @@ impl ScalarUDFImpl for StartsWithFunc {
9899
}
99100
}
100101

102+
fn simplify(
103+
&self,
104+
args: Vec<Expr>,
105+
_info: &dyn SimplifyInfo,
106+
) -> Result<ExprSimplifyResult> {
107+
if let Expr::Literal(scalar_value) = &args[1] {
108+
// Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping
109+
// Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%'
110+
// 1. 'ja%' (input pattern)
111+
// 2. 'ja\%' (escape special char '%')
112+
// 3. 'ja\%%' (add suffix for starts_with)
113+
let like_expr = match scalar_value {
114+
ScalarValue::Utf8(Some(pattern)) => {
115+
let escaped_pattern = pattern.replace("%", "\\%");
116+
let like_pattern = format!("{}%", escaped_pattern);
117+
Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))
118+
}
119+
ScalarValue::LargeUtf8(Some(pattern)) => {
120+
let escaped_pattern = pattern.replace("%", "\\%");
121+
let like_pattern = format!("{}%", escaped_pattern);
122+
Expr::Literal(ScalarValue::LargeUtf8(Some(like_pattern)))
123+
}
124+
ScalarValue::Utf8View(Some(pattern)) => {
125+
let escaped_pattern = pattern.replace("%", "\\%");
126+
let like_pattern = format!("{}%", escaped_pattern);
127+
Expr::Literal(ScalarValue::Utf8View(Some(like_pattern)))
128+
}
129+
_ => return Ok(ExprSimplifyResult::Original(args)),
130+
};
131+
132+
return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like {
133+
negated: false,
134+
expr: Box::new(args[0].clone()),
135+
pattern: Box::new(like_expr),
136+
escape_char: None,
137+
case_insensitive: false,
138+
})));
139+
}
140+
141+
Ok(ExprSimplifyResult::Original(args))
142+
}
143+
101144
fn documentation(&self) -> Option<&Documentation> {
102145
self.doc()
103146
}

datafusion/sqllogictest/test_files/parquet.slt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,3 +598,34 @@ drop table cpu;
598598

599599
statement ok
600600
drop table cpu_parquet;
601+
602+
# Test for parquet predicate pruning with `starts_with` function
603+
query I
604+
copy (values ('foo'), ('bar'), ('baz')) TO 'test_files/scratch/parquet/foo.parquet'
605+
----
606+
3
607+
608+
statement ok
609+
create external table foo
610+
stored as parquet
611+
location 'test_files/scratch/parquet/foo.parquet';
612+
613+
614+
# Expect that the pruning predicate contain a comparison on the min/max value of `column1):
615+
# column1_min@0 <= g AND f <= column1_max@1`
616+
# (the starts_with function is not supported in the parquet predicate pruning but DataFusion rewrites
617+
# it to a like which is then handled by the PruningPredicate)
618+
query TT
619+
explain select * from foo where starts_with(column1, 'f');
620+
----
621+
logical_plan
622+
01)Filter: foo.column1 LIKE Utf8View("f%")
623+
02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
624+
physical_plan
625+
01)CoalesceBatchesExec: target_batch_size=8192
626+
02)--FilterExec: column1@0 LIKE f%
627+
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
628+
04)------ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
629+
630+
statement ok
631+
drop table foo

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,51 @@ EXPLAIN SELECT
344344
FROM test;
345345
----
346346
logical_plan
347-
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
347+
01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
348348
02)--TableScan: test projection=[column1_utf8view]
349349

350+
## Test STARTS_WITH is rewitten to LIKE when the pattern is a constant
351+
query TT
352+
EXPLAIN SELECT
353+
STARTS_WITH(column1_utf8, 'foo%') as c1,
354+
STARTS_WITH(column1_large_utf8, 'foo%') as c2,
355+
STARTS_WITH(column1_utf8view, 'foo%') as c3,
356+
STARTS_WITH(column1_utf8, 'f_o') as c4,
357+
STARTS_WITH(column1_large_utf8, 'f_o') as c5,
358+
STARTS_WITH(column1_utf8view, 'f_o') as c6
359+
FROM test;
360+
----
361+
logical_plan
362+
01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6
363+
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]
364+
365+
## Test STARTS_WITH works with column arguments
366+
query TT
367+
EXPLAIN SELECT
368+
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
369+
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
370+
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
371+
FROM test;
372+
----
373+
logical_plan
374+
01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), Int64(2))) AS c3
375+
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]
376+
377+
query BBB
378+
SELECT
379+
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
380+
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
381+
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
382+
FROM test;
383+
----
384+
true true true
385+
true true true
386+
true true true
387+
true true true
388+
NULL NULL NULL
389+
390+
391+
# Ensure that INIT cap works with utf8view
350392
query TT
351393
EXPLAIN SELECT
352394
INITCAP(column1_utf8view) as c
@@ -887,7 +929,7 @@ EXPLAIN SELECT
887929
FROM test;
888930
----
889931
logical_plan
890-
01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
932+
01)Projection: test.column1_utf8view LIKE Utf8View("foo%") AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
891933
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
892934

893935
## Ensure no casts for TRANSLATE

0 commit comments

Comments
 (0)