Skip to content

Commit 5eec82b

Browse files
committed
feat: Add nilike comparison kernel
1 parent f541e13 commit 5eec82b

File tree

2 files changed

+162
-0
lines changed

2 files changed

+162
-0
lines changed

arrow/benches/comparison_kernels.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
124124
.unwrap();
125125
}
126126

127+
fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
128+
nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
129+
.unwrap();
130+
}
131+
127132
fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
128133
regexp_is_match_utf8_scalar(
129134
criterion::black_box(arr_a),
@@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) {
254259
b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
255260
});
256261

262+
c.bench_function("nilike_utf8 scalar equals", |b| {
263+
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX"))
264+
});
265+
266+
c.bench_function("nilike_utf8 scalar contains", |b| {
267+
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%"))
268+
});
269+
270+
c.bench_function("nilike_utf8 scalar ends with", |b| {
271+
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%"))
272+
});
273+
274+
c.bench_function("nilike_utf8 scalar starts with", |b| {
275+
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx"))
276+
});
277+
278+
c.bench_function("nilike_utf8 scalar complex", |b| {
279+
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
280+
});
281+
257282
c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
258283
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
259284
});

arrow/src/compute/kernels/comparison.rs

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
548548
Ok(BooleanArray::from(data))
549549
}
550550

551+
/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
552+
/// [`LargeStringArray`].
553+
///
554+
/// See the documentation on [`like_utf8`] for more details.
555+
pub fn nilike_utf8<OffsetSize: StringOffsetSizeTrait>(
556+
left: &GenericStringArray<OffsetSize>,
557+
right: &GenericStringArray<OffsetSize>,
558+
) -> Result<BooleanArray> {
559+
regex_like(left, right, true, |re_pattern| {
560+
Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
561+
ArrowError::ComputeError(format!(
562+
"Unable to build regex from ILIKE pattern: {}",
563+
e
564+
))
565+
})
566+
})
567+
}
568+
569+
/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
570+
/// [`LargeStringArray`] and a scalar.
571+
///
572+
/// See the documentation on [`like_utf8`] for more details.
573+
pub fn nilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
574+
left: &GenericStringArray<OffsetSize>,
575+
right: &str,
576+
) -> Result<BooleanArray> {
577+
let null_bit_buffer = left.data().null_buffer().cloned();
578+
let mut result = BooleanBufferBuilder::new(left.len());
579+
580+
if !right.contains(is_like_pattern) {
581+
// fast path, can use equals
582+
for i in 0..left.len() {
583+
result.append(left.value(i) != right);
584+
}
585+
} else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
586+
{
587+
// fast path, can use ends_with
588+
for i in 0..left.len() {
589+
result.append(
590+
!left
591+
.value(i)
592+
.to_uppercase()
593+
.starts_with(&right[..right.len() - 1].to_uppercase()),
594+
);
595+
}
596+
} else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
597+
// fast path, can use starts_with
598+
for i in 0..left.len() {
599+
result.append(
600+
!left
601+
.value(i)
602+
.to_uppercase()
603+
.ends_with(&right[1..].to_uppercase()),
604+
);
605+
}
606+
} else {
607+
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
608+
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
609+
ArrowError::ComputeError(format!(
610+
"Unable to build regex from ILIKE pattern: {}",
611+
e
612+
))
613+
})?;
614+
for i in 0..left.len() {
615+
let haystack = left.value(i);
616+
result.append(!re.is_match(haystack));
617+
}
618+
}
619+
620+
let data = unsafe {
621+
ArrayData::new_unchecked(
622+
DataType::Boolean,
623+
left.len(),
624+
None,
625+
null_bit_buffer,
626+
0,
627+
vec![result.finish()],
628+
vec![],
629+
)
630+
};
631+
Ok(BooleanArray::from(data))
632+
}
633+
551634
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
552635
/// If `regex_array` element has an empty value, the corresponding result value is always true.
553636
///
@@ -3983,6 +4066,60 @@ mod tests {
39834066
vec![false, true, false, false]
39844067
);
39854068

4069+
test_utf8!(
4070+
test_utf8_array_nilike,
4071+
vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
4072+
vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
4073+
nilike_utf8,
4074+
vec![false, false, false, true, true, false, true]
4075+
);
4076+
test_utf8_scalar!(
4077+
nilike_utf8_scalar_escape_testing,
4078+
vec!["varchar(255)", "int(255)", "varchar", "int"],
4079+
"%(%)%",
4080+
nilike_utf8_scalar,
4081+
vec![false, false, true, true]
4082+
);
4083+
test_utf8_scalar!(
4084+
test_utf8_array_nilike_scalar,
4085+
vec!["arrow", "parquet", "datafusion", "flight"],
4086+
"%AR%",
4087+
nilike_utf8_scalar,
4088+
vec![false, false, true, true]
4089+
);
4090+
4091+
test_utf8_scalar!(
4092+
test_utf8_array_nilike_scalar_start,
4093+
vec!["arrow", "parrow", "arrows", "ARR"],
4094+
"aRRow%",
4095+
nilike_utf8_scalar,
4096+
vec![false, true, false, true]
4097+
);
4098+
4099+
test_utf8_scalar!(
4100+
test_utf8_array_nilike_scalar_end,
4101+
vec!["ArroW", "parrow", "ARRowS", "arr"],
4102+
"%arrow",
4103+
nilike_utf8_scalar,
4104+
vec![false, false, true, true]
4105+
);
4106+
4107+
test_utf8_scalar!(
4108+
test_utf8_array_nilike_scalar_equals,
4109+
vec!["arrow", "parrow", "arrows", "arr"],
4110+
"arrow",
4111+
nilike_utf8_scalar,
4112+
vec![false, true, true, true]
4113+
);
4114+
4115+
test_utf8_scalar!(
4116+
test_utf8_array_nilike_scalar_one,
4117+
vec!["arrow", "arrows", "parrow", "arr"],
4118+
"arrow_",
4119+
nilike_utf8_scalar,
4120+
vec![true, false, true, true]
4121+
);
4122+
39864123
test_utf8!(
39874124
test_utf8_array_neq,
39884125
vec!["arrow", "arrow", "arrow", "arrow"],

0 commit comments

Comments
 (0)