Skip to content

Commit d49edca

Browse files
committed
bench: add scalar regex match benchmarks
1 parent a600f4d commit d49edca

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

datafusion/physical-expr/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,7 @@ name = "case_when"
8787
[[bench]]
8888
harness = false
8989
name = "is_null"
90+
91+
[[bench]]
92+
harness = false
93+
name = "scalar_regex_match"
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
use std::sync::Arc;
2+
3+
use arrow_array::{RecordBatch, StringArray};
4+
use arrow_schema::{DataType, Field, Schema};
5+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
6+
use datafusion_expr_common::operator::Operator;
7+
use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match};
8+
use hashbrown::HashMap;
9+
use rand::distributions::{Alphanumeric, DistString};
10+
11+
/// make a record batch with one column and n rows
12+
/// this record batch is single string column is used for
13+
/// scalar regex match benchmarks
14+
fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> RecordBatch {
15+
let mut rng = rand::thread_rng();
16+
let mut array = Vec::with_capacity(rows);
17+
for _ in 0..rows {
18+
let data_line = Alphanumeric.sample_string(&mut rng, string_length);
19+
array.push(Some(data_line));
20+
}
21+
let array = StringArray::from(array);
22+
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap()
23+
}
24+
25+
fn scalar_regex_match_benchmark(c: &mut Criterion) {
26+
// make common schema
27+
let column = "string";
28+
let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]);
29+
30+
// meke test record batch
31+
let test_batch = [
32+
(10, make_record_batch(10, 100, schema.clone())),
33+
(100, make_record_batch(100, 100, schema.clone())),
34+
(1000, make_record_batch(1000, 100, schema.clone())),
35+
(2000, make_record_batch(2000, 100, schema.clone())),
36+
]
37+
.iter()
38+
.map(|(k, v)| (*k, v.clone()))
39+
.collect::<HashMap<_, _>>();
40+
41+
// string column
42+
let string_col = col(column, &schema).unwrap();
43+
44+
// some pattern literal
45+
let pattern_lit = [
46+
("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")),
47+
("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")),
48+
("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")),
49+
("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")),
50+
("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
51+
].iter()
52+
.map(|(k, v)| (k.clone(), v.clone()))
53+
.collect::<HashMap<_, _>>();
54+
55+
for (name, regexp_lit) in pattern_lit.iter() {
56+
for (rows, batch) in test_batch.iter() {
57+
for iter in [10, 20, 50, 100] {
58+
// scalar regex match benchmarks
59+
let bench_name = format!(
60+
"scalar_regex_match_pattern_{}_rows_{}_iter_{}",
61+
name, rows, iter
62+
);
63+
c.bench_function(bench_name.as_str(), |b| {
64+
let expr = scalar_regex_match(
65+
false,
66+
false,
67+
string_col.clone(),
68+
regexp_lit.clone(),
69+
&schema,
70+
)
71+
.unwrap();
72+
b.iter(|| {
73+
for _ in 0..iter {
74+
expr.evaluate(black_box(batch)).unwrap();
75+
}
76+
});
77+
});
78+
79+
// binary regex match benchmarks
80+
let bench_name = format!(
81+
"binary_regex_match_pattern_{}_rows_{}_iter_{}",
82+
name, rows, iter
83+
);
84+
c.bench_function(bench_name.as_str(), |b| {
85+
let expr = binary(
86+
string_col.clone(),
87+
Operator::RegexMatch,
88+
regexp_lit.clone(),
89+
&schema,
90+
)
91+
.unwrap();
92+
b.iter(|| {
93+
for _ in 0..iter {
94+
expr.evaluate(black_box(batch)).unwrap();
95+
}
96+
});
97+
});
98+
}
99+
}
100+
}
101+
}
102+
103+
criterion_group!(benches, scalar_regex_match_benchmark);
104+
criterion_main!(benches);

0 commit comments

Comments
 (0)