Skip to content

Commit e9fc6c7

Browse files
committed
bench: add scalar regex match benchmarks
1 parent a600f4d commit e9fc6c7

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

datafusion/physical-expr/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,7 @@ name = "case_when"
8787
[[bench]]
8888
harness = false
8989
name = "is_null"
90+
91+
[[bench]]
92+
harness = false
93+
name = "scalar_regex_match"
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::sync::Arc;
19+
20+
use arrow_array::{RecordBatch, StringArray};
21+
use arrow_schema::{DataType, Field, Schema};
22+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
23+
use datafusion_expr_common::operator::Operator;
24+
use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match};
25+
use hashbrown::HashMap;
26+
use rand::distributions::{Alphanumeric, DistString};
27+
28+
/// make a record batch with one column and n rows
29+
/// this record batch is single string column is used for
30+
/// scalar regex match benchmarks
31+
fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> RecordBatch {
32+
let mut rng = rand::thread_rng();
33+
let mut array = Vec::with_capacity(rows);
34+
for _ in 0..rows {
35+
let data_line = Alphanumeric.sample_string(&mut rng, string_length);
36+
array.push(Some(data_line));
37+
}
38+
let array = StringArray::from(array);
39+
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap()
40+
}
41+
42+
fn scalar_regex_match_benchmark(c: &mut Criterion) {
43+
// make common schema
44+
let column = "string";
45+
let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]);
46+
47+
// meke test record batch
48+
let test_batch = [
49+
(10, make_record_batch(10, 100, schema.clone())),
50+
(100, make_record_batch(100, 100, schema.clone())),
51+
(1000, make_record_batch(1000, 100, schema.clone())),
52+
(2000, make_record_batch(2000, 100, schema.clone())),
53+
]
54+
.iter()
55+
.map(|(k, v)| (*k, v.clone()))
56+
.collect::<HashMap<_, _>>();
57+
58+
// string column
59+
let string_col = col(column, &schema).unwrap();
60+
61+
// some pattern literal
62+
let pattern_lit = [
63+
("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")),
64+
("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")),
65+
("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")),
66+
("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")),
67+
("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
68+
].iter()
69+
.map(|(k, v)| (k.clone(), v.clone()))
70+
.collect::<HashMap<_, _>>();
71+
72+
for (name, regexp_lit) in pattern_lit.iter() {
73+
for (rows, batch) in test_batch.iter() {
74+
for iter in [10, 20, 50, 100] {
75+
// scalar regex match benchmarks
76+
let bench_name = format!(
77+
"scalar_regex_match_pattern_{}_rows_{}_iter_{}",
78+
name, rows, iter
79+
);
80+
c.bench_function(bench_name.as_str(), |b| {
81+
let expr = scalar_regex_match(
82+
false,
83+
false,
84+
string_col.clone(),
85+
regexp_lit.clone(),
86+
&schema,
87+
)
88+
.unwrap();
89+
b.iter(|| {
90+
for _ in 0..iter {
91+
expr.evaluate(black_box(batch)).unwrap();
92+
}
93+
});
94+
});
95+
96+
// binary regex match benchmarks
97+
let bench_name = format!(
98+
"binary_regex_match_pattern_{}_rows_{}_iter_{}",
99+
name, rows, iter
100+
);
101+
c.bench_function(bench_name.as_str(), |b| {
102+
let expr = binary(
103+
string_col.clone(),
104+
Operator::RegexMatch,
105+
regexp_lit.clone(),
106+
&schema,
107+
)
108+
.unwrap();
109+
b.iter(|| {
110+
for _ in 0..iter {
111+
expr.evaluate(black_box(batch)).unwrap();
112+
}
113+
});
114+
});
115+
}
116+
}
117+
}
118+
}
119+
120+
criterion_group!(benches, scalar_regex_match_benchmark);
121+
criterion_main!(benches);

0 commit comments

Comments
 (0)