|
| 1 | +use std::sync::Arc; |
| 2 | + |
| 3 | +use arrow_array::{RecordBatch, StringArray}; |
| 4 | +use arrow_schema::{DataType, Field, Schema}; |
| 5 | +use criterion::{black_box, criterion_group, criterion_main, Criterion}; |
| 6 | +use datafusion_expr_common::operator::Operator; |
| 7 | +use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match}; |
| 8 | +use hashbrown::HashMap; |
| 9 | +use rand::distributions::{Alphanumeric, DistString}; |
| 10 | + |
| 11 | +/// make a record batch with one column and n rows |
| 12 | +/// this record batch is single string column is used for |
| 13 | +/// scalar regex match benchmarks |
| 14 | +fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> RecordBatch { |
| 15 | + let mut rng = rand::thread_rng(); |
| 16 | + let mut array = Vec::with_capacity(rows); |
| 17 | + for _ in 0..rows { |
| 18 | + let data_line = Alphanumeric.sample_string(&mut rng, string_length); |
| 19 | + array.push(Some(data_line)); |
| 20 | + } |
| 21 | + let array = StringArray::from(array); |
| 22 | + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap() |
| 23 | +} |
| 24 | + |
| 25 | +fn scalar_regex_match_benchmark(c: &mut Criterion) { |
| 26 | + // make common schema |
| 27 | + let column = "string"; |
| 28 | + let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]); |
| 29 | + |
| 30 | + // meke test record batch |
| 31 | + let test_batch = [ |
| 32 | + (10, make_record_batch(10, 100, schema.clone())), |
| 33 | + (100, make_record_batch(100, 100, schema.clone())), |
| 34 | + (1000, make_record_batch(1000, 100, schema.clone())), |
| 35 | + (2000, make_record_batch(2000, 100, schema.clone())), |
| 36 | + ] |
| 37 | + .iter() |
| 38 | + .map(|(k, v)| (*k, v.clone())) |
| 39 | + .collect::<HashMap<_, _>>(); |
| 40 | + |
| 41 | + // string column |
| 42 | + let string_col = col(column, &schema).unwrap(); |
| 43 | + |
| 44 | + // some pattern literal |
| 45 | + let pattern_lit = [ |
| 46 | + ("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")), |
| 47 | + ("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")), |
| 48 | + ("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")), |
| 49 | + ("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")), |
| 50 | + ("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")), |
| 51 | + ].iter() |
| 52 | + .map(|(k, v)| (k.clone(), v.clone())) |
| 53 | + .collect::<HashMap<_, _>>(); |
| 54 | + |
| 55 | + for (name, regexp_lit) in pattern_lit.iter() { |
| 56 | + for (rows, batch) in test_batch.iter() { |
| 57 | + for iter in [10, 20, 50, 100] { |
| 58 | + // scalar regex match benchmarks |
| 59 | + let bench_name = format!( |
| 60 | + "scalar_regex_match_pattern_{}_rows_{}_iter_{}", |
| 61 | + name, rows, iter |
| 62 | + ); |
| 63 | + c.bench_function(bench_name.as_str(), |b| { |
| 64 | + let expr = scalar_regex_match( |
| 65 | + false, |
| 66 | + false, |
| 67 | + string_col.clone(), |
| 68 | + regexp_lit.clone(), |
| 69 | + &schema, |
| 70 | + ) |
| 71 | + .unwrap(); |
| 72 | + b.iter(|| { |
| 73 | + for _ in 0..iter { |
| 74 | + expr.evaluate(black_box(batch)).unwrap(); |
| 75 | + } |
| 76 | + }); |
| 77 | + }); |
| 78 | + |
| 79 | + // binary regex match benchmarks |
| 80 | + let bench_name = format!( |
| 81 | + "binary_regex_match_pattern_{}_rows_{}_iter_{}", |
| 82 | + name, rows, iter |
| 83 | + ); |
| 84 | + c.bench_function(bench_name.as_str(), |b| { |
| 85 | + let expr = binary( |
| 86 | + string_col.clone(), |
| 87 | + Operator::RegexMatch, |
| 88 | + regexp_lit.clone(), |
| 89 | + &schema, |
| 90 | + ) |
| 91 | + .unwrap(); |
| 92 | + b.iter(|| { |
| 93 | + for _ in 0..iter { |
| 94 | + expr.evaluate(black_box(batch)).unwrap(); |
| 95 | + } |
| 96 | + }); |
| 97 | + }); |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +criterion_group!(benches, scalar_regex_match_benchmark); |
| 104 | +criterion_main!(benches); |
0 commit comments