Skip to content

Commit 7968aad

Browse files
authored
Merge pull request #288 from tiehuis/master
Add RegexSet functionality to C API
2 parents 3cb936c + 50af527 commit 7968aad

File tree

6 files changed

+423
-4
lines changed

6 files changed

+423
-4
lines changed

regex-capi/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,5 @@ There are a few things missing from the C API that are present in the Rust API.
9999
There's no particular (known) reason why they don't, they just haven't been
100100
implemented yet.
101101

102-
* RegexSet, which permits matching multiple regular expressions simultaneously
103-
in a single linear time search.
104102
* Splitting a string by a regex.
105103
* Replacing regex matches in a string with some other text.

regex-capi/ctest/test.c

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,191 @@ bool test_compile_error_size_limit() {
331331
return passed;
332332
}
333333

334+
bool test_regex_set_matches() {
335+
336+
#define PAT_COUNT 6
337+
338+
bool passed = true;
339+
const char *patterns[] = {
340+
"foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
341+
};
342+
const size_t patterns_lengths[] = {
343+
3, 6, 3, 3, 6, 3
344+
};
345+
346+
rure_error *err = rure_error_new();
347+
rure_set *re = rure_compile_set((const uint8_t **) patterns,
348+
patterns_lengths,
349+
PAT_COUNT,
350+
0,
351+
NULL,
352+
err);
353+
if (re == NULL) {
354+
passed = false;
355+
goto done2;
356+
}
357+
358+
if (rure_set_len(re) != PAT_COUNT) {
359+
passed = false;
360+
goto done1;
361+
}
362+
363+
if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) {
364+
passed = false;
365+
goto done1;
366+
}
367+
368+
if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) {
369+
passed = false;
370+
goto done1;
371+
}
372+
373+
bool matches[PAT_COUNT];
374+
if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) {
375+
passed = false;
376+
goto done1;
377+
}
378+
379+
const bool match_target[] = {
380+
true, false, true, false, true, true
381+
};
382+
383+
int i;
384+
for (i = 0; i < PAT_COUNT; ++i) {
385+
if (matches[i] != match_target[i]) {
386+
passed = false;
387+
goto done1;
388+
}
389+
}
390+
391+
done1:
392+
rure_set_free(re);
393+
done2:
394+
rure_error_free(err);
395+
return passed;
396+
397+
#undef PAT_COUNT
398+
}
399+
400+
bool test_regex_set_match_start() {
401+
402+
#define PAT_COUNT 3
403+
404+
bool passed = true;
405+
const char *patterns[] = {
406+
"foo", "bar", "fooo"
407+
};
408+
const size_t patterns_lengths[] = {
409+
3, 3, 4
410+
};
411+
412+
rure_error *err = rure_error_new();
413+
rure_set *re = rure_compile_set((const uint8_t **) patterns,
414+
patterns_lengths,
415+
PAT_COUNT,
416+
0,
417+
NULL,
418+
err);
419+
if (re == NULL) {
420+
passed = false;
421+
goto done2;
422+
}
423+
424+
if (rure_set_len(re) != PAT_COUNT) {
425+
passed = false;
426+
goto done1;
427+
}
428+
429+
if (rure_set_is_match(re, (const uint8_t *) "foobiasdr", 7, 2)) {
430+
passed = false;
431+
goto done1;
432+
}
433+
434+
{
435+
bool matches[PAT_COUNT];
436+
if (!rure_set_matches(re, (const uint8_t *) "fooobar", 8, 0, matches)) {
437+
passed = false;
438+
goto done1;
439+
}
440+
441+
const bool match_target[] = {
442+
true, true, true
443+
};
444+
445+
int i;
446+
for (i = 0; i < PAT_COUNT; ++i) {
447+
if (matches[i] != match_target[i]) {
448+
passed = false;
449+
goto done1;
450+
}
451+
}
452+
}
453+
454+
{
455+
bool matches[PAT_COUNT];
456+
if (!rure_set_matches(re, (const uint8_t *) "fooobar", 7, 1, matches)) {
457+
passed = false;
458+
goto done1;
459+
}
460+
461+
const bool match_target[] = {
462+
false, true, false
463+
};
464+
465+
int i;
466+
for (i = 0; i < PAT_COUNT; ++i) {
467+
if (matches[i] != match_target[i]) {
468+
passed = false;
469+
goto done1;
470+
}
471+
}
472+
}
473+
474+
done1:
475+
rure_set_free(re);
476+
done2:
477+
rure_error_free(err);
478+
return passed;
479+
480+
#undef PAT_COUNT
481+
}
482+
483+
bool test_regex_set_options() {
484+
485+
bool passed = true;
486+
rure_options *opts = rure_options_new();
487+
rure_options_size_limit(opts, 0);
488+
rure_error *err = rure_error_new();
489+
490+
const char *patterns[] = { "\\w{100}" };
491+
const size_t patterns_lengths[] = { 8 };
492+
493+
rure_set *re = rure_compile_set((const uint8_t **) patterns, patterns_lengths,
494+
1, 0, opts, err);
495+
if (re != NULL) {
496+
if (DEBUG) {
497+
fprintf(stderr,
498+
"[test_compile_error_size_limit] "
499+
"expected NULL regex pointer, but got non-NULL pointer\n");
500+
}
501+
passed = false;
502+
rure_set_free(re);
503+
}
504+
const char *msg = rure_error_message(err);
505+
if (NULL == strstr(msg, "exceeds size")) {
506+
if (DEBUG) {
507+
fprintf(stderr,
508+
"[test_compile_error] "
509+
"expected an 'exceeds size' error message, but "
510+
"got this instead: '%s'\n", msg);
511+
}
512+
passed = false;
513+
}
514+
rure_options_free(opts);
515+
rure_error_free(err);
516+
return passed;
517+
}
518+
334519
void run_test(bool (test)(), const char *name, bool *passed) {
335520
if (!test()) {
336521
*passed = false;
@@ -353,6 +538,9 @@ int main() {
353538
run_test(test_compile_error, "test_compile_error", &passed);
354539
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
355540
&passed);
541+
run_test(test_regex_set_matches, "test_regex_set_match", &passed);
542+
run_test(test_regex_set_options, "test_regex_set_options", &passed);
543+
run_test(test_regex_set_match_start, "test_regex_set_match_start", &passed);
356544

357545
if (!passed) {
358546
exit(1);

regex-capi/include/rure.h

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ extern "C" {
2828
*/
2929
typedef struct rure rure;
3030

31+
/*
32+
* rure_set is the type of a set of compiled regular expressions.
33+
*
34+
* A rure can be safely used from multiple threads simultaneously.
35+
*/
36+
typedef struct rure_set rure_set;
37+
3138
/*
3239
* rure_options is the set of non-flag configuration options for compiling
3340
* a regular expression. Currently, only two options are available: setting
@@ -165,7 +172,7 @@ rure *rure_compile(const uint8_t *pattern, size_t length,
165172
/*
166173
* rure_free frees the given compiled regular expression.
167174
*
168-
* This must be called at most once.
175+
* This must be called at most once for any rure.
169176
*/
170177
void rure_free(rure *re);
171178

@@ -446,6 +453,90 @@ void rure_options_size_limit(rure_options *options, size_t limit);
446453
*/
447454
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
448455

456+
/*
457+
* rure_compile_set compiles the given list of patterns into a single regular
458+
* expression which can be matched in a linear-scan. Each pattern in patterns
459+
* must be valid UTF-8 and the length of each pattern in patterns corresponds
460+
* to a byte length in patterns_lengths.
461+
*
462+
* The number of patterns to compile is specified by patterns_count. patterns
463+
* must contain at least this many entries.
464+
*
465+
* flags is a bitfield. Valid values are constants declared with prefix
466+
* RURE_FLAG_.
467+
*
468+
* options contains non-flag configuration settings. If it's NULL, default
469+
* settings are used. options may be freed immediately after a call to
470+
* rure_compile.
471+
*
472+
* error is set if there was a problem compiling the pattern.
473+
*
474+
* The compiled expression set returned may be used from multiple threads.
475+
*/
476+
rure_set *rure_compile_set(const uint8_t **patterns,
477+
const size_t *patterns_lengths,
478+
size_t patterns_count,
479+
uint32_t flags,
480+
rure_options *options,
481+
rure_error *error);
482+
483+
/*
484+
* rure_set_free frees the given compiled regular expression set.
485+
*
486+
* This must be called at most once for any rure_set.
487+
*/
488+
void rure_set_free(rure_set *re);
489+
490+
/*
491+
* rure_is_match returns true if and only if any regexes within the set
492+
* match anywhere in the haystack. Once a match has been located, the
493+
* matching engine will quit immediately.
494+
*
495+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
496+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
497+
* length should be the number of bytes in haystack.
498+
*
499+
* start is the position at which to start searching. Note that setting the
500+
* start position is distinct from incrementing the pointer, since the regex
501+
* engine may look at bytes before the start position to determine match
502+
* information. For example, if the start position is greater than 0, then the
503+
* \A ("begin text") anchor can never match.
504+
*/
505+
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
506+
size_t start);
507+
508+
/*
509+
* rure_set_matches compares each regex in the set against the haystack and
510+
* modifies matches with the match result of each pattern. Match results are
511+
* ordered in the same way as the rure_set was compiled. For example,
512+
* index 0 of matches corresponds to the first pattern passed to
513+
* `rure_compile_set`.
514+
*
515+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
516+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
517+
* length should be the number of bytes in haystack.
518+
*
519+
* start is the position at which to start searching. Note that setting the
520+
* start position is distinct from incrementing the pointer, since the regex
521+
* engine may look at bytes before the start position to determine match
522+
* information. For example, if the start position is greater than 0, then the
523+
* \A ("begin text") anchor can never match.
524+
*
525+
* matches must be greater than or equal to the number of patterns the
526+
* rure_set was compiled with.
527+
*
528+
* Only use this function if you specifically need to know which regexes
529+
* matched within the set. To determine if any of the regexes matched without
530+
* caring which, use rure_set_is_match.
531+
*/
532+
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
533+
size_t start, bool *matches);
534+
535+
/*
536+
* rure_set_len returns the number of patterns rure_set was compiled with.
537+
*/
538+
size_t rure_set_len(rure_set *re);
539+
449540
/*
450541
* rure_error_new allocates space for an error.
451542
*

0 commit comments

Comments
 (0)