8000 Add support for RegexSet · davidblewett/rure-python@3499127 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3499127

Browse files
committed
Add support for RegexSet
1 parent 36b2caa commit 3499127

File tree

5 files changed

+225
-4
lines changed

5 files changed

+225
-4
lines changed

README.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,6 @@ There are a few things missing from this package that are present in the Rust AP
6868
There's no particular (known) reason why they don't, they just haven't been
6969
implemented yet.
7070

71-
* RegexSet, which permits matching multiple regular expressions simultaneously
72-
in a single linear time search.
7371
* Splitting a string by a regex.
7472
* Replacing regex matches in a string with some other text.
7573

include/rure.h

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ extern "C" {
2828
*/
2929
typedef struct rure rure;
3030

31+
/*
32+
* rure_set is the type of a set of compiled regular expressions.
33+
*
34+
* A rure can be safely used from multiple threads simultaneously.
35+
*/
36+
typedef struct rure_set rure_set;
37+
3138
/*
3239
* rure_options is the set of non-flag configuration options for compiling
3340
* a regular expression. Currently, only two options are available: setting
@@ -165,7 +172,7 @@ rure *rure_compile(const uint8_t *pattern, size_t length,
165172
/*
166173
* rure_free frees the given compiled regular expression.
167174
*
168-
* This must be called at most once.
175+
* This must be called at most once for any rure.
169176
*/
170177
void rure_free(rure *re);
171178

@@ -446,6 +453,90 @@ void rure_options_size_limit(rure_options *options, size_t limit);
446453
*/
447454
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
448455

456+
/*
457+
* rure_compile_set compiles the given list of patterns into a single regular
458+
* expression which can be matched in a linear-scan. Each pattern in patterns
459+
* must be valid UTF-8 and the length of each pattern in patterns corresponds
460+
* to a byte length in patterns_lengths.
461+
*
462+
* The number of patterns to compile is specified by patterns_count. patterns
463+
* must contain at least this many entries.
464+
*
465+
* flags is a bitfield. Valid values are constants declared with prefix
466+
* RURE_FLAG_.
467+
*
468+
* options contains non-flag configuration settings. If it's NULL, default
469+
* settings are used. options may be freed immediately after a call to
470+
* rure_compile.
471+
*
472+
* error is set if there was a problem compiling the pattern.
473+
*
474+
* The compiled expression set returned may be used from multiple threads.
475+
*/
476+
rure_set *rure_compile_set(const uint8_t **patterns,
477+
const size_t *patterns_lengths,
478+
size_t patterns_count,
479+
uint32_t flags,
480+
rure_options *options,
481+
rure_error *error);
482+
483+
/*
484+
* rure_set_free frees the given compiled regular expression set.
485+
*
486+
* This must be called at most once for any rure_set.
487+
*/
488+
void rure_set_free(rure_set *re);
489+
490+
/*
491+
* rure_is_match returns true if and only if any regexes within the set
492+
* match anywhere in the haystack. Once a match has been located, the
493+
* matching engine will quit immediately.
494+
*
495+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
496+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
497+
* length should be the number of bytes in haystack.
498+
*
499+
* start is the position at which to start searching. Note that setting the
500+
* start position is distinct from incrementing the pointer, since the regex
501+
* engine may look at bytes before the start position to determine match
502+
* information. For example, if the start position is greater than 0, then the
503+
* \A ("begin text") anchor can never match.
504+
*/
505+
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
506+
size_t start);
507+
508+
/*
509+
* rure_set_matches compares each regex in the set against the haystack and
510+
* modifies matches with the match result of each pattern. Match results are
511+
* ordered in the same way as the rure_set was compiled. For example,
512+
* index 0 of matches corresponds to the first pattern passed to
513+
* `rure_compile_set`.
514+
*
515+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
516+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
517+
* length should be the number of bytes in haystack.
518+
*
519+
* start is the position at which to start searching. Note that setting the
520+
* start position is distinct from incrementing the pointer, since the regex
521+
* engine may look at bytes before the start position to determine match
522+
* information. For example, if the start position is greater than 0, then the
523+
* \A ("begin text") anchor can never match.
524+
*
525+
* matches must be greater than or equal to the number of patterns the
526+
* rure_set was compiled with.
527+
*
528+
* Only use this function if you specifically need to know which regexes
529+
* matched within the set. To determine if any of the regexes matched without
530+
* caring which, use rure_set_is_match.
531+
*/
532+
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
533+
size_t start, bool *matches);
534+
535+
/*
536+
* rure_set_len returns the number of patterns rure_set was compiled with.
537+
*/
538+
size_t rure_set_len(rure_set *re);
539+
449540
/*
450541
* rure_error_new allocates space for an error.
451542
*

rure/lib.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,3 +250,88 @@ def shortest_match(self, haystack, start=0):
250250
end = ffi.new('size_t *')
251251
if _lib.rure_shortest_match(self._ptr, haystack, hlen, start, end):
252252
return end[0]
253+
254+
255+
class RureSet(object):
256+
""" Match multiple (possibly overlapping) regular expressions in a single
257+
scan.
258+
259+
A regex set corresponds to the union of two or more regular expressions.
260+
That is, a regex set will match text where at least one of its constituent
261+
regular expressions matches. A regex set as its formulated here provides a
262+
touch more power: it will also report which regular expressions in the set
263+
match. Indeed, this is the key difference between regex sets and a single
264+
Regex with many alternates, since only one alternate can match at a time.
265+
"""
266+
def __init__(self, res, flags=DEFAULT_FLAGS, **options):
267+
268+
""" Compiles a regular expression. Once compiled, it can be used
269+
repeatedly to search, split or replace text in a string.
270+
271+
:param res: List of Bytestring expressions to compile
272+
:param flags: Bitmask of flags
273+
:param kwargs: Config options to pass (size_limit, dfa_size_limit)
274+
"""
275+
276+
if not all(isinstance(re, bytes) for re in res):
277+
raise TypeError("'rure.lib.RureSet' must be instantiated with a "
278+
"list of bytestrings as first argument.")
279+
280+
self._err = ffi.gc(_lib.rure_error_new(), _lib.rure_error_free)
281+
self._opts = ffi.gc(_lib.rure_options_new(), _lib.rure_options_free)
282+
self.options = options
283+
if 'size_limit' in options:
284+
_lib.rure_options_size_limit(self._opts, options['size_limit'])
285+
if 'dfa_size_limit' in options:
286+
_lib.rure_options_dfa_size_limit(self._opts,
287+
options['dfa_size_limit'])
288+
289+
patterns = []
290+
patterns_lengths = []
291+
for re in res:
292+
patterns.append(ffi.new("uint8_t []", re))
293+
patterns_lengths.append(len(re))
294+
295+
s = checked_call(
296+
_lib.rure_compile_set,
297+
self._err,
298+
ffi.new("uint8_t *[]", patterns),
299+
ffi.new("size_t []", patterns_lengths),
300+
len(patterns),
301+
flags,
302+
self._opts)
303+
self._ptr = ffi.gc(s, _lib.rure_set_free)
304+
305+
def __len__(self):
306+
return _lib.rure_set_len(self._ptr)
307+
308+
@accepts_bytes
309+
def is_match(self, haystack, start=0):
310+
"""
311+
Returns true if and only if one of the regexs matches the string
312+
given.
313+
314+
It is recommended to use this method if all you need to do is test
315+
a match, since the underlying matching engine may be able to do less
316+
work.
317+
"""
318+
return bool(_lib.rure_set_is_match(
319+
self._ptr,
320+
haystack,
321+
len(haystack),
322+
start
323+
))
324+
325+
@accepts_bytes
326+
def matches(self, haystack, start=0):
327+
"""
328+
Returns a list of booleans indicating whether the regex at each index
329+
was matched in the string given
330+
"""
331+
matches = ffi.new("bool[]", len(self))
332+
_lib.rure_set_matches(self._ptr,
333+
haystack,
334+
len(haystack),
335+
start,
336+
matches)
337+
return [bool(match) for match in matches]

rure/tests/test_rureset.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
from __future__ import print_function
4+
import unittest
5+
6+
from rure.exceptions import CompiledTooBigError, RegexSyntaxError
7+
from rure.lib import CASEI, RureSet
8+
9+
class TestRureSet(unittest.TestCase):
10+
def test_is_match(self):
11+
haystack = b"snowman: \xE2\x98\x83"
12+
res = RureSet([b"\\p{So}"])
13+
self.assertIsNotNone(res.is_match(haystack))
14+
15+
def test_matches(self):
16+
haystack = b"foobar"
17+
res = RureSet([b"baz", b"bar", b"foo"])
18+
self.assertEqual(res.matches(haystack), [False, True, True])
19+
20+
def test_set_len(self):
21+
res = RureSet([b"baz", b"bar", b"foo"])
22+
self.assertEqual(len(res), 3)
23+
24+
def test_flags(self):
25+
"""Test whether we can set the flags correctly.
26+
27+
In this case, we disable all flags, which includes disabling
< F28A /code>28+
Unicode mode. When we disable Unicode mode, we can match
29+
arbitrary possibly invalid UTF-8 bytes, such as \xFF.
30+
(When Unicode mode is enabled, \xFF won't match .)
31+
"""
32+
pattern = [b"."]
33+
haystack = b"\xFF"
34+
res = RureSet(pattern, flags=CASEI)
35+
self.assertTrue(res.is_match(haystack))
36+
37+
def test_compile_error(self):
38+
try:
39+
RureSet([b"("])
40+
except RegexSyntaxError as err:
41+
self.assertIn("Unclosed parenthesis", err.message)
42+
43+
def test_compile_error_size_limit(self):
44+
try:
45+
RureSet([b"\\w{100}"], size_limit=0)
46+
except CompiledTooBigError as err:
47+
self.assertIn("exceeds size", err.message)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setup(
1717
name='rure',
18-
version='0.1.1',
18+
version='0.1.2',
1919
author='David Blewett',
2020
author_email='david@dawninglight.net',
2121
description=('Python bindings for the Rust `regex` create. '

0 commit comments

Comments
 (0)
0