Skip to content

Commit 1927236

Browse files
committed
Add regex matching for &[u8].
This commit enables support for compiling regular expressions that can match on arbitrary byte slices. In particular, we add a new sub-module called `bytes` that duplicates the API of the top-level module, except `&str` for subjects is replaced by `&[u8]`. Additionally, Unicode support in the regular expression is disabled by default but can be selectively re-enabled with the `u` flag. (Unicode support cannot be selectively disabled in the standard top-level API.) Most of the interesting changes occurred in the `regex-syntax` crate, where the AST now explicitly distinguishes between "ASCII compatible" expressions and Unicode aware expressions. This PR makes a few other changes out of convenience: 1. The DFA now knows how to "give up" if it's flushing its cache too often. When the DFA gives up, either backtracking or the NFA algorithm take over, which provides better performance. 2. Benchmarks were added for Oniguruma. 3. The benchmarks in general were overhauled to be defined in one place by using conditional compilation. 4. The tests have been completely reorganized to make it easier to split up the tests depending on which regex engine we're using. For example, we occasionally need to be able to write tests specifically for `regex::Regex` or specifically for `regex::bytes::Regex`. 5. Fixes a bug where NUL bytes weren't represented correctly in the byte class optimization for the DFA. Closes #85.
1 parent 82bd6a8 commit 1927236

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+5402
-1914
lines changed

.travis.yml

+5-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@ script:
1414
- cargo doc --verbose --manifest-path=regex-syntax/Cargo.toml
1515
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
1616
travis_wait cargo test --verbose --features pattern;
17-
travis_wait cargo bench --verbose --bench dynamic;
18-
travis_wait cargo bench --manifest-path=regex-pcre-benchmark/Cargo.toml --verbose
17+
travis_wait ./run-bench rust;
18+
travis_wait ./run-bench rust-bytes;
19+
travis_wait ./run-bench rust-plugin;
20+
travis_wait ./run-bench pcre;
21+
travis_wait ./run-bench onig;
1922
travis_wait cargo test --verbose --manifest-path=regex_macros/Cargo.toml;
20-
travis_wait cargo bench --manifest-path=regex_macros/Cargo.toml --verbose --bench native bench::;
2123
fi
2224
after_success: |
2325
[ $TRAVIS_BRANCH = master ] &&

Cargo.toml

+33-36
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ regex-syntax = { path = "regex-syntax", version = "0.2.5" }
2323
utf8-ranges = "0.1"
2424

2525
[dev-dependencies]
26-
# To prevent the benchmarking harness from running setup code more than once.
27-
# Why? Because it takes too long.
26+
# For examples.
2827
lazy_static = "0.1"
29-
# For generating random text to test/benchmark with.
28+
# For property based tests.
29+
quickcheck = "0.2"
30+
# For generating random test data.
3031
rand = "0.3"
3132

3233
[features]
@@ -41,57 +42,53 @@ bench = false
4142
# Generally these tests specific pieces of the regex implementation.
4243
[[test]]
4344
path = "src/lib.rs"
44-
name = "regex"
45+
name = "regex-inline"
4546

4647
# Run the test suite on the default behavior of Regex::new.
4748
# This includes a mish mash of NFAs and DFAs, which are chosen automatically
4849
# based on the regex. We test both of the NFA implementations by forcing their
4950
# usage with the test definitions below. (We can't test the DFA implementations
5051
# in the same way since they can't be used for every regex tested.)
5152
[[test]]
52-
path = "tests/test_dynamic.rs"
53-
name = "dynamic"
53+
path = "tests/test_default.rs"
54+
name = "default"
55+
test = false
5456

55-
# Run the test suite on the NFA algorithm over Unicode codepoints.
57+
# The same as the default tests, but run on bytes::Regex.
5658
[[test]]
57-
path = "tests/test_dynamic_nfa.rs"
58-
name = "dynamic-nfa"
59+
path = "tests/test_default_bytes.rs"
60+
name = "default-bytes"
5961

60-
# Run the test suite on the NFA algorithm over bytes.
62+
# Run the test suite on the NFA algorithm over Unicode codepoints.
6163
[[test]]
62-
path = "tests/test_dynamic_nfa_bytes.rs"
63-
name = "dynamic-nfa-bytes"
64+
path = "tests/test_nfa.rs"
65+
name = "nfa"
6466

65-
# Run the test suite on the backtracking engine over Unicode codepoints.
67+
# Run the test suite on the NFA algorithm over bytes that match UTF-8 only.
6668
[[test]]
67-
path = "tests/test_dynamic_backtrack.rs"
68-
name = "dynamic-backtrack"
69+
path = "tests/test_nfa_utf8bytes.rs"
70+
name = "nfa-utf8bytes"
6971

70-
# Run the test suite on the backtracking engine over bytes.
72+
# Run the test suite on the NFA algorithm over arbitrary bytes.
7173
[[test]]
72-
path = "tests/test_dynamic_backtrack_bytes.rs"
73-
name = "dynamic-backtrack-bytes"
74+
path = "tests/test_nfa_bytes.rs"
75+
name = "nfa-bytes"
7476

75-
# Run the benchmarks on the default behavior of Regex::new.
76-
#
77-
# N.B. These benchmarks were originally taken from Russ Cox.
78-
[[bench]]
79-
name = "dynamic"
80-
path = "benches/bench_dynamic.rs"
81-
test = false
82-
bench = true
77+
# Run the test suite on the backtracking engine over Unicode codepoints.
78+
[[test]]
79+
path = "tests/test_backtrack.rs"
80+
name = "backtrack"
8381

84-
# Run the benchmarks on the NFA algorithm. We avoid chasing other permutations.
85-
#
86-
# N.B. These can take a *loong* time to run.
87-
[[bench]]
88-
name = "dynamic-nfa"
89-
path = "benches/bench_dynamic_nfa.rs"
90-
test = false
91-
bench = true
82+
# Run the test suite on the backtracking engine over bytes that match UTF-8
83+
# only.
84+
[[test]]
85+
path = "tests/test_backtrack_utf8bytes.rs"
86+
name = "backtrack-utf8bytes"
9287

93-
[profile.bench]
94-
debug = true
88+
# Run the test suite on the backtracking engine over arbitrary bytes.
89+
[[test]]
90+
path = "tests/test_backtrack_bytes.rs"
91+
name = "backtrack-bytes"
9592

9693
[profile.test]
9794
debug = true

benches/Cargo.toml

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
[package]
2+
publish = false
3+
name = "regex-benchmark"
4+
version = "0.1.0"
5+
authors = ["The Rust Project Developers"]
6+
license = "MIT/Apache-2.0"
7+
repository = "https://github.com/rust-lang/regex"
8+
documentation = "http://doc.rust-lang.org/regex/regex_syntax/index.html"
9+
homepage = "https://github.com/rust-lang/regex"
10+
description = "Regex benchmarks for Rust's and other engines."
11+
12+
[dependencies]
13+
enum-set = "0.0.6"
14+
lazy_static = "0.1"
15+
onig = { version = "0.4", optional = true }
16+
pcre = { version = "0.2", optional = true }
17+
rand = "0.3"
18+
regex = { version = "0.1", path = ".." }
19+
regex_macros = { version = "0.1", path = "../regex_macros", optional = true }
20+
regex-syntax = { version = "0.2", path = "../regex-syntax" }
21+
22+
# Use features to conditionally compile benchmarked regexes, since not every
23+
# regex works on every engine. :-(
24+
[features]
25+
re-pcre = ["pcre"]
26+
re-onig = ["onig"]
27+
re-rust = []
28+
re-rust-bytes = []
29+
re-rust-plugin = ["regex_macros"]
30+
31+
# Run the benchmarks on the default behavior of Regex::new.
32+
[[bench]]
33+
name = "rust"
34+
path = "src/bench_rust.rs"
35+
test = false
36+
bench = true
37+
38+
# Run the benchmarks on the default behavior of bytes::Regex::new.
39+
[[bench]]
40+
name = "rust-bytes"
41+
path = "src/bench_rust_bytes.rs"
42+
test = false
43+
bench = true
44+
45+
# Run the benchmarks on the default behavior of the `regex!` compiler plugin.
46+
[[bench]]
47+
name = "rust-plugin"
48+
path = "src/bench_rust_plugin.rs"
49+
test = false
50+
bench = true
51+
52+
# Run the benchmarks on PCRE.
53+
[[bench]]
54+
name = "pcre"
55+
path = "src/bench_pcre.rs"
56+
test = false
57+
bench = true
58+
59+
# Run the benchmarks on Oniguruma.
60+
[[bench]]
61+
name = "onig"
62+
path = "src/bench_onig.rs"
63+
test = false
64+
bench = true
65+
66+
[profile.bench]
67+
debug = true
68+
69+
[profile.test]
70+
debug = true

0 commit comments

Comments
 (0)