Skip to content

Commit 32b86e9

Browse files
committed
Major refactoring and performance improvements.
Overview of changes: * Instruction set has been redesigned to be smaller, mostly by collapsing empty-width matches into one instruction type. In addition to moving instruction-matching out of the matching engine, this makes matching engine code much simpler. * Rewrote input handling to use an inline representation of `Option<char>` and clearer position handling with the `Input` trait. * Added a new bounded backtracking matching engine that is invoked for small regexes/inputs. It's about twice as fast as the full NFA matching engine. * Implemented caching for both the NFA and backtracking engines. This avoids costly allocations on subsequent uses of the regex. * Overhauled prefix handling at both discovery and matching. Namely, sets of prefix literals can now be extracted from regexes. Depending on what the prefixes look like, an Aho-Corasick DFA is built from them. (This adds a dependency on the `aho-corasick` crate.) * When appropriate, use `memchr` to jump around in the input when there is a single common byte prefix. (This adds a dependency on the `memchr` crate.) * Bring the `regex!` macro up to date. Unfortunately, it still implements the full NFA matching engine and doesn't yet have access to the new prefix DFA handling. Thus, its performance has gotten *worse* than the dynamic implementation in most cases. The docs have been updated to reflect this change. Surprisingly, all of this required exactly one new application of `unsafe`, which is isolated in the `memchr` crate. (Aho-Corasick has no `unsafe` either!) There should be *no* breaking changes in this commit. The only public facing change is the addition of a method to the `Replacer` trait, but it comes with a default implementation so that existing implementors won't break. (Its purpose is to serve as a hint as to whether or not replacement strings need to be expanded. This is crucial to speeding up simple replacements.) Closes #21.
1 parent 258c261 commit 32b86e9

25 files changed

+2614
-1232
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ sudo: false
77
script:
88
- cargo build --verbose
99
- cargo test --verbose
10+
- ./run-shootout-test
1011
- |
1112
[ $TRAVIS_RUST_VERSION != nightly ] || (
1213
cargo test --verbose --features pattern &&

Cargo.toml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,15 @@ An implementation of regular expressions for Rust.
1313

1414
[[test]]
1515
path = "regex_macros/tests/test_dynamic.rs"
16-
name = "all"
16+
name = "dynamic"
17+
18+
[[test]]
19+
path = "regex_macros/tests/test_dynamic_nfa.rs"
20+
name = "dynamic_nfa"
21+
22+
[[test]]
23+
path = "regex_macros/tests/test_dynamic_backtrack.rs"
24+
name = "dynamic_backtrack"
1725

1826
[[bench]]
1927
name = "all"
@@ -22,6 +30,8 @@ test = false
2230
bench = true
2331

2432
[dependencies]
33+
aho-corasick = "0.1"
34+
memchr = "0.1"
2535
regex-syntax = { path = "regex-syntax", version = "0.1" }
2636

2737
[dev-dependencies]
File renamed without changes.

examples/regexdna-output.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
agggtaaa|tttaccct 0
2+
[cgt]gggtaaa|tttaccc[acg] 3
3+
a[act]ggtaaa|tttacc[agt]t 9
4+
ag[act]gtaaa|tttac[agt]ct 8
5+
agg[act]taaa|ttta[agt]cct 10
6+
aggg[acg]aaa|ttt[cgt]ccct 3
7+
agggt[cgt]aa|tt[acg]accct 4
8+
agggta[cgt]a|t[acg]taccct 3
9+
agggtaa[cgt]|[acg]ttaccct 5
10+
11+
101745
12+
100000
13+
133640

examples/shootout-regex-dna.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// The Computer Language Benchmarks Game
2+
// http://benchmarksgame.alioth.debian.org/
3+
//
4+
// contributed by the Rust Project Developers
5+
// contributed by TeXitoi
6+
// contributed by BurntSushi
7+
8+
extern crate regex;
9+
10+
use std::io::{self, Read};
11+
use std::sync::Arc;
12+
use std::thread;
13+
14+
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } }
15+
16+
fn main() {
17+
let mut seq = String::with_capacity(10 * (1 << 20));
18+
io::stdin().read_to_string(&mut seq).unwrap();
19+
let ilen = seq.len();
20+
21+
seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "");
22+
let clen = seq.len();
23+
let seq_arc = Arc::new(seq.clone());
24+
25+
let variants = vec![
26+
regex!("agggtaaa|tttaccct"),
27+
regex!("[cgt]gggtaaa|tttaccc[acg]"),
28+
regex!("a[act]ggtaaa|tttacc[agt]t"),
29+
regex!("ag[act]gtaaa|tttac[agt]ct"),
30+
regex!("agg[act]taaa|ttta[agt]cct"),
31+
regex!("aggg[acg]aaa|ttt[cgt]ccct"),
32+
regex!("agggt[cgt]aa|tt[acg]accct"),
33+
regex!("agggta[cgt]a|t[acg]taccct"),
34+
regex!("agggtaa[cgt]|[acg]ttaccct"),
35+
];
36+
let mut counts = vec![];
37+
for variant in variants {
38+
let seq = seq_arc.clone();
39+
let restr = variant.to_string();
40+
let future = thread::spawn(move || variant.find_iter(&seq).count());
41+
counts.push((restr, future));
42+
}
43+
44+
let substs = vec![
45+
(regex!("B"), "(c|g|t)"),
46+
(regex!("D"), "(a|g|t)"),
47+
(regex!("H"), "(a|c|t)"),
48+
(regex!("K"), "(g|t)"),
49+
(regex!("M"), "(a|c)"),
50+
(regex!("N"), "(a|c|g|t)"),
51+
(regex!("R"), "(a|g)"),
52+
(regex!("S"), "(c|g)"),
53+
(regex!("V"), "(a|c|g)"),
54+
(regex!("W"), "(a|t)"),
55+
(regex!("Y"), "(c|t)"),
56+
];
57+
let mut seq = seq;
58+
for (re, replacement) in substs.into_iter() {
59+
seq = re.replace_all(&seq, replacement);
60+
}
61+
let rlen = seq.len();
62+
63+
for (variant, count) in counts {
64+
println!("{} {}", variant, count.join().unwrap());
65+
}
66+
println!("\n{}\n{}\n{}", ilen, clen, rlen);
67+
}

regex_macros/Cargo.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@ plugin = true
1717
path = "tests/test_native.rs"
1818
name = "all"
1919

20-
[[test]]
21-
path = "benches/shootout-regex-dna.rs"
22-
name = "shootout_regex_dna"
23-
2420
[[bench]]
2521
name = "all"
2622
path = "benches/bench_native.rs"

regex_macros/benches/bench.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,13 @@ fn match_class_in_range(b: &mut Bencher) {
5959
bench_assert_match(b, re, &text);
6060
}
6161

62+
#[bench]
63+
fn match_class_unicode(b: &mut Bencher) {
64+
let re = regex!(r"\pL");
65+
let text = format!("{}a", repeat("☃5☃5").take(20).collect::<String>());
66+
bench_assert_match(b, re, &text);
67+
}
68+
6269
#[bench]
6370
fn replace_all(b: &mut Bencher) {
6471
let re = regex!("[cjrw]");
@@ -171,15 +178,19 @@ fn gen_text(n: usize) -> String {
171178
throughput!(easy0_32, easy0(), 32);
172179
throughput!(easy0_1K, easy0(), 1<<10);
173180
throughput!(easy0_32K, easy0(), 32<<10);
181+
throughput!(easy0_1MB, easy0(), 1<<20);
174182

175183
throughput!(easy1_32, easy1(), 32);
176184
throughput!(easy1_1K, easy1(), 1<<10);
177185
throughput!(easy1_32K, easy1(), 32<<10);
186+
throughput!(easy1_1MB, easy1(), 1<<20);
178187

179188
throughput!(medium_32, medium(), 32);
180189
throughput!(medium_1K, medium(), 1<<10);
181190
throughput!(medium_32K,medium(), 32<<10);
191+
throughput!(medium_1MB, medium(), 1<<20);
182192

183193
throughput!(hard_32, hard(), 32);
184194
throughput!(hard_1K, hard(), 1<<10);
185195
throughput!(hard_32K,hard(), 32<<10);
196+
throughput!(hard_1MB, hard(), 1<<20);

0 commit comments

Comments
 (0)