Skip to content

Commit 6f94255

Browse files
committed
impl: initial import of regex-automata
This effectively copies my regex-automata work into this crate and does a bunch of rejiggering to make it work. In particular, we wire up its new test harness to the public regex crate API. In this commit, that means the regex crate API is being simultaneously tested using both the old and new test suites. This does *not* get rid of the old regex crate implementation. That will happen in a subsequent commit. This is just a staging commit to prepare for that.
1 parent 4e29fce commit 6f94255

File tree

201 files changed

+84124
-195
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

201 files changed

+84124
-195
lines changed

.github/workflows/ci.yml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -149,27 +149,36 @@ jobs:
149149
if: matrix.build == 'stable'
150150
run: |
151151
# 'stable' is Linux only, so we have bash.
152-
cd regex-syntax
153-
./test
152+
./regex-syntax/test
153+
154+
- name: Build regex-automata docs
155+
if: matrix.build != 'pinned'
156+
run: |
157+
${{ env.CARGO }} doc --verbose --manifest-path regex-automata/Cargo.toml $TARGET
158+
159+
- name: Run subset of regex-automata tests
160+
if: matrix.build != 'pinned' && matrix.build != 'stable'
161+
run: |
162+
${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
163+
164+
- name: Run full regex-automata test suite
165+
if: matrix.build == 'stable'
166+
run: |
167+
# 'stable' is Linux only, so we have bash.
168+
./regex-automata/test
154169
155170
- name: Run regex-capi tests
156171
if: matrix.build == 'stable'
157172
run: |
158173
# 'stable' is Linux only, so we have bash.
159-
cd regex-capi
160-
./test
174+
./regex-capi/test
161175
162176
- if: matrix.build == 'nightly'
163177
name: Run benchmarks as tests
164178
run: |
165179
cd bench
166180
./run rust --no-run --verbose
167181
168-
- if: matrix.build == 'nightly'
169-
name: Run tests with pattern feature
170-
run: |
171-
cargo test --test default --no-default-features --features 'std pattern unicode-perl'
172-
173182
rustfmt:
174183
name: rustfmt
175184
runs-on: ubuntu-latest

Cargo.toml

Lines changed: 134 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "regex"
33
version = "1.8.1" #:version
4-
authors = ["The Rust Project Developers"]
4+
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
55
license = "MIT OR Apache-2.0"
66
readme = "README.md"
77
repository = "https://github.com/rust-lang/regex"
@@ -19,7 +19,12 @@ rust-version = "1.60.0"
1919

2020
[workspace]
2121
members = [
22-
"bench", "regex-capi", "regex-syntax",
22+
"bench",
23+
"regex-automata",
24+
"regex-capi",
25+
"regex-cli",
26+
"regex-syntax",
27+
"regex-test",
2328
]
2429

2530
[lib]
@@ -42,27 +47,53 @@ default = ["std", "perf", "unicode", "regex-syntax/default"]
4247
# to compile without std, and instead just rely on 'core' and 'alloc' (for
4348
# example). Currently, this isn't supported, and removing the 'std' feature
4449
# will prevent regex from compiling.
45-
std = []
50+
std = [
51+
"aho-corasick?/std",
52+
"memchr?/std",
53+
"regex-automata/std",
54+
"regex-syntax/std",
55+
]
4656
# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
4757
# then, it is an alias for the 'std' feature.
4858
use_std = ["std"]
4959

5060

5161
# PERFORMANCE FEATURES
5262

53-
# Enables all performance features.
54-
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
63+
# Enables all default performance features. Note that this specifically does
64+
# not include perf-dfa-full, because it leads to higher compile times and
65+
# bigger binaries, and the runtime performance improvement is not obviously
66+
# worth it.
67+
perf = [
68+
"perf-cache",
69+
"perf-dfa",
70+
"perf-onepass",
71+
"perf-backtrack",
72+
"perf-inline",
73+
"perf-literal",
74+
]
5575
# Enables fast caching. (If disabled, caching is still used, but is slower.)
5676
# Currently, this feature has no effect. It used to remove the thread_local
5777
# dependency and use a slower internal cache, but now the default cache has
5878
# been improved and thread_local is no longer a dependency at all.
5979
perf-cache = []
6080
# Enables use of a lazy DFA when possible.
61-
perf-dfa = []
81+
perf-dfa = ["regex-automata/hybrid"]
82+
# Enables use of a fully compiled DFA when possible.
83+
perf-dfa-full = ["regex-automata/dfa-build", "regex-automata/dfa-search"]
84+
# Enables use of the one-pass regex matcher, which speeds up capture searches
85+
# even beyond the backtracker.
86+
perf-onepass = ["regex-automata/dfa-onepass"]
87+
# Enables use of a bounded backtracker, which speeds up capture searches.
88+
perf-backtrack = ["regex-automata/nfa-backtrack"]
6289
# Enables aggressive use of inlining.
63-
perf-inline = []
90+
perf-inline = ["regex-automata/perf-inline"]
6491
# Enables literal optimizations.
65-
perf-literal = ["aho-corasick", "memchr"]
92+
perf-literal = [
93+
"dep:aho-corasick",
94+
"dep:memchr",
95+
"regex-automata/perf-literal",
96+
]
6697

6798

6899
# UNICODE DATA FEATURES
@@ -76,22 +107,45 @@ unicode = [
76107
"unicode-perl",
77108
"unicode-script",
78109
"unicode-segment",
110+
"regex-automata/unicode",
79111
"regex-syntax/unicode",
80112
]
81113
# Enables use of the `Age` property, e.g., `\p{Age:3.0}`.
82-
unicode-age = ["regex-syntax/unicode-age"]
114+
unicode-age = [
115+
"regex-automata/unicode-age",
116+
"regex-syntax/unicode-age",
117+
]
83118
# Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`.
84-
unicode-bool = ["regex-syntax/unicode-bool"]
119+
unicode-bool = [
120+
"regex-automata/unicode-bool",
121+
"regex-syntax/unicode-bool",
122+
]
85123
# Enables Unicode-aware case insensitive matching, e.g., `(?i)β`.
86-
unicode-case = ["regex-syntax/unicode-case"]
124+
unicode-case = [
125+
"regex-automata/unicode-case",
126+
"regex-syntax/unicode-case",
127+
]
87128
# Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`.
88-
unicode-gencat = ["regex-syntax/unicode-gencat"]
129+
unicode-gencat = [
130+
"regex-automata/unicode-gencat",
131+
"regex-syntax/unicode-gencat",
132+
]
89133
# Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`.
90-
unicode-perl = ["regex-syntax/unicode-perl"]
134+
unicode-perl = [
135+
"regex-automata/unicode-perl",
136+
"regex-automata/unicode-word-boundary",
137+
"regex-syntax/unicode-perl",
138+
]
91139
# Enables Unicode scripts and script extensions, e.g., `\p{Greek}`.
92-
unicode-script = ["regex-syntax/unicode-script"]
140+
unicode-script = [
141+
"regex-automata/unicode-script",
142+
"regex-syntax/unicode-script",
143+
]
93144
# Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`.
94-
unicode-segment = ["regex-syntax/unicode-segment"]
145+
unicode-segment = [
146+
"regex-automata/unicode-segment",
147+
"regex-syntax/unicode-segment",
148+
]
95149

96150

97151
# UNSTABLE FEATURES (requires Rust nightly)
@@ -121,6 +175,13 @@ path = "regex-syntax"
121175
version = "0.7.1"
122176
default-features = false
123177

178+
# For the actual regex engines.
179+
[dependencies.regex-automata]
180+
path = "regex-automata"
181+
version = "0.3.0"
182+
default-features = false
183+
features = ["alloc", "syntax", "meta", "nfa-pikevm"]
184+
124185
[dev-dependencies]
125186
# For examples.
126187
lazy_static = "1"
@@ -129,10 +190,39 @@ quickcheck = { version = "1.0.3", default-features = false }
129190
# For generating random test data.
130191
rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] }
131192
# To check README's example
132-
# TODO: Re-enable this once the MSRV is 1.43 or greater.
133-
# See: https://github.com/rust-lang/regex/issues/684
134-
# See: https://github.com/rust-lang/regex/issues/685
135-
# doc-comment = "0.3"
193+
doc-comment = "0.3"
194+
# For easy error handling in integration tests.
195+
anyhow = "1.0.69"
196+
# A library for testing regex engines.
197+
regex-test = { path = "regex-test", version = "0.1.0" }
198+
199+
[dev-dependencies.env_logger]
200+
# Note that this is currently using an older version because of the dependency
201+
# tree explosion that happened in 0.10.
202+
version = "0.9.3"
203+
default-features = false
204+
features = ["atty", "humantime", "termcolor"]
205+
206+
# This test suite reads a whole boatload of tests from the top-level testdata
207+
# directory, and then runs them against the regex crate API.
208+
#
209+
# regex-automata has its own version of them, and runs them against each
210+
# internal regex engine individually.
211+
#
212+
# This means that if you're seeing a failure in this test suite, you should
213+
# try running regex-automata's tests:
214+
#
215+
# cargo test --manifest-path regex-automata/Cargo.toml --test integration
216+
#
217+
# That *might* give you a more targeted test failure. i.e., "only the
218+
# PikeVM fails this test." Which gives you a narrower place to search. If
219+
# regex-automata's test suite passes, then the bug might be in the integration
220+
# of the regex crate and regex-automata. But generally speaking, a failure
221+
# in this test suite *should* mean there is a corresponding failure in
222+
# regex-automata's test suite.
223+
[[test]]
224+
path = "newtests/tests.rs"
225+
name = "integration"
136226

137227
# Run the test suite on the default behavior of Regex::new.
138228
# This includes a mish mash of NFAs and DFAs, which are chosen automatically
@@ -185,11 +275,36 @@ name = "backtrack-bytes"
185275
path = "tests/test_crates_regex.rs"
186276
name = "crates-regex"
187277

278+
[package.metadata.docs.rs]
279+
# We want to document all features.
280+
all-features = true
281+
# Since this crate's feature setup is pretty complicated, it is worth opting
282+
# into a nightly unstable option to show the features that need to be enabled
283+
# for public API items. To do that, we set 'docsrs', and when that's enabled,
284+
# we enable the 'doc_auto_cfg' feature.
285+
#
286+
# To test this locally, run:
287+
#
288+
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
289+
rustdoc-args = ["--cfg", "docsrs"]
290+
188291
[profile.release]
189292
debug = true
190293

191294
[profile.bench]
192295
debug = true
193296

297+
[profile.dev]
298+
# Running tests takes too long in debug mode, so we forcefully always build
299+
# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
300+
#
301+
# It's counter-intuitive that this needs to be set on dev *and* test, but
302+
# it's because the tests that take a long time to run are run as integration
303+
# tests in a separate crate. The test.opt-level setting won't apply there, so
304+
# we need to set the opt-level across the entire build.
305+
opt-level = 3
306+
debug = true
307+
194308
[profile.test]
309+
opt-level = 3
195310
debug = true

newtests/bytes.rs

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
use {
2+
anyhow::Result,
3+
regex::bytes::{Regex, RegexBuilder},
4+
regex_test::{
5+
CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
6+
},
7+
};
8+
9+
/// Tests the default configuration of the hybrid NFA/DFA.
10+
#[test]
11+
fn default() -> Result<()> {
12+
let mut runner = TestRunner::new()?;
13+
runner
14+
.expand(&["is_match", "find", "captures"], |test| test.compiles())
15+
.blacklist_iter(super::BLACKLIST)
16+
.test_iter(crate::suite()?.iter(), compiler)
17+
.assert();
18+
Ok(())
19+
}
20+
21+
fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
22+
match test.additional_name() {
23+
"is_match" => TestResult::matched(re.is_match(test.haystack())),
24+
"find" => TestResult::matches(
25+
re.find_iter(test.haystack())
26+
.take(test.match_limit().unwrap_or(std::usize::MAX))
27+
.map(|m| Match {
28+
id: 0,
29+
span: Span { start: m.start(), end: m.end() },
30+
}),
31+
),
32+
"captures" => {
33+
let it = re
34+
.captures_iter(test.haystack())
35+
.take(test.match_limit().unwrap_or(std::usize::MAX))
36+
.map(|caps| testify_captures(&caps));
37+
TestResult::captures(it)
38+
}
39+
name => TestResult::fail(&format!("unrecognized test name: {}", name)),
40+
}
41+
}
42+
43+
/// Converts the given regex test to a closure that searches with a
44+
/// `bytes::Regex`. If the test configuration is unsupported, then a
45+
/// `CompiledRegex` that skips the test is returned.
46+
fn compiler(
47+
test: &RegexTest,
48+
_patterns: &[String],
49+
) -> anyhow::Result<CompiledRegex> {
50+
let skip = Ok(CompiledRegex::skip());
51+
52+
// We're only testing bytes::Regex here, which supports one pattern only.
53+
let pattern = match test.regexes().len() {
54+
1 => &test.regexes()[0],
55+
_ => return skip,
56+
};
57+
// We only test is_match, find_iter and captures_iter. All of those are
58+
// leftmost searches.
59+
if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
60+
return skip;
61+
}
62+
// The top-level single-pattern regex API always uses leftmost-first.
63+
if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
64+
return skip;
65+
}
66+
// The top-level regex API always runs unanchored searches.
67+
if test.anchored() {
68+
return skip;
69+
}
70+
// We don't support tests with explicit search bounds. We could probably
71+
// support this by using the 'find_at' (and such) APIs.
72+
let bounds = test.bounds();
73+
if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
74+
return skip;
75+
}
76+
// The bytes::Regex API specifically does not support enabling UTF-8 mode.
77+
// It could I suppose, but currently it does not. That is, it permits
78+
// matches to have offsets that split codepoints.
79+
if test.utf8() {
80+
return skip;
81+
}
82+
let re = RegexBuilder::new(pattern)
83+
.case_insensitive(test.case_insensitive())
84+
.unicode(test.unicode())
85+
.build()?;
86+
Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
87+
}
88+
89+
/// Convert `Captures` into the test suite's capture values.
90+
fn testify_captures(
91+
caps: &regex::bytes::Captures<'_>,
92+
) -> regex_test::Captures {
93+
let spans = caps.iter().map(|group| {
94+
group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
95+
});
96+
// This unwrap is OK because we assume our 'caps' represents a match, and
97+
// a match always gives a non-zero number of groups with the first group
98+
// being non-None.
99+
regex_test::Captures::new(0, spans).unwrap()
100+
}

0 commit comments

Comments
 (0)