diff --git a/.travis.yml b/.travis.yml index 9eed600032..7c9a99a1c1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: rust rust: - - 1.3.0 + - 1.12.0 - stable - beta - nightly diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b5fc7a206..5c7ecfcfe4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,101 @@ +0.2.0 +===== +This is a new major release of the regex crate, and is an implementation of the +[regex 1.0 RFC](https://github.com/rust-lang/rfcs/blob/master/text/1620-regex-1.0.md). +We are releasing a `0.2` first, and if there are no major problems, we will +release a `1.0` shortly. For `0.2`, the minimum *supported* Rust version is +1.12. + +There are a number of **breaking changes** in `0.2`. They are split into two +types. The first type correspond to breaking changes in regular expression +syntax. The second type correspond to breaking changes in the API. + +Breaking changes for regex syntax: + +* POSIX character classes now require double bracketing. Previously, the regex + `[:upper:]` would parse as the `upper` POSIX character class. Now it parses + as the character class containing the characters `:upper:`. The fix to this + change is to use `[[:upper:]]` instead. Note that variants like + `[[:upper:][:blank:]]` continue to work. +* The character `[` must always be escaped inside a character class. +* The characters `&`, `-` and `~` must be escaped if any one of them are + repeated consecutively. For example, `[&]`, `[\&]`, `[\&\&]`, `[&-&]` are all + equivalent while `[&&]` is illegal. (The motivation for this and the prior + change is to provide a backwards compatible path for adding character class + set notation.) +* A `bytes::Regex` now has Unicode mode enabled by default (like the main + `Regex` type). This means regexes compiled with `bytes::Regex::new` that + don't have the Unicode flag set should add `(?-u)` to recover the original + behavior. + +Breaking changes for the regex API: + +* `find` and `find_iter` now **return `Match` values instead of + `(usize, usize)`.** `Match` values have `start` and `end` methods, which + return the match offsets. `Match` values also have an `as_str` method, + which returns the text of the match itself. +* The `Captures` type now only provides a single iterator over all capturing + matches, which should replace uses of `iter` and `iter_pos`. Uses of + `iter_named` should use the `capture_names` method on `Regex`. +* The `at` method on the `Captures` type has been renamed to `get`, and it + now returns a `Match`. Similarly, the `name` method on `Captures` now returns + a `Match`. +* The `replace` methods now return `Cow` values. The `Cow::Borrowed` variant + is returned when no replacements are made. +* The `Replacer` trait has been completely overhauled. This should only + impact clients that implement this trait explicitly. Standard uses of + the `replace` methods should continue to work unchanged. If you implement + the `Replacer` trait, please consult the new documentation. +* The `quote` free function has been renamed to `escape`. +* The `Regex::with_size_limit` method has been removed. It is replaced by + `RegexBuilder::size_limit`. +* The `RegexBuilder` type has switched from owned `self` method receivers to + `&mut self` method receivers. Most uses will continue to work unchanged, but + some code may require naming an intermediate variable to hold the builder. +* The free `is_match` function has been removed. It is replaced by compiling + a `Regex` and calling its `is_match` method. +* The `PartialEq` and `Eq` impls on `Regex` have been dropped. If you relied + on these impls, the fix is to define a wrapper type around `Regex`, impl + `Deref` on it and provide the necessary impls. +* The `is_empty` method on `Captures` has been removed. This always returns + `false`, so its use is superfluous. +* The `Syntax` variant of the `Error` type now contains a string instead of + a `regex_syntax::Error`. If you were examining syntax errors more closely, + you'll need to explicitly use the `regex_syntax` crate to re-parse the regex. +* The `InvalidSet` variant of the `Error` type has been removed since it is + no longer used. +* Most of the iterator types have been renamed to match conventions. If you + were using these iterator types explicitly, please consult the documentation + for its new name. For example, `RegexSplits` has been renamed to `Split`. + +A number of bugs have been fixed: + +* [BUG #151](https://github.com/rust-lang/regex/issues/151): + The `Replacer` trait has been changed to permit the caller to control + allocation. +* [BUG #165](https://github.com/rust-lang/regex/issues/165): + Remove the free `is_match` function. +* [BUG #166](https://github.com/rust-lang/regex/issues/166): + Expose more knobs (available in `0.1`) and remove `with_size_limit`. +* [BUG #168](https://github.com/rust-lang/regex/issues/168): + Iterators produced by `Captures` now have the correct lifetime parameters. +* [BUG #175](https://github.com/rust-lang/regex/issues/175): + Fix a corner case in the parsing of POSIX character classes. +* [BUG #178](https://github.com/rust-lang/regex/issues/178): + Drop the `PartialEq` and `Eq` impls on `Regex`. +* [BUG #179](https://github.com/rust-lang/regex/issues/179): + Remove `is_empty` from `Captures` since it always returns false. +* [BUG #276](https://github.com/rust-lang/regex/issues/276): + Position of named capture can now be retrieved from a `Captures`. +* [BUG #296](https://github.com/rust-lang/regex/issues/296): + Remove winapi/kernel32-sys dependency on UNIX. +* [BUG #307](https://github.com/rust-lang/regex/issues/307): + Fix error on emscripten. + + 0.1.80 ====== -* [PR #292](https://github.com/rust-lang-nursery/regex/pull/292): +* [PR #292](https://github.com/rust-lang/regex/pull/292): Fixes bug #291, which was introduced by PR #290. 0.1.79 @@ -9,13 +104,13 @@ 0.1.78 ====== -* [PR #290](https://github.com/rust-lang-nursery/regex/pull/290): +* [PR #290](https://github.com/rust-lang/regex/pull/290): Fixes bug #289, which caused some regexes with a certain combination of literals to match incorrectly. 0.1.77 ====== -* [PR #281](https://github.com/rust-lang-nursery/regex/pull/281): +* [PR #281](https://github.com/rust-lang/regex/pull/281): Fixes bug #280 by disabling all literal optimizations when a pattern is partially anchored. @@ -25,9 +120,9 @@ 0.1.75 ====== -* [PR #275](https://github.com/rust-lang-nursery/regex/pull/275): +* [PR #275](https://github.com/rust-lang/regex/pull/275): Improves match verification performance in the Teddy SIMD searcher. -* [PR #278](https://github.com/rust-lang-nursery/regex/pull/278): +* [PR #278](https://github.com/rust-lang/regex/pull/278): Replaces slow substring loop in the Teddy SIMD searcher with Aho-Corasick. * Implemented DoubleEndedIterator on regex set match iterators. @@ -36,7 +131,7 @@ * Release regex-syntax 0.3.5 with a minor bug fix. * Fix bug #272. * Fix bug #277. -* [PR #270](https://github.com/rust-lang-nursery/regex/pull/270): +* [PR #270](https://github.com/rust-lang/regex/pull/270): Fixes bugs #264, #268 and an unreported where the DFA cache size could be drastically under estimated in some cases (leading to high unexpected memory usage). @@ -48,55 +143,55 @@ 0.1.72 ====== -* [PR #262](https://github.com/rust-lang-nursery/regex/pull/262): +* [PR #262](https://github.com/rust-lang/regex/pull/262): Fixes a number of small bugs caught by fuzz testing (AFL). 0.1.71 ====== -* [PR #236](https://github.com/rust-lang-nursery/regex/pull/236): +* [PR #236](https://github.com/rust-lang/regex/pull/236): Fix a bug in how suffix literals were extracted, which could lead to invalid match behavior in some cases. 0.1.70 ====== -* [PR #231](https://github.com/rust-lang-nursery/regex/pull/231): +* [PR #231](https://github.com/rust-lang/regex/pull/231): Add SIMD accelerated multiple pattern search. -* [PR #228](https://github.com/rust-lang-nursery/regex/pull/228): +* [PR #228](https://github.com/rust-lang/regex/pull/228): Reintroduce the reverse suffix literal optimization. -* [PR #226](https://github.com/rust-lang-nursery/regex/pull/226): +* [PR #226](https://github.com/rust-lang/regex/pull/226): Implements NFA state compression in the lazy DFA. -* [PR #223](https://github.com/rust-lang-nursery/regex/pull/223): +* [PR #223](https://github.com/rust-lang/regex/pull/223): A fully anchored RegexSet can now short-circuit. 0.1.69 ====== -* [PR #216](https://github.com/rust-lang-nursery/regex/pull/216): +* [PR #216](https://github.com/rust-lang/regex/pull/216): Tweak the threshold for running backtracking. -* [PR #217](https://github.com/rust-lang-nursery/regex/pull/217): +* [PR #217](https://github.com/rust-lang/regex/pull/217): Add upper limit (from the DFA) to capture search (for the NFA). -* [PR #218](https://github.com/rust-lang-nursery/regex/pull/218): +* [PR #218](https://github.com/rust-lang/regex/pull/218): Add rure, a C API. 0.1.68 ====== -* [PR #210](https://github.com/rust-lang-nursery/regex/pull/210): +* [PR #210](https://github.com/rust-lang/regex/pull/210): Fixed a performance bug in `bytes::Regex::replace` where `extend` was used instead of `extend_from_slice`. -* [PR #211](https://github.com/rust-lang-nursery/regex/pull/211): +* [PR #211](https://github.com/rust-lang/regex/pull/211): Fixed a bug in the handling of word boundaries in the DFA. -* [PR #213](https://github.com/rust-lang-nursery/regex/pull/213): +* [PR #213](https://github.com/rust-lang/pull/213): Added RE2 and Tcl to the benchmark harness. Also added a CLI utility from running regexes using any of the following regex engines: PCRE1, PCRE2, Oniguruma, RE2, Tcl and of course Rust's own regexes. 0.1.67 ====== -* [PR #201](https://github.com/rust-lang-nursery/regex/pull/201): +* [PR #201](https://github.com/rust-lang/regex/pull/201): Fix undefined behavior in the `regex!` compiler plugin macro. -* [PR #205](https://github.com/rust-lang-nursery/regex/pull/205): +* [PR #205](https://github.com/rust-lang/regex/pull/205): More improvements to DFA performance. Competitive with RE2. See PR for benchmarks. -* [PR #209](https://github.com/rust-lang-nursery/regex/pull/209): +* [PR #209](https://github.com/rust-lang/regex/pull/209): Release 0.1.66 was semver incompatible since it required a newer version of Rust than previous releases. This PR fixes that. (And `0.1.66` was yanked.) @@ -110,11 +205,11 @@ complexity. It was replaced with a more limited optimization where, given any regex of the form `re$`, it will be matched in reverse from the end of the haystack. -* [PR #202](https://github.com/rust-lang-nursery/regex/pull/202): +* [PR #202](https://github.com/rust-lang/regex/pull/202): The inner loop of the DFA was heavily optimized to improve cache locality and reduce the overall number of instructions run on each iteration. This represents the first use of `unsafe` in `regex` (to elide bounds checks). -* [PR #200](https://github.com/rust-lang-nursery/regex/pull/200): +* [PR #200](https://github.com/rust-lang/regex/pull/200): Use of the `mempool` crate (which used thread local storage) was replaced with a faster version of a similar API in @Amanieu's `thread_local` crate. It should reduce contention when using a regex from multiple threads @@ -124,5 +219,5 @@ (Includes a comparison with PCRE1's JIT and Oniguruma.) * A bug where word boundaries weren't being matched correctly in the DFA was fixed. This only affected use of `bytes::Regex`. -* [#160](https://github.com/rust-lang-nursery/regex/issues/160): +* [#160](https://github.com/rust-lang/regex/issues/160): `Captures` now has a `Debug` impl. diff --git a/Cargo.toml b/Cargo.toml index ab1680cdf8..e115931aef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "0.1.80" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" readme = "README.md" @@ -16,23 +16,23 @@ finite automata and guarantees linear time matching on all inputs. # For very fast prefix literal matching. aho-corasick = "0.5.3" # For skipping along search text quickly when a leading byte is known. -memchr = "0.1.9" +memchr = "1" # For managing regex caches quickly across multiple threads. -thread_local = "0.2.4" +thread_local = "0.3.2" # For parsing regular expressions. -regex-syntax = { path = "regex-syntax", version = "0.3.8" } +regex-syntax = { path = "regex-syntax", version = "0.4.0" } # For accelerating text search. simd = { version = "0.1.0", optional = true } # For compiling UTF-8 decoding into automata. -utf8-ranges = "0.1.3" +utf8-ranges = "1" [dev-dependencies] # For examples. -lazy_static = "0.1" +lazy_static = "0.2.2" # For property based tests. -quickcheck = "0.2" +quickcheck = "0.4.1" # For generating random test data. -rand = "0.3" +rand = "0.3.15" [features] # Enable to use the unstable pattern traits defined in std. diff --git a/README.md b/README.md index 62c2b63328..38d8ddd1e2 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,15 @@ regex ===== - -A Rust library for parsing, compiling, and executing regular expressions. -This particular implementation of regular expressions guarantees execution -in linear time with respect to the size of the regular expression and -search text by using finite automata. In particular, it makes use of both -NFAs and DFAs when matching. Much of the syntax and implementation is inspired +A Rust library for parsing, compiling, and executing regular expressions. Its +syntax is similar to Perl-style regular expressions, but lacks a few features +like look around and backreferences. In exchange, all searches execute in +linear time with respect to the size of the regular expression and search text. +Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). -[![Build Status](https://travis-ci.org/rust-lang-nursery/regex.svg?branch=master)](https://travis-ci.org/rust-lang-nursery/regex) -[![Build status](https://ci.appveyor.com/api/projects/status/22g48bo866qr4u77?svg=true)](https://ci.appveyor.com/project/alexcrichton/regex) -[![Coverage Status](https://coveralls.io/repos/github/rust-lang-nursery/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang-nursery/regex?branch=master) +[![Build Status](https://travis-ci.org/rust-lang/regex.svg?branch=master)](https://travis-ci.org/rust-lang/regex) +[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) +[![Coverage Status](https://coveralls.io/repos/github/rust-lang/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang/regex?branch=master) [![](http://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) ### Documentation @@ -29,7 +28,7 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -regex = "0.1" +regex = "0.2" ``` and this to your crate root: @@ -56,9 +55,9 @@ fn main() { ").unwrap(); let caps = re.captures("2010-03-14").unwrap(); - assert_eq!("2010", caps.name("year").unwrap()); - assert_eq!("03", caps.name("month").unwrap()); - assert_eq!("14", caps.name("day").unwrap()); + assert_eq!("2010", caps["year"]); + assert_eq!("03", caps["month"]); + assert_eq!("14", caps["day"]); } ``` @@ -82,9 +81,9 @@ fn main() { // because the only way for the regex to match is if all of the // capture groups match. This is not true in general though! println!("year: {}, month: {}, day: {}", - caps.at(1).unwrap(), - caps.at(2).unwrap(), - caps.at(3).unwrap()); + caps.get(1).unwrap().as_str(), + caps.get(2).unwrap().as_str(), + caps.get(3).unwrap().as_str()); } } ``` @@ -137,8 +136,8 @@ means the main API can't be used for searching arbitrary bytes. To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API is identical to the main API, except that it takes an `&[u8]` to search on instead of an `&str`. By default, `.` will match any *byte* using -`regex::bytes::Regex`, while `.` will match any encoded Unicode *codepoint* -using the main API. +`regex::bytes::Regex`, while `.` will match any *UTF-8 encoded Unicode scalar +value* using the main API. This example shows how to find all null-terminated strings in a slice of bytes: @@ -152,7 +151,7 @@ let text = b"foo\x00bar\x00baz\x00"; // The unwrap is OK here since a match requires the `cstr` capture to match. let cstrs: Vec<&[u8]> = re.captures_iter(text) - .map(|c| c.name("cstr").unwrap()) + .map(|c| c.name("cstr").unwrap().as_bytes()) .collect(); assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); ``` @@ -211,9 +210,9 @@ fn main() { let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); let caps = re.captures("2010-03-14").unwrap(); - assert_eq!("2010", caps.at(1).unwrap()); - assert_eq!("03", caps.at(2).unwrap()); - assert_eq!("14", caps.at(3).unwrap()); + assert_eq!("2010", caps[1]); + assert_eq!("03", caps[2]); + assert_eq!("14", caps[3]); } ``` diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 05654e072c..0bd686762a 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -17,9 +17,9 @@ libc = "0.2" onig = { version = "0.4", optional = true } libpcre-sys = { version = "0.2", optional = true } memmap = "0.2" -regex = { version = "0.1", path = "..", features = ["simd-accel"] } -regex_macros = { version = "0.1", path = "../regex_macros", optional = true } -regex-syntax = { version = "0.3", path = "../regex-syntax" } +regex = { version = "0.2.0", path = "..", features = ["simd-accel"] } +regex_macros = { version = "0.2.0", path = "../regex_macros", optional = true } +regex-syntax = { version = "0.4.0", path = "../regex-syntax" } rustc-serialize = "0.3" [build-dependencies] diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 9c8a924746..a45079edc0 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -59,22 +59,11 @@ pub use ffi::tcl::Regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. -#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() } } -#[cfg(feature = "re-rust-bytes")] -macro_rules! regex { - ($re:expr) => {{ - // Always enable the Unicode flag for byte based regexes. - // Really, this should have been enabled by default. *sigh* - use regex::bytes::RegexBuilder; - RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap() - }} -} - // Usage: text!(haystack) // // Builds a ::Text from an owned string. diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs index 3b120260c0..ec57157c8e 100644 --- a/examples/shootout-regex-dna-bytes.rs +++ b/examples/shootout-regex-dna-bytes.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_end(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs index 57583218ba..a421d20853 100644 --- a/examples/shootout-regex-dna-cheat.rs +++ b/examples/shootout-regex-dna-cheat.rs @@ -23,7 +23,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -78,10 +78,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/examples/shootout-regex-dna-replace.rs b/examples/shootout-regex-dna-replace.rs index a3319ad29d..857d8bfcd7 100644 --- a/examples/shootout-regex-dna-replace.rs +++ b/examples/shootout-regex-dna-replace.rs @@ -14,6 +14,6 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); println!("original: {}, replaced: {}", ilen, seq.len()); } diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs index fbf464202f..64d210499d 100644 --- a/examples/shootout-regex-dna-single-cheat.rs +++ b/examples/shootout-regex-dna-single-cheat.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ @@ -63,10 +63,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/examples/shootout-regex-dna-single.rs b/examples/shootout-regex-dna-single.rs index 58eada712f..a84bc63c12 100644 --- a/examples/shootout-regex-dna-single.rs +++ b/examples/shootout-regex-dna-single.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ @@ -49,7 +49,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } println!("\n{}\n{}\n{}", ilen, clen, seq.len()); } diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs index d66b4fdf06..ec0060d7f4 100644 --- a/examples/shootout-regex-dna.rs +++ b/examples/shootout-regex-dna.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/regex-capi/Cargo.toml b/regex-capi/Cargo.toml index 8b5c8ae1bf..6dd59b6f65 100644 --- a/regex-capi/Cargo.toml +++ b/regex-capi/Cargo.toml @@ -1,11 +1,11 @@ [package] name = "rure" -version = "0.1.1" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" readme = "README.md" repository = "https://github.com/rust-lang/regex" -documentation = "https://github.com/rust-lang-nursery/regex/tree/master/regex-capi" +documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi" homepage = "https://github.com/rust-lang/regex" description = """ A C API for Rust's regular expression library. @@ -17,4 +17,4 @@ crate-type = ["staticlib", "cdylib"] [dependencies] libc = "0.2" -regex = { version = "0.1.77", path = ".." } +regex = { version = "0.2.0", path = ".." } diff --git a/regex-capi/README.md b/regex-capi/README.md index 85d5b0199c..c934895fbe 100644 --- a/regex-capi/README.md +++ b/regex-capi/README.md @@ -24,7 +24,7 @@ Assuming you have (and a C compiler), then this should work to run the `iter` example: ``` -$ git clone git://github.com/rust-lang-nursery/regex +$ git clone git://github.com/rust-lang/regex $ cd regex/regex-capi/examples $ ./compile $ LD_LIBRARY_PATH=../target/release ./iter @@ -42,7 +42,7 @@ All memory usage is bounded and all searching takes linear time with respect to the input string. For more details, see the PERFORMANCE guide: -https://github.com/rust-lang-nursery/regex/blob/master/PERFORMANCE.md +https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md Text encoding diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index 40560620f0..3a06319820 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -200,8 +200,8 @@ bool test_iter_capture_name(char *expect, char *given) { if (strcmp(expect, given)) { if (DEBUG) { fprintf(stderr, - "[test_iter_capture_name] expected first capture name '%s' " - "got '%s'\n", + "[test_iter_capture_name] expected first capture " + "name '%s' got '%s'\n", expect, given); } passed = false; @@ -213,14 +213,16 @@ bool test_iter_capture_names() { bool passed = true; char *name; - rure *re = rure_compile_must("(?P\\d{4})-(?P\\d{2})-(?P\\d{2})"); + rure *re = rure_compile_must( + "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})"); rure_iter_capture_names *it = rure_iter_capture_names_new(re); bool result = rure_iter_capture_names_next(it, &name); if (!result) { if (DEBUG) { fprintf(stderr, - "[test_iter_capture_names] expected a second name, but got none\n"); + "[test_iter_capture_names] expected a second name, " + "but got none\n"); } passed = false; goto done; @@ -426,14 +428,14 @@ bool test_regex_set_match_start() { goto done1; } - if (rure_set_is_match(re, (const uint8_t *) "foobiasdr", 7, 2)) { + if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) { passed = false; goto done1; } { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *) "fooobar", 8, 0, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) { passed = false; goto done1; } @@ -453,7 +455,7 @@ bool test_regex_set_match_start() { { bool matches[PAT_COUNT]; - if (!rure_set_matches(re, (const uint8_t *) "fooobar", 7, 1, matches)) { + if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) { passed = false; goto done1; } @@ -490,8 +492,8 @@ bool test_regex_set_options() { const char *patterns[] = { "\\w{100}" }; const size_t patterns_lengths[] = { 8 }; - rure_set *re = rure_compile_set((const uint8_t **) patterns, patterns_lengths, - 1, 0, opts, err); + rure_set *re = rure_compile_set( + (const uint8_t **) patterns, patterns_lengths, 1, 0, opts, err); if (re != NULL) { if (DEBUG) { fprintf(stderr, @@ -540,7 +542,8 @@ int main() { &passed); run_test(test_regex_set_matches, "test_regex_set_match", &passed); run_test(test_regex_set_options, "test_regex_set_options", &passed); - run_test(test_regex_set_match_start, "test_regex_set_match_start", &passed); + run_test(test_regex_set_match_start, "test_regex_set_match_start", + &passed); if (!passed) { exit(1); diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 3350a37c6e..0bbd214e3f 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -1,17 +1,22 @@ -use ::error::{Error, ErrorKind}; +use std::collections::HashMap; +use std::ops::Deref; +use std::ffi::{CStr, CString}; +use std::ptr; +use std::str; +use std::slice; -use ::regex::bytes; -use ::regex::internal::{Exec, ExecBuilder, RegexOptions}; -use ::regex::internal::RegularExpression; -use ::libc::{c_char, size_t}; +use libc::{c_char, size_t}; +use regex::bytes; -use ::std::collections::HashMap; -use ::std::ops::Deref; -use ::std::ffi::{CStr, CString}; -use ::std::ptr; -use ::std::str; -use ::std::slice; +use error::{Error, ErrorKind}; +const RURE_FLAG_CASEI: u32 = 1 << 0; +const RURE_FLAG_MULTI: u32 = 1 << 1; +const RURE_FLAG_DOTNL: u32 = 1 << 2; +const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; +const RURE_FLAG_SPACE: u32 = 1 << 4; +const RURE_FLAG_UNICODE: u32 = 1 << 5; +const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; pub struct Regex { re: bytes::Regex, @@ -27,26 +32,16 @@ pub struct Options { // arbitrary position with a crate just yet. To circumvent this, we use // the `Exec` structure directly. pub struct RegexSet { - re: Exec, - pattern_count: usize + re: bytes::RegexSet, } -const RURE_FLAG_CASEI: u32 = 1 << 0; -const RURE_FLAG_MULTI: u32 = 1 << 1; -const RURE_FLAG_DOTNL: u32 = 1 << 2; -const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; -const RURE_FLAG_SPACE: u32 = 1 << 4; -const RURE_FLAG_UNICODE: u32 = 1 << 5; -const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; - - #[repr(C)] pub struct rure_match { pub start: size_t, pub end: size_t, } -pub struct Captures(Vec>); +pub struct Captures(bytes::Locations); pub struct Iter { re: *const Regex, @@ -65,8 +60,8 @@ impl Deref for Regex { } impl Deref for RegexSet { - type Target = Exec; - fn deref(&self) -> &Exec { &self.re } + type Target = bytes::RegexSet; + fn deref(&self) -> &bytes::RegexSet { &self.re } } impl Default for Options { @@ -118,16 +113,16 @@ ffi_fn! { let mut builder = bytes::RegexBuilder::new(pat); if !options.is_null() { let options = unsafe { &*options }; - builder = builder.size_limit(options.size_limit); - builder = builder.dfa_size_limit(options.dfa_size_limit); + builder.size_limit(options.size_limit); + builder.dfa_size_limit(options.dfa_size_limit); } - builder = builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); - builder = builder.multi_line(flags & RURE_FLAG_MULTI > 0); - builder = builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); - builder = builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); - builder = builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); - builder = builder.unicode(flags & RURE_FLAG_UNICODE > 0); - match builder.compile() { + builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); + builder.multi_line(flags & RURE_FLAG_MULTI > 0); + builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); + builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); + builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); + builder.unicode(flags & RURE_FLAG_UNICODE > 0); + match builder.build() { Ok(re) => { let mut capture_names = HashMap::new(); for (i, name) in re.capture_names().enumerate() { @@ -182,10 +177,10 @@ ffi_fn! { ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start).map(|(s, e)| unsafe { + re.find_at(haystack, start).map(|m| unsafe { if !match_info.is_null() { - (*match_info).start = s; - (*match_info).end = e; + (*match_info).start = m.start(); + (*match_info).end = m.end(); } }).is_some() } @@ -339,7 +334,7 @@ ffi_fn! { } let (s, e) = match re.find_at(text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start @@ -381,7 +376,7 @@ ffi_fn! { } let (s, e) = match re.read_captures_at(slots, text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start @@ -404,7 +399,7 @@ ffi_fn! { ffi_fn! { fn rure_captures_new(re: *const Regex) -> *mut Captures { let re = unsafe { &*re }; - let captures = Captures(vec![None; 2 * re.captures_len()]); + let captures = Captures(re.locations()); Box::into_raw(Box::new(captures)) } } @@ -421,9 +416,9 @@ ffi_fn! { i: size_t, match_info: *mut rure_match, ) -> bool { - let captures = unsafe { &(*captures).0 }; - match (captures[i * 2], captures[i * 2 + 1]) { - (Some(start), Some(end)) => { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { + Some((start, end)) => { if !match_info.is_null() { unsafe { (*match_info).start = start; @@ -501,37 +496,21 @@ ffi_fn! { }); } - // Start with a default set and override values if present. - let mut opts = RegexOptions::default(); - let pat_count = pats.len(); - opts.pats = pats.into_iter().map(|s| s.to_owned()).collect(); - + let mut builder = bytes::RegexSetBuilder::new(pats); if !options.is_null() { let options = unsafe { &*options }; - opts.size_limit = options.size_limit; - opts.dfa_size_limit = options.dfa_size_limit; + builder.size_limit(options.size_limit); + builder.dfa_size_limit(options.dfa_size_limit); } - - opts.case_insensitive = flags & RURE_FLAG_CASEI > 0; - opts.multi_line = flags & RURE_FLAG_MULTI > 0; - opts.dot_matches_new_line = flags & RURE_FLAG_DOTNL > 0; - opts.swap_greed = flags & RURE_FLAG_SWAP_GREED > 0; - opts.ignore_whitespace = flags & RURE_FLAG_SPACE > 0; - opts.unicode = flags & RURE_FLAG_UNICODE > 0; - - // `Exec` does not expose a `new` function with appropriate arguments - // so we construct directly. - let builder = ExecBuilder::new_options(opts) - .bytes(true) - .only_utf8(false); - + builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); + builder.multi_line(flags & RURE_FLAG_MULTI > 0); + builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); + builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); + builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); + builder.unicode(flags & RURE_FLAG_UNICODE > 0); match builder.build() { - Ok(ex) => { - let re = RegexSet { - re: ex, - pattern_count: pat_count - }; - Box::into_raw(Box::new(re)) + Ok(re) => { + Box::into_raw(Box::new(RegexSet { re: re })) } Err(err) => { unsafe { @@ -560,7 +539,7 @@ ffi_fn! { ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.searcher().is_match_at(haystack, start) + re.is_match_at(haystack, start) } } @@ -574,21 +553,20 @@ ffi_fn! { ) -> bool { let re = unsafe { &*re }; let mut matches = unsafe { - slice::from_raw_parts_mut(matches, re.pattern_count) + slice::from_raw_parts_mut(matches, re.len()) }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - // many_matches_at isn't guaranteed to set non-matches to false + // read_matches_at isn't guaranteed to set non-matches to false for item in matches.iter_mut() { *item = false; } - - re.searcher().many_matches_at(&mut matches, haystack, start) + re.read_matches_at(&mut matches, haystack, start) } } ffi_fn! { fn rure_set_len(re: *const RegexSet) -> size_t { - unsafe { (*re).pattern_count } + unsafe { (*re).len() } } } diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index 6645ab60d4..cebad7ca4b 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -11,8 +11,8 @@ description = "A tool useful for debugging regular expressions." [dependencies] docopt = "0.6" -regex = { version = "0.1", path = ".." } -regex-syntax = { version = "0.3", path = "../regex-syntax" } +regex = { version = "0.2", path = ".." } +regex-syntax = { version = "0.4.0", path = "../regex-syntax" } rustc-serialize = "0.3" [profile.release] diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 801a60acd6..c386b48a75 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.3.9" #:version +version = "0.4.0" #:version authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" repository = "https://github.com/rust-lang/regex" @@ -9,5 +9,5 @@ homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." [dev-dependencies] -quickcheck = "0.2" -rand = "0.3" +quickcheck = "0.4.1" +rand = "0.3.15" diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 47da25f7d6..d10b2a50ec 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -1401,6 +1401,17 @@ pub enum ErrorKind { /// A character class was constructed such that it is empty. /// e.g., `[^\d\D]`. EmptyClass, + /// Indicates that unsupported notation was used in a character class. + /// + /// The char in this error corresponds to the illegal character. + /// + /// The intent of this error is to carve a path to support set notation + /// as described in UTS#18 RL1.3. We do this by rejecting regexes that + /// would use the notation. + /// + /// The work around for end users is to escape the character included in + /// this error message. + UnsupportedClassChar(char), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -1464,6 +1475,7 @@ impl ErrorKind { UnicodeNotAllowed => "Unicode features not allowed", InvalidUtf8 => "matching arbitrary bytes is not allowed", EmptyClass => "empty character class", + UnsupportedClassChar(_) => "unsupported class notation", __Nonexhaustive => unreachable!(), } } @@ -1576,6 +1588,9 @@ repetition operator."), write!(f, "Matching arbitrary bytes is not allowed."), EmptyClass => write!(f, "Empty character classes are not allowed."), + UnsupportedClassChar(c) => + write!(f, "Use of unescaped '{}' in character class is \ + not allowed.", c), __Nonexhaustive => unreachable!(), } } @@ -1620,7 +1635,7 @@ fn binary_search(xs: &[T], mut pred: F) -> usize /// /// The string returned may be safely used as a literal in a regular /// expression. -pub fn quote(text: &str) -> String { +pub fn escape(text: &str) -> String { let mut quoted = String::with_capacity(text.len()); for c in text.chars() { if parser::is_punct(c) { diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index a9370c24c9..8cf10b88eb 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -112,10 +112,7 @@ impl Parser { '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)), '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)), '{' => try!(self.parse_counted_repeat()), - '[' => match self.maybe_parse_ascii() { - None => try!(self.parse_class()), - Some(cls) => Build::Expr(Expr::Class(cls)), - }, + '[' => try!(self.parse_class()), '^' => { if self.flags.multi { self.parse_one(Expr::StartLine) @@ -551,8 +548,8 @@ impl Parser { '[' => match self.maybe_parse_ascii() { Some(class2) => class.ranges.extend(class2), None => { - self.bump(); - try!(self.parse_class_range(&mut class, '[')) + return Err(self.err( + ErrorKind::UnsupportedClassChar('['))); } }, '\\' => match try!(self.parse_escape()) { @@ -585,6 +582,16 @@ impl Parser { let _ = try!(self.codepoint_to_one_byte(start)); } self.bump(); + match start { + '&'|'~'|'-' => { + // Only report an error if we see && or ~~ or --. + if self.peek_is(start) { + return Err(self.err( + ErrorKind::UnsupportedClassChar(start))); + } + } + _ => {} + } try!(self.parse_class_range(&mut class, start)); } } @@ -602,7 +609,7 @@ impl Parser { // If `class` was only non-empty due to multibyte characters, the // corresponding byte class will now be empty. // - // See https://github.com/rust-lang-nursery/regex/issues/303 + // See https://github.com/rust-lang/regex/issues/303 if byte_class.is_empty() { // e.g., (?-u)[^\x00-\xFF] return Err(self.err(ErrorKind::EmptyClass)); @@ -657,8 +664,11 @@ impl Parser { // Because `parse_escape` can never return `LeftParen`. _ => unreachable!(), }, - _ => { - let c = self.bump(); + c => { + self.bump(); + if c == '-' { + return Err(self.err(ErrorKind::UnsupportedClassChar('-'))); + } if !self.flags.unicode { let _ = try!(self.codepoint_to_one_byte(c)); } @@ -1215,7 +1225,7 @@ fn is_valid_capture_char(c: char) -> bool { pub fn is_punct(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | - '[' | ']' | '{' | '}' | '^' | '$' | '#' => true, + '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, _ => false, } } @@ -2194,9 +2204,9 @@ mod tests { #[test] fn class_brackets() { - assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')]))); - assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')]))); - assert_eq!(p("[[]]"), Expr::Concat(vec![ + assert_eq!(p(r"[]]"), Expr::Class(class(&[(']', ']')]))); + assert_eq!(p(r"[]\[]"), Expr::Class(class(&[('[', '['), (']', ']')]))); + assert_eq!(p(r"[\[]]"), Expr::Concat(vec![ Expr::Class(class(&[('[', '[')])), lit(']'), ])); @@ -2211,6 +2221,31 @@ mod tests { ])); } + #[test] + fn class_special_escaped_set_chars() { + // These tests ensure that some special characters require escaping + // for use in character classes. The intention is to use these + // characters to implement sets as described in UTC#18 RL1.3. Once + // that's done, these tests should be removed and replaced with others. + assert_eq!(p(r"[\[]"), Expr::Class(class(&[('[', '[')]))); + assert_eq!(p(r"[&]"), Expr::Class(class(&[('&', '&')]))); + assert_eq!(p(r"[\&]"), Expr::Class(class(&[('&', '&')]))); + assert_eq!(p(r"[\&\&]"), Expr::Class(class(&[('&', '&')]))); + assert_eq!(p(r"[\x00-&]"), Expr::Class(class(&[('\u{0}', '&')]))); + assert_eq!(p(r"[&-\xFF]"), Expr::Class(class(&[('&', '\u{FF}')]))); + + assert_eq!(p(r"[~]"), Expr::Class(class(&[('~', '~')]))); + assert_eq!(p(r"[\~]"), Expr::Class(class(&[('~', '~')]))); + assert_eq!(p(r"[\~\~]"), Expr::Class(class(&[('~', '~')]))); + assert_eq!(p(r"[\x00-~]"), Expr::Class(class(&[('\u{0}', '~')]))); + assert_eq!(p(r"[~-\xFF]"), Expr::Class(class(&[('~', '\u{FF}')]))); + + assert_eq!(p(r"[+-\-]"), Expr::Class(class(&[('+', '-')]))); + assert_eq!(p(r"[a-a\--\xFF]"), Expr::Class(class(&[ + ('-', '\u{FF}'), + ]))); + } + #[test] fn class_overlapping() { assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')]))); @@ -2224,10 +2259,11 @@ mod tests { #[test] fn ascii_classes() { - assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); + assert_eq!(p("[:blank:]"), Expr::Class(class(&[ + (':', ':'), ('a', 'b'), ('k', 'l'), ('n', 'n'), + ]))); assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); - assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(pb("(?-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class())); } @@ -2270,12 +2306,9 @@ mod tests { #[test] fn ascii_classes_case_fold() { - assert_eq!(p("(?i)[:upper:]"), Expr::Class(class(UPPER).case_fold())); assert_eq!(p("(?i)[[:upper:]]"), Expr::Class(class(UPPER).case_fold())); - assert_eq!(pb("(?i-u)[:upper:]"), - Expr::Class(class(UPPER).case_fold())); assert_eq!(pb("(?i-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); } @@ -2764,6 +2797,19 @@ mod tests { test_err!(r"(?-u)[^\x00-\xFF]", 17, ErrorKind::EmptyClass, flags); } + #[test] + fn error_class_unsupported_char() { + // These tests ensure that some unescaped special characters are + // rejected in character classes. The intention is to use these + // characters to implement sets as described in UTC#18 RL1.3. Once + // that's done, these tests should be removed and replaced with others. + test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('[')); + test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&')); + test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~')); + test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-')); + test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-')); + } + #[test] fn error_duplicate_capture_name() { test_err!("(?P.)(?P.)", 14, diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml index c587e9ab16..7fd4ecd7a5 100644 --- a/regex_macros/Cargo.toml +++ b/regex_macros/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex_macros" -version = "0.1.38" +version = "0.2.0" authors = ["The Rust Project Developers"] license = "MIT/Apache-2.0" repository = "https://github.com/rust-lang/regex" @@ -19,16 +19,16 @@ plugin = true [dependencies.regex] path = ".." -version = "0.1.63" +version = "0.2.0" features = ["pattern"] [dependencies.regex-syntax] path = "../regex-syntax" -version = "0.3.1" +version = "0.4.0" [dev-dependencies] # For generating random test data. -rand = "0.3" +rand = "0.3.15" [[test]] path = "../tests/test_plugin.rs" diff --git a/src/backtrack.rs b/src/backtrack.rs index 3c06254c6b..49cb22e20f 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -44,7 +44,7 @@ pub fn should_exec(num_insts: usize, text_len: usize) -> bool { // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) // // The actual limit picked is pretty much a heuristic. - // See: https://github.com/rust-lang-nursery/regex/issues/215 + // See: https://github.com/rust-lang/regex/issues/215 let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; size <= MAX_SIZE_BYTES } diff --git a/src/error.rs b/src/error.rs index e014a37aba..c95d67acdd 100644 --- a/src/error.rs +++ b/src/error.rs @@ -16,15 +16,10 @@ use syntax; #[derive(Debug)] pub enum Error { /// A syntax error. - Syntax(syntax::Error), + Syntax(String), /// The compiled program exceeded the set size limit. /// The argument is the size limit imposed. CompiledTooBig(usize), - /// **DEPRECATED:** Will be removed on next major version bump. - /// - /// This error is no longer used. (A `RegexSet` can now contain zero or - /// more regular expressions.) - InvalidSet, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -37,20 +32,14 @@ pub enum Error { impl ::std::error::Error for Error { fn description(&self) -> &str { match *self { - Error::Syntax(ref err) => err.description(), + Error::Syntax(ref err) => err, Error::CompiledTooBig(_) => "compiled program too big", - Error::InvalidSet => { - "sets must contain 2 or more regular expressions" - } Error::__Nonexhaustive => unreachable!(), } } fn cause(&self) -> Option<&::std::error::Error> { - match *self { - Error::Syntax(ref err) => Some(err), - _ => None, - } + None } } @@ -62,9 +51,6 @@ impl fmt::Display for Error { write!(f, "Compiled regex exceeds size limit of {} bytes.", limit) } - Error::InvalidSet => { - write!(f, "Sets must contain 2 or more regular expressions.") - } Error::__Nonexhaustive => unreachable!(), } } @@ -72,6 +58,6 @@ impl fmt::Display for Error { impl From for Error { fn from(err: syntax::Error) -> Error { - Error::Syntax(err) + Error::Syntax(err.to_string()) } } diff --git a/src/exec.rs b/src/exec.rs index 62b0f0e2a7..d5d3bf3f7a 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -27,7 +27,7 @@ use prog::Program; use re_builder::RegexOptions; use re_bytes; use re_set; -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; use re_unicode; use utf8::next_utf8; @@ -343,11 +343,11 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { #[inline(always)] // reduces constant overhead fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)> { - self.0.read_captures_at(slots, text.as_bytes(), start) + self.0.read_captures_at(locs, text.as_bytes(), start) } } @@ -512,10 +512,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// locations of the overall match. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/expand.rs b/src/expand.rs index 9bea703881..55873f88bb 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -2,9 +2,56 @@ use std::str; use memchr::memchr; -use bytes::Captures; +use re_bytes; +use re_unicode; -pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { +pub fn expand_str( + caps: &re_unicode::Captures, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.push_str( + caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or("")); + } + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures, + mut replacement: &[u8], + dst: &mut Vec, +) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, @@ -27,65 +74,142 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { continue; } }; - replacement = cap_ref.rest; + replacement = &replacement[cap_ref.end..]; match cap_ref.cap { - Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), - Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), + Ref::Number(i) => { + dst.extend( + caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b"")); + } } } dst.extend(replacement); } +/// CaptureRef represents a reference to a capture group inside some text. The +/// reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { - rest: &'a [u8], cap: Ref<'a>, + end: usize, } +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } -fn find_cap_ref(mut replacement: &[u8]) -> Option { - if replacement.len() <= 1 || replacement[0] != b'$' { +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref>( + replacement: &T, +) -> Option { + let mut i = 0; + let rep: &[u8] = replacement.as_ref(); + if rep.len() <= 1 || rep[0] != b'$' { return None; } let mut brace = false; - replacement = &replacement[1..]; - if replacement[0] == b'{' { + i += 1; + if rep[i] == b'{' { brace = true; - replacement = &replacement[1..]; + i += 1; } - let mut cap_end = 0; - while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + let mut cap_end = i; + while rep.get(cap_end).map_or(false, is_valid_cap_letter) { cap_end += 1; } - if cap_end == 0 { + if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check with either unsafe or by parsing the number straight from &[u8]. - let cap = str::from_utf8(&replacement[..cap_end]) + let cap = str::from_utf8(&rep[i..cap_end]) .ok().expect("valid UTF-8 capture name"); if brace { - if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + if !rep.get(cap_end).map_or(false, |&b| b == b'}') { return None; } cap_end += 1; } Some(CaptureRef { - rest: &replacement[cap_end..], cap: match cap.parse::() { Ok(i) => Ref::Number(i as usize), Err(_) => Ref::Named(cap), }, + end: cap_end, }) } +/// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, _ => false, } } + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text)); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text)); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); +} diff --git a/src/lib.rs b/src/lib.rs index 7e4b24cb03..f52f1f4957 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,451 +8,469 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! This crate provides a native implementation of regular expressions that is -//! heavily based on RE2 both in syntax and in implementation. Notably, -//! backreferences and arbitrary lookahead/lookbehind assertions are not -//! provided. In return, regular expression searching provided by this package -//! has excellent worst-case performance. The specific syntax supported is -//! documented further down. -//! -//! This crate's documentation provides some simple examples, describes Unicode -//! support and exhaustively lists the supported syntax. For more specific -//! details on the API, please see the documentation for the -//! [`Regex`](struct.Regex.html) type. -//! -//! # Usage -//! -//! This crate is [on crates.io](https://crates.io/crates/regex) and can be -//! used by adding `regex` to your dependencies in your project's `Cargo.toml`. -//! -//! ```toml -//! [dependencies] -//! regex = "0.1" -//! ``` -//! -//! and this to your crate root: -//! -//! ```rust -//! extern crate regex; -//! ``` -//! -//! # Example: find a date -//! -//! General use of regular expressions in this package involves compiling an -//! expression and then using it to search, split or replace text. For example, -//! to confirm that some text resembles a date: -//! -//! ```rust -//! use regex::Regex; -//! let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); -//! assert!(re.is_match("2014-01-01")); -//! ``` -//! -//! Notice the use of the `^` and `$` anchors. In this crate, every expression -//! is executed with an implicit `.*?` at the beginning and end, which allows -//! it to match anywhere in the text. Anchors can be used to ensure that the -//! full text matches an expression. -//! -//! This example also demonstrates the utility of -//! [raw strings](https://doc.rust-lang.org/stable/reference.html#raw-string-literals) -//! in Rust, which -//! are just like regular strings except they are prefixed with an `r` and do -//! not process any escape sequences. For example, `"\\d"` is the same -//! expression as `r"\d"`. -//! -//! # Example: Avoid compiling the same regex in a loop -//! -//! It is an anti-pattern to compile the same regular expression in a loop -//! since compilation is typically expensive. (It takes anywhere from a few -//! microseconds to a few **milliseconds** depending on the size of the -//! regex.) Not only is compilation itself expensive, but this also prevents -//! optimizations that reuse allocations internally to the matching engines. -//! -//! In Rust, it can sometimes be a pain to pass regular expressions around if -//! they're used from inside a helper function. Instead, we recommend using the -//! [`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that -//! regular expressions are compiled exactly once. -//! -//! For example: -//! -//! ```rust -//! #[macro_use] extern crate lazy_static; -//! extern crate regex; -//! -//! use regex::Regex; -//! -//! fn some_helper_function(text: &str) -> bool { -//! lazy_static! { -//! static ref RE: Regex = Regex::new("...").unwrap(); -//! } -//! RE.is_match(text) -//! } -//! -//! fn main() {} -//! ``` -//! -//! Specifically, in this example, the regex will be compiled when it is used for -//! the first time. On subsequent uses, it will reuse the previous compilation. -//! -//! # Example: iterating over capture groups -//! -//! This crate provides convenient iterators for matching an expression -//! repeatedly against a search string to find successive non-overlapping -//! matches. For example, to find all dates in a string and be able to access -//! them by their component pieces: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); -//! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; -//! for cap in re.captures_iter(text) { -//! println!("Month: {} Day: {} Year: {}", -//! cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""), -//! cap.at(1).unwrap_or("")); -//! } -//! // Output: -//! // Month: 03 Day: 14 Year: 2012 -//! // Month: 01 Day: 01 Year: 2013 -//! // Month: 07 Day: 05 Year: 2014 -//! # } -//! ``` -//! -//! Notice that the year is in the capture group indexed at `1`. This is -//! because the *entire match* is stored in the capture group at index `0`. -//! -//! # Example: replacement with named capture groups -//! -//! Building on the previous example, perhaps we'd like to rearrange the date -//! formats. This can be done with text replacement. But to make the code -//! clearer, we can *name* our capture groups and use those names as variables -//! in our replacement text: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(?P\d{4})-(?P\d{2})-(?P\d{2})").unwrap(); -//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; -//! let after = re.replace_all(before, "$m/$d/$y"); -//! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); -//! # } -//! ``` -//! -//! The `replace` methods are actually polymorphic in the replacement, which -//! provides more flexibility than is seen here. (See the documentation for -//! `Regex::replace` for more details.) -//! -//! Note that if your regex gets complicated, you can use the `x` flag to -//! enable insigificant whitespace mode, which also lets you write comments: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(?x) -//! (?P\d{4}) # the year -//! - -//! (?P\d{2}) # the month -//! - -//! (?P\d{2}) # the day -//! ").unwrap(); -//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; -//! let after = re.replace_all(before, "$m/$d/$y"); -//! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); -//! # } -//! ``` -//! -//! # Example: match multiple regular expressions simultaneously -//! -//! This demonstrates how to use a `RegexSet` to match multiple (possibly -//! overlapping) regular expressions in a single scan of the search text: -//! -//! ```rust -//! use regex::RegexSet; -//! -//! let set = RegexSet::new(&[ -//! r"\w+", -//! r"\d+", -//! r"\pL+", -//! r"foo", -//! r"bar", -//! r"barfoo", -//! r"foobar", -//! ]).unwrap(); -//! -//! // Iterate over and collect all of the matches. -//! let matches: Vec<_> = set.matches("foobar").into_iter().collect(); -//! assert_eq!(matches, vec![0, 2, 3, 4, 6]); -//! -//! // You can also test whether a particular regex matched: -//! let matches = set.matches("foobar"); -//! assert!(!matches.matched(5)); -//! assert!(matches.matched(6)); -//! ``` -//! -//! # Pay for what you use -//! -//! With respect to searching text with a regular expression, there are three -//! questions that can be asked: -//! -//! 1. Does the text match this expression? -//! 2. If so, where does it match? -//! 3. Where are the submatches? -//! -//! Generally speaking, this crate could provide a function to answer only #3, -//! which would subsume #1 and #2 automatically. However, it can be -//! significantly more expensive to compute the location of submatches, so it's -//! best not to do it if you don't need to. -//! -//! Therefore, only use what you need. For example, don't use `find` if you -//! only need to test if an expression matches a string. (Use `is_match` -//! instead.) -//! -//! # Unicode -//! -//! This implementation executes regular expressions **only** on valid UTF-8 -//! while exposing match locations as byte indices into the search string. -//! -//! Only simple case folding is supported. Namely, when matching -//! case-insensitively, the characters are first mapped using the [simple case -//! folding](ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt) mapping -//! before matching. -//! -//! Regular expressions themselves are **only** interpreted as a sequence of -//! Unicode scalar values. This means you can use Unicode characters directly -//! in your expression: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(?i)Δ+").unwrap(); -//! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); -//! # } -//! ``` -//! -//! Finally, Unicode general categories and scripts are available as character -//! classes. For example, you can match a sequence of numerals, Greek or -//! Cherokee letters: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); -//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); -//! # } -//! ``` -//! -//! # Opt out of Unicode support -//! -//! The `bytes` sub-module provides a `Regex` type that can be used to match -//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with -//! all Unicode support disabled (e.g., `.` matches any byte instead of any -//! Unicode codepoint). Unicode support can be selectively enabled with the -//! `u` flag. See the `bytes` module documentation for more details. -//! -//! Unicode support can also be selectively *disabled* with the main `Regex` -//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII -//! word boundary. Note though that invalid UTF-8 is not allowed to be matched -//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an -//! error, since `.` matches *any byte* when Unicode support is disabled. -//! -//! # Syntax -//! -//! The syntax supported in this crate is almost in an exact correspondence -//! with the syntax supported by RE2. It is documented below. -//! -//! Note that the regular expression parser and abstract syntax are exposed in -//! a separate crate, [`regex-syntax`](../regex_syntax/index.html). -//! -//! ## Matching one character -//! -//!
-//! .           any character except new line (includes new line with s flag)
-//! [xyz]       A character class matching either x, y or z.
-//! [^xyz]      A character class matching any character except x, y and z.
-//! [a-z]       A character class matching any character in range a-z.
-//! \d          digit (\p{Nd})
-//! \D          not digit
-//! [:alpha:]   ASCII character class ([A-Za-z])
-//! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
-//! \pN         One-letter name Unicode character class
-//! \p{Greek}   Unicode character class (general category or script)
-//! \PN         Negated one-letter name Unicode character class
-//! \P{Greek}   negated Unicode character class (general category or script)
-//! 
-//! -//! Any named character class may appear inside a bracketed `[...]` character -//! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral -//! character. -//! -//! ## Composites -//! -//!
-//! xy    concatenation (x followed by y)
-//! x|y   alternation (x or y, prefer x)
-//! 
-//! -//! ## Repetitions -//! -//!
-//! x*        zero or more of x (greedy)
-//! x+        one or more of x (greedy)
-//! x?        zero or one of x (greedy)
-//! x*?       zero or more of x (ungreedy/lazy)
-//! x+?       one or more of x (ungreedy/lazy)
-//! x??       zero or one of x (ungreedy/lazy)
-//! x{n,m}    at least n x and at most m x (greedy)
-//! x{n,}     at least n x (greedy)
-//! x{n}      exactly n x
-//! x{n,m}?   at least n x and at most m x (ungreedy/lazy)
-//! x{n,}?    at least n x (ungreedy/lazy)
-//! x{n}?     exactly n x
-//! 
-//! -//! ## Empty matches -//! -//!
-//! ^     the beginning of text (or start-of-line with multi-line mode)
-//! $     the end of text (or end-of-line with multi-line mode)
-//! \A    only the beginning of text (even with multi-line mode enabled)
-//! \z    only the end of text (even with multi-line mode enabled)
-//! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-//! \B    not a Unicode word boundary
-//! 
-//! -//! ## Grouping and flags -//! -//!
-//! (exp)          numbered capture group (indexed by opening parenthesis)
-//! (?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
-//! (?:exp)        non-capturing group
-//! (?flags)       set flags within current group
-//! (?flags:exp)   set flags for exp (non-capturing)
-//! 
-//! -//! Flags are each a single character. For example, `(?x)` sets the flag `x` -//! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at -//! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets -//! the `x` flag and clears the `y` flag. -//! -//! All flags are by default disabled unless stated otherwise. They are: -//! -//!
-//! i     case-insensitive
-//! m     multi-line mode: ^ and $ match begin/end of line
-//! s     allow . to match \n
-//! U     swap the meaning of x* and x*?
-//! u     Unicode support (enabled by default)
-//! x     ignore whitespace and allow line comments (starting with `#`)
-//! 
-//! -//! Here's an example that matches case-insensitively for only part of the -//! expression: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); -//! let cap = re.captures("AaAaAbbBBBb").unwrap(); -//! assert_eq!(cap.at(0), Some("AaAaAbb")); -//! # } -//! ``` -//! -//! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches -//! `b`. -//! -//! Here is an example that uses an ASCII word boundary instead of a Unicode -//! word boundary: -//! -//! ```rust -//! # extern crate regex; use regex::Regex; -//! # fn main() { -//! let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); -//! let cap = re.captures("$$abc$$").unwrap(); -//! assert_eq!(cap.at(0), Some("abc")); -//! # } -//! ``` -//! -//! ## Escape sequences -//! -//!
-//! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
-//! \a         bell (\x07)
-//! \f         form feed (\x0C)
-//! \t         horizontal tab
-//! \n         new line
-//! \r         carriage return
-//! \v         vertical tab (\x0B)
-//! \123       octal character code (up to three digits)
-//! \x7F       hex character code (exactly two digits)
-//! \x{10FFFF} any hex character code corresponding to a Unicode code point
-//! 
-//! -//! ## Perl character classes (Unicode friendly) -//! -//! These classes are based on the definitions provided in -//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties): -//! -//!
-//! \d     digit (\p{Nd})
-//! \D     not digit
-//! \s     whitespace (\p{White_Space})
-//! \S     not whitespace
-//! \w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
-//! \W     not word character
-//! 
-//! -//! ## ASCII character classes -//! -//!
-//! [:alnum:]    alphanumeric ([0-9A-Za-z])
-//! [:alpha:]    alphabetic ([A-Za-z])
-//! [:ascii:]    ASCII ([\x00-\x7F])
-//! [:blank:]    blank ([\t ])
-//! [:cntrl:]    control ([\x00-\x1F\x7F])
-//! [:digit:]    digits ([0-9])
-//! [:graph:]    graphical ([!-~])
-//! [:lower:]    lower case ([a-z])
-//! [:print:]    printable ([ -~])
-//! [:punct:]    punctuation ([!-/:-@[-`{-~])
-//! [:space:]    whitespace ([\t\n\v\f\r ])
-//! [:upper:]    upper case ([A-Z])
-//! [:word:]     word characters ([0-9A-Za-z_])
-//! [:xdigit:]   hex digit ([0-9A-Fa-f])
-//! 
-//! -//! # Untrusted input -//! -//! This crate can handle both untrusted regular expressions and untrusted -//! search text. -//! -//! Untrusted regular expressions are handled by capping the size of a compiled -//! regular expression. (See `Regex::with_size_limit`.) Without this, it would -//! be trivial for an attacker to exhaust your system's memory with expressions -//! like `a{100}{100}{100}`. -//! -//! Untrusted search text is allowed because the matching engine(s) in this -//! crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search -//! text`), which means there's no way to cause exponential blow-up like with -//! some other regular expression engines. (We pay for this by disallowing -//! features like arbitrary look-ahead and backreferences.) -//! -//! When a DFA is used, pathological cases with exponential state blow up are -//! avoided by constructing the DFA lazily or in an "online" manner. Therefore, -//! at most one new state can be created for each byte of input. This satisfies -//! our time complexity guarantees, but can lead to unbounded memory growth -//! proportional to the size of the input. As a stopgap, the DFA is only -//! allowed to store a fixed number of states. (When the limit is reached, its -//! states are wiped and continues on, possibly duplicating previous work. If -//! the limit is reached too frequently, it gives up and hands control off to -//! another matching engine with fixed memory requirements.) +/*! +This crate provides a library for parsing, compiling, and executing regular +expressions. Its syntax is similar to Perl-style regular expressions, but lacks +a few features like look around and backreferences. In exchange, all searches +execute in linear time with respect to the size of the regular expression and +search text. + +This crate's documentation provides some simple examples, describes +[Unicode support](#unicode) and exhaustively lists the +[supported syntax](#syntax). + +For more specific details on the API for regular expressions, please see the +documentation for the [`Regex`](struct.Regex.html) type. + +# Usage + +This crate is [on crates.io](https://crates.io/crates/regex) and can be +used by adding `regex` to your dependencies in your project's `Cargo.toml`. + +```toml +[dependencies] +regex = "0.2" +``` + +and this to your crate root: + +```rust +extern crate regex; +``` + +# Example: find a date + +General use of regular expressions in this package involves compiling an +expression and then using it to search, split or replace text. For example, +to confirm that some text resembles a date: + +```rust +use regex::Regex; +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2014-01-01")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every expression +is executed with an implicit `.*?` at the beginning and end, which allows +it to match anywhere in the text. Anchors can be used to ensure that the +full text matches an expression. + +This example also demonstrates the utility of +[raw strings](https://doc.rust-lang.org/stable/reference.html#raw-string-literals) +in Rust, which +are just like regular strings except they are prefixed with an `r` and do +not process any escape sequences. For example, `"\\d"` is the same +expression as `r"\d"`. + +# Example: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop +since compilation is typically expensive. (It takes anywhere from a few +microseconds to a few **milliseconds** depending on the size of the +regex.) Not only is compilation itself expensive, but this also prevents +optimizations that reuse allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust +#[macro_use] extern crate lazy_static; +extern crate regex; + +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} + +fn main() {} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +# Example: iterating over capture groups + +This crate provides convenient iterators for matching an expression +repeatedly against a search string to find successive non-overlapping +matches. For example, to find all dates in a string and be able to access +them by their component pieces: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); +let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +for cap in re.captures_iter(text) { + println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); +} +// Output: +// Month: 03 Day: 14 Year: 2012 +// Month: 01 Day: 01 Year: 2013 +// Month: 07 Day: 05 Year: 2014 +# } +``` + +Notice that the year is in the capture group indexed at `1`. This is +because the *entire match* is stored in the capture group at index `0`. + +# Example: replacement with named capture groups + +Building on the previous example, perhaps we'd like to rearrange the date +formats. This can be done with text replacement. But to make the code +clearer, we can *name* our capture groups and use those names as variables +in our replacement text: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(?P\d{4})-(?P\d{2})-(?P\d{2})").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +The `replace` methods are actually polymorphic in the replacement, which +provides more flexibility than is seen here. (See the documentation for +`Regex::replace` for more details.) + +Note that if your regex gets complicated, you can use the `x` flag to +enable insigificant whitespace mode, which also lets you write comments: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(?x) + (?P\d{4}) # the year + - + (?P\d{2}) # the month + - + (?P\d{2}) # the day +").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +# Example: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +# Pay for what you use + +With respect to searching text with a regular expression, there are three +questions that can be asked: + +1. Does the text match this expression? +2. If so, where does it match? +3. Where did the capturing groups match? + +Generally speaking, this crate could provide a function to answer only #3, +which would subsume #1 and #2 automatically. However, it can be significantly +more expensive to compute the location of capturing group matches, so it's best +not to do it if you don't need to. + +Therefore, only use what you need. For example, don't use `find` if you +only need to test if an expression matches a string. (Use `is_match` +instead.) + +# Unicode + +This implementation executes regular expressions **only** on valid UTF-8 +while exposing match locations as byte indices into the search string. + +Only simple case folding is supported. Namely, when matching +case-insensitively, the characters are first mapped using the [simple case +folding](ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt) mapping +before matching. + +Regular expressions themselves are **only** interpreted as a sequence of +Unicode scalar values. This means you can use Unicode characters directly +in your expression: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)Δ+").unwrap(); +let mat = re.find("ΔδΔ").unwrap(); +assert_eq!((mat.start(), mat.end()), (0, 6)); +# } +``` + +Most features of the regular expressions in this crate are Unicode aware. Here +are some examples: + +* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`. + (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.) +* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms + of whitespace categorized by Unicode. +* `\b` matches a Unicode word boundary. +* Negated character classes like `[^a]` match all Unicode scalar values except + for `a`. +* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only + recognize `\n` and not any of the other forms of line terminators defined + by Unicode. + +Finally, Unicode general categories and scripts are available as character +classes. For example, you can match a sequence of numerals, Greek or +Cherokee letters: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); +let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +assert_eq!((mat.start(), mat.end()), (3, 23)); +# } +``` + +# Opt out of Unicode support + +The `bytes` sub-module provides a `Regex` type that can be used to match +on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +the main `Regex` type. However, this behavior can be disabled by turning +off the `u` flag, even if doing so could result in matching invalid UTF-8. +For example, when the `u` flag is disabled, `.` will match any byte instead +of any Unicode scalar value. + +Disabling the `u` flag is also possible with the standard `&str`-based `Regex` +type, but it is only allowed where the UTF-8 invariant is maintained. For +example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an +`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte +`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based +regexes. + +# Syntax + +The syntax supported in this crate is documented below. + +Note that the regular expression parser and abstract syntax are exposed in +a separate crate, [`regex-syntax`](../regex_syntax/index.html). + +## Matching one character + +
+.             any character except new line (includes new line with s flag)
+[xyz]         A character class matching either x, y or z.
+[^xyz]        A character class matching any character except x, y and z.
+[a-z]         A character class matching any character in range a-z.
+\d            digit (\p{Nd})
+\D            not digit
+[[:alpha:]]   ASCII character class ([A-Za-z])
+[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
+\pN           One-letter name Unicode character class
+\p{Greek}     Unicode character class (general category or script)
+\PN           Negated one-letter name Unicode character class
+\P{Greek}     negated Unicode character class (general category or script)
+
+ +Any named character class may appear inside a bracketed `[...]` character +class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII +digit. + +## Composites + +
+xy    concatenation (x followed by y)
+x|y   alternation (x or y, prefer x)
+
+ +## Repetitions + +
+x*        zero or more of x (greedy)
+x+        one or more of x (greedy)
+x?        zero or one of x (greedy)
+x*?       zero or more of x (ungreedy/lazy)
+x+?       one or more of x (ungreedy/lazy)
+x??       zero or one of x (ungreedy/lazy)
+x{n,m}    at least n x and at most m x (greedy)
+x{n,}     at least n x (greedy)
+x{n}      exactly n x
+x{n,m}?   at least n x and at most m x (ungreedy/lazy)
+x{n,}?    at least n x (ungreedy/lazy)
+x{n}?     exactly n x
+
+ +## Empty matches + +
+^     the beginning of text (or start-of-line with multi-line mode)
+$     the end of text (or end-of-line with multi-line mode)
+\A    only the beginning of text (even with multi-line mode enabled)
+\z    only the end of text (even with multi-line mode enabled)
+\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B    not a Unicode word boundary
+
+ +## Grouping and flags + +
+(exp)          numbered capture group (indexed by opening parenthesis)
+(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+(?:exp)        non-capturing group
+(?flags)       set flags within current group
+(?flags:exp)   set flags for exp (non-capturing)
+
+ +Flags are each a single character. For example, `(?x)` sets the flag `x` +and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +the `x` flag and clears the `y` flag. + +All flags are by default disabled unless stated otherwise. They are: + +
+i     case-insensitive
+m     multi-line mode: ^ and $ match begin/end of line
+s     allow . to match \n
+U     swap the meaning of x* and x*?
+u     Unicode support (enabled by default)
+x     ignore whitespace and allow line comments (starting with `#`)
+
+ +Here's an example that matches case-insensitively for only part of the +expression: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); +let cap = re.captures("AaAaAbbBBBb").unwrap(); +assert_eq!(&cap[0], "AaAaAbb"); +# } +``` + +Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +`b`. + +Here is an example that uses an ASCII word boundary instead of a Unicode +word boundary: + +```rust +# extern crate regex; use regex::Regex; +# fn main() { +let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); +let cap = re.captures("$$abc$$").unwrap(); +assert_eq!(&cap[0], "abc"); +# } +``` + +## Escape sequences + +
+\*         literal *, works for any punctuation character: \.+*?()|[]{}^$
+\a         bell (\x07)
+\f         form feed (\x0C)
+\t         horizontal tab
+\n         new line
+\r         carriage return
+\v         vertical tab (\x0B)
+\123       octal character code (up to three digits)
+\x7F       hex character code (exactly two digits)
+\x{10FFFF} any hex character code corresponding to a Unicode code point
+
+ +## Perl character classes (Unicode friendly) + +These classes are based on the definitions provided in +[UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties): + +
+\d     digit (\p{Nd})
+\D     not digit
+\s     whitespace (\p{White_Space})
+\S     not whitespace
+\w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
+\W     not word character
+
+ +## ASCII character classes + +
+[[:alnum:]]    alphanumeric ([0-9A-Za-z])
+[[:alpha:]]    alphabetic ([A-Za-z])
+[[:ascii:]]    ASCII ([\x00-\x7F])
+[[:blank:]]    blank ([\t ])
+[[:cntrl:]]    control ([\x00-\x1F\x7F])
+[[:digit:]]    digits ([0-9])
+[[:graph:]]    graphical ([!-~])
+[[:lower:]]    lower case ([a-z])
+[[:print:]]    printable ([ -~])
+[[:punct:]]    punctuation ([!-/:-@[-`{-~])
+[[:space:]]    whitespace ([\t\n\v\f\r ])
+[[:upper:]]    upper case ([A-Z])
+[[:word:]]     word characters ([0-9A-Za-z_])
+[[:xdigit:]]   hex digit ([0-9A-Fa-f])
+
+ +# Untrusted input + +This crate can handle both untrusted regular expressions and untrusted +search text. + +Untrusted regular expressions are handled by capping the size of a compiled +regular expression. +(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).) +Without this, it would be trivial for an attacker to exhaust your system's +memory with expressions like `a{100}{100}{100}`. + +Untrusted search text is allowed because the matching engine(s) in this +crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +text`), which means there's no way to cause exponential blow-up like with +some other regular expression engines. (We pay for this by disallowing +features like arbitrary look-ahead and backreferences.) + +When a DFA is used, pathological cases with exponential state blow up are +avoided by constructing the DFA lazily or in an "online" manner. Therefore, +at most one new state can be created for each byte of input. This satisfies +our time complexity guarantees, but can lead to unbounded memory growth +proportional to the size of the input. As a stopgap, the DFA is only +allowed to store a fixed number of states. When the limit is reached, its +states are wiped and continues on, possibly duplicating previous work. If +the limit is reached too frequently, it gives up and hands control off to +another matching engine with fixed memory requirements. +(The DFA size limit can also be tweaked. See +[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).) +*/ #![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] #![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))] -#![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", - html_favicon_url = "https://www.rust-lang.org/favicon.ico", - html_root_url = "https://doc.rust-lang.org/regex/")] extern crate aho_corasick; extern crate memchr; @@ -464,12 +482,14 @@ extern crate utf8_ranges; pub use error::Error; pub use re_builder::unicode::*; +pub use re_builder::set_unicode::*; pub use re_set::unicode::*; +pub use re_trait::Locations; pub use re_unicode::{ - Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, - CaptureNames, FindCaptures, FindMatches, - Replacer, NoExpand, RegexSplits, RegexSplitsN, - quote, is_match, + Regex, Match, Captures, + CaptureNames, Matches, CaptureMatches, SubCaptureMatches, + Replacer, NoExpand, Split, SplitN, + escape, }; /** @@ -480,11 +500,8 @@ top-level of this crate. There are two important differences: 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` is used where `String` would have been used. -2. Regular expressions are compiled with Unicode support *disabled* by -default. This means that while Unicode regular expressions can only match valid -UTF-8, regular expressions in this module can match arbitrary bytes. Unicode -support can be selectively enabled via the `u` flag in regular expressions -provided by this sub-module. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. # Example: match null terminated string @@ -492,14 +509,14 @@ This shows how to find all null-terminated strings in a slice of bytes: ```rust # use regex::bytes::Regex; -let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); let text = b"foo\x00bar\x00baz\x00"; // Extract all of the strings without the null terminator from each match. // The unwrap is OK here since a match requires the `cstr` capture to match. let cstrs: Vec<&[u8]> = re.captures_iter(text) - .map(|c| c.name("cstr").unwrap()) + .map(|c| c.name("cstr").unwrap().as_bytes()) .collect(); assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); ``` @@ -512,17 +529,20 @@ string (e.g., to extract a title from a Matroska file): ```rust # use std::str; # use regex::bytes::Regex; -let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap(); +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; let caps = re.captures(text).unwrap(); // Notice that despite the `.*` at the end, it will only match valid UTF-8 // because Unicode mode was enabled with the `u` flag. Without the `u` flag, // the `.*` would match the rest of the bytes. -assert_eq!((7, 10), caps.pos(1).unwrap()); +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); // If there was a match, Unicode mode guarantees that `title` is valid UTF-8. -let title = str::from_utf8(caps.at(1).unwrap()).unwrap(); +let title = str::from_utf8(&caps[1]).unwrap(); assert_eq!("☃", title); ``` @@ -536,11 +556,11 @@ The supported syntax is pretty much the same as the syntax for Unicode regular expressions with a few changes that make sense for matching arbitrary bytes: -1. The `u` flag is *disabled* by default, but can be selectively enabled. (The -opposite is true for the main `Regex` type.) Disabling the `u` flag is said to -invoke "ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character -classes are allowed. +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. +2. In ASCII compatible mode, neither Unicode scalar values nor Unicode +character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps to `[[:digit:]]` and `\s` maps to `[[:space:]]`. @@ -550,8 +570,8 @@ determine whether a byte is a word byte or not. Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation. -6. `.` matches any *byte* except for `\n` instead of any codepoint. When the -`s` flag is enabled, `.` matches any byte. +6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value. +When the `s` flag is enabled, `.` matches any byte. # Performance @@ -560,8 +580,10 @@ performance on `&str`. */ pub mod bytes { pub use re_builder::bytes::*; - pub use re_set::bytes::*; + pub use re_builder::set_bytes::*; pub use re_bytes::*; + pub use re_set::bytes::*; + pub use re_trait::Locations; } mod backtrack; @@ -603,6 +625,4 @@ pub mod internal { pub use prog::{Program, Inst, EmptyLook, InstRanges}; pub use re_plugin::Plugin; pub use re_unicode::_Regex; - pub use re_trait::RegularExpression; - pub use re_builder::RegexOptions; } diff --git a/src/pattern.rs b/src/pattern.rs index 3de377ad07..37183c24e3 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,17 +1,14 @@ -#[cfg(feature = "pattern")] use std::str::pattern::{Pattern, Searcher, SearchStep}; -use re_unicode::{Regex, FindMatches}; +use re_unicode::{Regex, Matches}; -#[cfg(feature = "pattern")] pub struct RegexSearcher<'r, 't> { haystack: &'t str, - it: FindMatches<'r, 't>, + it: Matches<'r, 't>, last_step_end: usize, next_match: Option<(usize, usize)>, } -#[cfg(feature = "pattern")] impl<'r, 't> Pattern<'t> for &'r Regex { type Searcher = RegexSearcher<'r, 't>; @@ -25,7 +22,6 @@ impl<'r, 't> Pattern<'t> for &'r Regex { } } -#[cfg(feature = "pattern")] unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { #[inline] fn haystack(&self) -> &'t str { @@ -49,7 +45,8 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { SearchStep::Done } } - Some((s, e)) => { + Some(m) => { + let (s, e) = (m.start(), m.end()); if s == self.last_step_end { self.last_step_end = e; SearchStep::Match(s, e) diff --git a/src/re_builder.rs b/src/re_builder.rs index a5ea341125..3849c892d6 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -40,7 +40,7 @@ impl Default for RegexOptions { } macro_rules! define_builder { - ($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use error::Error; use exec::ExecBuilder; @@ -63,7 +63,6 @@ impl RegexBuilder { pub fn new(pattern: &str) -> RegexBuilder { let mut builder = RegexBuilder(RegexOptions::default()); builder.0.pats.push(pattern.to_owned()); - builder.0.unicode = $unicode; builder } @@ -72,21 +71,21 @@ impl RegexBuilder { /// Note that calling `as_str` on the resulting `Regex` will produce the /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. - pub fn compile(self) -> Result { - ExecBuilder::new_options(self.0) + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) .only_utf8($only_utf8) .build() .map(Regex::from) } /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive(mut self, yes: bool) -> RegexBuilder { + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.0.case_insensitive = yes; self } /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line(mut self, yes: bool) -> RegexBuilder { + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.multi_line = yes; self } @@ -96,21 +95,21 @@ impl RegexBuilder { /// it is not set (the default). /// /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` - /// expressions and means "any Unicode codepoint" for `regex::Regex` + /// expressions and means "any Unicode scalar value" for `regex::Regex` /// expressions. - pub fn dot_matches_new_line(mut self, yes: bool) -> RegexBuilder { + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.dot_matches_new_line = yes; self } /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed(mut self, yes: bool) -> RegexBuilder { + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.0.swap_greed = yes; self } /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace(mut self, yes: bool) -> RegexBuilder { + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.0.ignore_whitespace = yes; self } @@ -118,7 +117,7 @@ impl RegexBuilder { /// Set the value for the Unicode (`u`) flag. /// /// For byte based regular expressions, this is disabled by default. - pub fn unicode(mut self, yes: bool) -> RegexBuilder { + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { self.0.unicode = yes; self } @@ -128,7 +127,7 @@ impl RegexBuilder { /// This roughly corresponds to the number of bytes occupied by a single /// compiled program. If the program exceeds this number, then a /// compilation error is returned. - pub fn size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.size_limit = limit; self } @@ -142,7 +141,7 @@ impl RegexBuilder { /// limit. In particular, if a regex is used from multiple threads /// simulanteously, then each thread may use up to the number of bytes /// specified here. - pub fn dfa_size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.dfa_size_limit = limit; self } @@ -151,5 +150,118 @@ impl RegexBuilder { } } -define_builder!(bytes, re_bytes, false, false); -define_builder!(unicode, re_unicode, true, true); +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); + +macro_rules! define_set_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use error::Error; + use exec::ExecBuilder; + use super::RegexOptions; + + use re_set::$regex_mod::RegexSet; + +/// A configurable builder for a set of regular expressions. +/// +/// A builder can be used to configure how the regexes are built, for example, +/// by setting the default flags (which can be overridden in the expression +/// itself) or setting various limits. +pub struct RegexSetBuilder(RegexOptions); + +impl RegexSetBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `compile` is called. + pub fn new(patterns: I) -> RegexSetBuilder + where S: AsRef, I: IntoIterator { + let mut builder = RegexSetBuilder(RegexOptions::default()); + for pat in patterns { + builder.0.pats.push(pat.as_ref().to_owned()); + } + builder + } + + /// Consume the builder and compile the regular expressions into a set. + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(RegexSet::from) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` + /// expressions and means "any Unicode scalar value" for `regex::RegexSet` + /// expressions. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + /// + /// For byte based regular expressions, this is disabled by default. + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.unicode = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder { + self.0.dfa_size_limit = limit; + self + } +} + } + } +} + +define_set_builder!(set_bytes, bytes, false); +define_set_builder!(set_unicode, unicode, true); diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 97ac5b923a..ffc659cab6 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -10,7 +10,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::collections::hash_map; use std::fmt; use std::ops::Index; use std::str::FromStr; @@ -19,10 +18,50 @@ use std::sync::Arc; use memchr::memchr; use exec::{Exec, ExecNoSync}; -use expand::expand; +use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + &self.text[self.start..self.end] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { + text: haystack, + start: start, + end: end, + } + } +} /// A compiled regular expression for matching arbitrary bytes. /// @@ -71,22 +110,14 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. @@ -111,17 +142,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -138,29 +158,14 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); /// # } /// ``` - pub fn find(&self, text: &[u8]) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at( - &self, - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().find_at(text, start) - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -174,27 +179,22 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindMatches<'r, 't> { - FindMatches(self.0.searcher().find_iter(text)) + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first /// match in `text`. Capture group `0` always corresponds to the entire /// match. If no match is found, then `None` is returned. /// - /// You should only use `captures` if you need access to submatches. - /// Otherwise, `find` is faster for discovering the location of the overall - /// match. + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. /// /// # Examples /// @@ -209,9 +209,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.at(2), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); + /// assert_eq!(&caps[2], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], b"Citizen Kane"); @@ -232,9 +232,9 @@ impl Regex { /// .unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.name("year"), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); + /// assert_eq!(&caps["year"], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], b"Citizen Kane"); @@ -252,33 +252,17 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: self.0.capture_name_idx().clone(), }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().read_captures_at(slots, text, start) - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it - /// yields information about submatches. + /// yields information about capturing group matches. /// /// # Example /// @@ -305,8 +289,8 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t [u8], - ) -> FindCaptures<'r, 't> { - FindCaptures(self.0.searcher().captures_iter(text)) + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -329,8 +313,8 @@ impl Regex { /// ]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Splits<'r, 't> { - Splits { + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -360,8 +344,8 @@ impl Regex { &'r self, text: &'t [u8], limit: usize, - ) -> SplitsN<'r, 't> { - SplitsN { + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -375,6 +359,25 @@ impl Regex { /// If no match is found, then a copy of the byte string is returned /// unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -384,14 +387,14 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); - /// assert_eq!(re.replace(b"1078910", &b""[..]), b"1010"); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); /// # } /// ``` /// /// But anything satisfying the `Replacer` trait will work. For example, a /// closure of type `|&Captures| -> Vec` provides direct access to the - /// captures corresponding to a match. This allows one to access submatches - /// easily: + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: /// /// ```rust /// # extern crate regex; use regex::bytes::Regex; @@ -403,7 +406,7 @@ impl Regex { /// replacement.extend(&caps[1]); /// replacement /// }); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -417,7 +420,7 @@ impl Regex { /// # fn main() { /// let re = Regex::new(r"(?P[^,\s]+),\s+(?P\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -432,8 +435,8 @@ impl Regex { /// precise control over the name, use braces, e.g., `${1}a`. /// /// Finally, sometimes you just want to replace a literal string with no - /// submatch expansion. This can be done by wrapping a byte string with - /// `NoExpand`: + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: /// /// ```rust /// # extern crate regex; use regex::bytes::Regex; @@ -442,10 +445,14 @@ impl Regex { /// /// let re = Regex::new(r"(?P[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); - /// assert_eq!(result, b"$2 $last"); + /// assert_eq!(result, &b"$2 $last"[..]); /// # } /// ``` - pub fn replace(&self, text: &[u8], rep: R) -> Vec { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 1, rep) } @@ -454,8 +461,12 @@ impl Regex { /// `0`. /// /// See the documentation for `replace` for details on how to access - /// submatches in the replacement text. - pub fn replace_all(&self, text: &[u8], rep: R) -> Vec { + /// capturing group matches in the replacement text. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 0, rep) } @@ -464,46 +475,56 @@ impl Regex { /// are replaced. /// /// See the documentation for `replace` for details on how to access - /// submatches in the replacement text. - pub fn replacen( + /// capturing group matches in the replacement text. + pub fn replacen<'t, R: Replacer>( &self, - text: &[u8], + text: &'t [u8], limit: usize, mut rep: R, - ) -> Vec { + ) -> Cow<'t, [u8]> { if let Some(rep) = rep.no_expansion() { let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, m) in self.find_iter(text).enumerate() { if limit > 0 && i >= limit { break } - extend_from_slice(&mut new, &text[last_match..s]); - extend_from_slice(&mut new, &*rep); - last_match = e; + new.extend_from_slice(&text[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + } + if new.is_empty() { + return Cow::Borrowed(text); } - extend_from_slice(&mut new, &text[last_match..]); - return new; + new.extend_from_slice(&text[last_match..]); + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - extend_from_slice(&mut new, &text[last_match..s]); + let m = cap.get(0).unwrap(); + new.extend_from_slice(&text[last_match..m.start()]); rep.replace_append(&cap, &mut new); - last_match = e; + last_match = m.end(); } - extend_from_slice(&mut new, &text[last_match..]); - new + new.extend_from_slice(&text[last_match..]); + Cow::Owned(new) } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -544,6 +565,53 @@ impl Regex { self.0.searcher().shortest_match_at(text, start) } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option> { + self.0.searcher().find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut Locations, + text: &'t [u8], + start: usize, + ) -> Option> { + self.0.searcher().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { &self.0.regex_strings()[0] @@ -558,6 +626,13 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + self.0.searcher().locations() + } } /// An iterator over all non-overlapping matches for a particular string. @@ -568,13 +643,14 @@ impl Regex { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindMatches<'r, 't>(re_trait::FindMatches<'t, ExecNoSync<'r>>); +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindMatches<'r, 't> { - type Item = (usize, usize); +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { - self.0.next() + fn next(&mut self) -> Option> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) } } @@ -585,15 +661,15 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindCaptures<'r, 't>(re_trait::FindCaptures<'t, ExecNoSync<'r>>); +pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { - self.0.next().map(|slots| Captures { + self.0.next().map(|locs| Captures { text: self.0.text(), - slots: slots, + locs: locs, named_groups: self.0.regex().capture_name_idx().clone(), }) } @@ -603,12 +679,12 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct Splits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for Splits<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -623,9 +699,9 @@ impl<'r, 't> Iterator for Splits<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -638,12 +714,12 @@ impl<'r, 't> Iterator for Splits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct SplitsN<'r, 't> { - splits: Splits<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -694,60 +770,33 @@ impl<'r> Iterator for CaptureNames<'r> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t [u8], - slots: Vec>, + locs: Locations, named_groups: Arc>, } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original byte string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t [u8]> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } - } - - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t [u8]> { - self.named_groups.get(name).and_then(|&i| self.at(i)) + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter<'a>(&'a self) -> SubCaptures<'a, 't> { - SubCaptures { idx: 0, caps: self } + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.get(name).and_then(|&i| self.get(i)) } - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, slots: &self.slots } - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named<'a>(&'a self) -> SubCapturesNamed<'a, 't> { - SubCapturesNamed { + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, - names: self.named_groups.iter() + it: self.locs.iter(), } } @@ -768,19 +817,16 @@ impl<'t> Captures<'t> { /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &[u8], dst: &mut Vec) { - expand(self, replacement, dst) + expand_bytes(self, replacement, dst) } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 - } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 + self.locs.len() } } @@ -814,7 +860,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap<&usize, &String> = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -841,7 +887,8 @@ impl<'t> Index for Captures<'t> { type Output = [u8]; fn index(&self, i: usize) -> &[u8] { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -861,75 +908,31 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = [u8]; fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCaptures<'c, 't: 'c> { - idx: usize, - caps: &'c Captures<'t>, -} - -impl<'c, 't> Iterator for SubCaptures<'c, 't> { - type Item = Option<&'t [u8]>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.at(self.idx - 1)) - } else { - None - } + self.name(name).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original byte string matched. +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. /// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] -} - -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; - - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - }; - self.idx += 2; - Some(r) - } -} - -/// An Iterator over named capture groups as a tuple with the group name and -/// the value. +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. /// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCapturesNamed<'c, 't: 'c> { +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +pub struct SubCaptureMatches<'c, 't: 'c> { caps: &'c Captures<'t>, - names: hash_map::Iter<'c, String, usize>, + it: SubCapturesPosIter<'c>, } -impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { - type Item = (&'c str, Option<&'t [u8]>); +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option>; - fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { - self.names.next().map(|(name, &pos)| (&**name, self.caps.at(pos))) + fn next(&mut self) -> Option>> { + self.it.next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) } } @@ -946,7 +949,7 @@ pub trait Replacer { /// have a match at capture group `0`. /// /// For example, a no-op replacement would be - /// `dst.extend(caps.at(0).unwrap())`. + /// `dst.extend(&caps[0])`. fn replace_append(&mut self, caps: &Captures, dst: &mut Vec); /// Return a fixed unchanging replacement byte string. @@ -976,7 +979,7 @@ impl<'a> Replacer for &'a [u8] { impl Replacer for F where F: FnMut(&Captures) -> Vec { fn replace_append(&mut self, caps: &Captures, dst: &mut Vec) { - extend_from_slice(dst, &(*self)(caps)); + dst.extend_from_slice(&(*self)(caps)); } } @@ -992,26 +995,10 @@ pub struct NoExpand<'r>(pub &'r [u8]); impl<'a> Replacer for NoExpand<'a> { fn replace_append(&mut self, _: &Captures, dst: &mut Vec) { - extend_from_slice(dst, self.0); + dst.extend_from_slice(self.0); } fn no_expansion<'r>(&'r mut self) -> Option> { Some(Cow::Borrowed(self.0)) } } - -/// This hopefully has the same performance characteristics as -/// Vec::extend_from_slice (which was introduced in Rust 1.6), but works on -/// Rust 1.3. -/// -/// N.B. Remove this once we do a semver bump. At that point, we'll bump -/// required Rust version to at least 1.6. -fn extend_from_slice(dst: &mut Vec, src: &[u8]) { - dst.reserve(src.len()); - let dst_len = dst.len(); - unsafe { dst.set_len(dst_len + src.len()); } - let mut dst = &mut dst[dst_len..dst_len + src.len()]; - for i in 0..src.len() { - dst[i] = src[i]; - } -} diff --git a/src/re_plugin.rs b/src/re_plugin.rs index d453ef7e7e..afd828921b 100644 --- a/src/re_plugin.rs +++ b/src/re_plugin.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; /// Plugin is the compiler plugin's data structure. It declare some static /// data (like capture groups and the original regex string), but defines its @@ -67,15 +67,20 @@ impl RegularExpression for Plugin { fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { let mut slots = [None, None]; - self.read_captures_at(&mut slots, text, start) + (self.prog)(&mut slots, text, start); + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } } fn read_captures_at<'t>( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &'t str, start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/re_set.rs b/src/re_set.rs index c9ad443637..89a7a80828 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -9,7 +9,7 @@ // except according to those terms. macro_rules! define_set { - ($name:ident, $exec_build:expr, $text_ty:ty, $as_bytes:expr, + ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, $(#[$doc_regexset_example:meta])* ) => { pub mod $name { use std::fmt; @@ -18,7 +18,8 @@ macro_rules! define_set { use std::vec; use error::Error; - use exec::{Exec, ExecBuilder}; + use exec::Exec; + use re_builder::$builder_mod::RegexSetBuilder; use re_trait::RegularExpression; /// Match multiple (possibly overlapping) regular expressions in a single scan. @@ -102,8 +103,7 @@ impl RegexSet { /// ``` pub fn new(exprs: I) -> Result where S: AsRef, I: IntoIterator { - let exec = try!($exec_build(exprs)); - Ok(RegexSet(exec)) + RegexSetBuilder::new(exprs).build() } /// Returns true if and only if one of the regexes in this set matches @@ -131,7 +131,18 @@ impl RegexSet { /// assert!(!set.is_match("☃")); /// ``` pub fn is_match(&self, text: $text_ty) -> bool { - self.0.searcher().is_match_at($as_bytes(text), 0) + self.is_match_at(text, 0) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { + self.0.searcher().is_match_at($as_bytes(text), start) } /// Returns the set of regular expressions that match in the given text. @@ -172,14 +183,35 @@ impl RegexSet { /// ``` pub fn matches(&self, text: $text_ty) -> SetMatches { let mut matches = vec![false; self.0.regex_strings().len()]; - let any = self.0.searcher().many_matches_at( - &mut matches, $as_bytes(text), 0); + let any = self.read_matches_at(&mut matches, text, 0); SetMatches { matched_any: any, matches: matches, } } + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `text`. + #[doc(hidden)] + pub fn read_matches_at( + &self, + matches: &mut [bool], + text: $text_ty, + start: usize, + ) -> bool { + self.0.searcher().many_matches_at(matches, $as_bytes(text), start) + } + /// Returns the total number of regular expressions in this set. pub fn len(&self) -> usize { self.0.regex_strings().len() @@ -322,7 +354,7 @@ impl fmt::Debug for RegexSet { define_set! { unicode, - |exprs| ExecBuilder::new_many(exprs).build(), + set_unicode, &str, as_bytes_str, /// ```rust @@ -351,7 +383,7 @@ define_set! { define_set! { bytes, - |exprs| ExecBuilder::new_many(exprs).only_utf8(false).build(), + set_bytes, &[u8], as_bytes_bytes, /// ```rust diff --git a/src/re_trait.rs b/src/re_trait.rs index 1841efb6a8..9f3407c98b 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -13,6 +13,77 @@ /// of the capture). pub type Slot = Option; +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +pub struct Locations(Vec); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPosIter { + SubCapturesPosIter { idx: 0, locs: &self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } +} + +/// This is a hack to make Locations -> &mut [Slot] be available internally +/// without exposing it in the public API. +pub fn as_slots(locs: &mut Locations) -> &mut [Slot] { + &mut locs.0 +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +pub struct SubCapturesPosIter<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPosIter<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => { + Some(Some((s, e))) + } + }; + self.idx += 1; + x + } +} + /// RegularExpression describes types that can implement regex searching. /// /// This trait is my attempt at reducing code duplication and to standardize @@ -33,6 +104,11 @@ pub trait RegularExpression: Sized { /// always two times the number of capture groups (two slots per group). fn slots_len(&self) -> usize; + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + /// Returns the position of the next character after `i`. /// /// For example, a haystack with type `&[u8]` probably returns `i+1`, @@ -65,7 +141,7 @@ pub trait RegularExpression: Sized { /// fills in any matching capture slot locations. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>; @@ -75,8 +151,8 @@ pub trait RegularExpression: Sized { fn find_iter<'t>( self, text: &'t Self::Text, - ) -> FindMatches<'t, Self> { - FindMatches { + ) -> Matches<'t, Self> { + Matches { re: self, text: text, last_end: 0, @@ -89,20 +165,20 @@ pub trait RegularExpression: Sized { fn captures_iter<'t>( self, text: &'t Self::Text, - ) -> FindCaptures<'t, Self> { - FindCaptures(self.find_iter(text)) + ) -> CaptureMatches<'t, Self> { + CaptureMatches(self.find_iter(text)) } } /// An iterator over all non-overlapping successive leftmost-first matches. -pub struct FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't { re: R, text: &'t R::Text, last_end: usize, last_match: Option, } -impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.text @@ -114,7 +190,7 @@ impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindMatches<'t, R> +impl<'t, R> Iterator for Matches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = (usize, usize); @@ -146,10 +222,10 @@ impl<'t, R> Iterator for FindMatches<'t, R> /// An iterator over all non-overlapping successive leftmost-first matches with /// captures. -pub struct FindCaptures<'t, R>(FindMatches<'t, R>) +pub struct CaptureMatches<'t, R>(Matches<'t, R>) where R: RegularExpression, R::Text: 't; -impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.0.text() @@ -161,17 +237,17 @@ impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindCaptures<'t, R> +impl<'t, R> Iterator for CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { - type Item = Vec; + type Item = Locations; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option { if self.0.last_end > self.0.text.as_ref().len() { return None } - let mut slots = vec![None; self.0.re.slots_len()]; + let mut locs = self.0.re.locations(); let (s, e) = match self.0.re.read_captures_at( - &mut slots, + &mut locs, self.0.text, self.0.last_end, ) { @@ -187,6 +263,6 @@ impl<'t, R> Iterator for FindCaptures<'t, R> self.0.last_end = e; } self.0.last_match = Some(e); - Some(slots) + Some(locs) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index ed3c6b5bde..4cc55235ed 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -15,31 +15,62 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; +use memchr::memchr; use syntax; use error::Error; use exec::{Exec, ExecNoSyncStr}; +use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. /// /// The string returned may be safely used as a literal in a regular /// expression. -pub fn quote(text: &str) -> String { - syntax::quote(text) +pub fn escape(text: &str) -> String { + syntax::escape(text) } -/// Tests if the given regular expression matches somewhere in the text given. +/// Match represents a single match of a regex in a haystack. /// -/// If there was a problem compiling the regular expression, an error is -/// returned. -/// -/// To find submatches, split or replace text, you'll need to compile an -/// expression first. -pub fn is_match(regex: &str, text: &str) -> Result { - Regex::new(regex).map(|r| r.is_match(text)) +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + &self.text[self.start..self.end] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { + text: haystack, + start: start, + end: end, + } + } } /// A compiled regular expression for matching Unicode strings. @@ -70,18 +101,19 @@ pub fn is_match(regex: &str, text: &str) -> Result { /// ```rust /// # use regex::Regex; /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); -/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19))); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); /// ``` /// -/// # Using the `std::str::StrExt` methods with `Regex` +/// # Using the `std::str::pattern` methods with `Regex` /// -/// > **Note**: This section requires that this crate is currently compiled with -/// > the `pattern` Cargo feature enabled. +/// > **Note**: This section requires that this crate is compiled with the +/// > `pattern` Cargo feature enabled, which **requires nightly Rust**. /// /// Since `Regex` implements `Pattern`, you can use regexes with methods -/// defined on `std::str::StrExt`. For example, `is_match`, `find`, `find_iter` -/// and `split` can be replaced with `StrExt::contains`, `StrExt::find`, -/// `StrExt::match_indices` and `StrExt::split`. +/// defined on `&str`. For example, `is_match`, `find`, `find_iter` +/// and `split` can be replaced with `str::contains`, `str::find`, +/// `str::match_indices` and `str::split`. /// /// Here are some examples: /// @@ -105,7 +137,7 @@ pub enum _Regex { // The representation of `Regex` is exported to support the `regex!` // syntax extension. Do not rely on it. // - // See the comments for the `program` module in `lib.rs` for a more + // See the comments for the `internal` module in `lib.rs` for a more // detailed explanation for what `regex!` requires. #[doc(hidden)] Dynamic(Exec), @@ -134,18 +166,6 @@ impl From for Regex { } } -/// Equality comparison is based on the original string. It is possible that -/// different regular expressions have the same matching behavior, but are -/// still compared unequal. For example, `\d+` and `\d\d*` match the same set -/// of strings, but are not considered equal. -impl PartialEq for Regex { - fn eq(&self, other: &Regex) -> bool { - self.as_str() == other.as_str() - } -} - -impl Eq for Regex {} - impl FromStr for Regex { type Err = Error; @@ -155,22 +175,14 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. @@ -195,17 +207,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &str, start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -222,30 +223,15 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); /// # } /// ``` - pub fn find(&self, text: &str) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t str) -> Option> { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start) - } - _Regex::Plugin(ref plug) => plug.find_at(text, start), - } - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -259,25 +245,20 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().find_iter(text); - FindMatches(FindMatchesInner::Dynamic(it)) + Matches(MatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.find_iter(text); - FindMatches(FindMatchesInner::Plugin(it)) + Matches(MatchesInner::Plugin(it)) } } } @@ -286,9 +267,9 @@ impl Regex { /// match in `text`. Capture group `0` always corresponds to the entire /// match. If no match is found, then `None` is returned. /// - /// You should only use `captures` if you need access to submatches. - /// Otherwise, `find` is faster for discovering the location of the overall - /// match. + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. /// /// # Examples /// @@ -303,9 +284,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some("Citizen Kane")); - /// assert_eq!(caps.at(2), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], "Citizen Kane"); @@ -326,9 +307,9 @@ impl Regex { /// .unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some("Citizen Kane")); - /// assert_eq!(caps.name("year"), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], "Citizen Kane"); @@ -346,40 +327,17 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: NamedGroups::from_regex(self) }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &str, - start: usize, - ) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().read_captures_at(slots, text, start) - } - _Regex::Plugin(ref plug) => { - plug.read_captures_at(slots, text, start) - } - } - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it - /// yields information about submatches. + /// yields information about capturing group matches. /// /// # Example /// @@ -394,7 +352,7 @@ impl Regex { /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for caps in re.captures_iter(text) { /// println!("Movie: {:?}, Released: {:?}", - /// caps.name("title"), caps.name("year")); + /// &caps["title"], &caps["year"]); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 @@ -405,15 +363,15 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t str, - ) -> FindCaptures<'r, 't> { + ) -> CaptureMatches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().captures_iter(text); - FindCaptures(FindCapturesInner::Dynamic(it)) + CaptureMatches(CaptureMatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.captures_iter(text); - FindCaptures(FindCapturesInner::Plugin(it)) + CaptureMatches(CaptureMatchesInner::Plugin(it)) } } } @@ -436,8 +394,8 @@ impl Regex { /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { - RegexSplits { + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -464,8 +422,8 @@ impl Regex { /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) - -> RegexSplitsN<'r, 't> { - RegexSplitsN { + -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -478,6 +436,25 @@ impl Regex { /// /// If no match is found, then a copy of the string is returned unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -494,14 +471,14 @@ impl Regex { /// But anything satisfying the `Replacer` trait will work. For example, /// a closure of type `|&Captures| -> String` provides direct access to the /// captures corresponding to a match. This allows one to access - /// submatches easily: + /// capturing group matches easily: /// /// ```rust /// # extern crate regex; use regex::Regex; /// # use regex::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { - /// format!("{} {}", caps.at(2).unwrap_or(""), caps.at(1).unwrap_or("")) + /// format!("{} {}", &caps[2], &caps[1]) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// # } @@ -525,8 +502,8 @@ impl Regex { /// would produce the same result. To write a literal `$` use `$$`. /// /// Finally, sometimes you just want to replace a literal string with no - /// submatch expansion. This can be done by wrapping a string with - /// `NoExpand`: + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: /// /// ```rust /// # extern crate regex; use regex::Regex; @@ -538,7 +515,11 @@ impl Regex { /// assert_eq!(result, "$2 $last"); /// # } /// ``` - pub fn replace(&self, text: &str, rep: R) -> String { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 1, rep) } @@ -547,8 +528,12 @@ impl Regex { /// `0`. /// /// See the documentation for `replace` for details on how to access - /// submatches in the replacement string. - pub fn replace_all(&self, text: &str, rep: R) -> String { + /// capturing group matches in the replacement string. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 0, rep) } @@ -557,14 +542,13 @@ impl Regex { /// are replaced. /// /// See the documentation for `replace` for details on how to access - /// submatches in the replacement string. - pub fn replacen( + /// capturing group matches in the replacement string. + pub fn replacen<'t, R: Replacer>( &self, - text: &str, + text: &'t str, limit: usize, mut rep: R, - ) -> String { - + ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, // then we can fast path. The fast path can make a tremendous // difference: @@ -574,39 +558,49 @@ impl Regex { // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. - if let Some(rep) = rep.no_expand() { + if let Some(rep) = rep.no_expansion() { let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, m) in self.find_iter(text).enumerate() { if limit > 0 && i >= limit { break } - new.push_str(&text[last_match..s]); + new.push_str(&text[last_match..m.start()]); new.push_str(&rep); - last_match = e; + last_match = m.end(); + } + if new.is_empty() { + return Cow::Borrowed(text); } new.push_str(&text[last_match..]); - return new; + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)); - last_match = e; + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); } new.push_str(&text[last_match..]); - new + Cow::Owned(new) } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -652,6 +646,69 @@ impl Regex { } } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().find_at(text, start).map(|(s, e)| { + Match::new(text, s, e) + }) + } + _Regex::Plugin(ref plug) => { + plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e)) + } + } + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut Locations, + text: &'t str, + start: usize, + ) -> Option> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + _Regex::Plugin(ref plug) => { + plug.read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + } + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match self.0 { @@ -677,6 +734,18 @@ impl Regex { _Regex::Dynamic(ref d) => d.capture_names().len() } } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().locations() + } + _Regex::Plugin(ref plug) => plug.locations(), + } + } } /// An iterator over the names of all possible captures. @@ -688,9 +757,7 @@ impl Regex { pub struct CaptureNames<'r>(_CaptureNames<'r>); enum _CaptureNames<'r> { - #[doc(hidden)] Plugin(::std::slice::Iter<'r, Option<&'static str>>), - #[doc(hidden)] Dynamic(::std::slice::Iter<'r, Option>) } @@ -714,68 +781,16 @@ impl<'r> Iterator for CaptureNames<'r> { } } -/// NoExpand indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal -/// string replacement without expanding `$name` to their corresponding -/// capture groups. -/// -/// `'t` is the lifetime of the literal text. -pub struct NoExpand<'t>(pub &'t str); - -/// Replacer describes types that can be used to replace matches in a string. -pub trait Replacer { - /// Returns a possibly owned string that is used to replace the match - /// corresponding to the `caps` capture group. - /// - /// The `'a` lifetime refers to the lifetime of a borrowed string when - /// a new owned string isn't needed (e.g., for `NoExpand`). - fn reg_replace(&mut self, caps: &Captures) -> Cow; - - /// Returns a possibly owned string that never needs expansion. - fn no_expand(&mut self) -> Option> { None } -} - -impl<'t> Replacer for NoExpand<'t> { - fn reg_replace(&mut self, _: &Captures) -> Cow { - self.0.into() - } - - fn no_expand(&mut self) -> Option> { - Some(self.0.into()) - } -} - -impl<'t> Replacer for &'t str { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - caps.expand(*self).into() - } - - fn no_expand(&mut self) -> Option> { - // if there is a $ there may be an expansion - match self.find('$') { - Some(_) => None, - None => Some((*self).into()), - } - } -} - -impl Replacer for F where F: FnMut(&Captures) -> String { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - (*self)(caps).into() - } -} - /// Yields all substrings delimited by a regular expression match. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for RegexSplits<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -790,9 +805,9 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -805,12 +820,12 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplitsN<'r, 't> { - splits: RegexSplits<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -893,109 +908,63 @@ impl<'n> Iterator for NamedGroupsIter<'n> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - slots: Vec>, + locs: Locations, named_groups: NamedGroups, } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t str> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t str> { - self.named_groups.pos(name).and_then(|i| self.at(i)) + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.pos(name).and_then(|i| self.get(i)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter(&'t self) -> SubCaptures<'t> { - SubCaptures { idx: 0, caps: self, } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, slots: &self.slots } - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named(&'t self) -> SubCapturesNamed<'t> { - SubCapturesNamed { + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, - names: self.named_groups.iter() + it: self.locs.iter(), } } /// Expands all instances of `$name` in `text` to the corresponding capture - /// group `name`. + /// group `name`, and writes them to the `dst` buffer given. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// - /// If `name` isn't a valid capture group (whether the name doesn't exist or - /// isn't a valid index), then it is replaced with the empty string. + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. - pub fn expand(&self, text: &str) -> String { - const REPLACE_EXPAND: &'static str = r"(?x) - (?P^|\b|[^$]) # Ignore `$$name`. - \$ - (?P # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) - "; - // How evil can you get? - let re = Regex::new(REPLACE_EXPAND).unwrap(); - let text = re.replace_all(text, |refs: &Captures| -> String { - let before = refs.name("before").unwrap_or(""); - let name = refs.name("name").unwrap_or(""); - format!("{}{}", before, match name.parse::() { - Err(_) => self.name(name).unwrap_or("").to_owned(), - Ok(i) => self.at(i).unwrap_or("").to_owned(), - }) - }); - let re = Regex::new(r"\$\$").unwrap(); - re.replace_all(&text, NoExpand("$")) + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 - } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 + self.locs.len() } } @@ -1014,7 +983,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| &self.0.text[s..e]); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -1041,7 +1010,8 @@ impl<'t> Index for Captures<'t> { type Output = str; fn index(&self, i: usize) -> &str { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -1061,74 +1031,31 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = str; fn index<'a>(&'a self, name: &'i str) -> &'a str { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCaptures<'c> { - idx: usize, - caps: &'c Captures<'c>, -} - -impl<'c> Iterator for SubCaptures<'c> { - type Item = Option<&'c str>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.at(self.idx - 1)) - } else { - None - } + self.name(name).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } -/// An iterator over capture group positions for a particular match of a -/// regular expression. +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. /// -/// Positions are byte indices in terms of the original string matched. +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. /// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +pub struct SubCaptureMatches<'c, 't: 'c> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, } -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option>; - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - (None, None) => None, - _ => unreachable!() - }; - self.idx += 2; - Some(r) - } -} - -/// An Iterator over named capture groups as a tuple with the group -/// name and the value. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesNamed<'c> { - caps: &'c Captures<'c>, - names: NamedGroupsIter<'c>, -} - -impl<'c> Iterator for SubCapturesNamed<'c> { - type Item = (&'c str, Option<&'c str>); - - fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> { - self.names.next().map(|(name, pos)| (name, self.caps.at(pos))) + fn next(&mut self) -> Option>> { + self.it.next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) } } @@ -1139,30 +1066,30 @@ impl<'c> Iterator for SubCapturesNamed<'c> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindCaptures<'r, 't>(FindCapturesInner<'r, 't>); +pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>); -enum FindCapturesInner<'r, 't> { - Dynamic(re_trait::FindCaptures<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindCaptures<'t, Plugin>), +enum CaptureMatchesInner<'r, 't> { + Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::CaptureMatches<'t, Plugin>), } -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { match self.0 { - FindCapturesInner::Dynamic(ref mut it) => { + CaptureMatchesInner::Dynamic(ref mut it) => { let named = it.regex().capture_name_idx().clone(); - it.next().map(|slots| Captures { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Dynamic(named), }) } - FindCapturesInner::Plugin(ref mut it) => { - it.next().map(|slots| Captures { + CaptureMatchesInner::Plugin(ref mut it) => { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Plugin(it.regex().groups), }) } @@ -1172,35 +1099,105 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// An iterator over all non-overlapping matches for a particular string. /// -/// The iterator yields a tuple of integers corresponding to the start and end -/// of the match. The indices are byte offsets. The iterator stops when no more +/// The iterator yields a `Match` value. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindMatches<'r, 't>(FindMatchesInner<'r, 't>); +pub struct Matches<'r, 't>(MatchesInner<'r, 't>); -enum FindMatchesInner<'r, 't> { - Dynamic(re_trait::FindMatches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindMatches<'t, Plugin>), +enum MatchesInner<'r, 't> { + Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::Matches<'t, Plugin>), } -impl<'r, 't> FindMatches<'r, 't> { +impl<'r, 't> Matches<'r, 't> { fn text(&self) -> &'t str { match self.0 { - FindMatchesInner::Dynamic(ref it) => it.text(), - FindMatchesInner::Plugin(ref it) => it.text(), + MatchesInner::Dynamic(ref it) => it.text(), + MatchesInner::Plugin(ref it) => it.text(), } } } -impl<'r, 't> Iterator for FindMatches<'r, 't> { - type Item = (usize, usize); +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { + fn next(&mut self) -> Option> { + let text = self.text(); match self.0 { - FindMatchesInner::Dynamic(ref mut it) => it.next(), - FindMatchesInner::Plugin(ref mut it) => it.next(), + MatchesInner::Dynamic(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } + MatchesInner::Plugin(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } + } + } +} + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` and +/// `FnMut(&Captures) -> String`, which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(caps.get(0).unwrap().as_str())`. + fn replace_append(&mut self, caps: &Captures, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option> { + None + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + match memchr(b'$', self.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(*self)), } } } + +impl Replacer for F where F: FnMut(&Captures) -> String { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + dst.push_str(&(*self)(caps)); + } +} + +/// NoExpand indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +pub struct NoExpand<'r>(pub &'r str); + +impl<'a> Replacer for NoExpand<'a> { + fn replace_append(&mut self, _: &Captures, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/tests/api.rs b/tests/api.rs index 0be032949a..e17afa9332 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -40,7 +40,7 @@ fn quoted_bracket_set() { #[test] fn first_range_starts_with_left_bracket() { - let re = regex!(r"([[-z])"); + let re = regex!(r"([\[-z])"); assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); } @@ -60,7 +60,8 @@ fn empty_match_find_iter() { fn empty_match_captures_iter() { let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("abc")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); } @@ -127,96 +128,35 @@ fn capture_misc() { assert_eq!(5, cap.len()); - assert_eq!(Some((0, 3)), cap.pos(0)); - assert_eq!(None, cap.pos(2)); - assert_eq!(Some((2, 3)), cap.pos(4)); + assert_eq!((0, 3), { let m = cap.get(0).unwrap(); (m.start(), m.end()) }); + assert_eq!(None, cap.get(2)); + assert_eq!((2, 3), { let m = cap.get(4).unwrap(); (m.start(), m.end()) }); - assert_eq!(Some(t!("abc")), cap.at(0)); - assert_eq!(None, cap.at(2)); - assert_eq!(Some(t!("c")), cap.at(4)); + assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap())); + assert_eq!(None, cap.get(2)); + assert_eq!(t!("c"), match_text!(cap.get(4).unwrap())); assert_eq!(None, cap.name("a")); - assert_eq!(Some(t!("c")), cap.name("b")); + assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); } #[test] -fn capture_iter() { - let re = regex!(r"(.)(?P
.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - t!("abcd"), t!("a"), t!("b"), t!("c"), t!("d"), - ].into_iter().map(Some).collect::>(); - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - Some(t!("abc")), Some(t!("a")), None, Some(t!("b")), Some(t!("c")), - ]; - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected = vec![ - (0, 4), (0, 1), (1, 2), (2, 3), (3, 4), - ].into_iter().map(Some).collect::>(); - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - - let expected = vec![ - Some((0, 3)), Some((0, 1)), None, Some((1, 2)), Some((2, 3)), - ]; - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} +fn sub_capture_matches() { + let re = regex!(r"([a-z])(([a-z])|([0-9]))"); + let cap = re.captures(t!("a5")).unwrap(); + let subs: Vec<_> = cap.iter().collect(); -#[test] -fn capture_iter_named() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected1 = vec![ - ("a", Some(t!("b"))), ("b", Some(t!("d"))), - ]; - let expected2 = vec![ - ("b", Some(t!("d"))), ("a", Some(t!("b"))), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); -} - -#[test] -fn capture_iter_named_missing() { - let re = regex!(r"(.)(?P.)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); + assert_eq!(5, subs.len()); + assert!(subs[0].is_some()); + assert!(subs[1].is_some()); + assert!(subs[2].is_some()); + assert!(subs[3].is_none()); + assert!(subs[4].is_some()); - let expected1 = vec![ - ("a", None), ("b", Some(t!("c"))), - ]; - let expected2 = vec![ - ("b", Some(t!("c"))), ("a", None), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); + assert_eq!(t!("a5"), match_text!(subs[0].unwrap())); + assert_eq!(t!("a"), match_text!(subs[1].unwrap())); + assert_eq!(t!("5"), match_text!(subs[2].unwrap())); + assert_eq!(t!("5"), match_text!(subs[4].unwrap())); } expand!(expand1, r"(?P\w+)", "abc", "$foo", "abc"); diff --git a/tests/api_str.rs b/tests/api_str.rs index 266b6455b2..5bdca8426a 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -5,7 +5,7 @@ fn empty_match_unicode_find_iter() { // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries // even when we're susceptible to empty width matches. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], findall!(re, "Ⅰ1Ⅱ2")); } @@ -13,15 +13,10 @@ fn empty_match_unicode_find_iter() { #[test] fn empty_match_unicode_captures_iter() { // Same as empty_match_unicode_find_iter, but tests capture iteration. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); } - -#[test] -fn eq() { - use regex::Regex; - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} diff --git a/tests/bytes.rs b/tests/bytes.rs index c950688fae..9d472aa518 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -5,57 +5,58 @@ struct R<'a>(&'a [u8]); impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } -mat!(word_boundary, r" \b", " δ", None); -mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1))); -mat!(word_not_boundary, r" \B", " δ", Some((0, 1))); -mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None); - -mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1))); -mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3))); -mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1))); -mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8))); -mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1))); -mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4))); +mat!(word_boundary, r"(?-u) \b", " δ", None); +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); // The first `(.+)` matches two Unicode codepoints, but can't match the 5th // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and // matches. -mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), +mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), Some((0, 5)), Some((0, 4)), Some((4, 5))); -mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1))); -mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5))); -mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7))); -mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2))); +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); -mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2))); -mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1))); +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); // This doesn't match in a normal Unicode regex because the implicit preceding // `.*?` is Unicode aware. -mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); // Have fun with null bytes. -mat!(null_bytes, r"(?P[^\x00]+)\x00", +mat!(null_bytes, r"(?-u)(?P[^\x00]+)\x00", R(b"foo\x00"), Some((0, 4)), Some((0, 3))); // Test that lookahead operators work properly in the face of invalid UTF-8. -// See: https://github.com/rust-lang-nursery/regex/issues/277 +// See: https://github.com/rust-lang/regex/issues/277 matiter!(invalidutf8_anchor1, - r"\xcc?^", + r"(?-u)\xcc?^", R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), (0, 0)); matiter!(invalidutf8_anchor2, - r"^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] #####\x80\S7|$", + r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), (22, 22)); matiter!(invalidutf8_anchor3, - r"^|ddp\xff\xffdddddlQd@\x80", + r"(?-u)^|ddp\xff\xffdddddlQd@\x80", R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), (0, 0)); -// See https://github.com/rust-lang-nursery/regex/issues/303 +// See https://github.com/rust-lang/regex/issues/303 #[test] fn negated_full_byte_range() { - assert!(::regex::bytes::Regex::new(r#"[^\x00-\xff]"#).is_err()); + assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); } diff --git a/tests/crazy.rs b/tests/crazy.rs index bed66277e5..ade839ade1 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -1,4 +1,4 @@ -mat!(ascii_literal, u!(r"a"), "a", Some((0, 1))); +mat!(ascii_literal, r"a", "a", Some((0, 1))); // Some crazy expressions from regular-expressions.info. mat!(match_ranges, diff --git a/tests/macros.rs b/tests/macros.rs index 34627cf260..ba9cd9b4a5 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -2,7 +2,8 @@ macro_rules! findall { ($re:expr, $text:expr) => {{ - $re.find_iter(text!($text)).collect::>() + $re.find_iter(text!($text)) + .map(|m| (m.start(), m.end())).collect::>() }} } @@ -30,7 +31,10 @@ macro_rules! mat( Some(c) => { assert!(r.is_match(text)); assert!(r.shortest_match(text).is_some()); - c.iter_pos().collect() + r.capture_names() + .enumerate() + .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end()))) + .collect() } None => vec![None], }; @@ -56,14 +60,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<(usize, usize)> = vec![]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ @@ -78,14 +86,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<_> = vec![$($loc)+]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index a68fada744..4a382c78dd 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -1,11 +1,12 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text.as_bytes() } } macro_rules! t { ($re:expr) => { text!($re) } } +macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } macro_rules! bytes { ($text:expr) => { $text } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } +// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } macro_rules! no_expand { ($text:expr) => {{ @@ -25,9 +26,6 @@ macro_rules! show { }} } -// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, -// but they should be unified in 1.0. Then we can move this macro back into -// tests/api.rs where it is used. ---AG macro_rules! expand { ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { #[test] diff --git a/tests/macros_str.rs b/tests/macros_str.rs index 7ea29335de..e5b0e219da 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -1,11 +1,12 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text } } macro_rules! t { ($text:expr) => { text!($text) } } +macro_rules! match_text { ($text:expr) => { $text.as_str() } } macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { $re } } +// macro_rules! u { ($re:expr) => { $re } } macro_rules! no_expand { ($text:expr) => {{ @@ -26,8 +27,14 @@ macro_rules! expand { let re = regex!($re); let cap = re.captures(t!($text)).unwrap(); - let got = cap.expand(t!($expand)); + let mut got = String::new(); + cap.expand(t!($expand), &mut got); assert_eq!(show!(t!($expected)), show!(&*got)); } } } + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } diff --git a/tests/misc.rs b/tests/misc.rs index 293cddb322..dfe28c9707 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -8,14 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use regex::Regex; - mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); mat!(one_literal_edge, r"abc", r"xxxxxab", None); matiter!(terminates, r"a$", r"a", (0, 1)); - -#[test] -fn eq() { - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} diff --git a/tests/regression.rs b/tests/regression.rs index 05717ea1bc..3d42df8608 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -23,27 +23,27 @@ mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); // See: https://github.com/rust-lang/regex/issues/101 -mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1))); +mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1))); -// See: https://github.com/rust-lang-nursery/regex/issues/129 +// See: https://github.com/rust-lang/regex/issues/129 #[test] fn regression_captures_rep() { let re = regex!(r"([a-f]){2}(?P[x-z])"); let caps = re.captures(text!("abx")).unwrap(); - assert_eq!(caps.name("foo").unwrap(), text!("x")); + assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x")); } -// See: https://github.com/rust-lang-nursery/regex/issues/153 +// See: https://github.com/rust-lang/regex/issues/153 mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1))); mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); -// See: https://github.com/rust-lang-nursery/regex/issues/169 +// See: https://github.com/rust-lang/regex/issues/169 mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); // See: https://github.com/rust-lang/regex/issues/76 -mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); -// See: https://github.com/rust-lang-nursery/regex/issues/191 +// See: https://github.com/rust-lang/regex/issues/191 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); // burntsushi was bad and didn't create an issue for this bug. @@ -51,34 +51,34 @@ mat!(anchored_prefix1, r"^a\S", "a ", None); mat!(anchored_prefix2, r"^a\S", "foo boo a ", None); mat!(anchored_prefix3, r"^-[a-z]", "r-f", None); -// See: https://github.com/rust-lang-nursery/regex/issues/204 +// See: https://github.com/rust-lang/regex/issues/204 split!(split_on_word_boundary, r"\b", r"Should this (work?)", &[t!(""), t!("Should"), t!(" "), t!("this"), t!(" ("), t!("work"), t!("?)")]); matiter!(word_boundary_dfa, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); -// See: https://github.com/rust-lang-nursery/regex/issues/268 -matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); +// See: https://github.com/rust-lang/regex/issues/268 +matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); -// See: https://github.com/rust-lang-nursery/regex/issues/264 -mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0))); -mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0))); +// See: https://github.com/rust-lang/regex/issues/264 +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); -// See: https://github.com/rust-lang-nursery/regex/issues/280 -ismatch!(partial_anchor_alternate_begin, u!(r"^a|z"), "yyyyya", false); -ismatch!(partial_anchor_alternate_end, u!(r"a$|z"), "ayyyyy", false); +// See: https://github.com/rust-lang/regex/issues/280 +ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false); +ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false); -// See: https://github.com/rust-lang-nursery/regex/issues/289 -mat!(lits_unambiguous1, u!(r"(ABC|CDA|BC)X"), "CDAX", Some((0, 4))); +// See: https://github.com/rust-lang/regex/issues/289 +mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4))); -// See: https://github.com/rust-lang-nursery/regex/issues/291 -mat!(lits_unambiguous2, u!(r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P[0-9]+)$"), +// See: https://github.com/rust-lang/regex/issues/291 +mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P[0-9]+)$", "CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8))); -// See: https://github.com/rust-lang-nursery/regex/issues/271 -mat!(end_not_wb, u!(r"$(?-u:\B)"), "\u{5c124}\u{b576c}", Some((8, 8))); -mat!(endl_or_wb, u!(r"(?m:$)|(?-u:\b)"), "\u{6084e}", Some((4, 4))); -mat!(zero_or_end, u!(r"(?i-u:\x00)|$"), "\u{e682f}", Some((4, 4))); -mat!(y_or_endl, u!(r"(?i-u:y)|(?m:$)"), "\u{b4331}", Some((4, 4))); -mat!(wb_start_x, u!(r"(?u:\b)^(?-u:X)"), "X", Some((0, 1))); +// See: https://github.com/rust-lang/regex/issues/271 +mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); +mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); +mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); +mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); +mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); diff --git a/tests/set.rs b/tests/set.rs index c329f2764e..52b1b0dead 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -20,7 +20,7 @@ nomatset!(nset1, &["a", "a"], "b"); nomatset!(nset2, &["^foo", "bar$"], "bar foo"); nomatset!(nset3, { let xs: &[&str] = &[]; xs }, "a"); -// See: https://github.com/rust-lang-nursery/regex/issues/187 +// See: https://github.com/rust-lang/regex/issues/187 #[test] fn regression_subsequent_matches() { let set = regex_set!(&["ab", "b"]); diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index 1d8588a1a8..5516c840e7 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; @@ -54,6 +56,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index 57074f1870..4ea60e7d0f 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -16,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -34,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index 279210fca4..a170d19324 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; @@ -55,6 +57,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_default.rs b/tests/test_default.rs index e873cb0640..e6cf92fa2e 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -46,11 +46,6 @@ macro_rules! regex_set { } } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } - // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index c8f08a8fc8..8a831c47d3 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; @@ -50,6 +52,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 83eea01a2d..f376cefe1f 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -1,4 +1,3 @@ - // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. @@ -17,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -35,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index 621a5098a4..5d13685aab 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; @@ -51,6 +53,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/unicode.rs b/tests/unicode.rs index 5357a18c96..48e9a95aaf 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -1,31 +1,31 @@ -mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3))); -mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3))); -mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3))); -mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3))); -mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3))); -mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8))); -mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5))); -mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2))); -mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8))); -mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10))); +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); // Test the Unicode friendliness of Perl character classes. -mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4))); -mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None); -mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3))); -mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8))); -mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None); -mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3))); -mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3))); -mat!(uni_perl_s_not, u!(r"\s+"), "☃", None); -mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3))); +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); // And do the same for word boundaries. -mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None); -mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1))); -mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1))); -mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None); +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index 9beb7c0cb1..5a3cf1166c 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -1,9 +1,9 @@ // ASCII word boundaries are completely oblivious to Unicode characters. // For Unicode word boundaries, the tests are precisely inverted. -matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); -matiter!(ascii2, r"\Bx\B", "áxβ"); -matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); -// We can still get Unicode mode in byte regexes. -matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); -matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3)); +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));