diff --git a/Cargo.toml b/Cargo.toml index 5054f4c8c3..bd947addf8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,12 +44,6 @@ simd-accel = ["simd"] # There are no benchmarks in the library code itself bench = false -# Runs unit tests defined inside the regex package. -# Generally these tests specific pieces of the regex implementation. -[[test]] -path = "src/lib.rs" -name = "regex-inline" - # Run the test suite on the default behavior of Regex::new. # This includes a mish mash of NFAs and DFAs, which are chosen automatically # based on the regex. We test both of the NFA implementations by forcing their diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 9c8a924746..db9e3a5f61 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -71,7 +71,7 @@ macro_rules! regex { // Always enable the Unicode flag for byte based regexes. // Really, this should have been enabled by default. *sigh* use regex::bytes::RegexBuilder; - RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap() + RegexBuilder::new(&$re.to_owned()).unicode(true).build().unwrap() }} } diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs index 3b120260c0..ec57157c8e 100644 --- a/examples/shootout-regex-dna-bytes.rs +++ b/examples/shootout-regex-dna-bytes.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_end(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs index 57583218ba..a421d20853 100644 --- a/examples/shootout-regex-dna-cheat.rs +++ b/examples/shootout-regex-dna-cheat.rs @@ -23,7 +23,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -78,10 +78,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/examples/shootout-regex-dna-replace.rs b/examples/shootout-regex-dna-replace.rs index a3319ad29d..857d8bfcd7 100644 --- a/examples/shootout-regex-dna-replace.rs +++ b/examples/shootout-regex-dna-replace.rs @@ -14,6 +14,6 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); println!("original: {}, replaced: {}", ilen, seq.len()); } diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs index fbf464202f..64d210499d 100644 --- a/examples/shootout-regex-dna-single-cheat.rs +++ b/examples/shootout-regex-dna-single-cheat.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ @@ -63,10 +63,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/examples/shootout-regex-dna-single.rs b/examples/shootout-regex-dna-single.rs index 58eada712f..a84bc63c12 100644 --- a/examples/shootout-regex-dna-single.rs +++ b/examples/shootout-regex-dna-single.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ @@ -49,7 +49,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } println!("\n{}\n{}\n{}", ilen, clen, seq.len()); } diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs index d66b4fdf06..ec0060d7f4 100644 --- a/examples/shootout-regex-dna.rs +++ b/examples/shootout-regex-dna.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 4e2b65924e..832bab6c0d 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -36,7 +36,7 @@ pub struct rure_match { pub end: size_t, } -pub struct Captures(Vec>); +pub struct Captures(bytes::Locations); pub struct Iter { re: *const Regex, @@ -98,16 +98,16 @@ ffi_fn! { let mut builder = bytes::RegexBuilder::new(pat); if !options.is_null() { let options = unsafe { &*options }; - builder = builder.size_limit(options.size_limit); - builder = builder.dfa_size_limit(options.dfa_size_limit); + builder.size_limit(options.size_limit); + builder.dfa_size_limit(options.dfa_size_limit); } - builder = builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); - builder = builder.multi_line(flags & RURE_FLAG_MULTI > 0); - builder = builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); - builder = builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); - builder = builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); - builder = builder.unicode(flags & RURE_FLAG_UNICODE > 0); - match builder.compile() { + builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); + builder.multi_line(flags & RURE_FLAG_MULTI > 0); + builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); + builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); + builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); + builder.unicode(flags & RURE_FLAG_UNICODE > 0); + match builder.build() { Ok(re) => { let mut capture_names = HashMap::new(); for (i, name) in re.capture_names().enumerate() { @@ -162,10 +162,10 @@ ffi_fn! { ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start).map(|(s, e)| unsafe { + re.find_at(haystack, start).map(|m| unsafe { if !match_info.is_null() { - (*match_info).start = s; - (*match_info).end = e; + (*match_info).start = m.start(); + (*match_info).end = m.end(); } }).is_some() } @@ -258,7 +258,7 @@ ffi_fn! { } let (s, e) = match re.find_at(text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start @@ -300,7 +300,7 @@ ffi_fn! { } let (s, e) = match re.read_captures_at(slots, text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start @@ -323,7 +323,7 @@ ffi_fn! { ffi_fn! { fn rure_captures_new(re: *const Regex) -> *mut Captures { let re = unsafe { &*re }; - let captures = Captures(vec![None; 2 * re.captures_len()]); + let captures = Captures(re.locations()); Box::into_raw(Box::new(captures)) } } @@ -340,9 +340,9 @@ ffi_fn! { i: size_t, match_info: *mut rure_match, ) -> bool { - let captures = unsafe { &(*captures).0 }; - match (captures[i * 2], captures[i * 2 + 1]) { - (Some(start), Some(end)) => { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { + Some((start, end)) => { if !match_info.is_null() { unsafe { (*match_info).start = start; diff --git a/src/error.rs b/src/error.rs index e014a37aba..c95d67acdd 100644 --- a/src/error.rs +++ b/src/error.rs @@ -16,15 +16,10 @@ use syntax; #[derive(Debug)] pub enum Error { /// A syntax error. - Syntax(syntax::Error), + Syntax(String), /// The compiled program exceeded the set size limit. /// The argument is the size limit imposed. CompiledTooBig(usize), - /// **DEPRECATED:** Will be removed on next major version bump. - /// - /// This error is no longer used. (A `RegexSet` can now contain zero or - /// more regular expressions.) - InvalidSet, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -37,20 +32,14 @@ pub enum Error { impl ::std::error::Error for Error { fn description(&self) -> &str { match *self { - Error::Syntax(ref err) => err.description(), + Error::Syntax(ref err) => err, Error::CompiledTooBig(_) => "compiled program too big", - Error::InvalidSet => { - "sets must contain 2 or more regular expressions" - } Error::__Nonexhaustive => unreachable!(), } } fn cause(&self) -> Option<&::std::error::Error> { - match *self { - Error::Syntax(ref err) => Some(err), - _ => None, - } + None } } @@ -62,9 +51,6 @@ impl fmt::Display for Error { write!(f, "Compiled regex exceeds size limit of {} bytes.", limit) } - Error::InvalidSet => { - write!(f, "Sets must contain 2 or more regular expressions.") - } Error::__Nonexhaustive => unreachable!(), } } @@ -72,6 +58,6 @@ impl fmt::Display for Error { impl From for Error { fn from(err: syntax::Error) -> Error { - Error::Syntax(err) + Error::Syntax(err.to_string()) } } diff --git a/src/exec.rs b/src/exec.rs index 5d0541a13c..18df740140 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -27,7 +27,7 @@ use prog::Program; use re_builder::RegexOptions; use re_bytes; use re_set; -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; use re_unicode; use utf8::next_utf8; @@ -332,11 +332,11 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { #[inline(always)] // reduces constant overhead fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)> { - self.0.read_captures_at(slots, text.as_bytes(), start) + self.0.read_captures_at(locs, text.as_bytes(), start) } } @@ -501,10 +501,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// locations of the overall match. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/expand.rs b/src/expand.rs index 9bea703881..55873f88bb 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -2,9 +2,56 @@ use std::str; use memchr::memchr; -use bytes::Captures; +use re_bytes; +use re_unicode; -pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { +pub fn expand_str( + caps: &re_unicode::Captures, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.push_str( + caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or("")); + } + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures, + mut replacement: &[u8], + dst: &mut Vec, +) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, @@ -27,65 +74,142 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { continue; } }; - replacement = cap_ref.rest; + replacement = &replacement[cap_ref.end..]; match cap_ref.cap { - Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), - Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), + Ref::Number(i) => { + dst.extend( + caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b"")); + } } } dst.extend(replacement); } +/// CaptureRef represents a reference to a capture group inside some text. The +/// reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { - rest: &'a [u8], cap: Ref<'a>, + end: usize, } +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } -fn find_cap_ref(mut replacement: &[u8]) -> Option { - if replacement.len() <= 1 || replacement[0] != b'$' { +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref>( + replacement: &T, +) -> Option { + let mut i = 0; + let rep: &[u8] = replacement.as_ref(); + if rep.len() <= 1 || rep[0] != b'$' { return None; } let mut brace = false; - replacement = &replacement[1..]; - if replacement[0] == b'{' { + i += 1; + if rep[i] == b'{' { brace = true; - replacement = &replacement[1..]; + i += 1; } - let mut cap_end = 0; - while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + let mut cap_end = i; + while rep.get(cap_end).map_or(false, is_valid_cap_letter) { cap_end += 1; } - if cap_end == 0 { + if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check with either unsafe or by parsing the number straight from &[u8]. - let cap = str::from_utf8(&replacement[..cap_end]) + let cap = str::from_utf8(&rep[i..cap_end]) .ok().expect("valid UTF-8 capture name"); if brace { - if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + if !rep.get(cap_end).map_or(false, |&b| b == b'}') { return None; } cap_end += 1; } Some(CaptureRef { - rest: &replacement[cap_end..], cap: match cap.parse::() { Ok(i) => Ref::Number(i as usize), Err(_) => Ref::Named(cap), }, + end: cap_end, }) } +/// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, _ => false, } } + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text)); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text)); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); +} diff --git a/src/lib.rs b/src/lib.rs index 95b70e0247..d2d9b18526 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -107,9 +107,7 @@ //! let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); //! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; //! for cap in re.captures_iter(text) { -//! println!("Month: {} Day: {} Year: {}", -//! cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""), -//! cap.at(1).unwrap_or("")); +//! println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); //! } //! // Output: //! // Month: 03 Day: 14 Year: 2012 @@ -225,7 +223,8 @@ //! # extern crate regex; use regex::Regex; //! # fn main() { //! let re = Regex::new(r"(?i)Δ+").unwrap(); -//! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); +//! let mat = re.find("ΔδΔ").unwrap(); +//! assert_eq!((mat.start(), mat.end()), (0, 6)); //! # } //! ``` //! @@ -237,23 +236,19 @@ //! # extern crate regex; use regex::Regex; //! # fn main() { //! let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); -//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); +//! let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +//! assert_eq!((mat.start(), mat.end()), (3, 23)); //! # } //! ``` //! //! # Opt out of Unicode support //! //! The `bytes` sub-module provides a `Regex` type that can be used to match -//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with -//! all Unicode support disabled (e.g., `.` matches any byte instead of any -//! Unicode codepoint). Unicode support can be selectively enabled with the -//! `u` flag. See the `bytes` module documentation for more details. -//! -//! Unicode support can also be selectively *disabled* with the main `Regex` -//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII -//! word boundary. Note though that invalid UTF-8 is not allowed to be matched -//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an -//! error, since `.` matches *any byte* when Unicode support is disabled. +//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +//! the main `Regex` type. However, this behavior can be disabled by turning +//! off the `u` flag, even if doing so could result in matching invalid UTF-8. +//! For example, when the `u` flag is disabled, `.` will match any byte instead +//! of any Unicode codepoint. //! //! # Syntax //! @@ -353,7 +348,7 @@ //! # fn main() { //! let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); //! let cap = re.captures("AaAaAbbBBBb").unwrap(); -//! assert_eq!(cap.at(0), Some("AaAaAbb")); +//! assert_eq!(&cap[0], "AaAaAbb"); //! # } //! ``` //! @@ -368,7 +363,7 @@ //! # fn main() { //! let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); //! let cap = re.captures("$$abc$$").unwrap(); -//! assert_eq!(cap.at(0), Some("abc")); +//! assert_eq!(&cap[0], "abc"); //! # } //! ``` //! @@ -465,11 +460,12 @@ extern crate utf8_ranges; pub use error::Error; pub use re_builder::unicode::*; pub use re_set::unicode::*; +pub use re_trait::Locations; pub use re_unicode::{ - Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, - CaptureNames, FindCaptures, FindMatches, - Replacer, NoExpand, RegexSplits, RegexSplitsN, - quote, is_match, + Regex, Match, Captures, + CaptureNames, Matches, CaptureMatches, + Replacer, NoExpand, Split, SplitN, + quote, }; /** @@ -480,11 +476,8 @@ top-level of this crate. There are two important differences: 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` is used where `String` would have been used. -2. Regular expressions are compiled with Unicode support *disabled* by -default. This means that while Unicode regular expressions can only match valid -UTF-8, regular expressions in this module can match arbitrary bytes. Unicode -support can be selectively enabled via the `u` flag in regular expressions -provided by this sub-module. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. # Example: match null terminated string @@ -492,14 +485,14 @@ This shows how to find all null-terminated strings in a slice of bytes: ```rust # use regex::bytes::Regex; -let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); let text = b"foo\x00bar\x00baz\x00"; // Extract all of the strings without the null terminator from each match. // The unwrap is OK here since a match requires the `cstr` capture to match. let cstrs: Vec<&[u8]> = re.captures_iter(text) - .map(|c| c.name("cstr").unwrap()) + .map(|c| c.name("cstr").unwrap().as_bytes()) .collect(); assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); ``` @@ -512,17 +505,20 @@ string (e.g., to extract a title from a Matroska file): ```rust # use std::str; # use regex::bytes::Regex; -let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap(); +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; let caps = re.captures(text).unwrap(); // Notice that despite the `.*` at the end, it will only match valid UTF-8 // because Unicode mode was enabled with the `u` flag. Without the `u` flag, // the `.*` would match the rest of the bytes. -assert_eq!((7, 10), caps.pos(1).unwrap()); +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); // If there was a match, Unicode mode guarantees that `title` is valid UTF-8. -let title = str::from_utf8(caps.at(1).unwrap()).unwrap(); +let title = str::from_utf8(&caps[1]).unwrap(); assert_eq!("☃", title); ``` @@ -536,9 +532,9 @@ The supported syntax is pretty much the same as the syntax for Unicode regular expressions with a few changes that make sense for matching arbitrary bytes: -1. The `u` flag is *disabled* by default, but can be selectively enabled. (The -opposite is true for the main `Regex` type.) Disabling the `u` flag is said to -invoke "ASCII compatible" mode. +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. 2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) @@ -560,8 +556,9 @@ performance on `&str`. */ pub mod bytes { pub use re_builder::bytes::*; - pub use re_set::bytes::*; pub use re_bytes::*; + pub use re_set::bytes::*; + pub use re_trait::Locations; } mod backtrack; diff --git a/src/pattern.rs b/src/pattern.rs index 3de377ad07..37183c24e3 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,17 +1,14 @@ -#[cfg(feature = "pattern")] use std::str::pattern::{Pattern, Searcher, SearchStep}; -use re_unicode::{Regex, FindMatches}; +use re_unicode::{Regex, Matches}; -#[cfg(feature = "pattern")] pub struct RegexSearcher<'r, 't> { haystack: &'t str, - it: FindMatches<'r, 't>, + it: Matches<'r, 't>, last_step_end: usize, next_match: Option<(usize, usize)>, } -#[cfg(feature = "pattern")] impl<'r, 't> Pattern<'t> for &'r Regex { type Searcher = RegexSearcher<'r, 't>; @@ -25,7 +22,6 @@ impl<'r, 't> Pattern<'t> for &'r Regex { } } -#[cfg(feature = "pattern")] unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { #[inline] fn haystack(&self) -> &'t str { @@ -49,7 +45,8 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { SearchStep::Done } } - Some((s, e)) => { + Some(m) => { + let (s, e) = (m.start(), m.end()); if s == self.last_step_end { self.last_step_end = e; SearchStep::Match(s, e) diff --git a/src/re_builder.rs b/src/re_builder.rs index ca030b3ef5..e770dcb7bf 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -39,7 +39,7 @@ impl Default for RegexOptions { } macro_rules! define_builder { - ($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use error::Error; use exec::ExecBuilder; @@ -62,7 +62,6 @@ impl RegexBuilder { pub fn new(pattern: &str) -> RegexBuilder { let mut builder = RegexBuilder(RegexOptions::default()); builder.0.pats.push(pattern.to_owned()); - builder.0.unicode = $unicode; builder } @@ -71,21 +70,21 @@ impl RegexBuilder { /// Note that calling `as_str` on the resulting `Regex` will produce the /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. - pub fn compile(self) -> Result { - ExecBuilder::new_options(self.0) + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) .only_utf8($only_utf8) .build() .map(Regex::from) } /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive(mut self, yes: bool) -> RegexBuilder { + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.0.case_insensitive = yes; self } /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line(mut self, yes: bool) -> RegexBuilder { + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.multi_line = yes; self } @@ -97,19 +96,19 @@ impl RegexBuilder { /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` /// expressions and means "any Unicode codepoint" for `regex::Regex` /// expressions. - pub fn dot_matches_new_line(mut self, yes: bool) -> RegexBuilder { + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.dot_matches_new_line = yes; self } /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed(mut self, yes: bool) -> RegexBuilder { + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.0.swap_greed = yes; self } /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace(mut self, yes: bool) -> RegexBuilder { + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.0.ignore_whitespace = yes; self } @@ -117,7 +116,7 @@ impl RegexBuilder { /// Set the value for the Unicode (`u`) flag. /// /// For byte based regular expressions, this is disabled by default. - pub fn unicode(mut self, yes: bool) -> RegexBuilder { + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { self.0.unicode = yes; self } @@ -127,7 +126,7 @@ impl RegexBuilder { /// This roughly corresponds to the number of bytes occupied by a single /// compiled program. If the program exceeds this number, then a /// compilation error is returned. - pub fn size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.size_limit = limit; self } @@ -141,7 +140,7 @@ impl RegexBuilder { /// limit. In particular, if a regex is used from multiple threads /// simulanteously, then each thread may use up to the number of bytes /// specified here. - pub fn dfa_size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.dfa_size_limit = limit; self } @@ -150,5 +149,5 @@ impl RegexBuilder { } } -define_builder!(bytes, re_bytes, false, false); -define_builder!(unicode, re_unicode, true, true); +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 97ac5b923a..a625fe2aa9 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -10,7 +10,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::collections::hash_map; use std::fmt; use std::ops::Index; use std::str::FromStr; @@ -19,10 +18,50 @@ use std::sync::Arc; use memchr::memchr; use exec::{Exec, ExecNoSync}; -use expand::expand; +use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations}; + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + self.text + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { + text: &haystack[start..end], + start: start, + end: end, + } + } +} /// A compiled regular expression for matching arbitrary bytes. /// @@ -71,22 +110,14 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. @@ -111,17 +142,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -138,29 +158,14 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); /// # } /// ``` - pub fn find(&self, text: &[u8]) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at( - &self, - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().find_at(text, start) - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -174,18 +179,13 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindMatches<'r, 't> { - FindMatches(self.0.searcher().find_iter(text)) + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first @@ -209,9 +209,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.at(2), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); + /// assert_eq!(&caps[2], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], b"Citizen Kane"); @@ -232,9 +232,9 @@ impl Regex { /// .unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.name("year"), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); + /// assert_eq!(&caps["year"], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], b"Citizen Kane"); @@ -252,30 +252,14 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: self.0.capture_name_idx().clone(), }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().read_captures_at(slots, text, start) - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it /// yields information about submatches. @@ -305,8 +289,8 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t [u8], - ) -> FindCaptures<'r, 't> { - FindCaptures(self.0.searcher().captures_iter(text)) + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -329,8 +313,8 @@ impl Regex { /// ]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Splits<'r, 't> { - Splits { + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -360,8 +344,8 @@ impl Regex { &'r self, text: &'t [u8], limit: usize, - ) -> SplitsN<'r, 't> { - SplitsN { + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -375,6 +359,25 @@ impl Regex { /// If no match is found, then a copy of the byte string is returned /// unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -384,7 +387,7 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); - /// assert_eq!(re.replace(b"1078910", &b""[..]), b"1010"); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); /// # } /// ``` /// @@ -403,7 +406,7 @@ impl Regex { /// replacement.extend(&caps[1]); /// replacement /// }); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -417,7 +420,7 @@ impl Regex { /// # fn main() { /// let re = Regex::new(r"(?P[^,\s]+),\s+(?P\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -442,10 +445,14 @@ impl Regex { /// /// let re = Regex::new(r"(?P[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); - /// assert_eq!(result, b"$2 $last"); + /// assert_eq!(result, &b"$2 $last"[..]); /// # } /// ``` - pub fn replace(&self, text: &[u8], rep: R) -> Vec { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 1, rep) } @@ -455,7 +462,11 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement text. - pub fn replace_all(&self, text: &[u8], rep: R) -> Vec { + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 0, rep) } @@ -465,45 +476,56 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement text. - pub fn replacen( + pub fn replacen<'t, R: Replacer>( &self, - text: &[u8], + text: &'t [u8], limit: usize, mut rep: R, - ) -> Vec { + ) -> Cow<'t, [u8]> { if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, m) in it { if limit > 0 && i >= limit { break } - extend_from_slice(&mut new, &text[last_match..s]); + extend_from_slice(&mut new, &text[last_match..m.start()]); extend_from_slice(&mut new, &*rep); - last_match = e; + last_match = m.end(); } extend_from_slice(&mut new, &text[last_match..]); - return new; + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - extend_from_slice(&mut new, &text[last_match..s]); + let m = cap.get(0).unwrap(); + extend_from_slice(&mut new, &text[last_match..m.start()]); rep.replace_append(&cap, &mut new); - last_match = e; + last_match = m.end(); } extend_from_slice(&mut new, &text[last_match..]); - new + Cow::Owned(new) } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -544,6 +566,53 @@ impl Regex { self.0.searcher().shortest_match_at(text, start) } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option> { + self.0.searcher().find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut Locations, + text: &'t [u8], + start: usize, + ) -> Option> { + self.0.searcher().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { &self.0.regex_strings()[0] @@ -558,6 +627,13 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + self.0.searcher().locations() + } } /// An iterator over all non-overlapping matches for a particular string. @@ -568,13 +644,14 @@ impl Regex { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindMatches<'r, 't>(re_trait::FindMatches<'t, ExecNoSync<'r>>); +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindMatches<'r, 't> { - type Item = (usize, usize); +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { - self.0.next() + fn next(&mut self) -> Option> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) } } @@ -585,15 +662,15 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindCaptures<'r, 't>(re_trait::FindCaptures<'t, ExecNoSync<'r>>); +pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { - self.0.next().map(|slots| Captures { + self.0.next().map(|locs| Captures { text: self.0.text(), - slots: slots, + locs: locs, named_groups: self.0.regex().capture_name_idx().clone(), }) } @@ -603,12 +680,12 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct Splits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for Splits<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -623,9 +700,9 @@ impl<'r, 't> Iterator for Splits<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -638,12 +715,12 @@ impl<'r, 't> Iterator for Splits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct SplitsN<'r, 't> { - splits: Splits<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -694,61 +771,22 @@ impl<'r> Iterator for CaptureNames<'r> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t [u8], - slots: Vec>, + locs: Locations, named_groups: Arc>, } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original byte string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t [u8]> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } - } - - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t [u8]> { - self.named_groups.get(name).and_then(|&i| self.at(i)) + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter<'a>(&'a self) -> SubCaptures<'a, 't> { - SubCaptures { idx: 0, caps: self } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, slots: &self.slots } - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named<'a>(&'a self) -> SubCapturesNamed<'a, 't> { - SubCapturesNamed { - caps: self, - names: self.named_groups.iter() - } + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.get(name).and_then(|&i| self.get(i)) } /// Expands all instances of `$name` in `text` to the corresponding capture @@ -768,19 +806,16 @@ impl<'t> Captures<'t> { /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &[u8], dst: &mut Vec) { - expand(self, replacement, dst) + expand_bytes(self, replacement, dst) } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 - } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 + self.locs.len() } } @@ -814,7 +849,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap<&usize, &String> = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -841,7 +876,8 @@ impl<'t> Index for Captures<'t> { type Output = [u8]; fn index(&self, i: usize) -> &[u8] { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -861,75 +897,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = [u8]; fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCaptures<'c, 't: 'c> { - idx: usize, - caps: &'c Captures<'t>, -} - -impl<'c, 't> Iterator for SubCaptures<'c, 't> { - type Item = Option<&'t [u8]>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.at(self.idx - 1)) - } else { - None - } - } -} - -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original byte string matched. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] -} - -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; - - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - }; - self.idx += 2; - Some(r) - } -} - -/// An Iterator over named capture groups as a tuple with the group name and -/// the value. -/// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCapturesNamed<'c, 't: 'c> { - caps: &'c Captures<'t>, - names: hash_map::Iter<'c, String, usize>, -} - -impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { - type Item = (&'c str, Option<&'t [u8]>); - - fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { - self.names.next().map(|(name, &pos)| (&**name, self.caps.at(pos))) + self.name(name).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } @@ -946,7 +915,7 @@ pub trait Replacer { /// have a match at capture group `0`. /// /// For example, a no-op replacement would be - /// `dst.extend(caps.at(0).unwrap())`. + /// `dst.extend(&caps[0])`. fn replace_append(&mut self, caps: &Captures, dst: &mut Vec); /// Return a fixed unchanging replacement byte string. diff --git a/src/re_plugin.rs b/src/re_plugin.rs index d453ef7e7e..afd828921b 100644 --- a/src/re_plugin.rs +++ b/src/re_plugin.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; /// Plugin is the compiler plugin's data structure. It declare some static /// data (like capture groups and the original regex string), but defines its @@ -67,15 +67,20 @@ impl RegularExpression for Plugin { fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { let mut slots = [None, None]; - self.read_captures_at(&mut slots, text, start) + (self.prog)(&mut slots, text, start); + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } } fn read_captures_at<'t>( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &'t str, start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/re_trait.rs b/src/re_trait.rs index 1841efb6a8..9f3407c98b 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -13,6 +13,77 @@ /// of the capture). pub type Slot = Option; +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +pub struct Locations(Vec); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPosIter { + SubCapturesPosIter { idx: 0, locs: &self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } +} + +/// This is a hack to make Locations -> &mut [Slot] be available internally +/// without exposing it in the public API. +pub fn as_slots(locs: &mut Locations) -> &mut [Slot] { + &mut locs.0 +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +pub struct SubCapturesPosIter<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPosIter<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => { + Some(Some((s, e))) + } + }; + self.idx += 1; + x + } +} + /// RegularExpression describes types that can implement regex searching. /// /// This trait is my attempt at reducing code duplication and to standardize @@ -33,6 +104,11 @@ pub trait RegularExpression: Sized { /// always two times the number of capture groups (two slots per group). fn slots_len(&self) -> usize; + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + /// Returns the position of the next character after `i`. /// /// For example, a haystack with type `&[u8]` probably returns `i+1`, @@ -65,7 +141,7 @@ pub trait RegularExpression: Sized { /// fills in any matching capture slot locations. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>; @@ -75,8 +151,8 @@ pub trait RegularExpression: Sized { fn find_iter<'t>( self, text: &'t Self::Text, - ) -> FindMatches<'t, Self> { - FindMatches { + ) -> Matches<'t, Self> { + Matches { re: self, text: text, last_end: 0, @@ -89,20 +165,20 @@ pub trait RegularExpression: Sized { fn captures_iter<'t>( self, text: &'t Self::Text, - ) -> FindCaptures<'t, Self> { - FindCaptures(self.find_iter(text)) + ) -> CaptureMatches<'t, Self> { + CaptureMatches(self.find_iter(text)) } } /// An iterator over all non-overlapping successive leftmost-first matches. -pub struct FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't { re: R, text: &'t R::Text, last_end: usize, last_match: Option, } -impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.text @@ -114,7 +190,7 @@ impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindMatches<'t, R> +impl<'t, R> Iterator for Matches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = (usize, usize); @@ -146,10 +222,10 @@ impl<'t, R> Iterator for FindMatches<'t, R> /// An iterator over all non-overlapping successive leftmost-first matches with /// captures. -pub struct FindCaptures<'t, R>(FindMatches<'t, R>) +pub struct CaptureMatches<'t, R>(Matches<'t, R>) where R: RegularExpression, R::Text: 't; -impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.0.text() @@ -161,17 +237,17 @@ impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindCaptures<'t, R> +impl<'t, R> Iterator for CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { - type Item = Vec; + type Item = Locations; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option { if self.0.last_end > self.0.text.as_ref().len() { return None } - let mut slots = vec![None; self.0.re.slots_len()]; + let mut locs = self.0.re.locations(); let (s, e) = match self.0.re.read_captures_at( - &mut slots, + &mut locs, self.0.text, self.0.last_end, ) { @@ -187,6 +263,6 @@ impl<'t, R> Iterator for FindCaptures<'t, R> self.0.last_end = e; } self.0.last_match = Some(e); - Some(slots) + Some(locs) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index ed3c6b5bde..a8c5983f8a 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -15,13 +15,15 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; +use memchr::memchr; use syntax; use error::Error; use exec::{Exec, ExecNoSyncStr}; +use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations}; /// Escapes all regular expression meta characters in `text`. /// @@ -31,15 +33,44 @@ pub fn quote(text: &str) -> String { syntax::quote(text) } -/// Tests if the given regular expression matches somewhere in the text given. +/// Match represents a single match of a regex in a haystack. /// -/// If there was a problem compiling the regular expression, an error is -/// returned. -/// -/// To find submatches, split or replace text, you'll need to compile an -/// expression first. -pub fn is_match(regex: &str, text: &str) -> Result { - Regex::new(regex).map(|r| r.is_match(text)) +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + self.text + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { + text: &haystack[start..end], + start: start, + end: end, + } + } } /// A compiled regular expression for matching Unicode strings. @@ -70,13 +101,14 @@ pub fn is_match(regex: &str, text: &str) -> Result { /// ```rust /// # use regex::Regex; /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); -/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19))); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); /// ``` /// /// # Using the `std::str::StrExt` methods with `Regex` /// -/// > **Note**: This section requires that this crate is currently compiled with -/// > the `pattern` Cargo feature enabled. +/// > **Note**: This section requires that this crate is currently compiled +/// > with the `pattern` Cargo feature enabled. /// /// Since `Regex` implements `Pattern`, you can use regexes with methods /// defined on `std::str::StrExt`. For example, `is_match`, `find`, `find_iter` @@ -134,18 +166,6 @@ impl From for Regex { } } -/// Equality comparison is based on the original string. It is possible that -/// different regular expressions have the same matching behavior, but are -/// still compared unequal. For example, `\d+` and `\d\d*` match the same set -/// of strings, but are not considered equal. -impl PartialEq for Regex { - fn eq(&self, other: &Regex) -> bool { - self.as_str() == other.as_str() - } -} - -impl Eq for Regex {} - impl FromStr for Regex { type Err = Error; @@ -155,22 +175,14 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. @@ -195,17 +207,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &str, start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -222,30 +223,15 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); /// # } /// ``` - pub fn find(&self, text: &str) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t str) -> Option> { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start) - } - _Regex::Plugin(ref plug) => plug.find_at(text, start), - } - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -259,25 +245,20 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().find_iter(text); - FindMatches(FindMatchesInner::Dynamic(it)) + Matches(MatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.find_iter(text); - FindMatches(FindMatchesInner::Plugin(it)) + Matches(MatchesInner::Plugin(it)) } } } @@ -303,9 +284,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some("Citizen Kane")); - /// assert_eq!(caps.at(2), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], "Citizen Kane"); @@ -326,9 +307,9 @@ impl Regex { /// .unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some("Citizen Kane")); - /// assert_eq!(caps.name("year"), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], "Citizen Kane"); @@ -346,37 +327,14 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: NamedGroups::from_regex(self) }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &str, - start: usize, - ) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().read_captures_at(slots, text, start) - } - _Regex::Plugin(ref plug) => { - plug.read_captures_at(slots, text, start) - } - } - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it /// yields information about submatches. @@ -394,7 +352,7 @@ impl Regex { /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for caps in re.captures_iter(text) { /// println!("Movie: {:?}, Released: {:?}", - /// caps.name("title"), caps.name("year")); + /// &caps["title"], &caps["year"]); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 @@ -405,15 +363,15 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t str, - ) -> FindCaptures<'r, 't> { + ) -> CaptureMatches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().captures_iter(text); - FindCaptures(FindCapturesInner::Dynamic(it)) + CaptureMatches(CaptureMatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.captures_iter(text); - FindCaptures(FindCapturesInner::Plugin(it)) + CaptureMatches(CaptureMatchesInner::Plugin(it)) } } } @@ -436,8 +394,8 @@ impl Regex { /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { - RegexSplits { + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -464,8 +422,8 @@ impl Regex { /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) - -> RegexSplitsN<'r, 't> { - RegexSplitsN { + -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -478,6 +436,25 @@ impl Regex { /// /// If no match is found, then a copy of the string is returned unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -501,7 +478,7 @@ impl Regex { /// # use regex::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { - /// format!("{} {}", caps.at(2).unwrap_or(""), caps.at(1).unwrap_or("")) + /// format!("{} {}", &caps[2], &caps[1]) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// # } @@ -538,7 +515,11 @@ impl Regex { /// assert_eq!(result, "$2 $last"); /// # } /// ``` - pub fn replace(&self, text: &str, rep: R) -> String { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 1, rep) } @@ -548,7 +529,11 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. - pub fn replace_all(&self, text: &str, rep: R) -> String { + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 0, rep) } @@ -558,13 +543,12 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. - pub fn replacen( + pub fn replacen<'t, R: Replacer>( &self, - text: &str, + text: &'t str, limit: usize, mut rep: R, - ) -> String { - + ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, // then we can fast path. The fast path can make a tremendous // difference: @@ -574,39 +558,50 @@ impl Regex { // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. - if let Some(rep) = rep.no_expand() { + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, m) in it { if limit > 0 && i >= limit { break } - new.push_str(&text[last_match..s]); + new.push_str(&text[last_match..m.start()]); new.push_str(&rep); - last_match = e; + last_match = m.end(); } new.push_str(&text[last_match..]); - return new; + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)); - last_match = e; + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); } new.push_str(&text[last_match..]); - new + Cow::Owned(new) } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -652,6 +647,69 @@ impl Regex { } } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().find_at(text, start).map(|(s, e)| { + Match::new(text, s, e) + }) + } + _Regex::Plugin(ref plug) => { + plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e)) + } + } + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut Locations, + text: &'t str, + start: usize, + ) -> Option> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + _Regex::Plugin(ref plug) => { + plug.read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + } + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match self.0 { @@ -677,6 +735,18 @@ impl Regex { _Regex::Dynamic(ref d) => d.capture_names().len() } } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().locations() + } + _Regex::Plugin(ref plug) => plug.locations(), + } + } } /// An iterator over the names of all possible captures. @@ -688,9 +758,7 @@ impl Regex { pub struct CaptureNames<'r>(_CaptureNames<'r>); enum _CaptureNames<'r> { - #[doc(hidden)] Plugin(::std::slice::Iter<'r, Option<&'static str>>), - #[doc(hidden)] Dynamic(::std::slice::Iter<'r, Option>) } @@ -714,68 +782,16 @@ impl<'r> Iterator for CaptureNames<'r> { } } -/// NoExpand indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal -/// string replacement without expanding `$name` to their corresponding -/// capture groups. -/// -/// `'t` is the lifetime of the literal text. -pub struct NoExpand<'t>(pub &'t str); - -/// Replacer describes types that can be used to replace matches in a string. -pub trait Replacer { - /// Returns a possibly owned string that is used to replace the match - /// corresponding to the `caps` capture group. - /// - /// The `'a` lifetime refers to the lifetime of a borrowed string when - /// a new owned string isn't needed (e.g., for `NoExpand`). - fn reg_replace(&mut self, caps: &Captures) -> Cow; - - /// Returns a possibly owned string that never needs expansion. - fn no_expand(&mut self) -> Option> { None } -} - -impl<'t> Replacer for NoExpand<'t> { - fn reg_replace(&mut self, _: &Captures) -> Cow { - self.0.into() - } - - fn no_expand(&mut self) -> Option> { - Some(self.0.into()) - } -} - -impl<'t> Replacer for &'t str { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - caps.expand(*self).into() - } - - fn no_expand(&mut self) -> Option> { - // if there is a $ there may be an expansion - match self.find('$') { - Some(_) => None, - None => Some((*self).into()), - } - } -} - -impl Replacer for F where F: FnMut(&Captures) -> String { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - (*self)(caps).into() - } -} - /// Yields all substrings delimited by a regular expression match. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for RegexSplits<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -790,9 +806,9 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -805,12 +821,12 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplitsN<'r, 't> { - splits: RegexSplits<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -893,109 +909,51 @@ impl<'n> Iterator for NamedGroupsIter<'n> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - slots: Vec>, + locs: Locations, named_groups: NamedGroups, } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t str> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } - } - - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t str> { - self.named_groups.pos(name).and_then(|i| self.at(i)) + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter(&'t self) -> SubCaptures<'t> { - SubCaptures { idx: 0, caps: self, } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, slots: &self.slots } - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named(&'t self) -> SubCapturesNamed<'t> { - SubCapturesNamed { - caps: self, - names: self.named_groups.iter() - } + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.pos(name).and_then(|i| self.get(i)) } /// Expands all instances of `$name` in `text` to the corresponding capture - /// group `name`. + /// group `name`, and writes them to the `dst` buffer given. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// - /// If `name` isn't a valid capture group (whether the name doesn't exist or - /// isn't a valid index), then it is replaced with the empty string. + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. - pub fn expand(&self, text: &str) -> String { - const REPLACE_EXPAND: &'static str = r"(?x) - (?P^|\b|[^$]) # Ignore `$$name`. - \$ - (?P # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) - "; - // How evil can you get? - let re = Regex::new(REPLACE_EXPAND).unwrap(); - let text = re.replace_all(text, |refs: &Captures| -> String { - let before = refs.name("before").unwrap_or(""); - let name = refs.name("name").unwrap_or(""); - format!("{}{}", before, match name.parse::() { - Err(_) => self.name(name).unwrap_or("").to_owned(), - Ok(i) => self.at(i).unwrap_or("").to_owned(), - }) - }); - let re = Regex::new(r"\$\$").unwrap(); - re.replace_all(&text, NoExpand("$")) + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 - } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 + self.locs.len() } } @@ -1014,7 +972,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| &self.0.text[s..e]); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -1041,7 +999,8 @@ impl<'t> Index for Captures<'t> { type Output = str; fn index(&self, i: usize) -> &str { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -1061,74 +1020,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = str; fn index<'a>(&'a self, name: &'i str) -> &'a str { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCaptures<'c> { - idx: usize, - caps: &'c Captures<'c>, -} - -impl<'c> Iterator for SubCaptures<'c> { - type Item = Option<&'c str>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.at(self.idx - 1)) - } else { - None - } - } -} - -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original string matched. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] -} - -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; - - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - (None, None) => None, - _ => unreachable!() - }; - self.idx += 2; - Some(r) - } -} - -/// An Iterator over named capture groups as a tuple with the group -/// name and the value. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesNamed<'c> { - caps: &'c Captures<'c>, - names: NamedGroupsIter<'c>, -} - -impl<'c> Iterator for SubCapturesNamed<'c> { - type Item = (&'c str, Option<&'c str>); - - fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> { - self.names.next().map(|(name, pos)| (name, self.caps.at(pos))) + self.name(name).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } @@ -1139,30 +1032,30 @@ impl<'c> Iterator for SubCapturesNamed<'c> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindCaptures<'r, 't>(FindCapturesInner<'r, 't>); +pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>); -enum FindCapturesInner<'r, 't> { - Dynamic(re_trait::FindCaptures<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindCaptures<'t, Plugin>), +enum CaptureMatchesInner<'r, 't> { + Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::CaptureMatches<'t, Plugin>), } -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { match self.0 { - FindCapturesInner::Dynamic(ref mut it) => { + CaptureMatchesInner::Dynamic(ref mut it) => { let named = it.regex().capture_name_idx().clone(); - it.next().map(|slots| Captures { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Dynamic(named), }) } - FindCapturesInner::Plugin(ref mut it) => { - it.next().map(|slots| Captures { + CaptureMatchesInner::Plugin(ref mut it) => { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Plugin(it.regex().groups), }) } @@ -1172,35 +1065,105 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// An iterator over all non-overlapping matches for a particular string. /// -/// The iterator yields a tuple of integers corresponding to the start and end -/// of the match. The indices are byte offsets. The iterator stops when no more +/// The iterator yields a `Match` value. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindMatches<'r, 't>(FindMatchesInner<'r, 't>); +pub struct Matches<'r, 't>(MatchesInner<'r, 't>); -enum FindMatchesInner<'r, 't> { - Dynamic(re_trait::FindMatches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindMatches<'t, Plugin>), +enum MatchesInner<'r, 't> { + Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::Matches<'t, Plugin>), } -impl<'r, 't> FindMatches<'r, 't> { +impl<'r, 't> Matches<'r, 't> { fn text(&self) -> &'t str { match self.0 { - FindMatchesInner::Dynamic(ref it) => it.text(), - FindMatchesInner::Plugin(ref it) => it.text(), + MatchesInner::Dynamic(ref it) => it.text(), + MatchesInner::Plugin(ref it) => it.text(), } } } -impl<'r, 't> Iterator for FindMatches<'r, 't> { - type Item = (usize, usize); +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { + fn next(&mut self) -> Option> { + let text = self.text(); match self.0 { - FindMatchesInner::Dynamic(ref mut it) => it.next(), - FindMatchesInner::Plugin(ref mut it) => it.next(), + MatchesInner::Dynamic(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } + MatchesInner::Plugin(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } } } } + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` and +/// `FnMut(&Captures) -> String`, which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(caps.get(0).unwrap().as_str())`. + fn replace_append(&mut self, caps: &Captures, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option> { + None + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + match memchr(b'$', self.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(*self)), + } + } +} + +impl Replacer for F where F: FnMut(&Captures) -> String { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + dst.push_str(&(*self)(caps)); + } +} + +/// NoExpand indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +pub struct NoExpand<'r>(pub &'r str); + +impl<'a> Replacer for NoExpand<'a> { + fn replace_append(&mut self, _: &Captures, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/tests/api.rs b/tests/api.rs index 0be032949a..7221995b0e 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -60,7 +60,8 @@ fn empty_match_find_iter() { fn empty_match_captures_iter() { let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("abc")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); } @@ -127,96 +128,16 @@ fn capture_misc() { assert_eq!(5, cap.len()); - assert_eq!(Some((0, 3)), cap.pos(0)); - assert_eq!(None, cap.pos(2)); - assert_eq!(Some((2, 3)), cap.pos(4)); + assert_eq!((0, 3), { let m = cap.get(0).unwrap(); (m.start(), m.end()) }); + assert_eq!(None, cap.get(2)); + assert_eq!((2, 3), { let m = cap.get(4).unwrap(); (m.start(), m.end()) }); - assert_eq!(Some(t!("abc")), cap.at(0)); - assert_eq!(None, cap.at(2)); - assert_eq!(Some(t!("c")), cap.at(4)); + assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap())); + assert_eq!(None, cap.get(2)); + assert_eq!(t!("c"), match_text!(cap.get(4).unwrap())); assert_eq!(None, cap.name("a")); - assert_eq!(Some(t!("c")), cap.name("b")); -} - -#[test] -fn capture_iter() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - t!("abcd"), t!("a"), t!("b"), t!("c"), t!("d"), - ].into_iter().map(Some).collect::>(); - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - Some(t!("abc")), Some(t!("a")), None, Some(t!("b")), Some(t!("c")), - ]; - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected = vec![ - (0, 4), (0, 1), (1, 2), (2, 3), (3, 4), - ].into_iter().map(Some).collect::>(); - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - - let expected = vec![ - Some((0, 3)), Some((0, 1)), None, Some((1, 2)), Some((2, 3)), - ]; - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_named() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected1 = vec![ - ("a", Some(t!("b"))), ("b", Some(t!("d"))), - ]; - let expected2 = vec![ - ("b", Some(t!("d"))), ("a", Some(t!("b"))), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); -} - -#[test] -fn capture_iter_named_missing() { - let re = regex!(r"(.)(?P.)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - - let expected1 = vec![ - ("a", None), ("b", Some(t!("c"))), - ]; - let expected2 = vec![ - ("b", Some(t!("c"))), ("a", None), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); + assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); } expand!(expand1, r"(?P\w+)", "abc", "$foo", "abc"); diff --git a/tests/api_str.rs b/tests/api_str.rs index 266b6455b2..5bdca8426a 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -5,7 +5,7 @@ fn empty_match_unicode_find_iter() { // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries // even when we're susceptible to empty width matches. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], findall!(re, "Ⅰ1Ⅱ2")); } @@ -13,15 +13,10 @@ fn empty_match_unicode_find_iter() { #[test] fn empty_match_unicode_captures_iter() { // Same as empty_match_unicode_find_iter, but tests capture iteration. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); } - -#[test] -fn eq() { - use regex::Regex; - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} diff --git a/tests/bytes.rs b/tests/bytes.rs index a290630d8d..e7748e91c9 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -5,36 +5,37 @@ struct R<'a>(&'a [u8]); impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } -mat!(word_boundary, r" \b", " δ", None); -mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1))); -mat!(word_not_boundary, r" \B", " δ", Some((0, 1))); -mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None); - -mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1))); -mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3))); -mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1))); -mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8))); -mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1))); -mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4))); +mat!(word_boundary, r"(?-u) \b", " δ", None); +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); // The first `(.+)` matches two Unicode codepoints, but can't match the 5th // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and // matches. -mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), +mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), Some((0, 5)), Some((0, 4)), Some((4, 5))); -mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1))); -mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5))); -mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7))); -mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2))); +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); -mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2))); -mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1))); +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); // This doesn't match in a normal Unicode regex because the implicit preceding // `.*?` is Unicode aware. -mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); // Have fun with null bytes. -mat!(null_bytes, r"(?P[^\x00]+)\x00", +mat!(null_bytes, r"(?-u)(?P[^\x00]+)\x00", R(b"foo\x00"), Some((0, 4)), Some((0, 3))); diff --git a/tests/crazy.rs b/tests/crazy.rs index bed66277e5..ade839ade1 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -1,4 +1,4 @@ -mat!(ascii_literal, u!(r"a"), "a", Some((0, 1))); +mat!(ascii_literal, r"a", "a", Some((0, 1))); // Some crazy expressions from regular-expressions.info. mat!(match_ranges, diff --git a/tests/macros.rs b/tests/macros.rs index f9e8912630..5badc89b53 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -2,7 +2,8 @@ macro_rules! findall { ($re:expr, $text:expr) => {{ - $re.find_iter(text!($text)).collect::>() + $re.find_iter(text!($text)) + .map(|m| (m.start(), m.end())).collect::>() }} } @@ -19,7 +20,10 @@ macro_rules! mat( Some(c) => { assert!(r.is_match(text)); assert!(r.shortest_match(text).is_some()); - c.iter_pos().collect() + r.capture_names() + .enumerate() + .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end()))) + .collect() } None => vec![None], }; @@ -45,14 +49,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<(usize, usize)> = vec![]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ @@ -67,14 +75,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<_> = vec![$($loc)+]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index a68fada744..4a382c78dd 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -1,11 +1,12 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text.as_bytes() } } macro_rules! t { ($re:expr) => { text!($re) } } +macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } macro_rules! bytes { ($text:expr) => { $text } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } +// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } macro_rules! no_expand { ($text:expr) => {{ @@ -25,9 +26,6 @@ macro_rules! show { }} } -// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, -// but they should be unified in 1.0. Then we can move this macro back into -// tests/api.rs where it is used. ---AG macro_rules! expand { ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { #[test] diff --git a/tests/macros_str.rs b/tests/macros_str.rs index 7ea29335de..e5b0e219da 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -1,11 +1,12 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text } } macro_rules! t { ($text:expr) => { text!($text) } } +macro_rules! match_text { ($text:expr) => { $text.as_str() } } macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { $re } } +// macro_rules! u { ($re:expr) => { $re } } macro_rules! no_expand { ($text:expr) => {{ @@ -26,8 +27,14 @@ macro_rules! expand { let re = regex!($re); let cap = re.captures(t!($text)).unwrap(); - let got = cap.expand(t!($expand)); + let mut got = String::new(); + cap.expand(t!($expand), &mut got); assert_eq!(show!(t!($expected)), show!(&*got)); } } } + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } diff --git a/tests/misc.rs b/tests/misc.rs index 293cddb322..dfe28c9707 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -8,14 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use regex::Regex; - mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); mat!(one_literal_edge, r"abc", r"xxxxxab", None); matiter!(terminates, r"a$", r"a", (0, 1)); - -#[test] -fn eq() { - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} diff --git a/tests/regression.rs b/tests/regression.rs index 3b7a1fe917..108cdb9565 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -30,7 +30,7 @@ mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1))); fn regression_captures_rep() { let re = regex!(r"([a-f]){2}(?P[x-z])"); let caps = re.captures(text!("abx")).unwrap(); - assert_eq!(caps.name("foo").unwrap(), text!("x")); + assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x")); } // See: https://github.com/rust-lang-nursery/regex/issues/153 @@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); // See: https://github.com/rust-lang/regex/issues/76 -mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); // See: https://github.com/rust-lang-nursery/regex/issues/191 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); @@ -59,8 +59,8 @@ matiter!(word_boundary_dfa, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); // See: https://github.com/rust-lang-nursery/regex/issues/268 -matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); +matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); // See: https://github.com/rust-lang-nursery/regex/issues/264 -mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0))); -mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index b6726167ff..5516c840e7 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -56,6 +56,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index 57074f1870..4ea60e7d0f 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -16,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -34,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index dd0ebbd7f5..a170d19324 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -57,6 +57,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_default.rs b/tests/test_default.rs index e873cb0640..e6cf92fa2e 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -46,11 +46,6 @@ macro_rules! regex_set { } } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } - // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index 12cb1606f3..8a831c47d3 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -52,6 +52,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 83eea01a2d..f376cefe1f 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -1,4 +1,3 @@ - // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. @@ -17,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -35,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index e6dd1907e6..5d13685aab 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -53,6 +53,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/unicode.rs b/tests/unicode.rs index 5357a18c96..48e9a95aaf 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -1,31 +1,31 @@ -mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3))); -mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3))); -mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3))); -mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3))); -mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3))); -mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8))); -mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5))); -mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2))); -mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8))); -mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10))); +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); // Test the Unicode friendliness of Perl character classes. -mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4))); -mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None); -mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3))); -mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8))); -mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None); -mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3))); -mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3))); -mat!(uni_perl_s_not, u!(r"\s+"), "☃", None); -mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3))); +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); // And do the same for word boundaries. -mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None); -mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1))); -mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1))); -mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None); +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index 9beb7c0cb1..5a3cf1166c 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -1,9 +1,9 @@ // ASCII word boundaries are completely oblivious to Unicode characters. // For Unicode word boundaries, the tests are precisely inverted. -matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); -matiter!(ascii2, r"\Bx\B", "áxβ"); -matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); -// We can still get Unicode mode in byte regexes. -matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); -matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3)); +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));