Skip to content

Commit ebd26e9

Browse files
committed
Update Replacer trait for Unicode regexes.
This uses the new Replacer trait essentially as defined in the `bytes` sub-module and described in #151. Fixes #151
1 parent d44a9f9 commit ebd26e9

File tree

5 files changed

+248
-100
lines changed

5 files changed

+248
-100
lines changed

src/expand.rs

Lines changed: 127 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,50 @@ use std::str;
22

33
use memchr::memchr;
44

5-
use bytes::Captures;
5+
use re_bytes;
6+
use re_unicode;
67

7-
pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
8+
pub fn expand_str(
9+
caps: &re_unicode::Captures,
10+
mut replacement: &str,
11+
dst: &mut String,
12+
) {
13+
while !replacement.is_empty() {
14+
match memchr(b'$', replacement.as_bytes()) {
15+
None => break,
16+
Some(i) => {
17+
dst.push_str(&replacement[..i]);
18+
replacement = &replacement[i..];
19+
}
20+
}
21+
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
22+
dst.push_str("$");
23+
replacement = &replacement[2..];
24+
continue;
25+
}
26+
debug_assert!(!replacement.is_empty());
27+
let cap_ref = match find_cap_ref(replacement) {
28+
Some(cap_ref) => cap_ref,
29+
None => {
30+
dst.push_str("$");
31+
replacement = &replacement[1..];
32+
continue;
33+
}
34+
};
35+
replacement = &replacement[cap_ref.end..];
36+
match cap_ref.cap {
37+
Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")),
38+
Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")),
39+
}
40+
}
41+
dst.push_str(replacement);
42+
}
43+
44+
pub fn expand_bytes(
45+
caps: &re_bytes::Captures,
46+
mut replacement: &[u8],
47+
dst: &mut Vec<u8>,
48+
) {
849
while !replacement.is_empty() {
950
match memchr(b'$', replacement) {
1051
None => break,
@@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
2768
continue;
2869
}
2970
};
30-
replacement = cap_ref.rest;
71+
replacement = &replacement[cap_ref.end..];
3172
match cap_ref.cap {
3273
Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")),
3374
Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")),
@@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
3677
dst.extend(replacement);
3778
}
3879

80+
/// CaptureRef represents a reference to a capture group inside some text. The
81+
/// reference is either a capture group name or a number.
82+
///
83+
/// It is also tagged with the position in the text immediately proceding the
84+
/// capture reference.
85+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
3986
struct CaptureRef<'a> {
40-
rest: &'a [u8],
4187
cap: Ref<'a>,
88+
end: usize,
4289
}
4390

91+
/// A reference to a capture group in some text.
92+
///
93+
/// e.g., `$2`, `$foo`, `${foo}`.
94+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
4495
enum Ref<'a> {
4596
Named(&'a str),
4697
Number(usize),
4798
}
4899

49-
fn find_cap_ref(mut replacement: &[u8]) -> Option<CaptureRef> {
50-
if replacement.len() <= 1 || replacement[0] != b'$' {
100+
impl<'a> From<&'a str> for Ref<'a> {
101+
fn from(x: &'a str) -> Ref<'a> {
102+
Ref::Named(x)
103+
}
104+
}
105+
106+
impl From<usize> for Ref<'static> {
107+
fn from(x: usize) -> Ref<'static> {
108+
Ref::Number(x)
109+
}
110+
}
111+
112+
/// Parses a possible reference to a capture group name in the given text,
113+
/// starting at the beginning of `replacement`.
114+
///
115+
/// If no such valid reference could be found, None is returned.
116+
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
117+
replacement: &T,
118+
) -> Option<CaptureRef> {
119+
let mut i = 0;
120+
let rep: &[u8] = replacement.as_ref();
121+
if rep.len() <= 1 || rep[0] != b'$' {
51122
return None;
52123
}
53124
let mut brace = false;
54-
replacement = &replacement[1..];
55-
if replacement[0] == b'{' {
125+
i += 1;
126+
if rep[i] == b'{' {
56127
brace = true;
57-
replacement = &replacement[1..];
128+
i += 1;
58129
}
59-
let mut cap_end = 0;
60-
while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
130+
let mut cap_end = i;
131+
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
61132
cap_end += 1;
62133
}
63-
if cap_end == 0 {
134+
if cap_end == i {
64135
return None;
65136
}
66137
// We just verified that the range 0..cap_end is valid ASCII, so it must
67138
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
68139
// check with either unsafe or by parsing the number straight from &[u8].
69-
let cap = str::from_utf8(&replacement[..cap_end])
140+
let cap = str::from_utf8(&rep[i..cap_end])
70141
.ok().expect("valid UTF-8 capture name");
71142
if brace {
72-
if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
143+
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
73144
return None;
74145
}
75146
cap_end += 1;
76147
}
77148
Some(CaptureRef {
78-
rest: &replacement[cap_end..],
79149
cap: match cap.parse::<u32>() {
80150
Ok(i) => Ref::Number(i as usize),
81151
Err(_) => Ref::Named(cap),
82152
},
153+
end: cap_end,
83154
})
84155
}
85156

157+
/// Returns true if and only if the given byte is allowed in a capture name.
86158
fn is_valid_cap_letter(b: &u8) -> bool {
87159
match *b {
88160
b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
89161
_ => false,
90162
}
91163
}
164+
165+
#[cfg(test)]
166+
mod tests {
167+
use super::{CaptureRef, find_cap_ref};
168+
169+
macro_rules! find {
170+
($name:ident, $text:expr) => {
171+
#[test]
172+
fn $name() {
173+
assert_eq!(None, find_cap_ref($text));
174+
}
175+
};
176+
($name:ident, $text:expr, $capref:expr) => {
177+
#[test]
178+
fn $name() {
179+
assert_eq!(Some($capref), find_cap_ref($text));
180+
}
181+
};
182+
}
183+
184+
macro_rules! c {
185+
($name_or_number:expr, $pos:expr) => {
186+
CaptureRef { cap: $name_or_number.into(), end: $pos }
187+
};
188+
}
189+
190+
find!(find_cap_ref1, "$foo", c!("foo", 4));
191+
find!(find_cap_ref2, "${foo}", c!("foo", 6));
192+
find!(find_cap_ref3, "$0", c!(0, 2));
193+
find!(find_cap_ref4, "$5", c!(5, 2));
194+
find!(find_cap_ref5, "$10", c!(10, 3));
195+
find!(find_cap_ref6, "$42a", c!("42a", 4));
196+
find!(find_cap_ref7, "${42}a", c!(42, 5));
197+
find!(find_cap_ref8, "${42");
198+
find!(find_cap_ref9, "${42 ");
199+
find!(find_cap_ref10, " $0 ");
200+
find!(find_cap_ref11, "$");
201+
find!(find_cap_ref12, " ");
202+
find!(find_cap_ref13, "");
203+
}

src/re_bytes.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::sync::Arc;
1919
use memchr::memchr;
2020

2121
use exec::{Exec, ExecNoSync};
22-
use expand::expand;
22+
use expand::expand_bytes;
2323
use error::Error;
2424
use re_builder::bytes::RegexBuilder;
2525
use re_trait::{self, RegularExpression, Slot};
@@ -375,6 +375,25 @@ impl Regex {
375375
/// If no match is found, then a copy of the byte string is returned
376376
/// unchanged.
377377
///
378+
/// # Replacement string syntax
379+
///
380+
/// All instances of `$name` in the replacement text is replaced with the
381+
/// corresponding capture group `name`.
382+
///
383+
/// `name` may be an integer corresponding to the index of the
384+
/// capture group (counted by order of opening parenthesis where `0` is the
385+
/// entire match) or it can be a name (consisting of letters, digits or
386+
/// underscores) corresponding to a named capture group.
387+
///
388+
/// If `name` isn't a valid capture group (whether the name doesn't exist
389+
/// or isn't a valid index), then it is replaced with the empty string.
390+
///
391+
/// The longest possible name is used. e.g., `$1a` looks up the capture
392+
/// group named `1a` and not the capture group at index `1`. To exert more
393+
/// precise control over the name, use braces, e.g., `${1}a`.
394+
///
395+
/// To write a literal `$` use `$$`.
396+
///
378397
/// # Examples
379398
///
380399
/// Note that this function is polymorphic with respect to the replacement.
@@ -768,7 +787,7 @@ impl<'t> Captures<'t> {
768787
///
769788
/// To write a literal `$` use `$$`.
770789
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
771-
expand(self, replacement, dst)
790+
expand_bytes(self, replacement, dst)
772791
}
773792

774793
/// Returns the number of captured groups.

0 commit comments

Comments
 (0)