Skip to content

Commit e583a02

Browse files
committed
Switch bytes::Regex to using Unicode mode by default.
1 parent 05e4a02 commit e583a02

12 files changed

+73
-83
lines changed

src/lib.rs

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -244,16 +244,11 @@
244244
//! # Opt out of Unicode support
245245
//!
246246
//! The `bytes` sub-module provides a `Regex` type that can be used to match
247-
//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with
248-
//! all Unicode support disabled (e.g., `.` matches any byte instead of any
249-
//! Unicode codepoint). Unicode support can be selectively enabled with the
250-
//! `u` flag. See the `bytes` module documentation for more details.
251-
//!
252-
//! Unicode support can also be selectively *disabled* with the main `Regex`
253-
//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII
254-
//! word boundary. Note though that invalid UTF-8 is not allowed to be matched
255-
//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an
256-
//! error, since `.` matches *any byte* when Unicode support is disabled.
247+
//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
248+
//! the main `Regex` type. However, this behavior can be disabled by turning
249+
//! off the `u` flag, even if doing so could result in matching invalid UTF-8.
250+
//! For example, when the `u` flag is disabled, `.` will match any byte instead
251+
//! of any Unicode codepoint.
257252
//!
258253
//! # Syntax
259254
//!
@@ -480,19 +475,16 @@ top-level of this crate. There are two important differences:
480475
481476
1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
482477
is used where `String` would have been used.
483-
2. Regular expressions are compiled with Unicode support *disabled* by
484-
default. This means that while Unicode regular expressions can only match valid
485-
UTF-8, regular expressions in this module can match arbitrary bytes. Unicode
486-
support can be selectively enabled via the `u` flag in regular expressions
487-
provided by this sub-module.
478+
2. Unicode support can be disabled even when disabling it would result in
479+
matching invalid UTF-8 bytes.
488480
489481
# Example: match null terminated string
490482
491483
This shows how to find all null-terminated strings in a slice of bytes:
492484
493485
```rust
494486
# use regex::bytes::Regex;
495-
let re = Regex::new(r"(?P<cstr>[^\x00]+)\x00").unwrap();
487+
let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
496488
let text = b"foo\x00bar\x00baz\x00";
497489
498490
// Extract all of the strings without the null terminator from each match.
@@ -512,7 +504,9 @@ string (e.g., to extract a title from a Matroska file):
512504
```rust
513505
# use std::str;
514506
# use regex::bytes::Regex;
515-
let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap();
507+
let re = Regex::new(
508+
r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
509+
).unwrap();
516510
let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
517511
let caps = re.captures(text).unwrap();
518512
@@ -536,9 +530,9 @@ The supported syntax is pretty much the same as the syntax for Unicode
536530
regular expressions with a few changes that make sense for matching arbitrary
537531
bytes:
538532
539-
1. The `u` flag is *disabled* by default, but can be selectively enabled. (The
540-
opposite is true for the main `Regex` type.) Disabling the `u` flag is said to
541-
invoke "ASCII compatible" mode.
533+
1. The `u` flag can be disabled even when disabling it might cause the regex to
534+
match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
535+
"ASCII compatible" mode.
542536
2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character
543537
classes are allowed.
544538
3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)

src/re_builder.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ impl Default for RegexOptions {
3939
}
4040

4141
macro_rules! define_builder {
42-
($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => {
42+
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
4343
pub mod $name {
4444
use error::Error;
4545
use exec::ExecBuilder;
@@ -62,7 +62,6 @@ impl RegexBuilder {
6262
pub fn new(pattern: &str) -> RegexBuilder {
6363
let mut builder = RegexBuilder(RegexOptions::default());
6464
builder.0.pats.push(pattern.to_owned());
65-
builder.0.unicode = $unicode;
6665
builder
6766
}
6867

@@ -146,5 +145,5 @@ impl RegexBuilder {
146145
}
147146
}
148147

149-
define_builder!(bytes, re_bytes, false, false);
150-
define_builder!(unicode, re_unicode, true, true);
148+
define_builder!(bytes, re_bytes, false);
149+
define_builder!(unicode, re_unicode, true);

tests/api_str.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
fn empty_match_unicode_find_iter() {
66
// Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
77
// even when we're susceptible to empty width matches.
8-
let re = regex!(u!(r".*?"));
8+
let re = regex!(r".*?");
99
assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
1010
findall!(re, "Ⅰ1Ⅱ2"));
1111
}
1212

1313
#[test]
1414
fn empty_match_unicode_captures_iter() {
1515
// Same as empty_match_unicode_find_iter, but tests capture iteration.
16-
let re = regex!(u!(r".*?"));
16+
let re = regex!(r".*?");
1717
let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2"))
1818
.map(|c| c.pos(0).unwrap())
1919
.collect();

tests/bytes.rs

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,36 +5,37 @@
55
struct R<'a>(&'a [u8]);
66
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }
77

8-
mat!(word_boundary, r" \b", " δ", None);
9-
mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1)));
10-
mat!(word_not_boundary, r" \B", " δ", Some((0, 1)));
11-
mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None);
12-
13-
mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1)));
14-
mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3)));
15-
mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1)));
16-
mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8)));
17-
mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1)));
18-
mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4)));
8+
mat!(word_boundary, r"(?-u) \b", " δ", None);
9+
mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
10+
mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
11+
mat!(word_not_boundary_unicode, r" \B", " δ", None);
12+
13+
mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
14+
mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
15+
mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
16+
mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
17+
mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
18+
mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
1919

2020
// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
2121
// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
2222
// matches.
23-
mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
23+
mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
2424
Some((0, 5)), Some((0, 4)), Some((4, 5)));
2525

26-
mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1)));
27-
mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5)));
28-
mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
29-
mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
26+
mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
27+
mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
28+
mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
29+
mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
3030

31-
mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2)));
32-
mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1)));
31+
mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
32+
mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
3333

3434
// This doesn't match in a normal Unicode regex because the implicit preceding
3535
// `.*?` is Unicode aware.
36-
mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2)));
36+
mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
37+
mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
3738

3839
// Have fun with null bytes.
39-
mat!(null_bytes, r"(?P<cstr>[^\x00]+)\x00",
40+
mat!(null_bytes, r"(?-u)(?P<cstr>[^\x00]+)\x00",
4041
R(b"foo\x00"), Some((0, 4)), Some((0, 3)));

tests/crazy.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
mat!(ascii_literal, u!(r"a"), "a", Some((0, 1)));
1+
mat!(ascii_literal, r"a", "a", Some((0, 1)));
22

33
// Some crazy expressions from regular-expressions.info.
44
mat!(match_ranges,

tests/macros_bytes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ macro_rules! t { ($re:expr) => { text!($re) } }
55
macro_rules! bytes { ($text:expr) => { $text } }
66
macro_rules! b { ($text:expr) => { bytes!($text) } }
77

8-
macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }
8+
// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }
99

1010
macro_rules! no_expand {
1111
($text:expr) => {{

tests/macros_str.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ macro_rules! t { ($text:expr) => { text!($text) } }
55
macro_rules! bytes { ($text:expr) => { $text.as_bytes() } }
66
macro_rules! b { ($text:expr) => { bytes!($text) } }
77

8-
macro_rules! u { ($re:expr) => { $re } }
8+
// macro_rules! u { ($re:expr) => { $re } }
99

1010
macro_rules! no_expand {
1111
($text:expr) => {{

tests/regression.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
4141
mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
4242

4343
// See: https://github.com/rust-lang/regex/issues/76
44-
mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10)));
44+
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
4545

4646
// See: https://github.com/rust-lang-nursery/regex/issues/191
4747
mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));

tests/test_backtrack_bytes.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ macro_rules! regex_new {
1616
use regex::internal::ExecBuilder;
1717
ExecBuilder::new($re)
1818
.bounded_backtracking()
19-
.unicode(false)
2019
.only_utf8(false)
2120
.build()
2221
.map(|e| e.into_byte_regex())
@@ -34,7 +33,6 @@ macro_rules! regex_set_new {
3433
use regex::internal::ExecBuilder;
3534
ExecBuilder::new_many($re)
3635
.bounded_backtracking()
37-
.unicode(false)
3836
.only_utf8(false)
3937
.build()
4038
.map(|e| e.into_byte_regex_set())

tests/test_nfa_bytes.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ macro_rules! regex_new {
1717
use regex::internal::ExecBuilder;
1818
ExecBuilder::new($re)
1919
.nfa()
20-
.unicode(false)
2120
.only_utf8(false)
2221
.build()
2322
.map(|e| e.into_byte_regex())
@@ -35,7 +34,6 @@ macro_rules! regex_set_new {
3534
use regex::internal::ExecBuilder;
3635
ExecBuilder::new_many($re)
3736
.nfa()
38-
.unicode(false)
3937
.only_utf8(false)
4038
.build()
4139
.map(|e| e.into_byte_regex_set())

tests/unicode.rs

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,28 @@
1-
mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3)));
2-
mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3)));
3-
mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8)));
4-
mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2)));
5-
mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2)));
6-
mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5)));
7-
mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2)));
8-
mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8)));
9-
mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10)));
10-
mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10)));
11-
mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10)));
1+
mat!(uni_literal, r"☃", "☃", Some((0, 3)));
2+
mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)));
3+
mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
4+
mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)));
5+
mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)));
6+
mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)));
7+
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
8+
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
9+
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
10+
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
11+
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
1212

1313
// Test the Unicode friendliness of Perl character classes.
14-
mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4)));
15-
mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None);
16-
mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3)));
17-
mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8)));
18-
mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None);
19-
mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3)));
20-
mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3)));
21-
mat!(uni_perl_s_not, u!(r"\s+"), "☃", None);
22-
mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3)));
14+
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
15+
mat!(uni_perl_w_not, r"\w+", "⥡", None);
16+
mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)));
17+
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
18+
mat!(uni_perl_d_not, r"\d+", "Ⅱ", None);
19+
mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)));
20+
mat!(uni_perl_s, r"\s+", " ", Some((0, 3)));
21+
mat!(uni_perl_s_not, r"\s+", "☃", None);
22+
mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)));
2323

2424
// And do the same for word boundaries.
25-
mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None);
26-
mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1)));
27-
mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1)));
28-
mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None);
25+
mat!(uni_boundary_none, r"\d\b", "6δ", None);
26+
mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)));
27+
mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1)));
28+
mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);

tests/word_boundary_ascii.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// ASCII word boundaries are completely oblivious to Unicode characters.
22
// For Unicode word boundaries, the tests are precisely inverted.
3-
matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
4-
matiter!(ascii2, r"\Bx\B", "áxβ");
3+
matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
4+
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
55

6-
// We can still get Unicode mode in byte regexes.
7-
matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");
8-
matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3));
6+
// We still get Unicode word boundaries by default in byte regexes.
7+
matiter!(unicode1, r"\bx\b", "áxβ");
8+
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

0 commit comments

Comments
 (0)