Skip to content

Commit e2f0850

Browse files
committed
Auto merge of #308 - BurntSushi:fix-271, r=BurntSushi
Compute word boundary flags in start state. At some point, I think I had convinced myself that we didn't need to compute word boundary flags for the initial state, but it turns out that we do. Fixes #271
2 parents 5233b14 + 8d764ea commit e2f0850

File tree

2 files changed

+29
-3
lines changed

2 files changed

+29
-3
lines changed

src/dfa.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,7 +1378,9 @@ impl<'a> Fsm<'a> {
13781378
((empty_flags.end as u8) << 1) |
13791379
((empty_flags.start_line as u8) << 2) |
13801380
((empty_flags.end_line as u8) << 3) |
1381-
((state_flags.is_word() as u8) << 4))
1381+
((empty_flags.word_boundary as u8) << 4) |
1382+
((empty_flags.not_word_boundary as u8) << 5) |
1383+
((state_flags.is_word() as u8) << 6))
13821384
as usize
13831385
};
13841386
match self.cache.start_states[flagi] {
@@ -1412,9 +1414,17 @@ impl<'a> Fsm<'a> {
14121414
empty_flags.end = text.len() == 0;
14131415
empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
14141416
empty_flags.end_line = text.len() == 0;
1415-
if at > 0 && Byte::byte(text[at - 1]).is_ascii_word() {
1417+
1418+
let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1419+
let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
1420+
if is_word_last {
14161421
state_flags.set_word();
14171422
}
1423+
if is_word == is_word_last {
1424+
empty_flags.not_word_boundary = true;
1425+
} else {
1426+
empty_flags.word_boundary = true;
1427+
}
14181428
(empty_flags, state_flags)
14191429
}
14201430

@@ -1433,9 +1443,18 @@ impl<'a> Fsm<'a> {
14331443
empty_flags.end = text.len() == 0;
14341444
empty_flags.start_line = at == text.len() || text[at] == b'\n';
14351445
empty_flags.end_line = text.len() == 0;
1436-
if at < text.len() && Byte::byte(text[at]).is_ascii_word() {
1446+
1447+
let is_word_last =
1448+
at < text.len() && Byte::byte(text[at]).is_ascii_word();
1449+
let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1450+
if is_word_last {
14371451
state_flags.set_word();
14381452
}
1453+
if is_word == is_word_last {
1454+
empty_flags.not_word_boundary = true;
1455+
} else {
1456+
empty_flags.word_boundary = true;
1457+
}
14391458
(empty_flags, state_flags)
14401459
}
14411460

tests/regression.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,10 @@ mat!(lits_unambiguous1, u!(r"(ABC|CDA|BC)X"), "CDAX", Some((0, 4)));
7575
// See: https://github.com/rust-lang-nursery/regex/issues/291
7676
mat!(lits_unambiguous2, u!(r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$"),
7777
"CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));
78+
79+
// See: https://github.com/rust-lang-nursery/regex/issues/271
80+
mat!(end_not_wb, u!(r"$(?-u:\B)"), "\u{5c124}\u{b576c}", Some((8, 8)));
81+
mat!(endl_or_wb, u!(r"(?m:$)|(?-u:\b)"), "\u{6084e}", Some((4, 4)));
82+
mat!(zero_or_end, u!(r"(?i-u:\x00)|$"), "\u{e682f}", Some((4, 4)));
83+
mat!(y_or_endl, u!(r"(?i-u:y)|(?m:$)"), "\u{b4331}", Some((4, 4)));
84+
mat!(wb_start_x, u!(r"(?u:\b)^(?-u:X)"), "X", Some((0, 1)));

0 commit comments

Comments
 (0)