Skip to content

Commit 9cf4a42

Browse files
committed
automata: fix bug where reverse NFA lacked an unanchored prefix
Previously, when compiling a Thompson NFA, we were omitting an unanchored prefix when the HIR contained a `^` in its prefix. We did this because unanchored prefix in that case would never match because of the requirement imposed by `^`. The problem with that is it's incorrect when compiling a reverse automaton. For example, in the case of building a reverse NFA for `^Qu`, we should sitll include an unanchored prefix because the `^` in that case has no conflict with it. It would be like if we omitted an unanchored prefix for `Qu$` in a forward NFA, which is obviously wrong. The fix here is pretty simple: in the reverse case, check for `$` in the suffix of the HIR rather than a `^` in the prefix. Fixes #1169
1 parent 10fe722 commit 9cf4a42

File tree

1 file changed

+85
-4
lines changed

1 file changed

+85
-4
lines changed

regex-automata/src/nfa/thompson/compiler.rs

+85-4
Original file line numberDiff line numberDiff line change
@@ -961,10 +961,12 @@ impl Compiler {
961961
// for all matches. When an unanchored prefix is not added, then the
962962
// NFA's anchored and unanchored start states are equivalent.
963963
let all_anchored = exprs.iter().all(|e| {
964-
e.borrow()
965-
.properties()
966-
.look_set_prefix()
967-
.contains(hir::Look::Start)
964+
let props = e.borrow().properties();
965+
if self.config.get_reverse() {
966+
props.look_set_suffix().contains(hir::Look::End)
967+
} else {
968+
props.look_set_prefix().contains(hir::Look::Start)
969+
}
968970
});
969971
let anchored = !self.config.get_unanchored_prefix() || all_anchored;
970972
let unanchored_prefix = if anchored {
@@ -1928,6 +1930,11 @@ mod tests {
19281930
State::Sparse(SparseTransitions { transitions })
19291931
}
19301932

1933+
fn s_look(look: Look, next: usize) -> State {
1934+
let next = sid(next);
1935+
State::Look { look, next }
1936+
}
1937+
19311938
fn s_bin_union(alt1: usize, alt2: usize) -> State {
19321939
State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
19331940
}
@@ -1978,6 +1985,80 @@ mod tests {
19781985
);
19791986
}
19801987

1988+
#[test]
1989+
fn compile_no_unanchored_prefix_with_start_anchor() {
1990+
let nfa = NFA::compiler()
1991+
.configure(NFA::config().which_captures(WhichCaptures::None))
1992+
.build(r"^a")
1993+
.unwrap();
1994+
assert_eq!(
1995+
nfa.states(),
1996+
&[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
1997+
);
1998+
}
1999+
2000+
#[test]
2001+
fn compile_yes_unanchored_prefix_with_end_anchor() {
2002+
let nfa = NFA::compiler()
2003+
.configure(NFA::config().which_captures(WhichCaptures::None))
2004+
.build(r"a$")
2005+
.unwrap();
2006+
assert_eq!(
2007+
nfa.states(),
2008+
&[
2009+
s_bin_union(2, 1),
2010+
s_range(0, 255, 0),
2011+
s_byte(b'a', 3),
2012+
s_look(Look::End, 4),
2013+
s_match(0),
2014+
]
2015+
);
2016+
}
2017+
2018+
#[test]
2019+
fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2020+
let nfa = NFA::compiler()
2021+
.configure(
2022+
NFA::config()
2023+
.reverse(true)
2024+
.which_captures(WhichCaptures::None),
2025+
)
2026+
.build(r"^a")
2027+
.unwrap();
2028+
assert_eq!(
2029+
nfa.states(),
2030+
&[
2031+
s_bin_union(2, 1),
2032+
s_range(0, 255, 0),
2033+
s_byte(b'a', 3),
2034+
// Anchors get flipped in a reverse automaton.
2035+
s_look(Look::End, 4),
2036+
s_match(0),
2037+
],
2038+
);
2039+
}
2040+
2041+
#[test]
2042+
fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2043+
let nfa = NFA::compiler()
2044+
.configure(
2045+
NFA::config()
2046+
.reverse(true)
2047+
.which_captures(WhichCaptures::None),
2048+
)
2049+
.build(r"a$")
2050+
.unwrap();
2051+
assert_eq!(
2052+
nfa.states(),
2053+
&[
2054+
// Anchors get flipped in a reverse automaton.
2055+
s_look(Look::Start, 1),
2056+
s_byte(b'a', 2),
2057+
s_match(0),
2058+
],
2059+
);
2060+
}
2061+
19812062
#[test]
19822063
fn compile_empty() {
19832064
assert_eq!(build("").states(), &[s_match(0),]);

0 commit comments

Comments
 (0)