Skip to content

Commit 01e8812

Browse files
committed
StrSearcher: Additional comments and small code moves
Break out a separate static method to create the "byteset".
1 parent 2b82c07 commit 01e8812

File tree

1 file changed

+27
-19
lines changed

1 file changed

+27
-19
lines changed

src/libcore/str/pattern.rs

+27-19
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,8 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
641641
}
642642
StrSearcherImpl::TwoWay(ref mut searcher) => {
643643
let is_long = searcher.memory == usize::MAX;
644+
// write out `true` and `false` cases to encourage the compiler
645+
// to specialize the two cases separately.
644646
if is_long {
645647
searcher.next::<MatchOnly>(self.haystack.as_bytes(),
646648
self.needle.as_bytes(),
@@ -653,8 +655,8 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
653655
}
654656
}
655657
}
656-
657658
}
659+
658660
unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
659661
#[inline]
660662
fn next_back(&mut self) -> SearchStep {
@@ -709,6 +711,7 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
709711
}
710712
StrSearcherImpl::TwoWay(ref mut searcher) => {
711713
let is_long = searcher.memory == usize::MAX;
714+
// write out `true` and `false`, like `next_match`
712715
if is_long {
713716
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
714717
self.needle.as_bytes(),
@@ -723,8 +726,7 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
723726
}
724727
}
725728

726-
/// The internal state of an iterator that searches for matches of a substring
727-
/// within a larger string using two-way search
729+
/// The internal state of the two-way substring search algorithm.
728730
#[derive(Clone, Debug)]
729731
struct TwoWaySearcher {
730732
// constants
@@ -741,7 +743,9 @@ struct TwoWaySearcher {
741743
// variables
742744
position: usize,
743745
end: usize,
746+
/// index into needle before which we have already matched
744747
memory: usize,
748+
/// index into needle after which we have already matched
745749
memory_back: usize,
746750
}
747751

@@ -841,9 +845,6 @@ impl TwoWaySearcher {
841845
// is large.
842846
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
843847
// short period case -- the period is exact
844-
let byteset = needle[..period].iter()
845-
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
846-
847848
// compute a separate critical factorization for the reversed needle
848849
// x = u' v' where |v'| < period(x).
849850
//
@@ -860,26 +861,26 @@ impl TwoWaySearcher {
860861
crit_pos: crit_pos,
861862
crit_pos_back: crit_pos_back,
862863
period: period,
863-
byteset: byteset,
864+
byteset: Self::byteset_create(&needle[..period]),
864865

865866
position: 0,
866867
end: end,
867868
memory: 0,
868-
// memory_back after which we have already matched
869869
memory_back: needle.len(),
870870
}
871871
} else {
872872
// long period case -- we have an approximation to the actual period,
873873
// and don't use memorization.
874-
875-
let byteset = needle.iter()
876-
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
874+
//
875+
// Approximate the period by lower bound max(|u|, |v|) + 1.
876+
// The critical factorization is efficient to use for both forward and
877+
// reverse search.
877878

878879
TwoWaySearcher {
879880
crit_pos: crit_pos,
880881
crit_pos_back: crit_pos,
881882
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
882-
byteset: byteset,
883+
byteset: Self::byteset_create(needle),
883884

884885
position: 0,
885886
end: end,
@@ -889,6 +890,11 @@ impl TwoWaySearcher {
889890
}
890891
}
891892

893+
#[inline]
894+
fn byteset_create(bytes: &[u8]) -> u64 {
895+
bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
896+
}
897+
892898
#[inline(always)]
893899
fn byteset_contains(&self, byte: u8) -> bool {
894900
(self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
@@ -976,9 +982,9 @@ impl TwoWaySearcher {
976982
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
977983
// is a critical factorization, so is (reverse(v), reverse(u)).
978984
//
979-
// For the short period case, using memorization, we rely on |u| < period(x).
980-
// For this case we have computed a critical factorization x = u' v'
981-
// where |v'| < period(x) instead (field `crit_pos_back`).
985+
// For the reverse case we have computed a critical factorization x = u' v'
986+
// (field `crit_pos_back`). We need |u| < period(x) for the forward case and
987+
// thus |v'| < period(x) for the reverse.
982988
//
983989
// To search in reverse through the haystack, we search forward through
984990
// a reversed haystack with a reversed needle, matching first u' and then v'.
@@ -1018,7 +1024,7 @@ impl TwoWaySearcher {
10181024

10191025
// See if the left part of the needle matches
10201026
let crit = if long_period { self.crit_pos_back }
1021-
else { cmp::min(self.crit_pos_back, self.memory_back) };
1027+
else { cmp::min(self.crit_pos_back, self.memory_back) };
10221028
for i in (0..crit).rev() {
10231029
if needle[i] != haystack[self.end - needle.len() + i] {
10241030
self.end -= self.crit_pos_back - i;
@@ -1031,7 +1037,7 @@ impl TwoWaySearcher {
10311037

10321038
// See if the right part of the needle matches
10331039
let needle_end = if long_period { needle.len() }
1034-
else { self.memory_back };
1040+
else { self.memory_back };
10351041
for i in self.crit_pos_back..needle_end {
10361042
if needle[i] != haystack[self.end - needle.len() + i] {
10371043
self.end -= self.period;
@@ -1070,7 +1076,8 @@ impl TwoWaySearcher {
10701076
fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
10711077
let mut left = 0; // Corresponds to i in the paper
10721078
let mut right = 1; // Corresponds to j in the paper
1073-
let mut offset = 0; // Corresponds to k in the paper
1079+
let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1080+
// to match 0-based indexing.
10741081
let mut period = 1; // Corresponds to p in the paper
10751082

10761083
while let Some(&a) = arr.get(right + offset) {
@@ -1117,7 +1124,8 @@ impl TwoWaySearcher {
11171124
{
11181125
let mut left = 0; // Corresponds to i in the paper
11191126
let mut right = 1; // Corresponds to j in the paper
1120-
let mut offset = 0; // Corresponds to k in the paper
1127+
let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1128+
// to match 0-based indexing.
11211129
let mut period = 1; // Corresponds to p in the paper
11221130
let n = arr.len();
11231131

0 commit comments

Comments
 (0)