pattern.rs 56.0 KB
Newer Older
1 2
//! The string Pattern API.
//!
E
Eric Huss 已提交
3 4 5
//! The Pattern API provides a generic mechanism for using different pattern
//! types when searching through a string.
//!
A
Alexander Regueiro 已提交
6 7
//! For more details, see the traits [`Pattern`], [`Searcher`],
//! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
E
Eric Huss 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
//!
//! Although this API is unstable, it is exposed via stable APIs on the
//! [`str`] type.
//!
//! # Examples
//!
//! [`Pattern`] is [implemented][pattern-impls] in the stable API for
//! [`&str`], [`char`], slices of [`char`], and functions and closures
//! implementing `FnMut(char) -> bool`.
//!
//! ```
//! let s = "Can you find a needle in a haystack?";
//!
//! // &str pattern
//! assert_eq!(s.find("you"), Some(4));
//! // char pattern
//! assert_eq!(s.find('n'), Some(2));
//! // slice of chars pattern
//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1));
//! // closure pattern
//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35));
//! ```
//!
//! [`&str`]: ../../../std/primitive.str.html
//! [`char`]: ../../../std/primitive.char.html
//! [`str`]: ../../../std/primitive.str.html
//! [`DoubleEndedSearcher`]: trait.DoubleEndedSearcher.html
//! [`Pattern`]: trait.Pattern.html
//! [`ReverseSearcher`]: trait.ReverseSearcher.html
//! [`Searcher`]: trait.Searcher.html
//! [pattern-impls]: trait.Pattern.html#implementors
39

M
Mark Rousskov 已提交
40 41 42 43 44
#![unstable(
    feature = "pattern",
    reason = "API not fully fleshed out and ready to be stabilized",
    issue = "27721"
)]
45

T
Taiki Endo 已提交
46 47 48 49
use crate::cmp;
use crate::fmt;
use crate::slice::memchr;
use crate::usize;
50 51 52

// Pattern

M
Marvin Löbel 已提交
53 54 55 56 57 58 59 60 61 62
/// A string pattern.
///
/// A `Pattern<'a>` expresses that the implementing type
/// can be used as a string pattern for searching in a `&'a str`.
///
/// For example, both `'a'` and `"aa"` are patterns that
/// would match at index `1` in the string `"baaaab"`.
///
/// The trait itself acts as a builder for an associated
/// `Searcher` type, which does the actual work of finding
J
Joseph Crail 已提交
63
/// occurrences of the pattern in a string.
64
pub trait Pattern<'a>: Sized {
M
Marvin Löbel 已提交
65
    /// Associated searcher for this pattern
66
    type Searcher: Searcher<'a>;
M
Marvin Löbel 已提交
67

68
    /// Constructs the associated searcher from
M
Marvin Löbel 已提交
69
    /// `self` and the `haystack` to search in.
70
    fn into_searcher(self, haystack: &'a str) -> Self::Searcher;
71

72
    /// Checks whether the pattern matches anywhere in the haystack
73 74
    #[inline]
    fn is_contained_in(self, haystack: &'a str) -> bool {
75
        self.into_searcher(haystack).next_match().is_some()
76 77
    }

78
    /// Checks whether the pattern matches at the front of the haystack
79
    #[inline]
M
Marvin Löbel 已提交
80
    fn is_prefix_of(self, haystack: &'a str) -> bool {
81
        matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _))
82 83
    }

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
    /// Removes the pattern from the front of haystack, if it matches.
    #[inline]
    fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
        if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() {
            debug_assert_eq!(
                start, 0,
                "The first search step from Searcher \
                 must include the first character"
            );
            // SAFETY: `Searcher` is known to return valid indices.
            unsafe { Some(haystack.get_unchecked(len..)) }
        } else {
            None
        }
    }

100
    /// Checks whether the pattern matches at the back of the haystack
101
    #[inline]
M
Marvin Löbel 已提交
102
    fn is_suffix_of(self, haystack: &'a str) -> bool
M
Mark Rousskov 已提交
103 104
    where
        Self::Searcher: ReverseSearcher<'a>,
M
Marvin Löbel 已提交
105
    {
106
        matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j)
107
    }
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127

    /// Removes the pattern from the back of haystack, if it matches.
    #[inline]
    fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
    where
        Self::Searcher: ReverseSearcher<'a>,
    {
        if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() {
            debug_assert_eq!(
                end,
                haystack.len(),
                "The first search step from ReverseSearcher \
                 must include the last character"
            );
            // SAFETY: `Searcher` is known to return valid indices.
            unsafe { Some(haystack.get_unchecked(..start)) }
        } else {
            None
        }
    }
128 129
}

130 131
// Searcher

M
Marvin Löbel 已提交
132
/// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`.
M
Marvin Löbel 已提交
133
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
134
pub enum SearchStep {
M
Marvin Löbel 已提交
135 136
    /// Expresses that a match of the pattern has been found at
    /// `haystack[a..b]`.
137
    Match(usize, usize),
M
Marvin Löbel 已提交
138 139 140
    /// Expresses that `haystack[a..b]` has been rejected as a possible match
    /// of the pattern.
    ///
J
Joseph Crail 已提交
141
    /// Note that there might be more than one `Reject` between two `Match`es,
M
Marvin Löbel 已提交
142
    /// there is no requirement for them to be combined into one.
143
    Reject(usize, usize),
B
Bruce Mitchener 已提交
144
    /// Expresses that every byte of the haystack has been visited, ending
M
Marvin Löbel 已提交
145
    /// the iteration.
M
Mark Rousskov 已提交
146
    Done,
147
}
148

M
Marvin Löbel 已提交
149 150 151 152 153 154 155 156 157 158 159 160
/// A searcher for a string pattern.
///
/// This trait provides methods for searching for non-overlapping
/// matches of a pattern starting from the front (left) of a string.
///
/// It will be implemented by associated `Searcher`
/// types of the `Pattern` trait.
///
/// The trait is marked unsafe because the indices returned by the
/// `next()` methods are required to lie on valid utf8 boundaries in
/// the haystack. This enables consumers of this trait to
/// slice the haystack without additional runtime checks.
161
pub unsafe trait Searcher<'a> {
B
Bruce Mitchener 已提交
162
    /// Getter for the underlying string to be searched in
M
Marvin Löbel 已提交
163 164
    ///
    /// Will always return the same `&str`
165
    fn haystack(&self) -> &'a str;
M
Marvin Löbel 已提交
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184

    /// Performs the next search step starting from the front.
    ///
    /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
    /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
    ///   pattern, even partially.
    /// - Returns `Done` if every byte of the haystack has been visited
    ///
    /// The stream of `Match` and `Reject` values up to a `Done`
    /// will contain index ranges that are adjacent, non-overlapping,
    /// covering the whole haystack, and laying on utf8 boundaries.
    ///
    /// A `Match` result needs to contain the whole matched pattern,
    /// however `Reject` results may be split up into arbitrary
    /// many adjacent fragments. Both ranges may have zero length.
    ///
    /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
    /// might produce the stream
    /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]`
185
    fn next(&mut self) -> SearchStep;
M
Marvin Löbel 已提交
186

A
Alexander Regueiro 已提交
187
    /// Finds the next `Match` result. See `next()`
188 189 190 191 192
    ///
    /// Unlike next(), there is no guarantee that the returned ranges
    /// of this and next_reject will overlap. This will return (start_match, end_match),
    /// where start_match is the index of where the match begins, and end_match is
    /// the index after the end of the match.
193 194 195 196 197 198 199 200 201 202
    #[inline]
    fn next_match(&mut self) -> Option<(usize, usize)> {
        loop {
            match self.next() {
                SearchStep::Match(a, b) => return Some((a, b)),
                SearchStep::Done => return None,
                _ => continue,
            }
        }
    }
M
Marvin Löbel 已提交
203

A
Alexander Regueiro 已提交
204
    /// Finds the next `Reject` result. See `next()` and `next_match()`
205 206 207
    ///
    /// Unlike next(), there is no guarantee that the returned ranges
    /// of this and next_match will overlap.
208
    #[inline]
M
Marvin Löbel 已提交
209
    fn next_reject(&mut self) -> Option<(usize, usize)> {
210 211 212 213 214 215 216 217
        loop {
            match self.next() {
                SearchStep::Reject(a, b) => return Some((a, b)),
                SearchStep::Done => return None,
                _ => continue,
            }
        }
    }
218 219
}

M
Marvin Löbel 已提交
220 221 222 223 224 225 226 227 228 229 230 231 232 233
/// A reverse searcher for a string pattern.
///
/// This trait provides methods for searching for non-overlapping
/// matches of a pattern starting from the back (right) of a string.
///
/// It will be implemented by associated `Searcher`
/// types of the `Pattern` trait if the pattern supports searching
/// for it from the back.
///
/// The index ranges returned by this trait are not required
/// to exactly match those of the forward search in reverse.
///
/// For the reason why this trait is marked unsafe, see them
/// parent trait `Searcher`.
234
pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
M
Marvin Löbel 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
    /// Performs the next search step starting from the back.
    ///
    /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
    /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
    ///   pattern, even partially.
    /// - Returns `Done` if every byte of the haystack has been visited
    ///
    /// The stream of `Match` and `Reject` values up to a `Done`
    /// will contain index ranges that are adjacent, non-overlapping,
    /// covering the whole haystack, and laying on utf8 boundaries.
    ///
    /// A `Match` result needs to contain the whole matched pattern,
    /// however `Reject` results may be split up into arbitrary
    /// many adjacent fragments. Both ranges may have zero length.
    ///
    /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
    /// might produce the stream
    /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`
253
    fn next_back(&mut self) -> SearchStep;
M
Marvin Löbel 已提交
254

A
Alexander Regueiro 已提交
255
    /// Finds the next `Match` result. See `next_back()`
256
    #[inline]
M
Mark Rousskov 已提交
257
    fn next_match_back(&mut self) -> Option<(usize, usize)> {
258 259 260 261 262 263 264 265
        loop {
            match self.next_back() {
                SearchStep::Match(a, b) => return Some((a, b)),
                SearchStep::Done => return None,
                _ => continue,
            }
        }
    }
M
Marvin Löbel 已提交
266

A
Alexander Regueiro 已提交
267
    /// Finds the next `Reject` result. See `next_back()`
268
    #[inline]
M
Mark Rousskov 已提交
269
    fn next_reject_back(&mut self) -> Option<(usize, usize)> {
270 271 272 273 274 275 276 277
        loop {
            match self.next_back() {
                SearchStep::Reject(a, b) => return Some((a, b)),
                SearchStep::Done => return None,
                _ => continue,
            }
        }
    }
278 279
}

M
Marvin Löbel 已提交
280 281 282 283 284 285 286 287 288 289 290 291
/// A marker trait to express that a `ReverseSearcher`
/// can be used for a `DoubleEndedIterator` implementation.
///
/// For this, the impl of `Searcher` and `ReverseSearcher` need
/// to follow these conditions:
///
/// - All results of `next()` need to be identical
///   to the results of `next_back()` in reverse order.
/// - `next()` and `next_back()` need to behave as
///   the two ends of a range of values, that is they
///   can not "walk past each other".
///
S
Steve Klabnik 已提交
292
/// # Examples
M
Marvin Löbel 已提交
293 294 295 296 297 298 299 300
///
/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a
/// `char` only requires looking at one at a time, which behaves the same
/// from both ends.
///
/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because
/// the pattern `"aa"` in the haystack `"aaa"` matches as either
/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
301
pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
302

303 304 305 306 307 308
/////////////////////////////////////////////////////////////////////////////
// Impl for char
/////////////////////////////////////////////////////////////////////////////

/// Associated type for `<char as Pattern<'a>>::Searcher`.
#[derive(Clone, Debug)]
309 310
pub struct CharSearcher<'a> {
    haystack: &'a str,
311 312 313 314 315
    // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
    // This invariant can be broken *within* next_match and next_match_back, however
    // they must exit with fingers on valid code point boundaries.
    /// `finger` is the current byte index of the forward search.
    /// Imagine that it exists before the byte at its index, i.e.
316
    /// `haystack[finger]` is the first byte of the slice we must inspect during
317
    /// forward searching
318
    finger: usize,
319 320 321 322
    /// `finger_back` is the current byte index of the reverse search.
    /// Imagine that it exists after the byte at its index, i.e.
    /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
    /// forward searching (and thus the first byte to be inspected when calling next_back())
323
    finger_back: usize,
324
    /// The character being searched for
325
    needle: char,
326 327 328 329 330 331

    // safety invariant: `utf8_size` must be less than 5
    /// The number of bytes `needle` takes up when encoded in utf8
    utf8_size: usize,
    /// A utf8 encoded copy of the `needle`
    utf8_encoded: [u8; 4],
332
}
333 334 335 336

unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
    #[inline]
    fn haystack(&self) -> &'a str {
337
        self.haystack
338 339 340
    }
    #[inline]
    fn next(&mut self) -> SearchStep {
341
        let old_finger = self.finger;
342 343 344 345 346 347 348 349
        // SAFETY: 1-4 guarantee safety of `get_unchecked`
        // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries
        //    (this is invariant)
        // 2. `self.finger >= 0` since it starts at 0 and only increases
        // 3. `self.finger < self.finger_back` because otherwise the char `iter`
        //    would return `SearchStep::Done`
        // 4. `self.finger` comes before the end of the haystack because `self.finger_back`
        //    starts at the end and only decreases
350
        let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
351 352 353 354
        let mut iter = slice.chars();
        let old_len = iter.iter.len();
        if let Some(ch) = iter.next() {
            // add byte offset of current character
355
            // without re-encoding as utf-8
356
            self.finger += old_len - iter.iter.len();
357 358 359 360 361 362 363 364
            if ch == self.needle {
                SearchStep::Match(old_finger, self.finger)
            } else {
                SearchStep::Reject(old_finger, self.finger)
            }
        } else {
            SearchStep::Done
        }
365 366 367
    }
    #[inline]
    fn next_match(&mut self) -> Option<(usize, usize)> {
368 369
        loop {
            // get the haystack after the last character found
A
Andre Bogus 已提交
370
            let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
371
            // the last byte of the utf8 encoded needle
372
            // SAFETY: we have an invariant that `utf8_size < 5`
373 374 375 376 377 378 379 380 381 382 383 384 385
            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
            if let Some(index) = memchr::memchr(last_byte, bytes) {
                // The new finger is the index of the byte we found,
                // plus one, since we memchr'd for the last byte of the character.
                //
                // Note that this doesn't always give us a finger on a UTF8 boundary.
                // If we *didn't* find our character
                // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
                // We can't just skip to the next valid starting byte because a character like
                // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
                // the second byte when searching for the third.
                //
                // However, this is totally okay. While we have the invariant that
S
Steve Klabnik 已提交
386
                // self.finger is on a UTF8 boundary, this invariant is not relied upon
387 388 389 390 391 392
                // within this method (it is relied upon in CharSearcher::next()).
                //
                // We only exit this method when we reach the end of the string, or if we
                // find something. When we find something the `finger` will be set
                // to a UTF8 boundary.
                self.finger += index + 1;
393 394 395 396 397 398
                if self.finger >= self.utf8_size {
                    let found_char = self.finger - self.utf8_size;
                    if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
                        if slice == &self.utf8_encoded[0..self.utf8_size] {
                            return Some((found_char, self.finger));
                        }
399
                    }
400
                }
401 402
            } else {
                // found nothing, exit
403
                self.finger = self.finger_back;
404
                return None;
405 406
            }
        }
407
    }
408 409

    // let next_reject use the default implementation from the Searcher trait
410 411 412 413 414
}

unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
    #[inline]
    fn next_back(&mut self) -> SearchStep {
415
        let old_finger = self.finger_back;
416
        // SAFETY: see the comment for next() above
417
        let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
418 419 420 421
        let mut iter = slice.chars();
        let old_len = iter.iter.len();
        if let Some(ch) = iter.next_back() {
            // subtract byte offset of current character
422
            // without re-encoding as utf-8
423 424 425 426 427 428 429 430 431
            self.finger_back -= old_len - iter.iter.len();
            if ch == self.needle {
                SearchStep::Match(self.finger_back, old_finger)
            } else {
                SearchStep::Reject(self.finger_back, old_finger)
            }
        } else {
            SearchStep::Done
        }
432 433 434
    }
    #[inline]
    fn next_match_back(&mut self) -> Option<(usize, usize)> {
435 436 437
        let haystack = self.haystack.as_bytes();
        loop {
            // get the haystack up to but not including the last character searched
438
            let bytes = haystack.get(self.finger..self.finger_back)?;
439
            // the last byte of the utf8 encoded needle
440
            // SAFETY: we have an invariant that `utf8_size < 5`
441 442
            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
            if let Some(index) = memchr::memrchr(last_byte, bytes) {
443 444 445
                // we searched a slice that was offset by self.finger,
                // add self.finger to recoup the original index
                let index = self.finger + index;
446 447 448 449 450 451
                // memrchr will return the index of the byte we wish to
                // find. In case of an ASCII character, this is indeed
                // were we wish our new finger to be ("after" the found
                // char in the paradigm of reverse iteration). For
                // multibyte chars we need to skip down by the number of more
                // bytes they have than ASCII
452 453 454 455 456
                let shift = self.utf8_size - 1;
                if index >= shift {
                    let found_char = index - shift;
                    if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
                        if slice == &self.utf8_encoded[0..self.utf8_size] {
457
                            // move finger to before the character found (i.e., at its start index)
458 459 460
                            self.finger_back = found_char;
                            return Some((self.finger_back, self.finger_back + self.utf8_size));
                        }
461
                    }
462
                }
463 464 465 466 467 468 469 470 471 472 473 474 475
                // We can't use finger_back = index - size + 1 here. If we found the last char
                // of a different-sized character (or the middle byte of a different character)
                // we need to bump the finger_back down to `index`. This similarly makes
                // `finger_back` have the potential to no longer be on a boundary,
                // but this is OK since we only exit this function on a boundary
                // or when the haystack has been searched completely.
                //
                // Unlike next_match this does not
                // have the problem of repeated bytes in utf-8 because
                // we're searching for the last byte, and we can only have
                // found the last byte when searching in reverse.
                self.finger_back = index;
            } else {
476
                self.finger_back = self.finger;
477 478
                // found nothing, exit
                return None;
479 480
            }
        }
481
    }
482 483

    // let next_reject_back use the default implementation from the Searcher trait
484 485 486 487
}

impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}

I
Ivan Tham 已提交
488
/// Searches for chars that are equal to a given `char`.
I
Ivan Tham 已提交
489 490 491 492 493 494
///
/// # Examples
///
/// ```
/// assert_eq!("Hello world".find('o'), Some(4));
/// ```
495 496 497 498 499
impl<'a> Pattern<'a> for char {
    type Searcher = CharSearcher<'a>;

    #[inline]
    fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
500
        let mut utf8_encoded = [0; 4];
S
Shotaro Yamada 已提交
501
        let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
502 503 504
        CharSearcher {
            haystack,
            finger: 0,
505
            finger_back: haystack.len(),
506
            needle: self,
507
            utf8_size,
M
Mark Rousskov 已提交
508
            utf8_encoded,
509
        }
510 511 512 513 514 515 516 517 518 519 520 521 522 523
    }

    #[inline]
    fn is_contained_in(self, haystack: &'a str) -> bool {
        if (self as u32) < 128 {
            haystack.as_bytes().contains(&(self as u8))
        } else {
            let mut buffer = [0u8; 4];
            self.encode_utf8(&mut buffer).is_contained_in(haystack)
        }
    }

    #[inline]
    fn is_prefix_of(self, haystack: &'a str) -> bool {
524
        self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack)
525 526
    }

527 528 529 530 531
    #[inline]
    fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
        self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack)
    }

532
    #[inline]
M
Mark Rousskov 已提交
533 534 535
    fn is_suffix_of(self, haystack: &'a str) -> bool
    where
        Self::Searcher: ReverseSearcher<'a>,
536
    {
537
        self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack)
538
    }
539 540 541 542 543 544 545 546

    #[inline]
    fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
    where
        Self::Searcher: ReverseSearcher<'a>,
    {
        self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
    }
547 548
}

549
/////////////////////////////////////////////////////////////////////////////
550
// Impl for a MultiCharEq wrapper
551
/////////////////////////////////////////////////////////////////////////////
552

553
#[doc(hidden)]
554
trait MultiCharEq {
555
    fn matches(&mut self, c: char) -> bool;
556 557
}

M
Mark Rousskov 已提交
558 559 560 561
impl<F> MultiCharEq for F
where
    F: FnMut(char) -> bool,
{
562
    #[inline]
M
Mark Rousskov 已提交
563 564 565
    fn matches(&mut self, c: char) -> bool {
        (*self)(c)
    }
566 567
}

568
impl MultiCharEq for &[char] {
569 570
    #[inline]
    fn matches(&mut self, c: char) -> bool {
M
Mark Rousskov 已提交
571
        self.iter().any(|&m| m == c)
572 573 574
    }
}

575
struct MultiCharEqPattern<C: MultiCharEq>(C);
576

577
#[derive(Clone, Debug)]
578
struct MultiCharEqSearcher<'a, C: MultiCharEq> {
579 580 581 582
    char_eq: C,
    haystack: &'a str,
    char_indices: super::CharIndices<'a>,
}
583

584 585
impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
    type Searcher = MultiCharEqSearcher<'a, C>;
586 587

    #[inline]
588
    fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
M
Mark Rousskov 已提交
589
        MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
590 591 592
    }
}

593
unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
594 595
    #[inline]
    fn haystack(&self) -> &'a str {
596
        self.haystack
597 598 599
    }

    #[inline]
600 601 602 603
    fn next(&mut self) -> SearchStep {
        let s = &mut self.char_indices;
        // Compare lengths of the internal byte slice iterator
        // to find length of current char
604
        let pre_len = s.iter.iter.len();
605
        if let Some((i, c)) = s.next() {
606
            let len = s.iter.iter.len();
607 608 609 610 611
            let char_len = pre_len - len;
            if self.char_eq.matches(c) {
                return SearchStep::Match(i, i + char_len);
            } else {
                return SearchStep::Reject(i, i + char_len);
612 613
            }
        }
614
        SearchStep::Done
615 616 617
    }
}

618
unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
619
    #[inline]
620 621 622 623
    fn next_back(&mut self) -> SearchStep {
        let s = &mut self.char_indices;
        // Compare lengths of the internal byte slice iterator
        // to find length of current char
624
        let pre_len = s.iter.iter.len();
625
        if let Some((i, c)) = s.next_back() {
626
            let len = s.iter.iter.len();
627 628 629 630 631
            let char_len = pre_len - len;
            if self.char_eq.matches(c) {
                return SearchStep::Match(i, i + char_len);
            } else {
                return SearchStep::Reject(i, i + char_len);
632 633
            }
        }
634
        SearchStep::Done
635 636 637
    }
}

638
impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
639

640 641 642 643
/////////////////////////////////////////////////////////////////////////////

macro_rules! pattern_methods {
    ($t:ty, $pmap:expr, $smap:expr) => {
644
        type Searcher = $t;
645 646 647

        #[inline]
        fn into_searcher(self, haystack: &'a str) -> $t {
648
            ($smap)(($pmap)(self).into_searcher(haystack))
649
        }
650

651 652
        #[inline]
        fn is_contained_in(self, haystack: &'a str) -> bool {
653
            ($pmap)(self).is_contained_in(haystack)
654
        }
655

656
        #[inline]
M
Marvin Löbel 已提交
657
        fn is_prefix_of(self, haystack: &'a str) -> bool {
658
            ($pmap)(self).is_prefix_of(haystack)
659
        }
660

661 662 663 664 665
        #[inline]
        fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
            ($pmap)(self).strip_prefix_of(haystack)
        }

666
        #[inline]
M
Marvin Löbel 已提交
667
        fn is_suffix_of(self, haystack: &'a str) -> bool
668 669
        where
            $t: ReverseSearcher<'a>,
M
Marvin Löbel 已提交
670
        {
671
            ($pmap)(self).is_suffix_of(haystack)
672
        }
673 674 675 676 677 678 679 680

        #[inline]
        fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
        where
            $t: ReverseSearcher<'a>,
        {
            ($pmap)(self).strip_suffix_of(haystack)
        }
681
    };
682 683
}

684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
macro_rules! searcher_methods {
    (forward) => {
        #[inline]
        fn haystack(&self) -> &'a str {
            self.0.haystack()
        }
        #[inline]
        fn next(&mut self) -> SearchStep {
            self.0.next()
        }
        #[inline]
        fn next_match(&mut self) -> Option<(usize, usize)> {
            self.0.next_match()
        }
        #[inline]
        fn next_reject(&mut self) -> Option<(usize, usize)> {
            self.0.next_reject()
        }
    };
    (reverse) => {
        #[inline]
        fn next_back(&mut self) -> SearchStep {
            self.0.next_back()
        }
        #[inline]
        fn next_match_back(&mut self) -> Option<(usize, usize)> {
            self.0.next_match_back()
        }
        #[inline]
        fn next_reject_back(&mut self) -> Option<(usize, usize)> {
            self.0.next_reject_back()
        }
716
    };
717 718
}

719 720 721 722 723 724 725
/////////////////////////////////////////////////////////////////////////////
// Impl for &[char]
/////////////////////////////////////////////////////////////////////////////

// Todo: Change / Remove due to ambiguity in meaning.

/// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
726
#[derive(Clone, Debug)]
727
pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
728 729

unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
730
    searcher_methods!(forward);
731
}
732

733
unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> {
734
    searcher_methods!(reverse);
735 736
}

737
impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
738

E
Eric Huss 已提交
739
/// Searches for chars that are equal to any of the chars in the slice.
I
Ivan Tham 已提交
740 741 742 743 744 745 746
///
/// # Examples
///
/// ```
/// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2));
/// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2));
/// ```
747
impl<'a, 'b> Pattern<'a> for &'b [char] {
748
    pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
749 750
}

751 752 753 754 755 756
/////////////////////////////////////////////////////////////////////////////
// Impl for F: FnMut(char) -> bool
/////////////////////////////////////////////////////////////////////////////

/// Associated type for `<F as Pattern<'a>>::Searcher`.
#[derive(Clone)]
757
pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
M
Mark Rousskov 已提交
758 759
where
    F: FnMut(char) -> bool;
760

761
impl<F> fmt::Debug for CharPredicateSearcher<'_, F>
M
Mark Rousskov 已提交
762 763
where
    F: FnMut(char) -> bool,
764
{
M
Mazdak Farrokhzad 已提交
765
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
766 767 768 769 770 771
        f.debug_struct("CharPredicateSearcher")
            .field("haystack", &self.0.haystack)
            .field("char_indices", &self.0.char_indices)
            .finish()
    }
}
772
unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F>
M
Mark Rousskov 已提交
773 774
where
    F: FnMut(char) -> bool,
775
{
776
    searcher_methods!(forward);
777
}
778 779

unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F>
M
Mark Rousskov 已提交
780 781
where
    F: FnMut(char) -> bool,
782
{
783
    searcher_methods!(reverse);
784 785
}

M
Mark Rousskov 已提交
786
impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {}
787

I
Ivan Tham 已提交
788
/// Searches for chars that match the given predicate.
I
Ivan Tham 已提交
789 790 791 792 793 794 795
///
/// # Examples
///
/// ```
/// assert_eq!("Hello world".find(char::is_uppercase), Some(0));
/// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1));
/// ```
M
Mark Rousskov 已提交
796 797 798 799
impl<'a, F> Pattern<'a> for F
where
    F: FnMut(char) -> bool,
{
800
    pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
801
}
802 803 804 805 806 807

/////////////////////////////////////////////////////////////////////////////
// Impl for &&str
/////////////////////////////////////////////////////////////////////////////

/// Delegates to the `&str` impl.
808
impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str {
809 810
    pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
}
811

812 813 814 815 816 817 818 819
/////////////////////////////////////////////////////////////////////////////
// Impl for &str
/////////////////////////////////////////////////////////////////////////////

/// Non-allocating substring search.
///
/// Will handle the pattern `""` as returning empty matches at each character
/// boundary.
I
Ivan Tham 已提交
820 821 822 823 824 825
///
/// # Examples
///
/// ```
/// assert_eq!("Hello world".find("world"), Some(6));
/// ```
826 827 828 829 830 831 832 833
impl<'a, 'b> Pattern<'a> for &'b str {
    type Searcher = StrSearcher<'a, 'b>;

    #[inline]
    fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
        StrSearcher::new(haystack, self)
    }

I
Ivan Tham 已提交
834
    /// Checks whether the pattern matches at the front of the haystack.
835 836
    #[inline]
    fn is_prefix_of(self, haystack: &'a str) -> bool {
837
        haystack.as_bytes().starts_with(self.as_bytes())
838 839
    }

840 841 842 843 844 845 846 847 848 849 850
    /// Removes the pattern from the front of haystack, if it matches.
    #[inline]
    fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
        if self.is_prefix_of(haystack) {
            // SAFETY: prefix was just verified to exist.
            unsafe { Some(haystack.get_unchecked(self.as_bytes().len()..)) }
        } else {
            None
        }
    }

I
Ivan Tham 已提交
851
    /// Checks whether the pattern matches at the back of the haystack.
852 853
    #[inline]
    fn is_suffix_of(self, haystack: &'a str) -> bool {
854
        haystack.as_bytes().ends_with(self.as_bytes())
855
    }
856 857 858 859 860 861 862 863 864 865 866 867

    /// Removes the pattern from the back of haystack, if it matches.
    #[inline]
    fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> {
        if self.is_suffix_of(haystack) {
            let i = haystack.len() - self.as_bytes().len();
            // SAFETY: suffix was just verified to exist.
            unsafe { Some(haystack.get_unchecked(..i)) }
        } else {
            None
        }
    }
868 869
}

870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
/////////////////////////////////////////////////////////////////////////////
// Two Way substring searcher
/////////////////////////////////////////////////////////////////////////////

#[derive(Clone, Debug)]
/// Associated type for `<&str as Pattern<'a>>::Searcher`.
pub struct StrSearcher<'a, 'b> {
    haystack: &'a str,
    needle: &'b str,

    searcher: StrSearcherImpl,
}

#[derive(Clone, Debug)]
enum StrSearcherImpl {
    Empty(EmptyNeedle),
886
    TwoWay(TwoWaySearcher),
887 888 889 890 891 892 893 894 895 896 897 898 899 900
}

#[derive(Clone, Debug)]
struct EmptyNeedle {
    position: usize,
    end: usize,
    is_match_fw: bool,
    is_match_bw: bool,
}

impl<'a, 'b> StrSearcher<'a, 'b> {
    fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
        if needle.is_empty() {
            StrSearcher {
901 902
                haystack,
                needle,
903 904 905 906 907 908 909 910 911
                searcher: StrSearcherImpl::Empty(EmptyNeedle {
                    position: 0,
                    end: haystack.len(),
                    is_match_fw: true,
                    is_match_bw: true,
                }),
            }
        } else {
            StrSearcher {
912 913
                haystack,
                needle,
M
Mark Rousskov 已提交
914 915 916 917
                searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new(
                    needle.as_bytes(),
                    haystack.len(),
                )),
918 919 920 921 922 923
            }
        }
    }
}

unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
924 925 926 927
    #[inline]
    fn haystack(&self) -> &'a str {
        self.haystack
    }
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945

    #[inline]
    fn next(&mut self) -> SearchStep {
        match self.searcher {
            StrSearcherImpl::Empty(ref mut searcher) => {
                // empty needle rejects every char and matches every empty string between them
                let is_match = searcher.is_match_fw;
                searcher.is_match_fw = !searcher.is_match_fw;
                let pos = searcher.position;
                match self.haystack[pos..].chars().next() {
                    _ if is_match => SearchStep::Match(pos, pos),
                    None => SearchStep::Done,
                    Some(ch) => {
                        searcher.position += ch.len_utf8();
                        SearchStep::Reject(pos, searcher.position)
                    }
                }
            }
946
            StrSearcherImpl::TwoWay(ref mut searcher) => {
947 948 949
                // TwoWaySearcher produces valid *Match* indices that split at char boundaries
                // as long as it does correct matching and that haystack and needle are
                // valid UTF-8
950 951 952 953
                // *Rejects* from the algorithm can fall on any indices, but we will walk them
                // manually to the next character boundary, so that they are utf-8 safe.
                if searcher.position == self.haystack.len() {
                    return SearchStep::Done;
954 955
                }
                let is_long = searcher.memory == usize::MAX;
M
Mark Rousskov 已提交
956 957 958 959 960
                match searcher.next::<RejectAndMatch>(
                    self.haystack.as_bytes(),
                    self.needle.as_bytes(),
                    is_long,
                ) {
961 962 963 964
                    SearchStep::Reject(a, mut b) => {
                        // skip to next char boundary
                        while !self.haystack.is_char_boundary(b) {
                            b += 1;
965
                        }
966 967
                        searcher.position = cmp::max(b, searcher.position);
                        SearchStep::Reject(a, b)
968
                    }
969
                    otherwise => otherwise,
970 971 972 973 974
                }
            }
        }
    }

975
    #[inline]
976 977
    fn next_match(&mut self) -> Option<(usize, usize)> {
        match self.searcher {
M
Mark Rousskov 已提交
978 979 980 981 982
            StrSearcherImpl::Empty(..) => loop {
                match self.next() {
                    SearchStep::Match(a, b) => return Some((a, b)),
                    SearchStep::Done => return None,
                    SearchStep::Reject(..) => {}
983
                }
M
Mark Rousskov 已提交
984
            },
985 986
            StrSearcherImpl::TwoWay(ref mut searcher) => {
                let is_long = searcher.memory == usize::MAX;
987 988
                // write out `true` and `false` cases to encourage the compiler
                // to specialize the two cases separately.
989
                if is_long {
M
Mark Rousskov 已提交
990 991 992 993 994
                    searcher.next::<MatchOnly>(
                        self.haystack.as_bytes(),
                        self.needle.as_bytes(),
                        true,
                    )
995
                } else {
M
Mark Rousskov 已提交
996 997 998 999 1000
                    searcher.next::<MatchOnly>(
                        self.haystack.as_bytes(),
                        self.needle.as_bytes(),
                        false,
                    )
1001
                }
1002 1003 1004
            }
        }
    }
1005
}
1006

1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
    #[inline]
    fn next_back(&mut self) -> SearchStep {
        match self.searcher {
            StrSearcherImpl::Empty(ref mut searcher) => {
                let is_match = searcher.is_match_bw;
                searcher.is_match_bw = !searcher.is_match_bw;
                let end = searcher.end;
                match self.haystack[..end].chars().next_back() {
                    _ if is_match => SearchStep::Match(end, end),
                    None => SearchStep::Done,
                    Some(ch) => {
                        searcher.end -= ch.len_utf8();
                        SearchStep::Reject(searcher.end, end)
                    }
                }
            }
1024 1025 1026
            StrSearcherImpl::TwoWay(ref mut searcher) => {
                if searcher.end == 0 {
                    return SearchStep::Done;
1027
                }
1028
                let is_long = searcher.memory == usize::MAX;
M
Mark Rousskov 已提交
1029 1030 1031 1032 1033
                match searcher.next_back::<RejectAndMatch>(
                    self.haystack.as_bytes(),
                    self.needle.as_bytes(),
                    is_long,
                ) {
1034 1035 1036 1037
                    SearchStep::Reject(mut a, b) => {
                        // skip to next char boundary
                        while !self.haystack.is_char_boundary(a) {
                            a -= 1;
1038
                        }
1039 1040
                        searcher.end = cmp::min(a, searcher.end);
                        SearchStep::Reject(a, b)
1041
                    }
1042
                    otherwise => otherwise,
1043 1044 1045 1046
                }
            }
        }
    }
1047 1048 1049 1050

    #[inline]
    fn next_match_back(&mut self) -> Option<(usize, usize)> {
        match self.searcher {
M
Mark Rousskov 已提交
1051 1052 1053 1054 1055
            StrSearcherImpl::Empty(..) => loop {
                match self.next_back() {
                    SearchStep::Match(a, b) => return Some((a, b)),
                    SearchStep::Done => return None,
                    SearchStep::Reject(..) => {}
1056
                }
M
Mark Rousskov 已提交
1057
            },
1058
            StrSearcherImpl::TwoWay(ref mut searcher) => {
1059
                let is_long = searcher.memory == usize::MAX;
1060
                // write out `true` and `false`, like `next_match`
1061
                if is_long {
M
Mark Rousskov 已提交
1062 1063 1064 1065 1066
                    searcher.next_back::<MatchOnly>(
                        self.haystack.as_bytes(),
                        self.needle.as_bytes(),
                        true,
                    )
1067
                } else {
M
Mark Rousskov 已提交
1068 1069 1070 1071 1072
                    searcher.next_back::<MatchOnly>(
                        self.haystack.as_bytes(),
                        self.needle.as_bytes(),
                        false,
                    )
1073
                }
1074 1075 1076
            }
        }
    }
1077 1078
}

1079
/// The internal state of the two-way substring search algorithm.
1080 1081 1082
#[derive(Clone, Debug)]
struct TwoWaySearcher {
    // constants
1083
    /// critical factorization index
1084
    crit_pos: usize,
1085 1086
    /// critical factorization index for reversed needle
    crit_pos_back: usize,
1087
    period: usize,
1088 1089 1090
    /// `byteset` is an extension (not part of the two way algorithm);
    /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
    /// to a (byte & 63) == j present in the needle.
1091 1092 1093 1094 1095
    byteset: u64,

    // variables
    position: usize,
    end: usize,
1096
    /// index into needle before which we have already matched
1097
    memory: usize,
1098
    /// index into needle after which we have already matched
1099
    memory_back: usize,
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
}

/*
    This is the Two-Way search algorithm, which was introduced in the paper:
    Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.

    Here's some background information.

    A *word* is a string of symbols. The *length* of a word should be a familiar
    notion, and here we denote it for any word x by |x|.
    (We also allow for the possibility of the *empty word*, a word of length zero).

    If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
    *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
    For example, both 1 and 2 are periods for the string "aa". As another example,
    the only period of the string "abcd" is 4.

    We denote by period(x) the *smallest* period of x (provided that x is non-empty).
    This is always well-defined since every non-empty word x has at least one period,
    |x|. We sometimes call this *the period* of x.

    If u, v and x are words such that x = uv, where uv is the concatenation of u and
    v, then we say that (u, v) is a *factorization* of x.

    Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
    that both of the following hold

      - either w is a suffix of u or u is a suffix of w
      - either w is a prefix of v or v is a prefix of w

    then w is said to be a *repetition* for the factorization (u, v).

    Just to unpack this, there are four possibilities here. Let w = "abc". Then we
    might have:

      - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
      - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
      - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
      - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")

    Note that the word vu is a repetition for any factorization (u,v) of x = uv,
    so every factorization has at least one repetition.

    If x is a string and (u, v) is a factorization for x, then a *local period* for
    (u, v) is an integer r such that there is some word w such that |w| = r and w is
    a repetition for (u, v).

    We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
    call this *the local period* of (u, v). Provided that x = uv is non-empty, this
    is well-defined (because each non-empty word has at least one factorization, as
    noted above).

    It can be proven that the following is an equivalent definition of a local period
    for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
    all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
1155
    defined. (i.e., i > 0 and i + r < |x|).
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170

    Using the above reformulation, it is easy to prove that

        1 <= local_period(u, v) <= period(uv)

    A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
    *critical factorization*.

    The algorithm hinges on the following theorem, which is stated without proof:

    **Critical Factorization Theorem** Any word x has at least one critical
    factorization (u, v) such that |u| < period(x).

    The purpose of maximal_suffix is to find such a critical factorization.

1171 1172 1173
    If the period is short, compute another factorization x = u' v' to use
    for reverse search, chosen instead so that |v'| < period(x).

1174 1175 1176 1177 1178 1179
*/
impl TwoWaySearcher {
    fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
        let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
        let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);

M
Mark Rousskov 已提交
1180 1181 1182 1183 1184
        let (crit_pos, period) = if crit_pos_false > crit_pos_true {
            (crit_pos_false, period_false)
        } else {
            (crit_pos_true, period_true)
        };
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194

        // A particularly readable explanation of what's going on here can be found
        // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
        // see the code for "Algorithm CP" on p. 323.
        //
        // What's going on is we have some critical factorization (u, v) of the
        // needle, and we want to determine whether u is a suffix of
        // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
        // "Algorithm CP2", which is optimized for when the period of the needle
        // is large.
1195
        if needle[..crit_pos] == needle[period..period + crit_pos] {
1196 1197 1198 1199 1200 1201 1202 1203 1204
            // short period case -- the period is exact
            // compute a separate critical factorization for the reversed needle
            // x = u' v' where |v'| < period(x).
            //
            // This is sped up by the period being known already.
            // Note that a case like x = "acba" may be factored exactly forwards
            // (crit_pos = 1, period = 3) while being factored with approximate
            // period in reverse (crit_pos = 2, period = 2). We use the given
            // reverse factorization but keep the exact period.
M
Mark Rousskov 已提交
1205 1206 1207 1208 1209
            let crit_pos_back = needle.len()
                - cmp::max(
                    TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
                    TwoWaySearcher::reverse_maximal_suffix(needle, period, true),
                );
1210

1211
            TwoWaySearcher {
1212 1213 1214
                crit_pos,
                crit_pos_back,
                period,
1215
                byteset: Self::byteset_create(&needle[..period]),
1216 1217

                position: 0,
1218
                end,
1219 1220
                memory: 0,
                memory_back: needle.len(),
1221 1222
            }
        } else {
1223 1224
            // long period case -- we have an approximation to the actual period,
            // and don't use memorization.
1225 1226 1227 1228
            //
            // Approximate the period by lower bound max(|u|, |v|) + 1.
            // The critical factorization is efficient to use for both forward and
            // reverse search.
1229

1230
            TwoWaySearcher {
1231
                crit_pos,
1232
                crit_pos_back: crit_pos,
1233
                period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
1234
                byteset: Self::byteset_create(needle),
1235 1236

                position: 0,
1237
                end,
1238 1239
                memory: usize::MAX, // Dummy value to signify that the period is long
                memory_back: usize::MAX,
1240 1241 1242 1243
            }
        }
    }

1244 1245 1246 1247 1248
    #[inline]
    fn byteset_create(bytes: &[u8]) -> u64 {
        bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
    }

1249
    #[inline]
1250 1251 1252 1253 1254 1255 1256 1257 1258
    fn byteset_contains(&self, byte: u8) -> bool {
        (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
    }

    // One of the main ideas of Two-Way is that we factorize the needle into
    // two halves, (u, v), and begin trying to find v in the haystack by scanning
    // left to right. If v matches, we try to match u by scanning right to left.
    // How far we can jump when we encounter a mismatch is all based on the fact
    // that (u, v) is a critical factorization for the needle.
1259
    #[inline]
M
Mark Rousskov 已提交
1260 1261 1262
    fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
    where
        S: TwoWayStrategy,
1263
    {
1264
        // `next()` uses `self.position` as its cursor
1265
        let old_pos = self.position;
1266
        let needle_last = needle.len() - 1;
1267 1268
        'search: loop {
            // Check that we have room to search in
1269 1270 1271 1272 1273 1274 1275 1276 1277
            // position + needle_last can not overflow if we assume slices
            // are bounded by isize's range.
            let tail_byte = match haystack.get(self.position + needle_last) {
                Some(&b) => b,
                None => {
                    self.position = haystack.len();
                    return S::rejecting(old_pos, self.position);
                }
            };
1278 1279 1280

            if S::use_early_reject() && old_pos != self.position {
                return S::rejecting(old_pos, self.position);
1281 1282 1283
            }

            // Quickly skip by large portions unrelated to our substring
1284
            if !self.byteset_contains(tail_byte) {
1285 1286 1287 1288 1289 1290 1291 1292
                self.position += needle.len();
                if !long_period {
                    self.memory = 0;
                }
                continue 'search;
            }

            // See if the right part of the needle matches
M
Mark Rousskov 已提交
1293 1294
            let start =
                if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) };
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
            for i in start..needle.len() {
                if needle[i] != haystack[self.position + i] {
                    self.position += i - self.crit_pos + 1;
                    if !long_period {
                        self.memory = 0;
                    }
                    continue 'search;
                }
            }

            // See if the left part of the needle matches
            let start = if long_period { 0 } else { self.memory };
            for i in (start..self.crit_pos).rev() {
                if needle[i] != haystack[self.position + i] {
                    self.position += self.period;
                    if !long_period {
                        self.memory = needle.len() - self.period;
                    }
                    continue 'search;
                }
            }

            // We have found a match!
            let match_pos = self.position;

            // Note: add self.period instead of needle.len() to have overlapping matches
            self.position += needle.len();
            if !long_period {
                self.memory = 0; // set to needle.len() - self.period for overlapping matches
            }

1326
            return S::matching(match_pos, match_pos + needle.len());
1327 1328 1329 1330 1331
        }
    }

    // Follows the ideas in `next()`.
    //
1332
    // The definitions are symmetrical, with period(x) = period(reverse(x))
1333
    // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
1334 1335
    // is a critical factorization, so is (reverse(v), reverse(u)).
    //
1336 1337 1338
    // For the reverse case we have computed a critical factorization x = u' v'
    // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
    // thus |v'| < period(x) for the reverse.
1339 1340
    //
    // To search in reverse through the haystack, we search forward through
1341
    // a reversed haystack with a reversed needle, matching first u' and then v'.
1342
    #[inline]
M
Mark Rousskov 已提交
1343 1344 1345
    fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
    where
        S: TwoWayStrategy,
1346
    {
1347 1348
        // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
        // are independent.
1349
        let old_end = self.end;
1350 1351
        'search: loop {
            // Check that we have room to search in
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
            // end - needle.len() will wrap around when there is no more room,
            // but due to slice length limits it can never wrap all the way back
            // into the length of haystack.
            let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) {
                Some(&b) => b,
                None => {
                    self.end = 0;
                    return S::rejecting(0, old_end);
                }
            };
1362 1363 1364

            if S::use_early_reject() && old_end != self.end {
                return S::rejecting(self.end, old_end);
1365 1366 1367
            }

            // Quickly skip by large portions unrelated to our substring
1368
            if !self.byteset_contains(front_byte) {
1369
                self.end -= needle.len();
1370 1371 1372
                if !long_period {
                    self.memory_back = needle.len();
                }
1373 1374 1375 1376
                continue 'search;
            }

            // See if the left part of the needle matches
M
Mark Rousskov 已提交
1377 1378 1379 1380 1381
            let crit = if long_period {
                self.crit_pos_back
            } else {
                cmp::min(self.crit_pos_back, self.memory_back)
            };
1382
            for i in (0..crit).rev() {
1383
                if needle[i] != haystack[self.end - needle.len() + i] {
1384 1385 1386 1387
                    self.end -= self.crit_pos_back - i;
                    if !long_period {
                        self.memory_back = needle.len();
                    }
1388 1389 1390 1391 1392
                    continue 'search;
                }
            }

            // See if the right part of the needle matches
M
Mark Rousskov 已提交
1393
            let needle_end = if long_period { needle.len() } else { self.memory_back };
1394
            for i in self.crit_pos_back..needle_end {
1395 1396
                if needle[i] != haystack[self.end - needle.len() + i] {
                    self.end -= self.period;
1397 1398 1399
                    if !long_period {
                        self.memory_back = self.period;
                    }
1400 1401 1402 1403 1404 1405 1406 1407
                    continue 'search;
                }
            }

            // We have found a match!
            let match_pos = self.end - needle.len();
            // Note: sub self.period instead of needle.len() to have overlapping matches
            self.end -= needle.len();
1408 1409 1410
            if !long_period {
                self.memory_back = needle.len();
            }
1411

1412
            return S::matching(match_pos, match_pos + needle.len());
1413 1414 1415
        }
    }

1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
    // Compute the maximal suffix of `arr`.
    //
    // The maximal suffix is a possible critical factorization (u, v) of `arr`.
    //
    // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
    // period of v.
    //
    // `order_greater` determines if lexical order is `<` or `>`. Both
    // orders must be computed -- the ordering with the largest `i` gives
    // a critical factorization.
    //
    // For long period cases, the resulting period is not exact (it is too short).
1428
    #[inline]
1429 1430 1431
    fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
        let mut left = 0; // Corresponds to i in the paper
        let mut right = 1; // Corresponds to j in the paper
1432
        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
M
Mark Rousskov 已提交
1433
        // to match 0-based indexing.
1434 1435
        let mut period = 1; // Corresponds to p in the paper

1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451
        while let Some(&a) = arr.get(right + offset) {
            // `left` will be inbounds when `right` is.
            let b = arr[left + offset];
            if (a < b && !order_greater) || (a > b && order_greater) {
                // Suffix is smaller, period is entire prefix so far.
                right += offset + 1;
                offset = 0;
                period = right - left;
            } else if a == b {
                // Advance through repetition of the current period.
                if offset + 1 == period {
                    right += offset + 1;
                    offset = 0;
                } else {
                    offset += 1;
                }
1452
            } else {
1453 1454 1455 1456 1457
                // Suffix is larger, start over from current location.
                left = right;
                right += 1;
                offset = 0;
                period = 1;
1458
            }
1459 1460 1461 1462 1463 1464 1465 1466 1467
        }
        (left, period)
    }

    // Compute the maximal suffix of the reverse of `arr`.
    //
    // The maximal suffix is a possible critical factorization (u', v') of `arr`.
    //
    // Returns `i` where `i` is the starting index of v', from the back;
B
Bruce Mitchener 已提交
1468
    // returns immediately when a period of `known_period` is reached.
1469 1470 1471 1472 1473 1474
    //
    // `order_greater` determines if lexical order is `<` or `>`. Both
    // orders must be computed -- the ordering with the largest `i` gives
    // a critical factorization.
    //
    // For long period cases, the resulting period is not exact (it is too short).
M
Mark Rousskov 已提交
1475
    fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize {
1476 1477
        let mut left = 0; // Corresponds to i in the paper
        let mut right = 1; // Corresponds to j in the paper
1478
        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
M
Mark Rousskov 已提交
1479
        // to match 0-based indexing.
1480 1481 1482 1483 1484 1485 1486
        let mut period = 1; // Corresponds to p in the paper
        let n = arr.len();

        while right + offset < n {
            let a = arr[n - (1 + right + offset)];
            let b = arr[n - (1 + left + offset)];
            if (a < b && !order_greater) || (a > b && order_greater) {
1487
                // Suffix is smaller, period is entire prefix so far.
1488 1489 1490
                right += offset + 1;
                offset = 0;
                period = right - left;
1491 1492
            } else if a == b {
                // Advance through repetition of the current period.
1493 1494 1495
                if offset + 1 == period {
                    right += offset + 1;
                    offset = 0;
1496 1497 1498 1499 1500 1501 1502
                } else {
                    offset += 1;
                }
            } else {
                // Suffix is larger, start over from current location.
                left = right;
                right += 1;
1503
                offset = 0;
1504 1505
                period = 1;
            }
1506 1507 1508
            if period == known_period {
                break;
            }
1509
        }
1510 1511
        debug_assert!(period <= known_period);
        left
1512 1513
    }
}
1514 1515 1516 1517 1518 1519

// TwoWayStrategy allows the algorithm to either skip non-matches as quickly
// as possible, or to work in a mode where it emits Rejects relatively quickly.
trait TwoWayStrategy {
    type Output;
    fn use_early_reject() -> bool;
1520 1521
    fn rejecting(a: usize, b: usize) -> Self::Output;
    fn matching(a: usize, b: usize) -> Self::Output;
1522 1523 1524
}

/// Skip to match intervals as quickly as possible
M
Mark Rousskov 已提交
1525
enum MatchOnly {}
1526 1527 1528 1529 1530

impl TwoWayStrategy for MatchOnly {
    type Output = Option<(usize, usize)>;

    #[inline]
M
Mark Rousskov 已提交
1531 1532 1533
    fn use_early_reject() -> bool {
        false
    }
1534
    #[inline]
M
Mark Rousskov 已提交
1535 1536 1537
    fn rejecting(_a: usize, _b: usize) -> Self::Output {
        None
    }
1538
    #[inline]
M
Mark Rousskov 已提交
1539 1540 1541
    fn matching(a: usize, b: usize) -> Self::Output {
        Some((a, b))
    }
1542 1543 1544
}

/// Emit Rejects regularly
M
Mark Rousskov 已提交
1545
enum RejectAndMatch {}
1546 1547 1548 1549 1550

impl TwoWayStrategy for RejectAndMatch {
    type Output = SearchStep;

    #[inline]
M
Mark Rousskov 已提交
1551 1552 1553
    fn use_early_reject() -> bool {
        true
    }
1554
    #[inline]
M
Mark Rousskov 已提交
1555 1556 1557
    fn rejecting(a: usize, b: usize) -> Self::Output {
        SearchStep::Reject(a, b)
    }
1558
    #[inline]
M
Mark Rousskov 已提交
1559 1560 1561
    fn matching(a: usize, b: usize) -> Self::Output {
        SearchStep::Match(a, b)
    }
1562
}