123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515 |
- //! The matcher: Can find substrings in a string that match any compiled regex
- #[cfg(feature = "no_std")]
- use std::prelude::*;
- use compile::{Token, Range};
- use std::borrow::Cow;
- use std::fmt;
- use std::rc::Rc;
- /// A regex matcher, ready to match stuff
- #[derive(Clone)]
- pub struct PosixRegex {
- pub(crate) search: Vec<Vec<(Token, Range)>>
- }
- impl PosixRegex {
- /// Match the string starting at the current position. This does not find
- /// substrings.
- pub fn matches_exact(self, input: &[u8]) -> Option<PosixRegexResult> {
- let mut groups = Vec::new();
- let mut matcher = PosixRegexMatcher {
- input,
- offset: 0,
- groups: &mut groups
- };
- let start = matcher.offset;
- if !matcher.matches_exact(self.search.iter().filter_map(|tokens| Branch::new(tokens)).collect()) {
- return None;
- }
- let end = matcher.offset;
- Some(PosixRegexResult {
- start,
- end,
- groups
- })
- }
- }
- #[derive(Clone)]
- struct Branch<'a> {
- index: usize,
- repeated: u32,
- tokens: &'a [(Token, Range)],
- path: Box<[(usize, usize)]>,
- group_ids: Rc<[usize]>,
- repeat_min: u32,
- repeat_max: Option<u32>,
- next: Option<Rc<Branch<'a>>>
- }
- impl<'a> fmt::Debug for Branch<'a> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "{:?}", self.get_token())
- }
- }
- impl<'a> Branch<'a> {
- fn new(tokens: &'a [(Token, Range)]) -> Option<Self> {
- if tokens.is_empty() {
- return None;
- }
- Some(Self {
- index: 0,
- repeated: 0,
- tokens: tokens,
- path: Vec::new().into(),
- group_ids: Vec::new().into(),
- repeat_min: 0,
- repeat_max: Some(0),
- next: None
- })
- }
- fn group(
- path: Box<[(usize, usize)]>,
- group_ids: Rc<[usize]>,
- tokens: &'a [(Token, Range)],
- range: Range,
- next: Option<Branch<'a>>
- ) -> Option<Self> {
- if tokens.is_empty() {
- return None;
- }
- Some(Self {
- index: 0,
- repeated: 0,
- tokens,
- path,
- group_ids,
- repeat_min: range.0.saturating_sub(1),
- repeat_max: range.1.map(|i| i.saturating_sub(1)),
- next: next.map(Rc::new)
- })
- }
- fn parent_tokens(&self) -> &[(Token, Range)] {
- let mut tokens = self.tokens;
- let len = self.path.len();
- if len > 0 {
- for &(index, variant) in &self.path[..len-1] {
- match tokens[index] {
- (Token::Group(ref inner), _) => tokens = &inner[variant],
- _ => panic!("non-group index in path")
- }
- }
- }
- tokens
- }
- fn tokens(&self) -> &[(Token, Range)] {
- let mut tokens = self.parent_tokens();
- if let Some(&(index, variant)) = self.path.last() {
- match tokens[index] {
- (Token::Group(ref inner), _) => tokens = &inner[variant],
- _ => panic!("non-group index in path")
- }
- }
- tokens
- }
- fn get_token(&self) -> &(Token, Range) {
- &self.tokens()[self.index]
- }
- fn next_branch(&self) -> Option<Self> {
- if self.repeat_min > 0 {
- // Don't add the next branch until we've repeated this one enough
- return None;
- }
- if self.index + 1 >= self.tokens().len() {
- return self.next.as_ref().map(|inner| (**inner).clone());
- }
- Some(Self {
- index: self.index + 1,
- repeated: 0,
- ..self.clone()
- })
- }
- fn add_repeats(&self, branches: &mut Vec<Branch<'a>>) {
- if self.repeat_max.map(|max| max == 0).unwrap_or(false) {
- return;
- }
- let tokens = self.parent_tokens();
- let &(index, _) = self.path.last().expect("add_repeats called on top level");
- match tokens[index] {
- (Token::Group(ref repeats), _) => {
- for alternative in 0..repeats.len() {
- let mut path = self.path.clone();
- path.last_mut().unwrap().1 = alternative;
- branches.push(Self {
- index: 0,
- path,
- repeated: 0,
- repeat_min: self.repeat_min.saturating_sub(1),
- repeat_max: self.repeat_max.map(|max| max - 1),
- ..self.clone()
- });
- }
- },
- _ => panic!("non-group index in path")
- }
- }
- /// Returns if this node is "explored" enough times,
- /// meaning it has repeated as many times as it want to and has nowhere to go next.
- fn is_explored(&self) -> bool {
- let mut branch = Cow::Borrowed(self);
- loop {
- if branch.repeat_min > 0 {
- // Group did not repeat enough times!
- return false;
- }
- let (_, Range(min, _)) = *branch.get_token();
- if branch.repeated < min {
- return false;
- }
- match branch.next_branch() {
- Some(next) => branch = Cow::Owned(next),
- None => break
- }
- }
- true
- }
- }
- struct PosixRegexMatcher<'a> {
- input: &'a [u8],
- offset: usize,
- groups: &'a mut Vec<(usize, usize)>
- }
- impl<'a> PosixRegexMatcher<'a> {
- fn expand<'b>(&mut self, branches: &[Branch<'b>]) -> Vec<Branch<'b>> {
- let mut insert = Vec::new();
- for branch in branches {
- let (ref token, range) = *branch.get_token();
- if let Token::Group(ref inner) = token {
- // Push the group's inner content as a new branch
- let group_id = self.groups.len();
- self.groups.push((self.offset, 0));
- let mut ids = Vec::with_capacity(branch.group_ids.len() + 1);
- ids.extend(&*branch.group_ids);
- ids.push(group_id);
- let ids = ids.into();
- for alternation in 0..inner.len() {
- let mut path = Vec::with_capacity(branch.path.len() + 1);
- path.extend(&*branch.path);
- path.push((branch.index, alternation));
- if let Some(branch) = Branch::group(
- path.into(),
- Rc::clone(&ids),
- branch.tokens,
- range,
- branch.next_branch()
- ) {
- insert.push(branch);
- }
- }
- }
- if branch.repeated >= range.0 {
- // Push the next element as a new branch
- if let Some(next) = branch.next_branch() {
- insert.push(next);
- }
- branch.add_repeats(&mut insert);
- }
- }
- if !insert.is_empty() {
- // Resolve recursively
- let mut new = self.expand(&insert);
- insert.append(&mut new);
- }
- insert
- }
- fn matches_exact(&mut self, mut branches: Vec<Branch>) -> bool {
- // Whether or not any branch, at any point, got fully explored. This
- // means at least one path of the regex successfully completed!
- let mut succeeded = false;
- let mut prev = None;
- loop {
- let next = self.input.get(self.offset).cloned();
- let mut index = 0;
- let mut remove = 0;
- let mut insert = self.expand(&branches);
- branches.append(&mut insert);
- loop {
- if index >= branches.len() {
- break;
- }
- if remove > 0 {
- // Just like Rust's `retain` function, shift all elements I
- // want to keep back and `truncate` when I'm done.
- branches.swap(index, index-remove);
- }
- let branch = &mut branches[index-remove];
- index += 1;
- let (ref token, Range(_, mut max)) = *branch.get_token();
- let mut token = token;
- let mut accepts = true;
- // Step 1: Handle zero-width stuff like ^ and \<
- loop {
- match token {
- Token::Start |
- Token::WordEnd |
- Token::WordStart => {
- // Should be cheap to clone since we already make sure
- // it's a type that doesn't hold any data.
- let original = token.clone();
- // Skip ahead to the next token.
- match branch.next_branch() {
- Some(next) => *branch = next,
- None => break
- }
- let (ref new_token, Range(_, new_max)) = *branch.get_token();
- token = new_token;
- max = new_max;
- accepts = match original {
- Token::Start => self.offset == 0,
- Token::WordEnd => next.map(::ctype::is_word_boundary).unwrap_or(true),
- Token::WordStart => prev.map(::ctype::is_word_boundary).unwrap_or(true),
- _ => unreachable!()
- };
- },
- _ => break
- }
- }
- // Step 2: Check if the token matches
- accepts = accepts && match *token {
- Token::Any => next.is_some(),
- Token::Char(c) => next == Some(c),
- Token::End => next.is_none(),
- Token::Group(_) => false, // <- handled separately
- Token::OneOf { invert, ref list } => if let Some(next) = next {
- list.iter().any(|c| c.matches(next)) == !invert
- } else { false },
- // These will only get called if they are encountered at
- // EOF (because next_branch returns None), for example
- // "abc\>" or "^". Then we simply want to return true as to
- // preserve the current `accepts` status.
- Token::Start |
- Token::WordEnd |
- Token::WordStart => true
- };
- if !accepts || max.map(|max| branch.repeated >= max).unwrap_or(false) {
- succeeded = succeeded || branch.is_explored();
- for &id in &*branch.group_ids {
- self.groups[id].1 = self.offset;
- }
- remove += 1;
- continue;
- }
- branch.repeated += 1;
- }
- let end = branches.len() - remove;
- branches.truncate(end);
- if branches.is_empty() {
- return succeeded;
- }
- if next.is_some() {
- self.offset += 1;
- prev = next;
- }
- }
- }
- }
- /// A single result, including start and end bounds
- #[derive(Debug, Clone, PartialEq, Eq)]
- pub struct PosixRegexResult {
- /// An offset in the input string to where the match started (inclusive)
- pub start: usize,
- /// An offset in the input string to where the match ended (exclusive)
- pub end: usize,
- /// Start/end offsets in the input to where a group matched
- pub groups: Vec<(usize, usize)>
- }
- #[cfg(test)]
- mod tests {
- #[cfg(feature = "bench")]
- extern crate test;
- #[cfg(feature = "bench")]
- use self::test::Bencher;
- use super::*;
- use ::PosixRegexBuilder;
- fn matches_exact(regex: &str, input: &str) -> Option<PosixRegexResult> {
- PosixRegexBuilder::new(regex.as_bytes())
- .with_default_classes()
- .compile()
- .expect("error compiling regex")
- .matches_exact(input.as_bytes())
- }
- #[test]
- fn basic() {
- assert!(matches_exact("abc", "abc").is_some());
- assert!(matches_exact("abc", "bbc").is_none());
- assert!(matches_exact("abc", "acc").is_none());
- assert!(matches_exact("abc", "abd").is_none());
- }
- #[test]
- fn repetitions() {
- assert!(matches_exact("abc*", "ab").is_some());
- assert!(matches_exact("abc*", "abc").is_some());
- assert!(matches_exact("abc*", "abccc").is_some());
- assert!(matches_exact(r"a\{1,2\}b", "b").is_none());
- assert!(matches_exact(r"a\{1,2\}b", "ab").is_some());
- assert!(matches_exact(r"a\{1,2\}b", "aab").is_some());
- assert!(matches_exact(r"a\{1,2\}b", "aaab").is_none());
- }
- #[test]
- fn any() {
- assert!(matches_exact(".*", "").is_some());
- assert!(matches_exact(".*b", "b").is_some());
- assert!(matches_exact(".*b", "ab").is_some());
- assert!(matches_exact(".*b", "aaaaab").is_some());
- assert!(matches_exact(".*b", "HELLO WORLD").is_none());
- assert!(matches_exact(".*b", "HELLO WORLDb").is_some());
- assert!(matches_exact("H.*O WORLD", "HELLO WORLD").is_some());
- assert!(matches_exact("H.*ORLD", "HELLO WORLD").is_some());
- }
- #[test]
- fn brackets() {
- assert!(matches_exact("[abc]*d", "abcd").is_some());
- assert!(matches_exact("[0-9]*d", "1234d").is_some());
- assert!(matches_exact("[[:digit:]]*d", "1234d").is_some());
- assert!(matches_exact("[[:digit:]]*d", "abcd").is_none());
- }
- #[test]
- fn alternations() {
- assert!(matches_exact(r"abc\|bcd", "abc").is_some());
- assert!(matches_exact(r"abc\|bcd", "bcd").is_some());
- assert!(matches_exact(r"abc\|bcd", "cde").is_none());
- assert!(matches_exact(r"[A-Z]\+\|yee", "").is_none());
- assert!(matches_exact(r"[A-Z]\+\|yee", "HELLO").is_some());
- assert!(matches_exact(r"[A-Z]\+\|yee", "yee").is_some());
- assert!(matches_exact(r"[A-Z]\+\|yee", "hello").is_none());
- }
- #[test]
- fn offsets() {
- assert_eq!(
- matches_exact("abc", "abcd"),
- Some(PosixRegexResult { start: 0, end: 3, groups: vec![] })
- );
- assert_eq!(
- matches_exact(r"[[:alpha:]]\+", "abcde12345"),
- Some(PosixRegexResult { start: 0, end: 5, groups: vec![] })
- );
- assert_eq!(
- matches_exact(r"a\(bc\)\+d", "abcbcd"),
- Some(PosixRegexResult { start: 0, end: 6, groups: vec![(1, 5)] })
- );
- assert_eq!(
- matches_exact(r"hello\( \(world\|universe\) :D\)\?!", "hello world :D!"),
- Some(PosixRegexResult { start: 0, end: 15, groups: vec![(5, 14), (6, 11)] })
- );
- assert_eq!(
- matches_exact(r"hello\( \(world\|universe\) :D\)\?", "hello world :D"),
- Some(PosixRegexResult { start: 0, end: 14, groups: vec![(5, 14), (6, 11)] })
- );
- assert_eq!(
- matches_exact(r"\(\<hello\>\) world", "hello world"),
- Some(PosixRegexResult { start: 0, end: 11, groups: vec![(0, 5)] })
- );
- }
- #[test]
- fn start_and_end() {
- assert!(matches_exact("^abc$", "abc").is_some());
- assert!(matches_exact("^bcd", "bcde").is_some());
- assert!(matches_exact("^bcd", "abcd").is_none());
- assert!(matches_exact("abc$", "abc").is_some());
- assert!(matches_exact("abc$", "abcd").is_none());
- assert!(matches_exact(r".*\(^\|a\)c", "c").is_some());
- assert!(matches_exact(r".*\(^\|a\)c", "ac").is_some());
- assert!(matches_exact(r".*\(^\|a\)c", "bc").is_none());
- // Tests if ^ can be repeated without issues
- assert!(matches_exact(".*^^a", "helloabc").is_none());
- assert!(matches_exact(".*^^a", "abc").is_some());
- }
- #[test]
- fn word_boundaries() {
- assert!(matches_exact(r"hello\>.world", "hello world").is_some());
- assert!(matches_exact(r"hello\>.world", "hello!world").is_some());
- assert!(matches_exact(r"hello\>.world", "hellooworld").is_none());
- assert!(matches_exact(r"hello.\<world", "hello world").is_some());
- assert!(matches_exact(r"hello.\<world", "hello!world").is_some());
- assert!(matches_exact(r"hello.\<world", "hellooworld").is_none());
- assert!(matches_exact(r".*\<hello\>", "hihello").is_none());
- assert!(matches_exact(r".*\<hello\>", "hi_hello").is_none());
- assert!(matches_exact(r".*\<hello\>", "hi hello").is_some());
- }
- #[test]
- fn groups() {
- assert!(matches_exact(r"\(a*\|b\|c\)d", "d").is_some());
- assert!(matches_exact(r"\(a*\|b\|c\)d", "aaaad").is_some());
- assert!(matches_exact(r"\(a*\|b\|c\)d", "bd").is_some());
- assert!(matches_exact(r"\(a*\|b\|c\)d", "bbbbbd").is_none());
- }
- #[test]
- fn repeating_groups() {
- assert!(matches_exact(r"\(a\|b\|c\)*d", "d").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)*d", "aaaad").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)*d", "bbbbd").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)*d", "aabbd").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "d").is_none());
- assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "ad").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "abd").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "abcd").is_none());
- assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "ababad").is_none());
- assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "ababd").is_some());
- assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "abad").is_none());
- }
- #[cfg(feature = "bench")]
- #[bench]
- fn speeeeed(b: &mut Bencher) {
- b.iter(|| {
- assert!(matches_exact(r"\(\(a*\|b\|c\) test\|yee\)", "aaaaa test").is_some());
- })
- }
- }
|