瀏覽代碼

Word boundaries

jD91mZM2 6 年之前
父節點
當前提交
c8f82a17fb
共有 3 個文件被更改,包括 74 次插入25 次删除
  1. 0 3
      README.md
  2. 4 0
      src/ctype.rs
  3. 70 22
      src/matcher.rs

+ 0 - 3
README.md

@@ -11,6 +11,3 @@ Regex compiler:
  - Alternative syntax for word boundaries: `[[:<:]]` and `[[:>:]]`
  - Short forms of character classes, like `\d`
  - Unnamed groups
-
-Matcher:
- - Word boundaries

+ 4 - 0
src/ctype.rs

@@ -34,3 +34,7 @@ pub fn is_upper(c: u8) -> bool {
 pub fn is_xdigit(c: u8) -> bool {
     is_digit(c) || (c >= b'a' && c <= b'f') || (c >= b'A' && c <= b'F')
 }
+
+pub fn is_word_boundary(c: u8) -> bool {
+    !is_alnum(c) && c != b'_'
+}

+ 70 - 22
src/matcher.rs

@@ -125,7 +125,7 @@ impl<'a> Branch<'a> {
 
         loop {
             if branch.repeat_min > 0 {
-                // Did not repeat enough times!
+                // Group did not repeat enough times!
                 return false;
             }
 
@@ -227,6 +227,11 @@ impl<'a> PosixRegexMatcher<'a> {
     }
 
     fn matches_exact(&mut self, mut branches: Vec<Branch>) -> bool {
+        // Whether or not any branch, at any point, got fully explored. This
+        // means at least one path of the regex successfully completed!
+        let mut succeeded = false;
+        let mut prev = None;
+
         loop {
             let next = self.input.get(self.offset).cloned();
             //println!();
@@ -237,10 +242,6 @@ impl<'a> PosixRegexMatcher<'a> {
             let mut insert = self.expand(&branches);
             branches.append(&mut insert);
 
-            // Whether or not any branch got fully explored. This means at
-            // least one path of the regex successfully completed!
-            let mut happy = false;
-
             //println!("Branches: {:?}", branches);
             loop {
                 if index >= branches.len() {
@@ -258,22 +259,39 @@ impl<'a> PosixRegexMatcher<'a> {
 
                 let mut accepts = true;
 
-                if let Token::Start = token {
-                    // Skip ahead to the next token.
-                    match branch.next_branch() {
-                        Some(next) => *branch = next,
-                        None => {
-                            remove += 1;
-                            continue;
-                        }
-                    };
-                    let (ref new_token, Range(_, new_max)) = *branch.get_token();
-                    token = new_token;
-                    max = new_max;
-
-                    accepts = self.offset == 0;
+                loop {
+                    match token {
+                        Token::Start |
+                        Token::WordEnd |
+                        Token::WordStart => {
+                            // Should be cheap to clone since we already make sure
+                            // it's a type that doesn't hold any data.
+                            let original = token.clone();
+
+                            // Skip ahead to the next token.
+                            match branch.next_branch() {
+                                Some(next) => *branch = next,
+                                None => break
+                            }
+                            let (ref new_token, Range(_, new_max)) = *branch.get_token();
+                            token = new_token;
+                            max = new_max;
+
+                            //println!("Or {:?}", token);
+
+                            accepts = match original {
+                                Token::Start => self.offset == 0,
+                                Token::WordEnd => next.map(::ctype::is_word_boundary).unwrap_or(true),
+                                Token::WordStart => prev.map(::ctype::is_word_boundary).unwrap_or(true),
+                                _ => unreachable!()
+                            };
+                        },
+                        _ => break
+                    }
                 }
 
+                //println!("Accepts before: {:?}", accepts);
+
                 accepts = accepts && match *token {
                     Token::Any => next.is_some(),
                     Token::Char(c) => next == Some(c),
@@ -282,10 +300,20 @@ impl<'a> PosixRegexMatcher<'a> {
                     Token::OneOf { invert, ref list } => if let Some(next) = next {
                         list.iter().any(|c| c.matches(next)) == !invert
                     } else { false },
-                    _ => unimplemented!("TODO")
+
+                    // These will only get called if they are encountered at
+                    // EOF, for example "abc\>" or "^". Then we simply want to
+                    // return true as to preserve the current `accepts` status.
+                    Token::Start |
+                    Token::WordEnd |
+                    Token::WordStart => true
                 };
+
+                //println!("Accepts after: {:?}", accepts);
+
                 if !accepts || max.map(|max| branch.repeated >= max).unwrap_or(false) {
-                    happy = happy || branch.is_explored();
+                    succeeded = succeeded || branch.is_explored();
+                    //println!("-> Deleted! Succeeded: {}", succeeded);
                     for &id in &*branch.group_ids {
                         self.groups[id].1 = self.offset;
                     }
@@ -298,11 +326,12 @@ impl<'a> PosixRegexMatcher<'a> {
             branches.truncate(end);
 
             if branches.is_empty() {
-                return happy;
+                return succeeded;
             }
 
             if next.is_some() {
                 self.offset += 1;
+                prev = next;
             }
         }
     }
@@ -405,6 +434,7 @@ mod tests {
     #[test]
     fn start_and_end() {
         assert!(matches_exact("^abc$", "abc").is_some());
+        assert!(matches_exact("^bcd", "bcde").is_some());
         assert!(matches_exact("^bcd", "abcd").is_none());
         assert!(matches_exact("abc$", "abc").is_some());
         assert!(matches_exact("abc$", "abcd").is_none());
@@ -412,6 +442,24 @@ mod tests {
         assert!(matches_exact(r".*\(^\|a\)c", "c").is_some());
         assert!(matches_exact(r".*\(^\|a\)c", "ac").is_some());
         assert!(matches_exact(r".*\(^\|a\)c", "bc").is_none());
+
+        // Tests if ^ can be repeated without issues
+        assert!(matches_exact(".*^^a", "helloabc").is_none());
+        assert!(matches_exact(".*^^a", "abc").is_some());
+    }
+    #[test]
+    fn word_boundaries() {
+        assert!(matches_exact(r"hello\>.world", "hello world").is_some());
+        assert!(matches_exact(r"hello\>.world", "hello!world").is_some());
+        assert!(matches_exact(r"hello\>.world", "hellooworld").is_none());
+
+        assert!(matches_exact(r"hello.\<world", "hello world").is_some());
+        assert!(matches_exact(r"hello.\<world", "hello!world").is_some());
+        assert!(matches_exact(r"hello.\<world", "hellooworld").is_none());
+
+        assert!(matches_exact(r".*\<hello\>", "hihello").is_none());
+        assert!(matches_exact(r".*\<hello\>", "hi_hello").is_none());
+        assert!(matches_exact(r".*\<hello\>", "hi hello").is_some());
     }
     #[test]
     fn groups() {