소스 검색

Return group offsets

jD91mZM2 6 년 전
부모
커밋
d0d2618770
1개의 변경된 파일84개의 추가작업 그리고 39개의 파일을 삭제
  1. 84 39
      src/matcher.rs

+ 84 - 39
src/matcher.rs

@@ -18,11 +18,11 @@ impl PosixRegex {
     /// Match the string starting at the current position. This does not find
     /// substrings.
     pub fn matches_exact(self, input: &[u8]) -> Option<PosixRegexResult> {
-        // let mut groups = Vec::new();
+        let mut groups = Vec::new();
         let mut matcher = PosixRegexMatcher {
             input,
-            offset: 0
-            // groups: &mut groups
+            offset: 0,
+            groups: &mut groups
         };
         let start = matcher.offset;
         if !matcher.matches_exact(self.search.iter().filter_map(|tokens| Branch::new(tokens)).collect()) {
@@ -32,7 +32,8 @@ impl PosixRegex {
 
         Some(PosixRegexResult {
             start,
-            end
+            end,
+            groups
         })
     }
 }
@@ -66,6 +67,7 @@ struct Branch<'a> {
     repeated: u32,
     tokens: CowRc<'a, [(Token, Range)]>,
 
+    group_ids: Rc<[usize]>,
     repeat_min: u32,
     repeat_max: Option<u32>,
     repeats: Option<Rc<Vec<Rc<Vec<(Token, Range)>>>>>,
@@ -85,6 +87,7 @@ impl<'a> Branch<'a> {
             index: 0,
             repeated: 0,
             tokens: CowRc::Borrowed(tokens),
+            group_ids: Vec::new().into(),
             repeat_min: 0,
             repeat_max: Some(0),
             repeats: None,
@@ -92,6 +95,7 @@ impl<'a> Branch<'a> {
         })
     }
     fn group(
+        group_ids: Rc<[usize]>,
         tokens: Rc<Vec<(Token, Range)>>,
         range: Range,
         repeats: Rc<Vec<Rc<Vec<(Token, Range)>>>>,
@@ -104,6 +108,7 @@ impl<'a> Branch<'a> {
             index: 0,
             repeated: 0,
             tokens: CowRc::Owned(tokens),
+            group_ids,
             repeat_min: range.0.saturating_sub(1),
             repeat_max: range.1.map(|i| i.saturating_sub(1)),
             repeats: Some(repeats),
@@ -171,43 +176,57 @@ impl<'a> Branch<'a> {
     }
 }
 
-fn expand<'a>(branches: &[Branch<'a>]) -> Vec<Branch<'a>> {
-    let mut insert = Vec::new();
+struct PosixRegexMatcher<'a> {
+    input: &'a [u8],
+    offset: usize,
+    groups: &'a mut Vec<(usize, usize)>
+}
+impl<'a> PosixRegexMatcher<'a> {
+    fn expand<'b>(&mut self, branches: &[Branch<'b>]) -> Vec<Branch<'b>> {
+        let mut insert = Vec::new();
+
+        for branch in branches {
+            let (ref token, range) = *branch.get_token();
 
-    for branch in branches {
-        let (ref token, range) = *branch.get_token();
+            if let Token::Group(ref inner) = token {
+                let group_id = self.groups.len();
+                self.groups.push((self.offset, 0));
 
-        if let Token::Group(ref inner) = token {
-            let repeats = Rc::new(inner.iter().cloned().map(Rc::new).collect());
-            for alternation in &*repeats {
-                if let Some(branch) = Branch::group(Rc::clone(alternation), range, Rc::clone(&repeats), branch.next_branch()) {
-                    //println!("{:?} ---[G Cloned]--> {:?}", token, branch.get_token());
-                    insert.push(branch);
+                let repeats = Rc::new(inner.iter().cloned().map(Rc::new).collect());
+
+                let mut ids = Vec::with_capacity(branch.group_ids.len() + 1);
+                ids.extend(&*branch.group_ids);
+                ids.push(group_id);
+                let ids = ids.into();
+                for alternation in &*repeats {
+                    if let Some(branch) = Branch::group(
+                        Rc::clone(&ids),
+                        Rc::clone(alternation),
+                        range,
+                        Rc::clone(&repeats),
+                        branch.next_branch()
+                    ) {
+                        //println!("{:?} ---[G Cloned]--> {:?}", token, branch.get_token());
+                        insert.push(branch);
+                    }
                 }
             }
-        }
-        if branch.repeated >= range.0 {
-            if let Some(next) = branch.next_branch() {
-                //println!("{:?} ---[Cloned]--> {:?}", token, next.get_token());
-                insert.push(next);
+            if branch.repeated >= range.0 {
+                if let Some(next) = branch.next_branch() {
+                    //println!("{:?} ---[Cloned]--> {:?}", token, next.get_token());
+                    insert.push(next);
+                }
+                branch.add_repeats(&mut insert);
             }
-            branch.add_repeats(&mut insert);
         }
-    }
 
-    if !insert.is_empty() {
-        let mut new = expand(&insert);
-        insert.append(&mut new);
+        if !insert.is_empty() {
+            let mut new = self.expand(&insert);
+            insert.append(&mut new);
+        }
+        insert
     }
-    insert
-}
 
-struct PosixRegexMatcher<'a> {
-    input: &'a [u8],
-    offset: usize
-    // TODO: groups: &'a mut Vec<(usize, usize)>
-}
-impl<'a> PosixRegexMatcher<'a> {
     fn matches_exact(&mut self, mut branches: Vec<Branch>) -> bool {
         while let Some(&next) = self.input.get(self.offset) {
             //println!();
@@ -215,7 +234,7 @@ impl<'a> PosixRegexMatcher<'a> {
             let mut index = 0;
             let mut remove = 0;
 
-            let mut insert = expand(&branches);
+            let mut insert = self.expand(&branches);
             branches.append(&mut insert);
 
             // Whether or not all deleted branched got fully explored. This
@@ -246,8 +265,11 @@ impl<'a> PosixRegexMatcher<'a> {
                     _ => unimplemented!("TODO")
                 };
                 if !accepts || max.map(|max| branch.repeated > max).unwrap_or(false) {
-                    happy = happy && branch.is_explored();
                     //println!("-> Delete!");
+                    happy = happy && branch.is_explored();
+                    for &id in &*branch.group_ids {
+                        self.groups[id].1 = self.offset;
+                    }
                     remove += 1;
                     continue;
                 }
@@ -265,6 +287,9 @@ impl<'a> PosixRegexMatcher<'a> {
         //println!("Branches: {:?}", branches);
 
         for branch in &branches {
+            for &id in &*branch.group_ids {
+                self.groups[id].1 = self.offset;
+            }
             if branch.is_explored() {
                 return true;
             }
@@ -276,10 +301,12 @@ impl<'a> PosixRegexMatcher<'a> {
 /// A single result, including start and end bounds
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct PosixRegexResult {
-    /// An offset in the original string to where the match started (inclusive)
+    /// An offset in the input string to where the match started (inclusive)
     pub start: usize,
-    /// An offset in the original string to where the match ended (exclusive)
-    pub end: usize
+    /// An offset in the input string to where the match ended (exclusive)
+    pub end: usize,
+    /// Start/end offsets in the input to where a group matched
+    pub groups: Vec<(usize, usize)>
 }
 
 #[cfg(test)]
@@ -344,8 +371,26 @@ mod tests {
     }
     #[test]
     fn offsets() {
-        assert_eq!(matches_exact("abc", "abcd"), Some(PosixRegexResult { start: 0, end: 3 }));
-        assert_eq!(matches_exact(r"[[:alpha:]]\+", "abcde12345"), Some(PosixRegexResult { start: 0, end: 5 }));
+        assert_eq!(
+            matches_exact("abc", "abcd"),
+            Some(PosixRegexResult { start: 0, end: 3, groups: vec![] })
+        );
+        assert_eq!(
+            matches_exact(r"[[:alpha:]]\+", "abcde12345"),
+            Some(PosixRegexResult { start: 0, end: 5, groups: vec![] })
+        );
+        assert_eq!(
+            matches_exact(r"a\(bc\)\+d", "abcbcd"),
+            Some(PosixRegexResult { start: 0, end: 6, groups: vec![(1, 5)] })
+        );
+        assert_eq!(
+            matches_exact(r"hello\( \(world\|universe\) :D\)\?!", "hello world :D!"),
+            Some(PosixRegexResult { start: 0, end: 15, groups: vec![(5, 14), (6, 11)] })
+        );
+        assert_eq!(
+            matches_exact(r"hello\( \(world\|universe\) :D\)\?", "hello world :D"),
+            Some(PosixRegexResult { start: 0, end: 14, groups: vec![(5, 14), (6, 11)] })
+        );
     }
     //#[test]
     //fn start_and_end() {