Jelajahi Sumber

Fix empty group edge-case and remove mark_end

Apparently the end marker thingy isn't needed, because all nodes, even
optional ones, will be explored to the final node anyway I realized.
jD91mZM2 6 tahun lalu
induk
melakukan
b5826a73ab
3 mengubah file dengan 37 tambahan dan 112 penghapusan
  1. 3 5
      src/compile.rs
  2. 34 12
      src/matcher.rs
  3. 0 95
      src/tree.rs

+ 3 - 5
src/compile.rs

@@ -158,9 +158,7 @@ impl<'a> PosixRegexBuilder<'a> {
         self.builder.start_internal(Token::Root, Range(1, Some(1)));
         self.parse()?;
         self.builder.finish_internal();
-        let mut tree = self.builder.finish();
-        tree.mark_end();
-        Ok(tree)
+        Ok(self.builder.finish())
     }
 
     fn consume(&mut self, amount: usize) {
@@ -444,7 +442,7 @@ Root 1..1
     'y' 1..1
     'e' 1..1
     'e' 1..1
-    'e' 0.. ending
+    'e' 0..
 "
         );
         assert_eq!(
@@ -454,7 +452,7 @@ Root 1..1
   Alternative 1..1
     'y' 1..1
     'e' 1..1
-    'e' 0..1 ending
+    'e' 0..1
 "
         );
         assert_eq!(

+ 34 - 12
src/matcher.rs

@@ -130,7 +130,6 @@ impl<'a> PosixRegex<'a> {
         arena.push(TreeNode {
             token: Token::Group(0),
             range: Range(1, Some(1)),
-            end: false,
             parent: None,
             next_sibling: None,
             child: root
@@ -149,7 +148,6 @@ impl<'a> PosixRegex<'a> {
         arena.push(TreeNode {
             token: Token::InternalStart,
             range: Range(0, None),
-            end: false,
             parent: None,
             next_sibling: Some(group_id),
             child: None
@@ -235,6 +233,11 @@ impl<'a> Node<'a> {
                     index: 0,
                     len: end - start
                 });
+                if start == end {
+                    // Empty group, mark as repeated enough times
+                    let Range(min, _) = me.node().range;
+                    me.repeated += min;
+                }
             }
         }
         me
@@ -257,9 +260,11 @@ impl<'a> Node<'a> {
             _ => return
         };
         self.repeated += 1;
-        let parent = Rc::new(self);
+        let mut parent = Rc::new(self);
+        let mut empty = true;
         for alternative in parent.tree[parent.node].children(&parent.tree) {
             if let Some(node) = parent.tree[alternative].child {
+                empty = false;
                 branches.push(Self::prepare(Self {
                     tree: parent.tree,
                     parent: Some(Rc::clone(&parent)),
@@ -274,6 +279,17 @@ impl<'a> Node<'a> {
                 }));
             }
         }
+        if empty {
+            let mut parent = Rc::get_mut(&mut parent).expect("group empty but still there's a dangling reference");
+            for &open in &[true, false] {
+                parent.prev = parent.prev.push(GroupEvent {
+                    open,
+                    id,
+                    offset
+                });
+            }
+            parent.add_branches(branches, offset);
+        }
     }
     /// Get the internal token node without additional state metadata
     fn node(&self) -> &TreeNode {
@@ -400,9 +416,8 @@ impl<'a> Node<'a> {
             }
         }
     }
-    /// Returns true if this node is "finished", meaning it's reached one
-    /// possible end and continuing exploring is optional
-    fn is_finished(&self) -> bool {
+    /// Returns true if this node is the final node in the branch
+    fn is_final(&self) -> bool {
         let Range(min, _) = self.node().range;
         if self.repeated < min {
             return false;
@@ -427,10 +442,7 @@ impl<'a> Node<'a> {
             }
             next = current.parent.as_ref().map(|node| &**node);
         }
-        next
-            .and_then(|node| self.tree[node.node].next_sibling)
-            .map(|node| self.tree[node].end)
-            .unwrap_or(true)
+        next.and_then(|node| self.tree[node.node].next_sibling).is_none()
     }
 }
 
@@ -521,7 +533,7 @@ impl<'a> PosixRegexMatcher<'a> {
                                 branch.increment();
                                 branch.add_branches(&mut insert, self.offset);
                             }
-                            if branch.is_finished() {
+                            if branch.is_final() {
                                 succeeded = Some(branch.get_capturing_groups(self.max_groups, self.offset));
                             }
                             remove += 1;
@@ -586,7 +598,7 @@ impl<'a> PosixRegexMatcher<'a> {
                 if accepts {
                     branch.increment();
                 } else {
-                    if branch.is_finished() {
+                    if branch.is_final() {
                         let groups = branch.get_capturing_groups(self.max_groups, self.offset);
 
                         let mut add = true;
@@ -772,6 +784,10 @@ mod tests {
             matches_exact(r"\(a\|\(b\)\)*\(c\)", "aaac"),
             Some(abox![Some((0, 4)), Some((2, 3)), None, Some((3, 4))])
         );
+        assert_eq!(
+            matches_exact(r"a\(\)bc", "abc"),
+            Some(abox![Some((0, 3)), Some((1, 1))])
+        );
     }
     #[test]
     fn matches_is_lazy() {
@@ -859,8 +875,14 @@ mod tests {
         assert!(matches_exact(r"\([abc]\{2,3\}\)\1d", "abcbcd").is_none());
         assert!(matches_exact(r"\([abc]\{2,3\}\)\1d", "ababd").is_some());
         assert!(matches_exact(r"\([abc]\{2,3\}\)\1d", "abacd").is_none());
+
         assert!(matches_exact(r"\([[:alpha:]]\).*\1d", "hellohd").is_some());
         assert!(matches_exact(r"\([[:alpha:]]\).*\1d", "hellod").is_none());
+        assert!(matches_exact(r"\([[:alpha:]]\).*\1", "hello").is_none());
+        assert!(matches_exact(r"\([[:alpha:]]\).*\1", "helloh").is_some());
+
+        assert!(matches_exact(r"\(\)-\?\1d", "d").is_some());
+        assert!(matches_exact(r"\(\)-\?\1", "").is_some());
 
         // Just make sure this doesn't crash it (even though it should error
         // but I'm too lazy)

+ 0 - 95
src/tree.rs

@@ -23,7 +23,6 @@ impl From<NodeId> for usize {
 pub struct Node {
     pub token: Token,
     pub range: Range,
-    pub end: bool,
     pub parent: Option<NodeId>,
     pub next_sibling: Option<NodeId>,
     pub child: Option<NodeId>
@@ -36,9 +35,6 @@ impl Node {
 impl fmt::Debug for Node {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{:?} {:?}", self.token, self.range)?;
-        if self.end {
-            write!(f, " ending")?;
-        }
         Ok(())
     }
 }
@@ -72,7 +68,6 @@ impl TreeBuilder {
         self.arena.push(Node {
             token,
             range,
-            end: false,
             parent: self.parent,
             next_sibling: None,
             child: None
@@ -107,7 +102,6 @@ impl TreeBuilder {
             self.arena.push(Node {
                 token,
                 range,
-                end: false,
                 parent: self.parent,
                 next_sibling: None,
                 child: self.arena[usize::from(from)].next_sibling
@@ -119,7 +113,6 @@ impl TreeBuilder {
             self.arena.push(Node {
                 token,
                 range,
-                end: false,
                 parent: self.parent,
                 next_sibling: None,
                 child: self.arena[usize::from(parent)].child
@@ -131,7 +124,6 @@ impl TreeBuilder {
             self.arena.push(Node {
                 token,
                 range,
-                end: false,
                 parent: None,
                 next_sibling: None,
                 child: self.cursor
@@ -164,70 +156,6 @@ pub struct Tree {
     pub arena: Box<[Node]>,
     pub root: NodeId
 }
-impl Tree {
-    fn mark_end_of(&mut self, root: NodeId) {
-        // abc(de)? = (, )
-        // (ab)?c(de)? = (, )
-        // ab?c? = b, c
-        // ab?(c*) = b, c
-        //
-        // Algorithm: Find the first in a series of trailing optional nodes and
-        // mark all the nodes afterwards as endings as well, recursing any
-        // optional groups.
-        let mut next = Some(root);
-        while let Some(alternation) = next {
-            next = self.arena[usize::from(alternation)].next_sibling;
-
-            let mut end = None;
-            let mut next = self[alternation].child;
-            let mut nested: usize = 0;
-            'outer: while let Some(id) = next {
-                let node = &self[id];
-
-                // Mark the first optional node, or reset if it's not optional
-                let Range(min, _) = node.range;
-                if min == 0 {
-                    end = end.or(Some(id));
-                } else {
-                    if node.child.is_some() {
-                        // Recurse required groups
-                        nested += 1;
-                        next = node.child;
-                        continue;
-                    } else {
-                        end = None;
-                    }
-                }
-                let mut me = Some(node);
-                while me.map(|me| me.next_sibling.is_none()).unwrap_or(false) {
-                    match nested.checked_sub(1) {
-                        Some(new) => nested = new,
-                        None => break 'outer
-                    }
-                    me = me.unwrap().parent.map(|id| &self[id]);
-                }
-                next = me.and_then(|me| me.next_sibling);
-            }
-
-            // Mark all nodes after end as optional
-            let mut next = end;
-            while let Some(node) = next {
-                let node = &mut self[node];
-                next = node.next_sibling;
-                node.end = true;
-                if let Some(child) = node.child {
-                    // Find any ends if this node ends up expanded
-                    self.mark_end_of(child);
-                }
-            }
-        }
-    }
-    pub fn mark_end(&mut self) {
-        if let Some(alternative) = self[self.root].child {
-            self.mark_end_of(alternative);
-        }
-    }
-}
 impl Index<NodeId> for Tree {
     type Output = Node;
     fn index(&self, index: NodeId) -> &Node {
@@ -268,7 +196,6 @@ impl fmt::Debug for Tree {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use compile::PosixRegexBuilder;
 
     fn sanity_check(tree: &Tree) {
         let mut next = Some(tree.root);
@@ -359,28 +286,6 @@ Root 1..1
         . 1..1
   Alternative 1..1
     $ 1..1
-"
-        );
-    }
-    #[test]
-    fn mark_end() {
-        let tree = PosixRegexBuilder::new(br"a\?bc\?\(d*\)\|bb\?").compile_tokens().unwrap();
-        sanity_check(&tree);
-
-        assert_eq!(
-            format!("{:?}", tree),
-            "\
-Root 1..1
-  Alternative 1..1
-    'a' 0..1
-    'b' 1..1
-    'c' 0..1 ending
-    Group(1) 1..1 ending
-      Alternative 1..1
-        'd' 0.. ending
-  Alternative 1..1
-    'b' 1..1
-    'b' 0..1 ending
 "
         );
     }