瀏覽代碼

Use arena-based tree

This is a pretty big speedup, and I intend to keep optimizing it until
the code sucks less.
jD91mZM2 6 年之前
父節點
當前提交
22e4a6691e
共有 4 個文件被更改,包括 886 次插入416 次删除
  1. 248 166
      src/compile.rs
  2. 1 0
      src/lib.rs
  3. 250 250
      src/matcher.rs
  4. 387 0
      src/tree.rs

+ 248 - 166
src/compile.rs

@@ -8,6 +8,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fmt;
 use {ctype, PosixRegex};
+use tree::*;
 
 /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
 #[derive(Clone, Copy, PartialEq, Eq)]
@@ -22,7 +23,7 @@ impl fmt::Debug for Range {
 }
 
 /// An item inside square brackets, like `[abc]` or `[[:digit:]]`
-#[derive(Clone, Debug, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq)]
 pub enum Collation {
     Char(u8),
     Class(fn(u8) -> bool)
@@ -31,29 +32,41 @@ impl Collation {
     /// Compare this collation to a character
     pub fn matches(&self, other: u8, insensitive: bool) -> bool {
         match *self {
-            Collation::Char(me) if insensitive => me & !32 == other & !32,
+            Collation::Char(me) if insensitive => if ctype::is_alpha(me) && ctype::is_alpha(other) {
+                me | 32 == other | 32
+            } else {
+                me == other
+            },
             Collation::Char(me) => me == other,
             Collation::Class(f) => f(other)
         }
     }
 }
+impl fmt::Debug for Collation {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Collation::Char(c) => write!(f, "{:?}", c as char),
+            Collation::Class(c) => write!(f, "{:p}", c),
+        }
+    }
+}
 
 /// A single "compiled" token, such as a `.` or a character literal
 #[derive(Clone, PartialEq, Eq)]
 pub enum Token {
+    /// Internal token used to find matches that might be anywhere in the text
     InternalStart,
 
+    Alternative,
     Any,
     Char(u8),
     End,
-    Group {
-        id: usize,
-        branches: Vec<Vec<(Token, Range)>>
-    },
+    Group(usize),
     OneOf {
         invert: bool,
         list: Vec<Collation>
     },
+    Root,
     Start,
     WordEnd,
     WordStart
@@ -63,11 +76,13 @@ impl fmt::Debug for Token {
         match *self {
             Token::InternalStart => write!(f, "<START>"),
 
+            Token::Alternative => write!(f, "Alternative"),
             Token::Any => write!(f, "."),
             Token::Char(c) => write!(f, "{:?}", c as char),
             Token::End => write!(f, "$"),
-            Token::Group { ref branches, .. } => write!(f, "Group({:?})", branches),
-            Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list),
+            Token::Group(id) => write!(f, "Group({})", id),
+            Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
+            Token::Root => write!(f, "Root"),
             Token::Start => write!(f, "^"),
             Token::WordEnd => write!(f, ">"),
             Token::WordStart => write!(f, "<")
@@ -93,7 +108,8 @@ pub enum Error {
 pub struct PosixRegexBuilder<'a> {
     input: &'a [u8],
     classes: HashMap<&'a [u8], fn(u8) -> bool>,
-    group_id: usize
+    group_id: usize,
+    builder: TreeBuilder
 }
 impl<'a> PosixRegexBuilder<'a> {
     /// Create a new instance that is ready to parse the regex `input`
@@ -101,7 +117,8 @@ impl<'a> PosixRegexBuilder<'a> {
         Self {
             input,
             classes: HashMap::new(),
-            group_id: 1
+            group_id: 1,
+            builder: TreeBuilder::default()
         }
     }
     /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
@@ -130,9 +147,17 @@ impl<'a> PosixRegexBuilder<'a> {
         self
     }
     /// "Compile" this regex to a struct ready to match input
-    pub fn compile(mut self) -> Result<PosixRegex<'static>, Error> {
-        let search = self.compile_tokens()?;
-        Ok(PosixRegex::new(Cow::Owned(search)))
+    pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
+        let tree = self.compile_tokens()?;
+        Ok(PosixRegex::new(Cow::Owned(tree)))
+    }
+    pub fn compile_tokens(mut self) -> Result<Tree, Error> {
+        self.builder.start_internal(Token::Root, Range(1, Some(1)));
+        self.parse()?;
+        self.builder.finish_internal();
+        let mut tree = self.builder.finish();
+        tree.mark_end();
+        Ok(tree)
     }
 
     fn consume(&mut self, amount: usize) {
@@ -161,22 +186,53 @@ impl<'a> PosixRegexBuilder<'a> {
         self.consume(1);
         Ok(())
     }
-    pub fn compile_tokens(&mut self) -> Result<Vec<Vec<(Token, Range)>>, Error> {
-        let mut alternatives = Vec::new();
-        let mut chain: Vec<(Token, Range)> = Vec::new();
-
-        while let Some(&c) = self.input.first() {
-            self.consume(1);
+    fn parse_range(&mut self) -> Result<Range, Error> {
+        let mut range = Range(1, Some(1));
+        if let Some(&c) = self.input.first() {
+            let new = match c {
+                b'*' => Some((1, Range(0, None))),
+                b'\\' => match self.input.get(1) {
+                    Some(b'?') => Some((2, Range(0, Some(1)))),
+                    Some(b'+') => Some((2, Range(1, None))),
+                    Some(b'{') => {
+                        self.consume(2);
+                        let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
+                        let mut second = Some(first);
+                        if let Some(b',') = self.input.first() {
+                            self.consume(1);
+                            second = self.take_int()?;
+                        }
+                        if self.input.first() == Some(&b'}') {
+                            self.consume(1);
+                        } else if self.input.starts_with(br"\}") {
+                            self.consume(2);
+                        } else {
+                            return Err(Error::UnclosedRepetition);
+                        }
+                        if second.map(|second| first > second).unwrap_or(false) {
+                            return Err(Error::IllegalRange);
+                        }
+                        range = Range(first, second);
+                        None
+                    },
+                    _ => None
+                },
+                _ => None
+            };
+            if let Some((consume, new)) = new {
+                range = new;
+                self.consume(consume);
+            }
+        }
+        Ok(range)
+    }
+    fn parse(&mut self) -> Result<(), Error> {
+        self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
+        while let Ok(c) = self.next() {
             let token = match c {
                 b'^' => Token::Start,
                 b'$' => Token::End,
                 b'.' => Token::Any,
-                b'*' => if let Some(last) = chain.last_mut() {
-                    last.1 = Range(0, None);
-                    continue;
-                } else {
-                    return Err(Error::LeadingRepetition);
-                },
                 b'[' => {
                     let mut list = Vec::new();
                     let invert = self.input.first() == Some(&b'^');
@@ -246,54 +302,21 @@ impl<'a> PosixRegexBuilder<'a> {
                     b'(' => {
                         let id = self.group_id;
                         self.group_id += 1;
-                        Token::Group {
-                            id,
-                            branches: self.compile_tokens()?
-                        }
+                        let checkpoint = self.builder.checkpoint();
+                        self.parse()?;
+                        let range = self.parse_range()?;
+                        self.builder.start_internal_at(checkpoint, Token::Group(id), range);
+                        self.builder.finish_internal();
+                        continue;
                     },
-                    b')' => {
-                        alternatives.push(chain);
-                        return Ok(alternatives);
-                    }
+                    b')' => break,
                     b'|' => {
-                        alternatives.push(chain);
-                        chain = Vec::new();
+                        self.builder.finish_internal();
+                        self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
                         continue;
                     },
                     b'<' => Token::WordStart,
                     b'>' => Token::WordEnd,
-                    c@b'?' | c@b'+' => if let Some(last) = chain.last_mut() {
-                        last.1 = match c {
-                            b'?' => Range(0, Some(1)),
-                            b'+' => Range(1, None),
-                            _ => unreachable!(c)
-                        };
-                        continue;
-                    } else {
-                        return Err(Error::LeadingRepetition);
-                    },
-                    b'{' => if let Some(last) = chain.last_mut() {
-                        let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
-                        let mut second = Some(first);
-                        if let Some(b',') = self.input.first() {
-                            self.consume(1);
-                            second = self.take_int()?;
-                        }
-                        if self.input.first() == Some(&b'}') {
-                            self.consume(1);
-                        } else if self.input.starts_with(br"\}") {
-                            self.consume(2);
-                        } else {
-                            return Err(Error::UnclosedRepetition);
-                        }
-                        if second.map(|second| first > second).unwrap_or(false) {
-                            return Err(Error::IllegalRange);
-                        }
-                        last.1 = Range(first, second);
-                        continue;
-                    } else {
-                        return Err(Error::LeadingRepetition);
-                    },
                     b'a' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_alnum)] },
                     b'd' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_digit)] },
                     b's' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_space)] },
@@ -305,11 +328,11 @@ impl<'a> PosixRegexBuilder<'a> {
                 },
                 c => Token::Char(c)
             };
-            chain.push((token, Range(1, Some(1))));
+            let range = self.parse_range()?;
+            self.builder.leaf(token, range);
         }
-
-        alternatives.push(chain);
-        Ok(alternatives)
+        self.builder.finish_internal();
+        Ok(())
     }
 }
 
@@ -317,161 +340,220 @@ impl<'a> PosixRegexBuilder<'a> {
 mod tests {
     use super::*;
 
-    fn compile(input: &[u8]) -> Vec<(Token, Range)> {
-        PosixRegexBuilder::new(input)
-            .with_default_classes()
-            .compile_tokens()
-            .expect("error compiling regex")
-            .into_iter()
-            .next()
-            .unwrap()
-    }
-    fn t(t: Token) -> (Token, Range) {
-        (t, Range(1, Some(1)))
-    }
-    fn c(c: u8) -> (Token, Range) {
-        t(Token::Char(c))
+    fn compile(input: &[u8]) -> String {
+        format!(
+            "{:?}",
+            PosixRegexBuilder::new(input)
+                .with_default_classes()
+                .compile_tokens()
+                .expect("error compiling regex")
+        )
     }
 
     #[test]
     fn basic() {
-        assert_eq!(compile(b"abc"), &[c(b'a'), c(b'b'), c(b'c')]);
+        assert_eq!(
+            compile(b"abc"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'a' 1..1
+    'b' 1..1
+    'c' 1..1
+"
+        );
     }
     #[test]
     fn groups() {
-        assert_eq!(compile(br"\(abc\|bcd\|cde\)"), &[t(Token::Group { id: 1, branches: vec![
-            vec![c(b'a'), c(b'b'), c(b'c')],
-            vec![c(b'b'), c(b'c'), c(b'd')],
-            vec![c(b'c'), c(b'd'), c(b'e')]
-        ]})]);
-        assert_eq!(compile(br"\(abc\|\(bcd\|cde\)\)"), &[
-            t(Token::Group { id: 1, branches: vec![
-                vec![c(b'a'), c(b'b'), c(b'c')],
-                vec![t(Token::Group { id: 2, branches: vec![
-                    vec![c(b'b'), c(b'c'), c(b'd')],
-                    vec![c(b'c'), c(b'd'), c(b'e')]
-                ]})]
-            ]})
-        ]);
+        assert_eq!(
+            compile(br"\(abc\|bcd\|cde\)"),
+            "\
+Root 1..1
+  Alternative 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        'a' 1..1
+        'b' 1..1
+        'c' 1..1
+      Alternative 1..1
+        'b' 1..1
+        'c' 1..1
+        'd' 1..1
+      Alternative 1..1
+        'c' 1..1
+        'd' 1..1
+        'e' 1..1
+"
+        );
+        assert_eq!(
+            compile(br"\(abc\|\(bcd\|cde\)\)"),
+            "\
+Root 1..1
+  Alternative 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        'a' 1..1
+        'b' 1..1
+        'c' 1..1
+      Alternative 1..1
+        Group(2) 1..1
+          Alternative 1..1
+            'b' 1..1
+            'c' 1..1
+            'd' 1..1
+          Alternative 1..1
+            'c' 1..1
+            'd' 1..1
+            'e' 1..1
+"
+        );
     }
     #[test]
     fn words() {
         assert_eq!(
             compile(br"\<word\>"),
-            &[t(Token::WordStart), c(b'w'), c(b'o'), c(b'r'), c(b'd'), t(Token::WordEnd)]
+            "\
+Root 1..1
+  Alternative 1..1
+    < 1..1
+    'w' 1..1
+    'o' 1..1
+    'r' 1..1
+    'd' 1..1
+    > 1..1
+"
         );
     }
     #[test]
     fn repetitions() {
         assert_eq!(
             compile(br"yeee*"),
-            &[c(b'y'), c(b'e'), c(b'e'), (Token::Char(b'e'), Range(0, None))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 1..1
+    'e' 0.. ending
+"
         );
         assert_eq!(
             compile(br"yee\?"),
-            &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(0, Some(1)))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 0..1 ending
+"
         );
         assert_eq!(
             compile(br"yee\+"),
-            &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(1, None))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 1..
+"
         );
         assert_eq!(
             compile(br"ye\{2}"),
-            &[c(b'y'), (Token::Char(b'e'), Range(2, Some(2)))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..2
+"
         );
         assert_eq!(
             compile(br"ye\{2,}"),
-            &[c(b'y'), (Token::Char(b'e'), Range(2, None))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..
+"
         );
         assert_eq!(
             compile(br"ye\{2,3}"),
-            &[c(b'y'), (Token::Char(b'e'), Range(2, Some(3)))]
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..3
+"
         );
     }
     #[test]
     fn bracket() {
         assert_eq!(
             compile(b"[abc]"),
-            &[t(Token::OneOf {
-                invert: false,
-                list: vec![
-                    Collation::Char(b'a'),
-                    Collation::Char(b'b'),
-                    Collation::Char(b'c')
-                ]
-            })]
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['a', 'b', 'c']} 1..1
+"
         );
         assert_eq!(
             compile(b"[^abc]"),
-            &[t(Token::OneOf {
-                invert: true,
-                list: vec![
-                    Collation::Char(b'a'),
-                    Collation::Char(b'b'),
-                    Collation::Char(b'c')
-                ]
-            })]
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: true, ['a', 'b', 'c']} 1..1
+"
         );
         assert_eq!(
             compile(b"[]] [^]]"),
-            &[
-                t(Token::OneOf { invert: false, list: vec![ Collation::Char(b']') ] }),
-                c(b' '),
-                t(Token::OneOf { invert: true,  list: vec![ Collation::Char(b']') ] }),
-            ]
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, [']']} 1..1
+    ' ' 1..1
+    {invert: true, [']']} 1..1
+"
         );
         assert_eq!(
             compile(b"[0-3] [a-c] [-1] [1-]"),
-            &[
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Char(b'0'),
-                    Collation::Char(b'1'),
-                    Collation::Char(b'2'),
-                    Collation::Char(b'3')
-                ] }),
-                c(b' '),
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Char(b'a'),
-                    Collation::Char(b'b'),
-                    Collation::Char(b'c')
-                ] }),
-                c(b' '),
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Char(b'-'),
-                    Collation::Char(b'1')
-                ] }),
-                c(b' '),
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Char(b'1'),
-                    Collation::Char(b'-')
-                ] })
-            ]
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['0', '1', '2', '3']} 1..1
+    ' ' 1..1
+    {invert: false, ['a', 'b', 'c']} 1..1
+    ' ' 1..1
+    {invert: false, ['-', '1']} 1..1
+    ' ' 1..1
+    {invert: false, ['1', '-']} 1..1
+"
         );
         assert_eq!(
             compile(b"[[.-.]-/]"),
-            &[
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Char(b'-'),
-                    Collation::Char(b'.'),
-                    Collation::Char(b'/')
-                ] })
-            ]
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['-', '.', '/']} 1..1
+"
         );
         assert_eq!(
             compile(b"[[:digit:][:upper:]]"),
-            &[
-                t(Token::OneOf { invert: false, list: vec![
-                    Collation::Class(ctype::is_digit),
-                    Collation::Class(ctype::is_upper)
-                ] })
-            ]
+            format!("\
+Root 1..1
+  Alternative 1..1
+    {{invert: false, [{:p}, {:p}]}} 1..1
+", ctype::is_digit as fn(u8) -> bool, ctype::is_upper as fn(u8) -> bool)
         );
     }
     #[test]
     fn newline() {
         assert_eq!(
             compile(br"\r\n"),
-            &[c(b'\r'), c(b'\n')]
+            "\
+Root 1..1
+  Alternative 1..1
+    '\\r' 1..1
+    '\\n' 1..1
+"
         );
     }
 }

+ 1 - 0
src/lib.rs

@@ -26,6 +26,7 @@ mod std {
 pub mod compile;
 pub mod ctype;
 pub mod matcher;
+pub mod tree;
 
 pub use compile::PosixRegexBuilder;
 pub use matcher::PosixRegex;

+ 250 - 250
src/matcher.rs

@@ -8,11 +8,12 @@ use ctype;
 use std::borrow::Cow;
 use std::fmt;
 use std::rc::Rc;
+use tree::{*, Node as TreeNode};
 
 /// A regex matcher, ready to match stuff
 #[derive(Clone)]
 pub struct PosixRegex<'a> {
-    branches: Cow<'a, [Vec<(Token, Range)>]>,
+    tree: Cow<'a, Tree>,
     case_insensitive: bool,
     newline: bool,
     no_start: bool,
@@ -22,9 +23,9 @@ impl<'a> PosixRegex<'a> {
     /// Create a new matcher instance from the specified alternations. This
     /// should probably not be used and instead an instance should be obtained
     /// from `PosixRegexBuilder`, which also compiles a string into regex.
-    pub fn new(branches: Cow<'a, [Vec<(Token, Range)>]>) -> Self {
+    pub fn new(tree: Cow<'a, Tree>) -> Self {
         Self {
-            branches,
+            tree,
             case_insensitive: false,
             newline: false,
             no_start: false,
@@ -61,8 +62,24 @@ impl<'a> PosixRegex<'a> {
     /// `matches_exact` or in each match in `matches`.
     pub fn count_groups(&self) -> usize {
         let mut count = 1;
-        for branch in &*self.branches {
-            count += count_groups(branch);
+        let mut cursor = self.tree[self.tree.root].child;
+        while let Some(node) = cursor {
+            // Walk tree
+            let node = &self.tree[node];
+            if node.child.is_some() {
+                cursor = node.child;
+            } else {
+                let mut node = Some(node);
+                while node.map(|node| node.next_sibling.is_none()).unwrap_or(false) {
+                    node = node.unwrap().parent.map(|node| &self.tree[node]);
+                }
+                cursor = node.and_then(|node| node.next_sibling);
+            }
+
+            // Count groups
+            if let Token::Group(_) = node.token {
+                count += 1;
+            }
         }
         count
     }
@@ -74,12 +91,13 @@ impl<'a> PosixRegex<'a> {
             input,
             offset: 0
         };
-        let branches = self.branches.iter()
-            .filter_map(|tokens| Branch::new(true, tokens))
+        let groups = self.count_groups();
+        let tree = self.tree[self.tree.root].children(&self.tree)
+            .filter_map(|node| self.tree[node].child.map(|child| Node::new(&self.tree, child, groups)))
             .collect();
 
         let start = matcher.offset;
-        match matcher.matches_exact(branches) {
+        match matcher.matches_exact(tree) {
             None => None,
             Some(mut groups) => {
                 assert_eq!(groups[0], None);
@@ -96,17 +114,50 @@ impl<'a> PosixRegex<'a> {
             offset: 0
         };
 
-        let tokens = vec![
-            (Token::InternalStart, Range(0, None)),
-            (Token::Group { id: 0, branches: self.branches.to_vec() }, Range(1, Some(1)))
-        ];
-        let branches = vec![
-            Branch::new(false, &tokens).unwrap()
-        ];
+        let mut arena = self.tree.arena.to_vec();
+
+        let root = self.tree[self.tree.root].child;
+
+        // Wrap everything in group
+        let group_id = NodeId::from(arena.len());
+        arena.push(TreeNode {
+            token: Token::Group(0),
+            range: Range(1, Some(1)),
+            end: false,
+            parent: None,
+            next_sibling: None,
+            child: root
+        });
+
+        // Update parents
+        let mut cursor = root;
+        while let Some(node) = cursor {
+            let node = &mut arena[usize::from(node)];
+            cursor = node.next_sibling;
+            node.parent = Some(group_id);
+        }
+
+        // Push leading start
+        let start_id = NodeId::from(arena.len());
+        arena.push(TreeNode {
+            token: Token::InternalStart,
+            range: Range(0, None),
+            end: false,
+            parent: None,
+            next_sibling: Some(group_id),
+            child: None
+        });
+
+        let groups = self.count_groups();
+        let tree = Tree {
+            arena: arena.into_boxed_slice(),
+            root: start_id
+        };
+        let tree = vec![Node::new(&tree, tree.root, groups)];
 
         let mut matches = Vec::new();
         while max.map(|max| max > 0).unwrap_or(true) {
-            match matcher.matches_exact(branches.clone()) {
+            match matcher.matches_exact(tree.clone()) {
                 Some(groups) => matches.push(groups),
                 None => break
             }
@@ -116,19 +167,6 @@ impl<'a> PosixRegex<'a> {
     }
 }
 
-fn count_groups(tokens: &[(Token, Range)]) -> usize {
-    let mut groups = 0;
-    for (token, _) in tokens {
-        if let Token::Group { ref branches, .. } = token {
-            groups += 1;
-            for branch in branches {
-                groups += count_groups(branch);
-            }
-        }
-    }
-    groups
-}
-
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct Group {
     index: usize,
@@ -137,90 +175,64 @@ struct Group {
 }
 
 #[derive(Clone)]
-struct Branch<'a> {
-    index: usize,
-    repeated: u32,
-    tokens: &'a [(Token, Range)],
-    path: Box<[Group]>,
+struct Node<'a> {
+    tree: &'a Tree,
+    parent: Option<Rc<Node<'a>>>,
+    node: NodeId,
     prev: Box<[Option<(usize, usize)>]>,
-
-    parent: Option<Rc<Branch<'a>>>
+    repeated: u32
 }
-impl<'a> fmt::Debug for Branch<'a> {
+impl<'a> fmt::Debug for Node<'a> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let (ref token, mut range) = *self.get_token();
+        let mut range = self.node().range;
         range.0 = range.0.saturating_sub(self.repeated);
         range.1 = range.1.map(|max| max.saturating_sub(self.repeated));
-        write!(f, "{:?}", (token, range))
+        write!(f, "{:?}", (&self.node().token, range))
     }
 }
-impl<'a> Branch<'a> {
-    fn new(exact: bool, tokens: &'a [(Token, Range)]) -> Option<Self> {
-        if tokens.is_empty() {
-            return None;
-        }
-        Some(Self {
-            index: 0,
-            repeated: 0,
-            tokens: tokens,
-            path: Box::new([]),
-            prev: vec![None; if exact { 1 } else { 0 } + count_groups(tokens)].into_boxed_slice(),
-
-            parent: None
-        })
-    }
-    fn group(
-        path: Box<[Group]>,
-        prev: Box<[Option<(usize, usize)>]>,
-        tokens: &'a [(Token, Range)],
-        mut parent: Branch<'a>
-    ) -> Option<Self> {
-        if tokens.is_empty() {
-            return None;
-        }
-        parent.repeated += 1;
-        Some(Self {
-            index: 0,
-            repeated: 0,
-            tokens,
-            path,
-            prev,
-            parent: Some(Rc::new(parent))
-        })
-    }
-    fn parent_tokens(&self) -> &[(Token, Range)] {
-        let mut tokens = self.tokens;
-
-        let len = self.path.len();
-        if len > 0 {
-            for group in &self.path[..len-1] {
-                match tokens[group.index] {
-                    (Token::Group { ref branches, .. }, _) => tokens = &branches[group.variant],
-                    _ => panic!("non-group index in path")
-                }
-            }
+impl<'a> Node<'a> {
+    fn new(tree: &'a Tree, node: NodeId, groups: usize) -> Self {
+        Self {
+            tree: tree,
+            parent: None,
+            node,
+            prev: vec![None; groups].into_boxed_slice(),
+            repeated: 0
         }
-
-        tokens
     }
-    fn tokens(&self) -> &[(Token, Range)] {
-        let mut tokens = self.parent_tokens();
-
-        if let Some(group) = self.path.last() {
-            match tokens[group.index] {
-                (Token::Group { ref branches, .. }, _) => tokens = &branches[group.variant],
-                _ => panic!("non-group index in path")
+    fn into_children(mut self, branches: &mut Vec<Node<'a>>, offset: usize) {
+        let id = match self.tree[self.node].token {
+            Token::Group(id) => id,
+            _ => return
+        };
+        self.repeated += 1;
+        let parent = Rc::new(self);
+        for alternative in parent.tree[parent.node].children(&parent.tree) {
+            if let Some(node) = parent.tree[alternative].child {
+                let mut prev = parent.prev.clone();
+                prev[id] = Some((offset, 0));
+                branches.push(Self {
+                    tree: parent.tree,
+                    parent: Some(Rc::clone(&parent)),
+                    node,
+                    prev,
+                    repeated: 0
+                });
             }
         }
-
-        tokens
     }
-    fn get_token(&self) -> &(Token, Range) {
-        &self.tokens()[self.index]
+    fn node(&self) -> &TreeNode {
+        &self.tree[self.node]
     }
     fn update_group_end(&mut self, offset: usize) {
-        for group in &mut *self.path {
-            self.prev[group.id].as_mut().unwrap().1 = offset;
+        let mut parent = self.node().parent;
+        while let Some(group) = parent {
+            let group = &self.tree[group];
+            parent = group.parent;
+            match group.token {
+                Token::Group(id) => self.prev[id].as_mut().unwrap().1 = offset,
+                _ => ()
+            }
         }
     }
     fn extend(&self, prev: &mut Box<[Option<(usize, usize)>]>) {
@@ -230,92 +242,83 @@ impl<'a> Branch<'a> {
             }
         }
     }
-    fn next_branch(&self) -> Option<Self> {
-        if self.index + 1 >= self.tokens().len() {
-            let parent = self.parent.as_ref()?;
-            let (_, Range(min, _)) = *parent.get_token();
-            // Don't add the next branch until we've repeated this one enough
-            if parent.repeated < min {
-                return None;
-            }
-
-            if let Some(mut next) = parent.next_branch() {
+    fn add_branches(&self, branches: &mut Vec<Node<'a>>, offset: usize) {
+        if let Some(next) = self.node().next_sibling {
+            branches.push(Self {
+                node: next,
+                repeated: 0,
+                ..self.clone()
+            });
+        } else {
+            let parent = match self.parent {
+                Some(ref parent) => parent,
+                None => return
+            };
+            let Range(min, _) = parent.node().range;
+
+            if parent.repeated >= min {
                 // Group is closing, migrate previous & current groups to next.
-                self.extend(&mut next.prev);
-
-                return Some(next);
-            }
-            return None;
-        }
-        Some(Self {
-            index: self.index + 1,
-            repeated: 0,
-            ..self.clone()
-        })
-    }
-    fn add_repeats(&self, branches: &mut Vec<Branch<'a>>, offset: usize) {
-        let mut branch = self;
-        loop {
-            if let (Token::Group { id, branches: ref alternatives }, Range(_, max)) = *branch.get_token() {
-                if max.map(|max| branch.repeated < max).unwrap_or(true) {
-                    for alternative in 0..alternatives.len() {
-                        let mut path = branch.path.to_vec();
-                        path.push(Group {
-                            variant: alternative,
-                            index: branch.index,
-                            id
-                        });
-
-                        let mut prev = self.prev.clone();
-                        prev[id].get_or_insert((0, 0)).0 = offset;
-
-                        if let Some(group) = Branch::group(
-                            path.into_boxed_slice(),
-                            prev,
-                            branch.tokens,
-                            branch.clone()
-                        ) {
-                            branches.push(group);
-                        }
-                    }
-                    break;
+                let mut parent = Some(parent);
+                while parent.map(|parent| parent.node().next_sibling.is_none()).unwrap_or(false) {
+                    parent = parent.unwrap().parent.as_ref();
+                }
+                if let Some((node, next)) = parent.and_then(|parent| parent.node().next_sibling.map(|node| (parent, node))) {
+                    let clone = (**node).clone();
+                    let mut prev = clone.prev;
+                    self.extend(&mut prev);
+                    branches.push(Self {
+                        node: next,
+                        repeated: 0,
+                        prev,
+                        ..clone
+                    });
                 }
             }
 
-            match branch.parent {
-                Some(ref new) => branch = new,
-                None => break
+            // Add repetitions
+            let mut parent = Some(parent);
+            while let Some(node) = parent {
+                parent = node.parent.as_ref();
+                let Range(_, max) = node.node().range;
+                if max.map(|max| node.repeated < max).unwrap_or(true) {
+                    let mut clone = (**node).clone();
+                    self.extend(&mut clone.prev);
+                    clone.into_children(branches, offset);
+                }
             }
         }
     }
-    /// Returns if this node is "explored" enough times,
-    /// meaning it has repeated as many times as it want to and has nowhere to go next.
-    fn is_explored(&self) -> bool {
-        let mut branch = Cow::Borrowed(self);
+    /// Returns true if this node is "finished", meaning it's reached one
+    /// possible end and continuing exploring is optional
+    fn is_finished(&self) -> bool {
+        let Range(min, _) = self.node().range;
+        if self.repeated < min {
+            return false;
+        }
 
-        loop {
-            {
-                let mut branch = &*branch;
-                while let Some(ref parent) = branch.parent {
-                    let (_, Range(min, _)) = *parent.get_token();
-                    if parent.repeated < min {
-                        // Group did not repeat enough times!
-                        return false;
-                    }
-                    branch = parent;
-                }
+        let mut next = Some(self);
+        while let Some(current) = next {
+            let mut node = current.node();
+            if node.token == Token::Alternative {
+                // Don't explore other alternatives
+                next = current.parent.as_ref().map(|node| &**node);
+                node = &self.tree[node.parent.expect("found root alternative")];
             }
-
-            let (_, Range(min, _)) = *branch.get_token();
-            if branch.repeated < min {
-                return false;
+            if let Token::Group(_) = node.token {
+                let Range(min, _) = node.range;
+                if current.repeated < min {
+                    return false;
+                }
             }
-            match branch.next_branch() {
-                Some(next) => branch = Cow::Owned(next),
-                None => break
+            if node.next_sibling.is_some() {
+                break;
             }
+            next = current.parent.as_ref().map(|node| &**node);
         }
-        true
+        next
+            .and_then(|node| self.tree[node.node].next_sibling)
+            .map(|node| self.tree[node].end)
+            .unwrap_or(true)
     }
 }
 
@@ -325,43 +328,22 @@ struct PosixRegexMatcher<'a> {
     offset: usize
 }
 impl<'a> PosixRegexMatcher<'a> {
-    fn expand<'b>(&mut self, branches: &mut [Branch<'b>]) -> Vec<Branch<'b>> {
+    fn expand<'b>(&mut self, branches: &mut [Node<'b>]) -> Vec<Node<'b>> {
         let mut insert = Vec::new();
 
         for branch in branches {
             branch.update_group_end(self.offset);
 
-            let (ref token, range) = *branch.get_token();
+            let node = branch.node();
 
-            if let Token::Group { id, branches: ref inner } = *token {
-                for alternation in 0..inner.len() {
-                    let mut path = Vec::with_capacity(branch.path.len() + 1);
-                    path.extend_from_slice(&branch.path);
-                    path.push(Group {
-                        index: branch.index,
-                        variant: alternation,
-                        id
-                    });
-
-                    let mut prev = branch.prev.clone();
-                    prev[id].get_or_insert((0, 0)).0 = self.offset;
-
-                    if let Some(branch) = Branch::group(
-                        path.into(),
-                        prev,
-                        branch.tokens,
-                        branch.clone()
-                    ) {
-                        insert.push(branch);
-                    }
-                }
+            if let Token::Group(_) = node.token {
+                branch.clone().into_children(&mut insert, self.offset);
             }
-            if branch.repeated >= range.0 {
+
+            let Range(min, _) = node.range;
+            if branch.repeated >= min {
                 // Push the next element as a new branch
-                if let Some(next) = branch.next_branch() {
-                    insert.push(next);
-                }
-                branch.add_repeats(&mut insert, self.offset);
+                branch.add_branches(&mut insert, self.offset);
             }
         }
 
@@ -372,7 +354,7 @@ impl<'a> PosixRegexMatcher<'a> {
         insert
     }
 
-    fn matches_exact(&mut self, mut branches: Vec<Branch>) -> Option<Box<[Option<(usize, usize)>]>> {
+    fn matches_exact(&mut self, mut branches: Vec<Node>) -> Option<Box<[Option<(usize, usize)>]>> {
         // Whether or not any branch, at any point, got fully explored. This
         // means at least one path of the regex successfully completed!
         let mut succeeded = None;
@@ -381,37 +363,30 @@ impl<'a> PosixRegexMatcher<'a> {
         loop {
             let next = self.input.get(self.offset).cloned();
 
-            let mut index = 0;
-            let mut remove = 0;
-
             let mut insert = self.expand(&mut branches);
             branches.append(&mut insert);
 
+            // Handle zero-width stuff
             loop {
-                if index >= branches.len() {
-                    break;
-                }
-                if remove > 0 {
-                    // Just like Rust's `retain` function, shift all elements I
-                    // want to keep back and `truncate` when I'm done.
-                    branches.swap(index, index-remove);
-                }
-                let branch = &mut branches[index-remove];
-                index += 1;
+                let mut index = 0;
+                let mut remove = 0;
+                let mut insert = Vec::new();
 
-                let (ref token, Range(_, mut max)) = *branch.get_token();
-                let mut token = token;
+                while index < branches.len() {
+                    if remove > 0 {
+                        branches.swap(index, index-remove);
+                    }
+                    let branch = &mut branches[index-remove];
+                    index += 1;
 
-                let mut accepts = true;
+                    let node = branch.node();
 
-                // Step 1: Handle zero-width stuff like ^ and \<
-                loop {
-                    match token {
+                    match node.token {
                         Token::End |
                         Token::Start |
                         Token::WordEnd |
                         Token::WordStart => {
-                            accepts = accepts && match token {
+                            let accepts = match node.token {
                                 Token::End =>
                                     (!self.base.no_end && next.is_none())
                                         || (self.base.newline && next == Some(b'\n')),
@@ -422,25 +397,46 @@ impl<'a> PosixRegexMatcher<'a> {
                                 Token::WordStart => prev.map(ctype::is_word_boundary).unwrap_or(true),
                                 _ => unreachable!()
                             };
-
-                            // Skip ahead to the next token.
-                            match branch.next_branch() {
-                                Some(next) => *branch = next,
-                                None => break
+                            if accepts {
+                                branch.repeated += 1;
+                                branch.add_branches(&mut insert, self.offset);
+                            }
+                            if branch.is_finished() {
+                                succeeded = Some(branch.clone());
                             }
-                            let (ref new_token, Range(_, new_max)) = *branch.get_token();
-                            token = new_token;
-                            max = new_max;
+                            remove += 1;
                         },
-                        _ => break
+                        _ => ()
                     }
                 }
+                branches.truncate(branches.len() - remove);
+
+                if insert.is_empty() {
+                    break;
+                }
+                let mut insert2 = self.expand(&mut insert);
+                branches.append(&mut insert);
+                branches.append(&mut insert2);
+            }
+
+            let mut index = 0;
+            let mut remove = 0;
+
+            // Handle stuff
+            while index < branches.len() {
+                if remove > 0 {
+                    // Just like Rust's `retain` function, shift all elements I
+                    // want to keep back and `truncate` when I'm done.
+                    branches.swap(index, index-remove);
+                }
+                let branch = &mut branches[index-remove];
+                index += 1;
 
-                // Step 2: Check if the token isn't repeated enough times already
-                accepts = accepts && max.map(|max| branch.repeated < max).unwrap_or(true);
+                let node = branch.node();
+                let Range(_, max) = node.range;
 
                 // Step 3: Check if the token matches
-                accepts = accepts && match *token {
+                let accepts = max.map(|max| branch.repeated < max).unwrap_or(true) && match node.token {
                     Token::InternalStart => next.is_some(),
                     Token::Group { .. } => false, // <- content is already expanded and handled
 
@@ -455,32 +451,29 @@ impl<'a> PosixRegexMatcher<'a> {
                         && list.iter().any(|c| c.matches(next, self.base.case_insensitive)) == !invert
                     } else { false },
 
-                    // These will only get called if they are encountered at
-                    // EOF (because next_branch returns None), for example
-                    // "abc\>" or "^". Then we simply want to return true as to
-                    // preserve the current `accepts` status.
-                    Token::End |
-                    Token::Start |
-                    Token::WordEnd |
-                    Token::WordStart => true
+                    Token::Alternative
+                    | Token::End
+                    | Token::Root
+                    | Token::Start
+                    | Token::WordEnd
+                    | Token::WordStart => unreachable!()
                 };
 
-                if !accepts {
-                    if branch.is_explored() {
+                if accepts {
+                    branch.repeated += 1
+                } else {
+                    if branch.is_finished() {
                         succeeded = Some(branch.clone());
                     }
                     remove += 1;
-                    continue;
                 }
-
-                branch.repeated += 1;
             }
             let end = branches.len() - remove;
             branches.truncate(end);
 
             if branches.is_empty() ||
                     // The internal start thing is lazy, not greedy:
-                    (succeeded.is_some() && branches.iter().all(|t| t.get_token().0 == Token::InternalStart)) {
+                    (succeeded.is_some() && branches.iter().all(|t| t.node().token == Token::InternalStart)) {
                 return succeeded.map(|branch| branch.prev);
             }
 
@@ -623,7 +616,7 @@ mod tests {
         );
         assert_eq!(
             matches(r"h\(i\)", "hello hi lol"),
-            vec!(abox![Some((6, 8)), Some((7, 8))])
+            vec![abox![Some((6, 8)), Some((7, 8))]]
         );
         assert_eq!(
             matches_exact(r"\(\([[:alpha:]]\)*\)", "abcdefg"),
@@ -642,6 +635,13 @@ mod tests {
             Some(abox![Some((0, 4)), Some((2, 3)), None, Some((3, 4))])
         );
     }
+    //FIXME #[test]
+    //FIXME fn matches_is_lazy() {
+    //FIXME     assert_eq!(
+    //FIXME         matches(r"\(hi\)\+", "hello hihi kek"),
+    //FIXME         vec![abox![Some((6, 10)), Some((6, 10))]]
+    //FIXME     );
+    //FIXME }
     #[test]
     fn start_and_end() {
         assert!(matches_exact("^abc$", "abc").is_some());

+ 387 - 0
src/tree.rs

@@ -0,0 +1,387 @@
+#[cfg(feature = "no_std")]
+use std::prelude::*;
+
+use std::fmt;
+use std::ops::{Index, IndexMut};
+
+use compile::{Token, Range};
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct NodeId(usize);
+impl From<usize> for NodeId {
+    fn from(id: usize) -> Self {
+        NodeId(id)
+    }
+}
+impl From<NodeId> for usize {
+    fn from(id: NodeId) -> usize {
+        id.0
+    }
+}
+
+#[derive(Clone)]
+pub struct Node {
+    pub token: Token,
+    pub range: Range,
+    pub end: bool,
+    pub parent: Option<NodeId>,
+    pub next_sibling: Option<NodeId>,
+    pub child: Option<NodeId>
+}
+impl Node {
+    pub fn children<'a>(&self, arena: &'a Tree) -> NodeIter<'a> {
+        NodeIter(arena, self.child)
+    }
+}
+impl fmt::Debug for Node {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{:?} {:?}", self.token, self.range)?;
+        if self.end {
+            write!(f, " ending")?;
+        }
+        Ok(())
+    }
+}
+
+pub struct NodeIter<'a>(&'a Tree, Option<NodeId>);
+impl<'a> Iterator for NodeIter<'a> {
+    type Item = NodeId;
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(next) = self.1 {
+            self.1 = self.0[next].next_sibling;
+            Some(next)
+        } else {
+            None
+        }
+    }
+}
+
+pub struct Checkpoint {
+    cursor: Option<NodeId>
+}
+
+#[derive(Default)]
+pub struct TreeBuilder {
+    arena: Vec<Node>,
+    parent: Option<NodeId>,
+    cursor: Option<NodeId>
+}
+impl TreeBuilder {
+    fn insert(&mut self, token: Token, range: Range) -> NodeId {
+        let id = NodeId::from(self.arena.len());
+        self.arena.push(Node {
+            token,
+            range,
+            end: false,
+            parent: self.parent,
+            next_sibling: None,
+            child: None
+        });
+        if let Some(prev) = self.cursor {
+            self.arena[usize::from(prev)].next_sibling = Some(id);
+        }
+        if let Some(parent) = self.parent {
+            self.arena[usize::from(parent)].child = self.arena[usize::from(parent)].child.or(Some(id));
+        }
+        id
+    }
+    pub fn leaf(&mut self, token: Token, range: Range) {
+        self.cursor = Some(self.insert(token, range));
+    }
+    pub fn start_internal(&mut self, token: Token, range: Range) {
+        self.parent = Some(self.insert(token, range));
+        self.cursor = None;
+    }
+    pub fn finish_internal(&mut self) {
+        self.cursor = self.parent;
+        self.parent = self.parent.and_then(|parent| self.arena[usize::from(parent)].parent);
+    }
+    pub fn checkpoint(&self) -> Checkpoint {
+        Checkpoint {
+            cursor: self.cursor
+        }
+    }
+    pub fn start_internal_at(&mut self, checkpoint: Checkpoint, token: Token, range: Range) {
+        let parent = if let Some(from) = checkpoint.cursor {
+            let id = NodeId::from(self.arena.len());
+            self.arena.push(Node {
+                token,
+                range,
+                end: false,
+                parent: self.parent,
+                next_sibling: None,
+                child: self.arena[usize::from(from)].next_sibling
+            });
+            self.arena[usize::from(from)].next_sibling = Some(id);
+            id
+        } else if let Some(parent) = self.parent {
+            let id = NodeId::from(self.arena.len());
+            self.arena.push(Node {
+                token,
+                range,
+                end: false,
+                parent: self.parent,
+                next_sibling: None,
+                child: self.arena[usize::from(parent)].child
+            });
+            self.arena[usize::from(parent)].child = Some(id);
+            id
+        } else {
+            let id = NodeId::from(self.arena.len());
+            self.arena.push(Node {
+                token,
+                range,
+                end: false,
+                parent: None,
+                next_sibling: None,
+                child: self.cursor
+            });
+            id
+        };
+        // Update parent
+        let mut next = self.arena[usize::from(parent)].child;
+        while let Some(node) = next {
+            let node = &mut self.arena[usize::from(node)];
+            next = node.next_sibling;
+            node.parent = Some(parent);
+        }
+        self.parent = Some(parent);
+        self.cursor = None;
+    }
+    pub fn finish(self) -> Tree {
+        assert!(self.cursor.is_some(), "no item");
+        let cursor = self.cursor.unwrap();
+
+        Tree {
+            arena: self.arena.into_boxed_slice(),
+            root: cursor
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct Tree {
+    pub arena: Box<[Node]>,
+    pub root: NodeId
+}
+impl Tree {
+    fn mark_end_of(&mut self, root: NodeId) {
+        // abc(de)? = (, )
+        // (ab)?c(de)? = (, )
+        // ab?c? = b, c
+        // ab?(c*) = b, c
+        //
+        // Algorithm: Find the first in a series of trailing optional nodes and
+        // mark all the nodes afterwards as endings as well, recursing any
+        // optional groups.
+        let mut next = Some(root);
+        while let Some(alternation) = next {
+            next = self.arena[usize::from(alternation)].next_sibling;
+
+            let mut end = None;
+            let mut next = self[alternation].child;
+            let mut nested: usize = 0;
+            'outer: while let Some(id) = next {
+                let node = &self[id];
+
+                // Mark the first optional node, or reset if it's not optional
+                let Range(min, _) = node.range;
+                if min == 0 {
+                    end = end.or(Some(id));
+                } else {
+                    if node.child.is_some() {
+                        // Recurse required groups
+                        nested += 1;
+                        next = node.child;
+                        continue;
+                    } else {
+                        end = None;
+                    }
+                }
+                let mut me = Some(node);
+                while me.map(|me| me.next_sibling.is_none()).unwrap_or(false) {
+                    match nested.checked_sub(1) {
+                        Some(new) => nested = new,
+                        None => break 'outer
+                    }
+                    me = me.unwrap().parent.map(|id| &self[id]);
+                }
+                next = me.and_then(|me| me.next_sibling);
+            }
+
+            // Mark all nodes after end as optional
+            let mut next = end;
+            while let Some(node) = next {
+                let node = &mut self[node];
+                next = node.next_sibling;
+                node.end = true;
+                if let Some(child) = node.child {
+                    // Find any ends if this node ends up expanded
+                    self.mark_end_of(child);
+                }
+            }
+        }
+    }
+    pub fn mark_end(&mut self) {
+        if let Some(alternative) = self[self.root].child {
+            self.mark_end_of(alternative);
+        }
+    }
+}
+impl Index<NodeId> for Tree {
+    type Output = Node;
+    fn index(&self, index: NodeId) -> &Node {
+        &self.arena[usize::from(index)]
+    }
+}
+impl IndexMut<NodeId> for Tree {
+    fn index_mut(&mut self, index: NodeId) -> &mut Node {
+        &mut self.arena[usize::from(index)]
+    }
+}
+impl fmt::Debug for Tree {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut next = Some(self.root);
+        let mut nested: usize = 0;
+        'outer: while let Some(id) = next {
+            let node = &self[id];
+            writeln!(f, "{:indent$}{:?}", "", node, indent = nested * 2)?;
+            if node.child.is_some() {
+                nested += 1;
+                next = node.child;
+            } else {
+                let mut me = Some(node);
+                while me.map(|me| me.next_sibling.is_none()).unwrap_or(false) {
+                    match nested.checked_sub(1) {
+                        Some(new) => nested = new,
+                        None => break 'outer
+                    }
+                    me = me.unwrap().parent.map(|id| &self[id]);
+                }
+                next = me.and_then(|me| me.next_sibling);
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use compile::PosixRegexBuilder;
+
+    fn sanity_check(tree: &Tree) {
+        let mut next = Some(tree.root);
+        let mut parent = None;
+        while let Some(id) = next {
+            let node = &tree[id];
+            assert_eq!(parent, node.parent);
+
+            if let Some(child) = node.child {
+                next = Some(child);
+                parent = Some(id);
+            } else {
+                let mut node = Some(id);
+                while node.map(|node| tree[node].next_sibling.is_none()).unwrap_or(false) {
+                    node = tree[node.unwrap()].parent;
+                }
+                next = node.and_then(|node| tree[node].next_sibling);
+                parent = node.and_then(|node| tree[node].parent);
+            }
+        }
+    }
+
+    #[test]
+    fn simple_builder() {
+        let mut builder = TreeBuilder::default();
+        builder.start_internal(Token::Root, Range(1, Some(1)));
+            builder.start_internal(Token::Alternative, Range(1, Some(1)));
+                builder.leaf(Token::Start, Range(1, Some(1)));
+                builder.start_internal(Token::Group(1), Range(1, Some(1)));
+                    builder.start_internal(Token::Alternative, Range(1, Some(1)));
+                        builder.leaf(Token::Any, Range(1, Some(1)));
+                    builder.finish_internal();
+                builder.finish_internal();
+            builder.finish_internal();
+            builder.start_internal(Token::Alternative, Range(1, Some(1)));
+                builder.leaf(Token::End, Range(1, Some(1)));
+            builder.finish_internal();
+        builder.finish_internal();
+
+        let tree = builder.finish();
+        sanity_check(&tree);
+
+        assert_eq!(
+            format!("{:?}", tree),
+            "\
+Root 1..1
+  Alternative 1..1
+    ^ 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        . 1..1
+  Alternative 1..1
+    $ 1..1
+"
+        );
+    }
+    #[test]
+    fn builder_checkpoint() {
+        let mut builder = TreeBuilder::default();
+        builder.start_internal(Token::Root, Range(1, Some(1)));
+            let mut alternation = builder.checkpoint();
+                builder.leaf(Token::Start, Range(1, Some(1)));
+                let group = builder.checkpoint();
+                    builder.start_internal(Token::Alternative, Range(1, Some(1)));
+                        builder.leaf(Token::Any, Range(1, Some(1)));
+                    builder.finish_internal();
+                builder.start_internal_at(group, Token::Group(1), Range(1, Some(1)));
+                builder.finish_internal();
+            builder.start_internal_at(alternation, Token::Alternative, Range(1, Some(1)));
+            builder.finish_internal();
+            alternation = builder.checkpoint();
+                builder.leaf(Token::End, Range(1, Some(1)));
+            builder.start_internal_at(alternation, Token::Alternative, Range(1, Some(1)));
+            builder.finish_internal();
+        builder.finish_internal();
+
+        let tree = builder.finish();
+        sanity_check(&tree);
+
+        assert_eq!(
+            format!("{:?}", tree),
+            "\
+Root 1..1
+  Alternative 1..1
+    ^ 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        . 1..1
+  Alternative 1..1
+    $ 1..1
+"
+        );
+    }
+    #[test]
+    fn mark_end() {
+        let tree = PosixRegexBuilder::new(br"a\?bc\?\(d*\)\|bb\?").compile_tokens().unwrap();
+        sanity_check(&tree);
+
+        assert_eq!(
+            format!("{:?}", tree),
+            "\
+Root 1..1
+  Alternative 1..1
+    'a' 0..1
+    'b' 1..1
+    'c' 0..1 ending
+    Group(1) 1..1 ending
+      Alternative 1..1
+        'd' 0.. ending
+  Alternative 1..1
+    'b' 1..1
+    'b' 0..1 ending
+"
+        );
+    }
+}