浏览代码

Add support for extended regex

Jeremy Soller 1 周之前
父节点
当前提交
178c47d005
共有 1 个文件被更改,包括 291 次插入6 次删除
  1. 291 6
      src/compile.rs

+ 291 - 6
src/compile.rs

@@ -117,6 +117,7 @@ pub struct PosixRegexBuilder<'a> {
     classes: HashMap<&'a [u8], fn(u8) -> bool>,
     group_id: usize,
     builder: TreeBuilder,
+    extended: bool,
 }
 impl<'a> PosixRegexBuilder<'a> {
     /// Create a new instance that is ready to parse the regex `input`
@@ -126,6 +127,7 @@ impl<'a> PosixRegexBuilder<'a> {
             classes: HashMap::new(),
             group_id: 1,
             builder: TreeBuilder::default(),
+            extended: false,
         }
     }
     /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
@@ -153,6 +155,11 @@ impl<'a> PosixRegexBuilder<'a> {
 
         self
     }
+    /// Use POSIX extended regex
+    pub fn extended(mut self, extended: bool) -> Self {
+        self.extended = true;
+        self
+    }
     /// "Compile" this regex to a struct ready to match input
     pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
         let tree = self.compile_tokens()?;
@@ -203,9 +210,9 @@ impl<'a> PosixRegexBuilder<'a> {
             let new = match c {
                 b'*' => Some((1, Range(0, None))),
                 b'\\' => match self.input.get(1) {
-                    Some(b'?') => Some((2, Range(0, Some(1)))),
-                    Some(b'+') => Some((2, Range(1, None))),
-                    Some(b'{') => {
+                    Some(b'?') if !self.extended => Some((2, Range(0, Some(1)))),
+                    Some(b'+') if !self.extended => Some((2, Range(1, None))),
+                    Some(b'{') if !self.extended => {
                         self.consume(2);
                         let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
                         let mut second = Some(first);
@@ -228,6 +235,29 @@ impl<'a> PosixRegexBuilder<'a> {
                     }
                     _ => None,
                 },
+                b'?' if self.extended => Some((1, Range(0, Some(1)))),
+                b'+' if self.extended => Some((1, Range(1, None))),
+                b'{' if self.extended => {
+                    self.consume(1);
+                    let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
+                    let mut second = Some(first);
+                    if let Some(b',') = self.input.first() {
+                        self.consume(1);
+                        second = self.take_int()?;
+                    }
+                    if self.input.first() == Some(&b'}') {
+                        self.consume(1);
+                    } else if self.input.starts_with(br"\}") {
+                        self.consume(2);
+                    } else {
+                        return Err(Error::UnclosedRepetition);
+                    }
+                    if second.map(|second| first > second).unwrap_or(false) {
+                        return Err(Error::IllegalRange);
+                    }
+                    range = Range(first, second);
+                    None
+                }
                 _ => None,
             };
             if let Some((consume, new)) = new {
@@ -329,7 +359,7 @@ impl<'a> PosixRegexBuilder<'a> {
                     Token::BackRef(id)
                 }
                 b'\\' => match self.next()? {
-                    b'(' => {
+                    b'(' if !self.extended => {
                         let id = self.group_id;
                         self.group_id += 1;
                         let checkpoint = self.builder.checkpoint();
@@ -340,8 +370,8 @@ impl<'a> PosixRegexBuilder<'a> {
                         self.builder.finish_internal();
                         continue;
                     }
-                    b')' => break,
-                    b'|' => {
+                    b')' if !self.extended => break,
+                    b'|' if !self.extended => {
                         self.builder.finish_internal();
                         self.builder
                             .start_internal(Token::Alternative, Range(1, Some(1)));
@@ -370,6 +400,24 @@ impl<'a> PosixRegexBuilder<'a> {
                     b't' => Token::Char(b'\t'),
                     c => Token::Char(c),
                 },
+                b'(' if self.extended => {
+                    let id = self.group_id;
+                    self.group_id += 1;
+                    let checkpoint = self.builder.checkpoint();
+                    self.parse()?;
+                    let range = self.parse_range()?;
+                    self.builder
+                        .start_internal_at(checkpoint, Token::Group(id), range);
+                    self.builder.finish_internal();
+                    continue;
+                }
+                b')' if self.extended => break,
+                b'|' if self.extended => {
+                    self.builder.finish_internal();
+                    self.builder
+                        .start_internal(Token::Alternative, Range(1, Some(1)));
+                    continue;
+                }
                 c => Token::Char(c),
             };
             let range = self.parse_range()?;
@@ -617,6 +665,243 @@ Root 1..1
       Alternative 1..1
         {invert: false, ['a', 'b', 'c']} 1..1
     \\1 1..1
+"
+        )
+    }
+
+    fn compile_extended(input: &[u8]) -> String {
+        format!(
+            "{:?}",
+            PosixRegexBuilder::new(input)
+                .with_default_classes()
+                .extended(true)
+                .compile_tokens()
+                .expect("error compiling regex")
+        )
+    }
+
+    #[test]
+    fn basic_extended() {
+        assert_eq!(
+            compile_extended(b"abc"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'a' 1..1
+    'b' 1..1
+    'c' 1..1
+"
+        );
+    }
+    #[test]
+    fn groups_extended() {
+        assert_eq!(
+            compile_extended(br"(abc|bcd|cde)"),
+            "\
+Root 1..1
+  Alternative 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        'a' 1..1
+        'b' 1..1
+        'c' 1..1
+      Alternative 1..1
+        'b' 1..1
+        'c' 1..1
+        'd' 1..1
+      Alternative 1..1
+        'c' 1..1
+        'd' 1..1
+        'e' 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(br"(abc|(bcd|cde))"),
+            "\
+Root 1..1
+  Alternative 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        'a' 1..1
+        'b' 1..1
+        'c' 1..1
+      Alternative 1..1
+        Group(2) 1..1
+          Alternative 1..1
+            'b' 1..1
+            'c' 1..1
+            'd' 1..1
+          Alternative 1..1
+            'c' 1..1
+            'd' 1..1
+            'e' 1..1
+"
+        );
+    }
+    #[test]
+    fn words_extended() {
+        assert_eq!(
+            compile_extended(br"\<word\>"),
+            "\
+Root 1..1
+  Alternative 1..1
+    < 1..1
+    'w' 1..1
+    'o' 1..1
+    'r' 1..1
+    'd' 1..1
+    > 1..1
+"
+        );
+    }
+
+    #[test]
+    fn repetitions_extended() {
+        assert_eq!(
+            compile_extended(br"yeee*"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 1..1
+    'e' 0..
+"
+        );
+        assert_eq!(
+            compile_extended(br"yee?"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 0..1
+"
+        );
+        assert_eq!(
+            compile_extended(br"yee+"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 1..1
+    'e' 1..
+"
+        );
+        assert_eq!(
+            compile_extended(br"ye{2}"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..2
+"
+        );
+        assert_eq!(
+            compile_extended(br"ye{2,}"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..
+"
+        );
+        assert_eq!(
+            compile_extended(br"ye{2,3}"),
+            "\
+Root 1..1
+  Alternative 1..1
+    'y' 1..1
+    'e' 2..3
+"
+        );
+    }
+    #[test]
+    fn bracket_extended() {
+        assert_eq!(
+            compile_extended(b"[abc]"),
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['a', 'b', 'c']} 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(b"[^abc]"),
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: true, ['a', 'b', 'c']} 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(b"[]] [^]]"),
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, [']']} 1..1
+    ' ' 1..1
+    {invert: true, [']']} 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(b"[0-3] [a-c] [-1] [1-]"),
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['0', '1', '2', '3']} 1..1
+    ' ' 1..1
+    {invert: false, ['a', 'b', 'c']} 1..1
+    ' ' 1..1
+    {invert: false, ['-', '1']} 1..1
+    ' ' 1..1
+    {invert: false, ['1', '-']} 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(b"[[.-.]-/]"),
+            "\
+Root 1..1
+  Alternative 1..1
+    {invert: false, ['-', '.', '/']} 1..1
+"
+        );
+        assert_eq!(
+            compile_extended(b"[[:digit:][:upper:]]"),
+            format!(
+                "\
+Root 1..1
+  Alternative 1..1
+    {{invert: false, [{:p}, {:p}]}} 1..1
+",
+                ctype::is_digit as fn(u8) -> bool,
+                ctype::is_upper as fn(u8) -> bool
+            )
+        );
+    }
+    #[test]
+    fn newline_extended() {
+        assert_eq!(
+            compile_extended(br"\r\n"),
+            "\
+Root 1..1
+  Alternative 1..1
+    '\\r' 1..1
+    '\\n' 1..1
+"
+        );
+    }
+    #[test]
+    fn backref_extended() {
+        assert_eq!(
+            compile_extended(br"([abc])\1"),
+            "\
+Root 1..1
+  Alternative 1..1
+    Group(1) 1..1
+      Alternative 1..1
+        {invert: false, ['a', 'b', 'c']} 1..1
+    \\1 1..1
 "
         )
     }