//! The regex "compiler", which parses the regex itself. //! Produces a matcher ready to match input. #[cfg(feature = "no_std")] use std::prelude::*; use std::collections::HashMap; use std::fmt; use ::{ctype, PosixRegex}; /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1)) #[derive(Clone, PartialEq, Eq)] pub struct Range(pub u32, pub Option); impl fmt::Debug for Range { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Range(start, None) => write!(f, "{}..", start), Range(start, Some(end)) => write!(f, "{}..{}", start, end), } } } /// An item inside square brackets, like [abc] or [[:digit:]] #[derive(Clone, Debug, PartialEq, Eq)] pub enum Collation { Char(u8), Class(fn(u8) -> bool) } impl Collation { /// Compare this collation to a character pub fn matches(&self, other: u8) -> bool { match *self { Collation::Char(me) => me == other, Collation::Class(f) => f(other) } } } /// A single "compiled" token, such as a `.` or a character literal #[derive(Clone, PartialEq, Eq)] pub enum Token { Any, Char(u8), End, Group(Vec>), OneOf { invert: bool, list: Vec }, Start, WordEnd, WordStart } impl fmt::Debug for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Token::Any => write!(f, "."), Token::Char(c) => write!(f, "{:?}", c as char), Token::End => write!(f, "$"), Token::Group(ref inner) => write!(f, "Group({:?})", inner), Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list), Token::Start => write!(f, "^"), Token::WordEnd => write!(f, ">"), Token::WordStart => write!(f, "<") } } } /// An error that occurred while compiling the regex #[derive(Clone, Debug, PartialEq, Eq)] pub enum Error { EOF, EmptyRepetition, Expected(u8, Option), IllegalRange, IntegerOverflow, LeadingRepetition, UnclosedRepetition, UnexpectedToken(u8), UnknownClass(Vec), UnknownCollation } /// A regex builder struct pub struct PosixRegexBuilder<'a> { input: &'a [u8], classes: HashMap<&'a [u8], fn(u8) -> bool> } impl<'a> PosixRegexBuilder<'a> { /// Create a new instance that is ready to parse the regex `input` pub fn new(input: &'a [u8]) -> Self { Self { input, classes: HashMap::new() } } /// Add a custom collation class, for use within square brackets (such as [[:digit:]]) pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self { self.classes.insert(name, callback); self } /// Add all the default collation classes, like [[:digit:]] and [[:alnum:]] pub fn with_default_classes(mut self) -> Self { self.classes.reserve(12); self.classes.insert(b"alnum", ctype::is_alnum); self.classes.insert(b"alpha", ctype::is_alpha); self.classes.insert(b"blank", ctype::is_blank); self.classes.insert(b"cntrl", ctype::is_cntrl); self.classes.insert(b"digit", ctype::is_digit); self.classes.insert(b"graph", ctype::is_graph); self.classes.insert(b"lower", ctype::is_lower); self.classes.insert(b"print", ctype::is_print); self.classes.insert(b"punct", ctype::is_punct); self.classes.insert(b"space", ctype::is_space); self.classes.insert(b"upper", ctype::is_upper); self.classes.insert(b"xdigit", ctype::is_xdigit); self } /// "Compile" this regex to a struct ready to match input pub fn compile(&mut self) -> Result { let search = self.compile_inner(true)?; Ok(PosixRegex { search }) } fn consume(&mut self, amount: usize) { self.input = &self.input[amount..]; } fn take_int(&mut self) -> Result, Error> { let mut out: Option = None; while let Some(&c @ b'0'..=b'9') = self.input.first() { self.consume(1); out = Some(out.unwrap_or(0) .checked_mul(10) .and_then(|out| out.checked_add((c - b'0') as u32)) .ok_or(Error::IntegerOverflow)?); } Ok(out) } fn next(&mut self) -> Result { self.input.first() .map(|&c| { self.consume(1); c }) .ok_or(Error::EOF) } fn expect(&mut self, c: u8) -> Result<(), Error> { if self.input.first() != Some(&c) { return Err(Error::Expected(c, self.input.first().cloned())); } self.consume(1); Ok(()) } fn compile_inner(&mut self, toplevel: bool) -> Result, Error> { let mut search: Vec<(Token, Range)> = Vec::new(); while let Some(&c) = self.input.first() { self.consume(1); let token = match c { b'^' => Token::Start, b'$' => Token::End, b'.' => Token::Any, b'*' => if let Some(last) = search.last_mut() { last.1 = Range(0, None); continue; } else { return Err(Error::LeadingRepetition); }, b'[' => { let mut list = Vec::new(); let invert = self.input.first() == Some(&b'^'); if invert { self.consume(1); } loop { let mut c = self.next()?; let mut push = true; if c == b'[' { // TODO: Handle collation characters properly, // because currently idk what they are and only // have the behavior of `grep` to go on. match self.next()? { b'.' => { c = self.next()?; self.expect(b'.')?; self.expect(b']')?; }, b'=' => { c = self.next()?; self.expect(b'=')?; self.expect(b']')?; }, b':' => { let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?; let key = &self.input[..end]; let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?; self.consume(end + 1); self.expect(b']')?; list.push(Collation::Class(class)); push = false; }, _ => return Err(Error::UnknownCollation) } } if push { list.push(Collation::Char(c)); if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') { self.consume(1); let dest = self.next()?; for c in (c+1)..=dest { list.push(Collation::Char(c)); } } } if self.input.first() == Some(&b']') { self.consume(1); break; } } Token::OneOf { invert, list } }, b'\\' => match self.input.first() { None => return Err(Error::EOF), Some(b'|') | Some(b')') if !toplevel => return Ok(search), Some(&c @ b'|') | Some(&c @ b')') if toplevel => return Err(Error::UnexpectedToken(c)), Some(&c) => { self.consume(1); match c { b'(' => { let mut branches = Vec::new(); loop { let inner = self.compile_inner(false)?; branches.push(inner); match self.next()? { b'|' => (), b')' => break, _ => unreachable!() } } Token::Group(branches) }, b'<' => Token::WordStart, b'>' => Token::WordEnd, b'?' | b'+' => if let Some(last) = search.last_mut() { last.1 = match c { b'?' => Range(0, Some(1)), b'+' => Range(1, None), _ => unreachable!() }; continue; } else { return Err(Error::LeadingRepetition); }, b'{' => if let Some(last) = search.last_mut() { let first = self.take_int()?.ok_or(Error::EmptyRepetition)?; let mut second = Some(first); if let Some(b',') = self.input.first() { self.consume(1); second = self.take_int()?; } if self.input.first() == Some(&b'}') { self.consume(1); } else if self.input.starts_with(br"\}") { self.consume(2); } else { return Err(Error::UnclosedRepetition); } if second.map(|second| first > second).unwrap_or(false) { return Err(Error::IllegalRange); } last.1 = Range(first, second); continue; } else { return Err(Error::LeadingRepetition); }, c => Token::Char(c) } } }, c => Token::Char(c) }; search.push((token, Range(1, Some(1)))); } Ok(search) } } #[cfg(test)] mod tests { use super::*; fn compile(input: &[u8]) -> Vec<(Token, Range)> { PosixRegexBuilder::new(input) .with_default_classes() .compile() .expect("error compiling regex") .search } fn t(t: Token) -> (Token, Range) { (t, Range(1, Some(1))) } fn c(c: u8) -> (Token, Range) { t(Token::Char(c)) } #[test] fn basic() { assert_eq!(compile(b"abc"), &[c(b'a'), c(b'b'), c(b'c')]); } #[test] fn groups() { assert_eq!(compile(br"\(abc\|bcd\|cde\)"), &[t(Token::Group(vec![ vec![c(b'a'), c(b'b'), c(b'c')], vec![c(b'b'), c(b'c'), c(b'd')], vec![c(b'c'), c(b'd'), c(b'e')] ]))]); assert_eq!(compile(br"\(abc\|\(bcd\|cde\)\)"), &[ t(Token::Group(vec![ vec![c(b'a'), c(b'b'), c(b'c')], vec![t(Token::Group(vec![ vec![c(b'b'), c(b'c'), c(b'd')], vec![c(b'c'), c(b'd'), c(b'e')] ]))] ])) ]); } #[test] fn words() { assert_eq!( compile(br"\"), &[t(Token::WordStart), c(b'w'), c(b'o'), c(b'r'), c(b'd'), t(Token::WordEnd)] ); } #[test] fn repetitions() { assert_eq!( compile(br"yeee*"), &[c(b'y'), c(b'e'), c(b'e'), (Token::Char(b'e'), Range(0, None))] ); assert_eq!( compile(br"yee\?"), &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(0, Some(1)))] ); assert_eq!( compile(br"yee\+"), &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(1, None))] ); assert_eq!( compile(br"ye\{2}"), &[c(b'y'), (Token::Char(b'e'), Range(2, Some(2)))] ); assert_eq!( compile(br"ye\{2,}"), &[c(b'y'), (Token::Char(b'e'), Range(2, None))] ); assert_eq!( compile(br"ye\{2,3}"), &[c(b'y'), (Token::Char(b'e'), Range(2, Some(3)))] ); } #[test] fn bracket() { assert_eq!( compile(b"[abc]"), &[t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'a'), Collation::Char(b'b'), Collation::Char(b'c') ] })] ); assert_eq!( compile(b"[^abc]"), &[t(Token::OneOf { invert: true, list: vec![ Collation::Char(b'a'), Collation::Char(b'b'), Collation::Char(b'c') ] })] ); assert_eq!( compile(b"[]] [^]]"), &[ t(Token::OneOf { invert: false, list: vec![ Collation::Char(b']') ] }), c(b' '), t(Token::OneOf { invert: true, list: vec![ Collation::Char(b']') ] }), ] ); assert_eq!( compile(b"[0-3] [a-c] [-1] [1-]"), &[ t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'0'), Collation::Char(b'1'), Collation::Char(b'2'), Collation::Char(b'3') ] }), c(b' '), t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'a'), Collation::Char(b'b'), Collation::Char(b'c') ] }), c(b' '), t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'-'), Collation::Char(b'1') ] }), c(b' '), t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'1'), Collation::Char(b'-') ] }) ] ); assert_eq!( compile(b"[[.-.]-/]"), &[ t(Token::OneOf { invert: false, list: vec![ Collation::Char(b'-'), Collation::Char(b'.'), Collation::Char(b'/') ] }) ] ); assert_eq!( compile(b"[[:digit:][:upper:]]"), &[ t(Token::OneOf { invert: false, list: vec![ Collation::Class(ctype::is_digit), Collation::Class(ctype::is_upper) ] }) ] ); } }