123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- //! The regex "compiler", which parses the regex itself.
- //! Produces a matcher ready to match input.
- #[cfg(feature = "no_std")]
- use std::prelude::*;
- use std::collections::HashMap;
- use std::fmt;
- use ::{ctype, PosixRegex};
- /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
- #[derive(Clone, PartialEq, Eq)]
- pub struct Range(pub u32, pub Option<u32>);
- impl fmt::Debug for Range {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match self {
- Range(start, None) => write!(f, "{}..", start),
- Range(start, Some(end)) => write!(f, "{}..{}", start, end),
- }
- }
- }
- /// An item inside square brackets, like [abc] or [[:digit:]]
- #[derive(Clone, Debug, PartialEq, Eq)]
- pub enum Collation {
- Char(u8),
- Class(fn(u8) -> bool)
- }
- impl Collation {
- /// Compare this collation to a character
- pub fn matches(&self, other: u8) -> bool {
- match *self {
- Collation::Char(me) => me == other,
- Collation::Class(f) => f(other)
- }
- }
- }
- /// A single "compiled" token, such as a `.` or a character literal
- #[derive(Clone, PartialEq, Eq)]
- pub enum Token {
- Any,
- Char(u8),
- End,
- Group(Vec<Vec<(Token, Range)>>),
- OneOf {
- invert: bool,
- list: Vec<Collation>
- },
- Start,
- WordEnd,
- WordStart
- }
- impl fmt::Debug for Token {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match *self {
- Token::Any => write!(f, "."),
- Token::Char(c) => write!(f, "{:?}", c as char),
- Token::End => write!(f, "$"),
- Token::Group(ref inner) => write!(f, "Group({:?})", inner),
- Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list),
- Token::Start => write!(f, "^"),
- Token::WordEnd => write!(f, ">"),
- Token::WordStart => write!(f, "<")
- }
- }
- }
- /// An error that occurred while compiling the regex
- #[derive(Clone, Debug, PartialEq, Eq)]
- pub enum Error {
- EOF,
- EmptyRepetition,
- Expected(u8, Option<u8>),
- IllegalRange,
- IntegerOverflow,
- LeadingRepetition,
- UnclosedRepetition,
- UnexpectedToken(u8),
- UnknownClass(Vec<u8>),
- UnknownCollation
- }
- /// A regex builder struct
- pub struct PosixRegexBuilder<'a> {
- input: &'a [u8],
- classes: HashMap<&'a [u8], fn(u8) -> bool>
- }
- impl<'a> PosixRegexBuilder<'a> {
- /// Create a new instance that is ready to parse the regex `input`
- pub fn new(input: &'a [u8]) -> Self {
- Self {
- input,
- classes: HashMap::new()
- }
- }
- /// Add a custom collation class, for use within square brackets (such as [[:digit:]])
- pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
- self.classes.insert(name, callback);
- self
- }
- /// Add all the default collation classes, like [[:digit:]] and [[:alnum:]]
- pub fn with_default_classes(mut self) -> Self {
- self.classes.reserve(12);
- self.classes.insert(b"alnum", ctype::is_alnum);
- self.classes.insert(b"alpha", ctype::is_alpha);
- self.classes.insert(b"blank", ctype::is_blank);
- self.classes.insert(b"cntrl", ctype::is_cntrl);
- self.classes.insert(b"digit", ctype::is_digit);
- self.classes.insert(b"graph", ctype::is_graph);
- self.classes.insert(b"lower", ctype::is_lower);
- self.classes.insert(b"print", ctype::is_print);
- self.classes.insert(b"punct", ctype::is_punct);
- self.classes.insert(b"space", ctype::is_space);
- self.classes.insert(b"upper", ctype::is_upper);
- self.classes.insert(b"xdigit", ctype::is_xdigit);
- self
- }
- /// "Compile" this regex to a struct ready to match input
- pub fn compile(&mut self) -> Result<PosixRegex, Error> {
- let search = self.compile_inner(true)?;
- Ok(PosixRegex {
- search
- })
- }
- fn consume(&mut self, amount: usize) {
- self.input = &self.input[amount..];
- }
- fn take_int(&mut self) -> Result<Option<u32>, Error> {
- let mut out: Option<u32> = None;
- while let Some(&c @ b'0'..=b'9') = self.input.first() {
- self.consume(1);
- out = Some(out.unwrap_or(0)
- .checked_mul(10)
- .and_then(|out| out.checked_add((c - b'0') as u32))
- .ok_or(Error::IntegerOverflow)?);
- }
- Ok(out)
- }
- fn next(&mut self) -> Result<u8, Error> {
- self.input.first()
- .map(|&c| { self.consume(1); c })
- .ok_or(Error::EOF)
- }
- fn expect(&mut self, c: u8) -> Result<(), Error> {
- if self.input.first() != Some(&c) {
- return Err(Error::Expected(c, self.input.first().cloned()));
- }
- self.consume(1);
- Ok(())
- }
- fn compile_inner(&mut self, toplevel: bool) -> Result<Vec<(Token, Range)>, Error> {
- let mut search: Vec<(Token, Range)> = Vec::new();
- while let Some(&c) = self.input.first() {
- self.consume(1);
- let token = match c {
- b'^' => Token::Start,
- b'$' => Token::End,
- b'.' => Token::Any,
- b'*' => if let Some(last) = search.last_mut() {
- last.1 = Range(0, None);
- continue;
- } else {
- return Err(Error::LeadingRepetition);
- },
- b'[' => {
- let mut list = Vec::new();
- let invert = self.input.first() == Some(&b'^');
- if invert {
- self.consume(1);
- }
- loop {
- let mut c = self.next()?;
- let mut push = true;
- if c == b'[' {
- // TODO: Handle collation characters properly,
- // because currently idk what they are and only
- // have the behavior of `grep` to go on.
- match self.next()? {
- b'.' => {
- c = self.next()?;
- self.expect(b'.')?;
- self.expect(b']')?;
- },
- b'=' => {
- c = self.next()?;
- self.expect(b'=')?;
- self.expect(b']')?;
- },
- b':' => {
- let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;
- let key = &self.input[..end];
- let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
- self.consume(end + 1);
- self.expect(b']')?;
- list.push(Collation::Class(class));
- push = false;
- },
- _ => return Err(Error::UnknownCollation)
- }
- }
- if push {
- list.push(Collation::Char(c));
- if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {
- self.consume(1);
- let dest = self.next()?;
- for c in (c+1)..=dest {
- list.push(Collation::Char(c));
- }
- }
- }
- if self.input.first() == Some(&b']') {
- self.consume(1);
- break;
- }
- }
- Token::OneOf {
- invert,
- list
- }
- },
- b'\\' => match self.input.first() {
- None => return Err(Error::EOF),
- Some(b'|') | Some(b')') if !toplevel => return Ok(search),
- Some(&c @ b'|') | Some(&c @ b')') if toplevel => return Err(Error::UnexpectedToken(c)),
- Some(&c) => {
- self.consume(1);
- match c {
- b'(' => {
- let mut branches = Vec::new();
- loop {
- let inner = self.compile_inner(false)?;
- branches.push(inner);
- match self.next()? {
- b'|' => (),
- b')' => break,
- _ => unreachable!()
- }
- }
- Token::Group(branches)
- },
- b'<' => Token::WordStart,
- b'>' => Token::WordEnd,
- b'?' | b'+' => if let Some(last) = search.last_mut() {
- last.1 = match c {
- b'?' => Range(0, Some(1)),
- b'+' => Range(1, None),
- _ => unreachable!()
- };
- continue;
- } else {
- return Err(Error::LeadingRepetition);
- },
- b'{' => if let Some(last) = search.last_mut() {
- let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
- let mut second = Some(first);
- if let Some(b',') = self.input.first() {
- self.consume(1);
- second = self.take_int()?;
- }
- if self.input.first() == Some(&b'}') {
- self.consume(1);
- } else if self.input.starts_with(br"\}") {
- self.consume(2);
- } else {
- return Err(Error::UnclosedRepetition);
- }
- if second.map(|second| first > second).unwrap_or(false) {
- return Err(Error::IllegalRange);
- }
- last.1 = Range(first, second);
- continue;
- } else {
- return Err(Error::LeadingRepetition);
- },
- c => Token::Char(c)
- }
- }
- },
- c => Token::Char(c)
- };
- search.push((token, Range(1, Some(1))));
- }
- Ok(search)
- }
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- fn compile(input: &[u8]) -> Vec<(Token, Range)> {
- PosixRegexBuilder::new(input)
- .with_default_classes()
- .compile()
- .expect("error compiling regex")
- .search
- }
- fn t(t: Token) -> (Token, Range) {
- (t, Range(1, Some(1)))
- }
- fn c(c: u8) -> (Token, Range) {
- t(Token::Char(c))
- }
- #[test]
- fn basic() {
- assert_eq!(compile(b"abc"), &[c(b'a'), c(b'b'), c(b'c')]);
- }
- #[test]
- fn groups() {
- assert_eq!(compile(br"\(abc\|bcd\|cde\)"), &[t(Token::Group(vec![
- vec![c(b'a'), c(b'b'), c(b'c')],
- vec![c(b'b'), c(b'c'), c(b'd')],
- vec![c(b'c'), c(b'd'), c(b'e')]
- ]))]);
- assert_eq!(compile(br"\(abc\|\(bcd\|cde\)\)"), &[
- t(Token::Group(vec![
- vec![c(b'a'), c(b'b'), c(b'c')],
- vec![t(Token::Group(vec![
- vec![c(b'b'), c(b'c'), c(b'd')],
- vec![c(b'c'), c(b'd'), c(b'e')]
- ]))]
- ]))
- ]);
- }
- #[test]
- fn words() {
- assert_eq!(
- compile(br"\<word\>"),
- &[t(Token::WordStart), c(b'w'), c(b'o'), c(b'r'), c(b'd'), t(Token::WordEnd)]
- );
- }
- #[test]
- fn repetitions() {
- assert_eq!(
- compile(br"yeee*"),
- &[c(b'y'), c(b'e'), c(b'e'), (Token::Char(b'e'), Range(0, None))]
- );
- assert_eq!(
- compile(br"yee\?"),
- &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(0, Some(1)))]
- );
- assert_eq!(
- compile(br"yee\+"),
- &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(1, None))]
- );
- assert_eq!(
- compile(br"ye\{2}"),
- &[c(b'y'), (Token::Char(b'e'), Range(2, Some(2)))]
- );
- assert_eq!(
- compile(br"ye\{2,}"),
- &[c(b'y'), (Token::Char(b'e'), Range(2, None))]
- );
- assert_eq!(
- compile(br"ye\{2,3}"),
- &[c(b'y'), (Token::Char(b'e'), Range(2, Some(3)))]
- );
- }
- #[test]
- fn bracket() {
- assert_eq!(
- compile(b"[abc]"),
- &[t(Token::OneOf {
- invert: false,
- list: vec![
- Collation::Char(b'a'),
- Collation::Char(b'b'),
- Collation::Char(b'c')
- ]
- })]
- );
- assert_eq!(
- compile(b"[^abc]"),
- &[t(Token::OneOf {
- invert: true,
- list: vec![
- Collation::Char(b'a'),
- Collation::Char(b'b'),
- Collation::Char(b'c')
- ]
- })]
- );
- assert_eq!(
- compile(b"[]] [^]]"),
- &[
- t(Token::OneOf { invert: false, list: vec![ Collation::Char(b']') ] }),
- c(b' '),
- t(Token::OneOf { invert: true, list: vec![ Collation::Char(b']') ] }),
- ]
- );
- assert_eq!(
- compile(b"[0-3] [a-c] [-1] [1-]"),
- &[
- t(Token::OneOf { invert: false, list: vec![
- Collation::Char(b'0'),
- Collation::Char(b'1'),
- Collation::Char(b'2'),
- Collation::Char(b'3')
- ] }),
- c(b' '),
- t(Token::OneOf { invert: false, list: vec![
- Collation::Char(b'a'),
- Collation::Char(b'b'),
- Collation::Char(b'c')
- ] }),
- c(b' '),
- t(Token::OneOf { invert: false, list: vec![
- Collation::Char(b'-'),
- Collation::Char(b'1')
- ] }),
- c(b' '),
- t(Token::OneOf { invert: false, list: vec![
- Collation::Char(b'1'),
- Collation::Char(b'-')
- ] })
- ]
- );
- assert_eq!(
- compile(b"[[.-.]-/]"),
- &[
- t(Token::OneOf { invert: false, list: vec![
- Collation::Char(b'-'),
- Collation::Char(b'.'),
- Collation::Char(b'/')
- ] })
- ]
- );
- assert_eq!(
- compile(b"[[:digit:][:upper:]]"),
- &[
- t(Token::OneOf { invert: false, list: vec![
- Collation::Class(ctype::is_digit),
- Collation::Class(ctype::is_upper)
- ] })
- ]
- );
- }
- }
|