| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 | //! The regex "compiler", which parses the regex itself.//! Produces a matcher ready to match input.#[cfg(feature = "no_std")]use std::prelude::*;use std::collections::HashMap;use std::fmt;use ::{ctype, PosixRegex};/// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))#[derive(Clone, PartialEq, Eq)]pub struct Range(pub u32, pub Option<u32>);impl fmt::Debug for Range {    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {        match self {            Range(start, None) => write!(f, "{}..", start),            Range(start, Some(end)) => write!(f, "{}..{}", start, end),        }    }}/// An item inside square brackets, like [abc] or [[:digit:]]#[derive(Clone, Debug, PartialEq, Eq)]pub enum Collation {    Char(u8),    Class(fn(u8) -> bool)}impl Collation {    /// Compare this collation to a character    pub fn matches(&self, other: u8) -> bool {        match *self {            Collation::Char(me) => me == other,            Collation::Class(f) => f(other)        }    }}/// A single "compiled" token, such as a `.` or a character literal#[derive(Clone, PartialEq, Eq)]pub enum Token {    Any,    Char(u8),    End,    Group(Vec<Vec<(Token, Range)>>),    OneOf {        invert: bool,        list: Vec<Collation>    },    Start,    WordEnd,    WordStart}impl fmt::Debug for Token {    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {        match *self {            Token::Any => write!(f, "."),            Token::Char(c) => write!(f, "{:?}", c as char),            Token::End => write!(f, "$"),            Token::Group(ref inner) => write!(f, "Group({:?})", inner),            Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list),            Token::Start => write!(f, "^"),            Token::WordEnd => write!(f, ">"),            Token::WordStart => write!(f, "<")        }    }}/// An error that occurred while compiling the regex#[derive(Clone, Debug, PartialEq, Eq)]pub enum Error {    EOF,    EmptyRepetition,    Expected(u8, Option<u8>),    IllegalRange,    IntegerOverflow,    LeadingRepetition,    UnclosedRepetition,    UnexpectedToken(u8),    UnknownClass(Vec<u8>),    UnknownCollation}/// A regex builder structpub struct PosixRegexBuilder<'a> {    input: &'a [u8],    classes: HashMap<&'a [u8], fn(u8) -> bool>}impl<'a> PosixRegexBuilder<'a> {    /// Create a new instance that is ready to parse the regex `input`    pub fn new(input: &'a [u8]) -> Self {        Self {            input,            classes: HashMap::new()        }    }    /// Add a custom collation class, for use within square brackets (such as [[:digit:]])    pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {        self.classes.insert(name, callback);        self    }    /// Add all the default collation classes, like [[:digit:]] and [[:alnum:]]    pub fn with_default_classes(mut self) -> Self {        self.classes.reserve(12);        self.classes.insert(b"alnum", ctype::is_alnum);        self.classes.insert(b"alpha", ctype::is_alpha);        self.classes.insert(b"blank", ctype::is_blank);        self.classes.insert(b"cntrl", ctype::is_cntrl);        self.classes.insert(b"digit", ctype::is_digit);        self.classes.insert(b"graph", ctype::is_graph);        self.classes.insert(b"lower", ctype::is_lower);        self.classes.insert(b"print", ctype::is_print);        self.classes.insert(b"punct", ctype::is_punct);        self.classes.insert(b"space", ctype::is_space);        self.classes.insert(b"upper", ctype::is_upper);        self.classes.insert(b"xdigit", ctype::is_xdigit);        self    }    /// "Compile" this regex to a struct ready to match input    pub fn compile(&mut self) -> Result<PosixRegex, Error> {        let search = self.compile_inner(true)?;        Ok(PosixRegex {            search        })    }    fn consume(&mut self, amount: usize) {        self.input = &self.input[amount..];    }    fn take_int(&mut self) -> Result<Option<u32>, Error> {        let mut out: Option<u32> = None;        while let Some(&c @ b'0'..=b'9') = self.input.first() {            self.consume(1);            out = Some(out.unwrap_or(0)                .checked_mul(10)                .and_then(|out| out.checked_add((c - b'0') as u32))                .ok_or(Error::IntegerOverflow)?);        }        Ok(out)    }    fn next(&mut self) -> Result<u8, Error> {        self.input.first()            .map(|&c| { self.consume(1); c })            .ok_or(Error::EOF)    }    fn expect(&mut self, c: u8) -> Result<(), Error> {        if self.input.first() != Some(&c) {            return Err(Error::Expected(c, self.input.first().cloned()));        }        self.consume(1);        Ok(())    }    fn compile_inner(&mut self, toplevel: bool) -> Result<Vec<(Token, Range)>, Error> {        let mut search: Vec<(Token, Range)> = Vec::new();        while let Some(&c) = self.input.first() {            self.consume(1);            let token = match c {                b'^' => Token::Start,                b'$' => Token::End,                b'.' => Token::Any,                b'*' => if let Some(last) = search.last_mut() {                    last.1 = Range(0, None);                    continue;                } else {                    return Err(Error::LeadingRepetition);                },                b'[' => {                    let mut list = Vec::new();                    let invert = self.input.first() == Some(&b'^');                    if invert {                        self.consume(1);                    }                    loop {                        let mut c = self.next()?;                        let mut push = true;                        if c == b'[' {                            // TODO: Handle collation characters properly,                            // because currently idk what they are and only                            // have the behavior of `grep` to go on.                            match self.next()? {                                b'.' => {                                    c = self.next()?;                                    self.expect(b'.')?;                                    self.expect(b']')?;                                },                                b'=' => {                                    c = self.next()?;                                    self.expect(b'=')?;                                    self.expect(b']')?;                                },                                b':' => {                                    let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;                                    let key = &self.input[..end];                                    let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;                                    self.consume(end + 1);                                    self.expect(b']')?;                                    list.push(Collation::Class(class));                                    push = false;                                },                                _ => return Err(Error::UnknownCollation)                            }                        }                        if push {                            list.push(Collation::Char(c));                            if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {                                self.consume(1);                                let dest = self.next()?;                                for c in (c+1)..=dest {                                    list.push(Collation::Char(c));                                }                            }                        }                        if self.input.first() == Some(&b']') {                            self.consume(1);                            break;                        }                    }                    Token::OneOf {                        invert,                        list                    }                },                b'\\' => match self.input.first() {                    None => return Err(Error::EOF),                    Some(b'|') | Some(b')') if !toplevel => return Ok(search),                    Some(&c @ b'|') | Some(&c @ b')') if toplevel => return Err(Error::UnexpectedToken(c)),                    Some(&c) => {                        self.consume(1);                        match c {                            b'(' => {                                let mut branches = Vec::new();                                loop {                                    let inner = self.compile_inner(false)?;                                    branches.push(inner);                                    match self.next()? {                                        b'|' => (),                                        b')' => break,                                        _ => unreachable!()                                    }                                }                                Token::Group(branches)                            },                            b'<' => Token::WordStart,                            b'>' => Token::WordEnd,                            b'?' | b'+' => if let Some(last) = search.last_mut() {                                last.1 = match c {                                    b'?' => Range(0, Some(1)),                                    b'+' => Range(1, None),                                    _ => unreachable!()                                };                                continue;                            } else {                                return Err(Error::LeadingRepetition);                            },                            b'{' => if let Some(last) = search.last_mut() {                                let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;                                let mut second = Some(first);                                if let Some(b',') = self.input.first() {                                    self.consume(1);                                    second = self.take_int()?;                                }                                if self.input.first() == Some(&b'}') {                                    self.consume(1);                                } else if self.input.starts_with(br"\}") {                                    self.consume(2);                                } else {                                    return Err(Error::UnclosedRepetition);                                }                                if second.map(|second| first > second).unwrap_or(false) {                                    return Err(Error::IllegalRange);                                }                                last.1 = Range(first, second);                                continue;                            } else {                                return Err(Error::LeadingRepetition);                            },                            c => Token::Char(c)                        }                    }                },                c => Token::Char(c)            };            search.push((token, Range(1, Some(1))));        }        Ok(search)    }}#[cfg(test)]mod tests {    use super::*;    fn compile(input: &[u8]) -> Vec<(Token, Range)> {        PosixRegexBuilder::new(input)            .with_default_classes()            .compile()            .expect("error compiling regex")            .search    }    fn t(t: Token) -> (Token, Range) {        (t, Range(1, Some(1)))    }    fn c(c: u8) -> (Token, Range) {        t(Token::Char(c))    }    #[test]    fn basic() {        assert_eq!(compile(b"abc"), &[c(b'a'), c(b'b'), c(b'c')]);    }    #[test]    fn groups() {        assert_eq!(compile(br"\(abc\|bcd\|cde\)"), &[t(Token::Group(vec![            vec![c(b'a'), c(b'b'), c(b'c')],            vec![c(b'b'), c(b'c'), c(b'd')],            vec![c(b'c'), c(b'd'), c(b'e')]        ]))]);        assert_eq!(compile(br"\(abc\|\(bcd\|cde\)\)"), &[            t(Token::Group(vec![                vec![c(b'a'), c(b'b'), c(b'c')],                vec![t(Token::Group(vec![                    vec![c(b'b'), c(b'c'), c(b'd')],                    vec![c(b'c'), c(b'd'), c(b'e')]                ]))]            ]))        ]);    }    #[test]    fn words() {        assert_eq!(            compile(br"\<word\>"),            &[t(Token::WordStart), c(b'w'), c(b'o'), c(b'r'), c(b'd'), t(Token::WordEnd)]        );    }    #[test]    fn repetitions() {        assert_eq!(            compile(br"yeee*"),            &[c(b'y'), c(b'e'), c(b'e'), (Token::Char(b'e'), Range(0, None))]        );        assert_eq!(            compile(br"yee\?"),            &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(0, Some(1)))]        );        assert_eq!(            compile(br"yee\+"),            &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(1, None))]        );        assert_eq!(            compile(br"ye\{2}"),            &[c(b'y'), (Token::Char(b'e'), Range(2, Some(2)))]        );        assert_eq!(            compile(br"ye\{2,}"),            &[c(b'y'), (Token::Char(b'e'), Range(2, None))]        );        assert_eq!(            compile(br"ye\{2,3}"),            &[c(b'y'), (Token::Char(b'e'), Range(2, Some(3)))]        );    }    #[test]    fn bracket() {        assert_eq!(            compile(b"[abc]"),            &[t(Token::OneOf {                invert: false,                list: vec![                    Collation::Char(b'a'),                    Collation::Char(b'b'),                    Collation::Char(b'c')                ]            })]        );        assert_eq!(            compile(b"[^abc]"),            &[t(Token::OneOf {                invert: true,                list: vec![                    Collation::Char(b'a'),                    Collation::Char(b'b'),                    Collation::Char(b'c')                ]            })]        );        assert_eq!(            compile(b"[]] [^]]"),            &[                t(Token::OneOf { invert: false, list: vec![ Collation::Char(b']') ] }),                c(b' '),                t(Token::OneOf { invert: true,  list: vec![ Collation::Char(b']') ] }),            ]        );        assert_eq!(            compile(b"[0-3] [a-c] [-1] [1-]"),            &[                t(Token::OneOf { invert: false, list: vec![                    Collation::Char(b'0'),                    Collation::Char(b'1'),                    Collation::Char(b'2'),                    Collation::Char(b'3')                ] }),                c(b' '),                t(Token::OneOf { invert: false, list: vec![                    Collation::Char(b'a'),                    Collation::Char(b'b'),                    Collation::Char(b'c')                ] }),                c(b' '),                t(Token::OneOf { invert: false, list: vec![                    Collation::Char(b'-'),                    Collation::Char(b'1')                ] }),                c(b' '),                t(Token::OneOf { invert: false, list: vec![                    Collation::Char(b'1'),                    Collation::Char(b'-')                ] })            ]        );        assert_eq!(            compile(b"[[.-.]-/]"),            &[                t(Token::OneOf { invert: false, list: vec![                    Collation::Char(b'-'),                    Collation::Char(b'.'),                    Collation::Char(b'/')                ] })            ]        );        assert_eq!(            compile(b"[[:digit:][:upper:]]"),            &[                t(Token::OneOf { invert: false, list: vec![                    Collation::Class(ctype::is_digit),                    Collation::Class(ctype::is_upper)                ] })            ]        );    }}
 |