123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624 |
- //! The regex "compiler", which parses the regex itself.
- //! Produces a matcher ready to match input.
- #[cfg(feature = "no_std")]
- use crate::std::collections::HashMap;
- #[cfg(not(feature = "no_std"))]
- use std::collections::HashMap;
- use crate::{ctype, tree::*, PosixRegex};
- extern crate alloc;
- use alloc::{borrow::Cow, vec, vec::Vec};
- use core::fmt;
- /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
- #[derive(Clone, Copy, PartialEq, Eq)]
- pub struct Range(pub u32, pub Option<u32>);
- impl fmt::Debug for Range {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match self {
- Range(start, None) => write!(f, "{}..", start),
- Range(start, Some(end)) => write!(f, "{}..{}", start, end),
- }
- }
- }
- /// An item inside square brackets, like `[abc]` or `[[:digit:]]`
- #[derive(Clone, PartialEq, Eq)]
- pub enum Collation {
- Char(u8),
- Class(fn(u8) -> bool),
- }
- impl Collation {
- /// Compare this collation to a character
- pub fn matches(&self, other: u8, insensitive: bool) -> bool {
- match *self {
- Collation::Char(me) if insensitive => {
- if ctype::is_alpha(me) && ctype::is_alpha(other) {
- me | 32 == other | 32
- } else {
- me == other
- }
- }
- Collation::Char(me) => me == other,
- Collation::Class(f) => f(other),
- }
- }
- }
- impl fmt::Debug for Collation {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match *self {
- Collation::Char(c) => write!(f, "{:?}", c as char),
- Collation::Class(c) => write!(f, "{:p}", c),
- }
- }
- }
- /// A single "compiled" token, such as a `.` or a character literal
- #[derive(Clone, PartialEq, Eq)]
- pub enum Token {
- /// Internal token used to find matches that might be anywhere in the text
- InternalStart,
- Alternative,
- Any,
- BackRef(u32),
- Char(u8),
- End,
- Group(usize),
- OneOf {
- invert: bool,
- list: Vec<Collation>,
- },
- Root,
- Start,
- WordEnd,
- WordStart,
- }
- impl fmt::Debug for Token {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match *self {
- Token::InternalStart => write!(f, "<START>"),
- Token::Alternative => write!(f, "Alternative"),
- Token::Any => write!(f, "."),
- Token::BackRef(id) => write!(f, "\\{}", id),
- Token::Char(c) => write!(f, "{:?}", c as char),
- Token::End => write!(f, "$"),
- Token::Group(id) => write!(f, "Group({})", id),
- Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
- Token::Root => write!(f, "Root"),
- Token::Start => write!(f, "^"),
- Token::WordEnd => write!(f, ">"),
- Token::WordStart => write!(f, "<"),
- }
- }
- }
- /// An error that occurred while compiling the regex
- #[derive(Clone, Debug, PartialEq, Eq)]
- pub enum Error {
- EOF,
- EmptyRepetition,
- Expected(u8, Option<u8>),
- IllegalRange,
- IntegerOverflow,
- InvalidBackRef(u32),
- LeadingRepetition,
- UnclosedRepetition,
- UnexpectedToken(u8),
- UnknownClass(Vec<u8>),
- UnknownCollation,
- }
- /// A regex builder struct
- pub struct PosixRegexBuilder<'a> {
- input: &'a [u8],
- classes: HashMap<&'a [u8], fn(u8) -> bool>,
- group_id: usize,
- builder: TreeBuilder,
- }
- impl<'a> PosixRegexBuilder<'a> {
- /// Create a new instance that is ready to parse the regex `input`
- pub fn new(input: &'a [u8]) -> Self {
- Self {
- input,
- classes: HashMap::new(),
- group_id: 1,
- builder: TreeBuilder::default(),
- }
- }
- /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
- pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
- self.classes.insert(name, callback);
- self
- }
- /// Add all the default collation classes, like `[[:digit:]]` and `[[:alnum:]]`
- pub fn with_default_classes(mut self) -> Self {
- #[cfg(not(feature = "no_std"))]
- self.classes.reserve(12);
- self.classes.insert(b"alnum", ctype::is_alnum);
- self.classes.insert(b"alpha", ctype::is_alpha);
- self.classes.insert(b"blank", ctype::is_blank);
- self.classes.insert(b"cntrl", ctype::is_cntrl);
- self.classes.insert(b"digit", ctype::is_digit);
- self.classes.insert(b"graph", ctype::is_graph);
- self.classes.insert(b"lower", ctype::is_lower);
- self.classes.insert(b"print", ctype::is_print);
- self.classes.insert(b"punct", ctype::is_punct);
- self.classes.insert(b"space", ctype::is_space);
- self.classes.insert(b"upper", ctype::is_upper);
- self.classes.insert(b"xdigit", ctype::is_xdigit);
- self
- }
- /// "Compile" this regex to a struct ready to match input
- pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
- let tree = self.compile_tokens()?;
- Ok(PosixRegex::new(Cow::Owned(tree)))
- }
- pub fn compile_tokens(mut self) -> Result<Tree, Error> {
- self.builder.start_internal(Token::Root, Range(1, Some(1)));
- self.parse()?;
- self.builder.finish_internal();
- Ok(self.builder.finish())
- }
- fn consume(&mut self, amount: usize) {
- self.input = &self.input[amount..];
- }
- fn take_int(&mut self) -> Result<Option<u32>, Error> {
- let mut out: Option<u32> = None;
- while let Some(&c @ b'0'..=b'9') = self.input.first() {
- self.consume(1);
- out = Some(
- out.unwrap_or(0)
- .checked_mul(10)
- .and_then(|out| out.checked_add((c - b'0') as u32))
- .ok_or(Error::IntegerOverflow)?,
- );
- }
- Ok(out)
- }
- fn next(&mut self) -> Result<u8, Error> {
- self.input
- .first()
- .map(|&c| {
- self.consume(1);
- c
- })
- .ok_or(Error::EOF)
- }
- fn expect(&mut self, c: u8) -> Result<(), Error> {
- if self.input.first() != Some(&c) {
- return Err(Error::Expected(c, self.input.first().cloned()));
- }
- self.consume(1);
- Ok(())
- }
- fn parse_range(&mut self) -> Result<Range, Error> {
- let mut range = Range(1, Some(1));
- if let Some(&c) = self.input.first() {
- let new = match c {
- b'*' => Some((1, Range(0, None))),
- b'\\' => match self.input.get(1) {
- Some(b'?') => Some((2, Range(0, Some(1)))),
- Some(b'+') => Some((2, Range(1, None))),
- Some(b'{') => {
- self.consume(2);
- let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
- let mut second = Some(first);
- if let Some(b',') = self.input.first() {
- self.consume(1);
- second = self.take_int()?;
- }
- if self.input.first() == Some(&b'}') {
- self.consume(1);
- } else if self.input.starts_with(br"\}") {
- self.consume(2);
- } else {
- return Err(Error::UnclosedRepetition);
- }
- if second.map(|second| first > second).unwrap_or(false) {
- return Err(Error::IllegalRange);
- }
- range = Range(first, second);
- None
- }
- _ => None,
- },
- _ => None,
- };
- if let Some((consume, new)) = new {
- range = new;
- self.consume(consume);
- }
- }
- Ok(range)
- }
- fn parse(&mut self) -> Result<(), Error> {
- self.builder
- .start_internal(Token::Alternative, Range(1, Some(1)));
- while let Ok(c) = self.next() {
- let token = match c {
- b'^' => Token::Start,
- b'$' => Token::End,
- b'.' => Token::Any,
- b'[' => {
- let mut list = Vec::new();
- let invert = self.input.first() == Some(&b'^');
- if invert {
- self.consume(1);
- }
- loop {
- let mut c = self.next()?;
- let mut push = true;
- if c == b'[' {
- // TODO: Handle collation characters properly,
- // because currently idk what they are and only
- // have the behavior of `grep` to go on.
- match self.next()? {
- b'.' => {
- c = self.next()?;
- self.expect(b'.')?;
- self.expect(b']')?;
- }
- b'=' => {
- c = self.next()?;
- self.expect(b'=')?;
- self.expect(b']')?;
- }
- b':' => {
- let end = self
- .input
- .iter()
- .position(|&c| c == b':')
- .ok_or(Error::EOF)?;
- let key = &self.input[..end];
- let class = *self
- .classes
- .get(key)
- .ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
- self.consume(end + 1);
- self.expect(b']')?;
- list.push(Collation::Class(class));
- push = false;
- }
- _ => return Err(Error::UnknownCollation),
- }
- }
- if push {
- list.push(Collation::Char(c));
- if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']')
- {
- self.consume(1);
- let dest = self.next()?;
- for c in (c + 1)..=dest {
- list.push(Collation::Char(c));
- }
- }
- }
- if self.input.first() == Some(&b']') {
- self.consume(1);
- break;
- }
- }
- Token::OneOf { invert, list }
- }
- b'\\'
- if self
- .input
- .first()
- .map(|&c| c.is_ascii_digit())
- .unwrap_or(false) =>
- {
- let id = self.take_int()?.unwrap();
- if (id as usize) >= self.group_id {
- return Err(Error::InvalidBackRef(id));
- }
- Token::BackRef(id)
- }
- b'\\' => match self.next()? {
- b'(' => {
- let id = self.group_id;
- self.group_id += 1;
- let checkpoint = self.builder.checkpoint();
- self.parse()?;
- let range = self.parse_range()?;
- self.builder
- .start_internal_at(checkpoint, Token::Group(id), range);
- self.builder.finish_internal();
- continue;
- }
- b')' => break,
- b'|' => {
- self.builder.finish_internal();
- self.builder
- .start_internal(Token::Alternative, Range(1, Some(1)));
- continue;
- }
- b'<' => Token::WordStart,
- b'>' => Token::WordEnd,
- b'a' => Token::OneOf {
- invert: false,
- list: vec![Collation::Class(ctype::is_alnum)],
- },
- b'd' => Token::OneOf {
- invert: false,
- list: vec![Collation::Class(ctype::is_digit)],
- },
- b's' => Token::OneOf {
- invert: false,
- list: vec![Collation::Class(ctype::is_space)],
- },
- b'S' => Token::OneOf {
- invert: true,
- list: vec![Collation::Class(ctype::is_space)],
- },
- b'n' => Token::Char(b'\n'),
- b'r' => Token::Char(b'\r'),
- b't' => Token::Char(b'\t'),
- c => Token::Char(c),
- },
- c => Token::Char(c),
- };
- let range = self.parse_range()?;
- self.builder.leaf(token, range);
- }
- self.builder.finish_internal();
- Ok(())
- }
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- use alloc::{format, string::String};
- fn compile(input: &[u8]) -> String {
- format!(
- "{:?}",
- PosixRegexBuilder::new(input)
- .with_default_classes()
- .compile_tokens()
- .expect("error compiling regex")
- )
- }
- #[test]
- fn basic() {
- assert_eq!(
- compile(b"abc"),
- "\
- Root 1..1
- Alternative 1..1
- 'a' 1..1
- 'b' 1..1
- 'c' 1..1
- "
- );
- }
- #[test]
- fn groups() {
- assert_eq!(
- compile(br"\(abc\|bcd\|cde\)"),
- "\
- Root 1..1
- Alternative 1..1
- Group(1) 1..1
- Alternative 1..1
- 'a' 1..1
- 'b' 1..1
- 'c' 1..1
- Alternative 1..1
- 'b' 1..1
- 'c' 1..1
- 'd' 1..1
- Alternative 1..1
- 'c' 1..1
- 'd' 1..1
- 'e' 1..1
- "
- );
- assert_eq!(
- compile(br"\(abc\|\(bcd\|cde\)\)"),
- "\
- Root 1..1
- Alternative 1..1
- Group(1) 1..1
- Alternative 1..1
- 'a' 1..1
- 'b' 1..1
- 'c' 1..1
- Alternative 1..1
- Group(2) 1..1
- Alternative 1..1
- 'b' 1..1
- 'c' 1..1
- 'd' 1..1
- Alternative 1..1
- 'c' 1..1
- 'd' 1..1
- 'e' 1..1
- "
- );
- }
- #[test]
- fn words() {
- assert_eq!(
- compile(br"\<word\>"),
- "\
- Root 1..1
- Alternative 1..1
- < 1..1
- 'w' 1..1
- 'o' 1..1
- 'r' 1..1
- 'd' 1..1
- > 1..1
- "
- );
- }
- #[test]
- fn repetitions() {
- assert_eq!(
- compile(br"yeee*"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 1..1
- 'e' 1..1
- 'e' 0..
- "
- );
- assert_eq!(
- compile(br"yee\?"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 1..1
- 'e' 0..1
- "
- );
- assert_eq!(
- compile(br"yee\+"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 1..1
- 'e' 1..
- "
- );
- assert_eq!(
- compile(br"ye\{2}"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 2..2
- "
- );
- assert_eq!(
- compile(br"ye\{2,}"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 2..
- "
- );
- assert_eq!(
- compile(br"ye\{2,3}"),
- "\
- Root 1..1
- Alternative 1..1
- 'y' 1..1
- 'e' 2..3
- "
- );
- }
- #[test]
- fn bracket() {
- assert_eq!(
- compile(b"[abc]"),
- "\
- Root 1..1
- Alternative 1..1
- {invert: false, ['a', 'b', 'c']} 1..1
- "
- );
- assert_eq!(
- compile(b"[^abc]"),
- "\
- Root 1..1
- Alternative 1..1
- {invert: true, ['a', 'b', 'c']} 1..1
- "
- );
- assert_eq!(
- compile(b"[]] [^]]"),
- "\
- Root 1..1
- Alternative 1..1
- {invert: false, [']']} 1..1
- ' ' 1..1
- {invert: true, [']']} 1..1
- "
- );
- assert_eq!(
- compile(b"[0-3] [a-c] [-1] [1-]"),
- "\
- Root 1..1
- Alternative 1..1
- {invert: false, ['0', '1', '2', '3']} 1..1
- ' ' 1..1
- {invert: false, ['a', 'b', 'c']} 1..1
- ' ' 1..1
- {invert: false, ['-', '1']} 1..1
- ' ' 1..1
- {invert: false, ['1', '-']} 1..1
- "
- );
- assert_eq!(
- compile(b"[[.-.]-/]"),
- "\
- Root 1..1
- Alternative 1..1
- {invert: false, ['-', '.', '/']} 1..1
- "
- );
- assert_eq!(
- compile(b"[[:digit:][:upper:]]"),
- format!(
- "\
- Root 1..1
- Alternative 1..1
- {{invert: false, [{:p}, {:p}]}} 1..1
- ",
- ctype::is_digit as fn(u8) -> bool,
- ctype::is_upper as fn(u8) -> bool
- )
- );
- }
- #[test]
- fn newline() {
- assert_eq!(
- compile(br"\r\n"),
- "\
- Root 1..1
- Alternative 1..1
- '\\r' 1..1
- '\\n' 1..1
- "
- );
- }
- #[test]
- fn backref() {
- assert_eq!(
- compile(br"\([abc]\)\1"),
- "\
- Root 1..1
- Alternative 1..1
- Group(1) 1..1
- Alternative 1..1
- {invert: false, ['a', 'b', 'c']} 1..1
- \\1 1..1
- "
- )
- }
- }
|