compile.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. //! The regex "compiler", which parses the regex itself.
  2. //! Produces a matcher ready to match input.
  3. #[cfg(feature = "no_std")]
  4. use std::prelude::*;
  5. use std::borrow::Cow;
  6. use std::collections::HashMap;
  7. use std::fmt;
  8. use {ctype, PosixRegex};
  9. /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
  10. #[derive(Clone, Copy, PartialEq, Eq)]
  11. pub struct Range(pub u32, pub Option<u32>);
  12. impl fmt::Debug for Range {
  13. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  14. match self {
  15. Range(start, None) => write!(f, "{}..", start),
  16. Range(start, Some(end)) => write!(f, "{}..{}", start, end),
  17. }
  18. }
  19. }
  20. /// An item inside square brackets, like `[abc]` or `[[:digit:]]`
  21. #[derive(Clone, Debug, PartialEq, Eq)]
  22. pub enum Collation {
  23. Char(u8),
  24. Class(fn(u8) -> bool)
  25. }
  26. impl Collation {
  27. /// Compare this collation to a character
  28. pub fn matches(&self, other: u8, insensitive: bool) -> bool {
  29. match *self {
  30. Collation::Char(me) if insensitive => me & !32 == other & !32,
  31. Collation::Char(me) => me == other,
  32. Collation::Class(f) => f(other)
  33. }
  34. }
  35. }
  36. /// A single "compiled" token, such as a `.` or a character literal
  37. #[derive(Clone, PartialEq, Eq)]
  38. pub enum Token {
  39. InternalStart,
  40. Any,
  41. Char(u8),
  42. End,
  43. Group {
  44. id: usize,
  45. branches: Vec<Vec<(Token, Range)>>
  46. },
  47. OneOf {
  48. invert: bool,
  49. list: Vec<Collation>
  50. },
  51. Start,
  52. WordEnd,
  53. WordStart
  54. }
  55. impl fmt::Debug for Token {
  56. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  57. match *self {
  58. Token::InternalStart => write!(f, "<START>"),
  59. Token::Any => write!(f, "."),
  60. Token::Char(c) => write!(f, "{:?}", c as char),
  61. Token::End => write!(f, "$"),
  62. Token::Group { ref branches, .. } => write!(f, "Group({:?})", branches),
  63. Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list),
  64. Token::Start => write!(f, "^"),
  65. Token::WordEnd => write!(f, ">"),
  66. Token::WordStart => write!(f, "<")
  67. }
  68. }
  69. }
  70. /// An error that occurred while compiling the regex
  71. #[derive(Clone, Debug, PartialEq, Eq)]
  72. pub enum Error {
  73. EOF,
  74. EmptyRepetition,
  75. Expected(u8, Option<u8>),
  76. IllegalRange,
  77. IntegerOverflow,
  78. LeadingRepetition,
  79. UnclosedRepetition,
  80. UnexpectedToken(u8),
  81. UnknownClass(Vec<u8>),
  82. UnknownCollation
  83. }
  84. /// A regex builder struct
  85. pub struct PosixRegexBuilder<'a> {
  86. input: &'a [u8],
  87. classes: HashMap<&'a [u8], fn(u8) -> bool>,
  88. group_id: usize
  89. }
  90. impl<'a> PosixRegexBuilder<'a> {
  91. /// Create a new instance that is ready to parse the regex `input`
  92. pub fn new(input: &'a [u8]) -> Self {
  93. Self {
  94. input,
  95. classes: HashMap::new(),
  96. group_id: 1
  97. }
  98. }
  99. /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
  100. pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
  101. self.classes.insert(name, callback);
  102. self
  103. }
  104. /// Add all the default collation classes, like `[[:digit:]]` and `[[:alnum:]]`
  105. pub fn with_default_classes(mut self) -> Self {
  106. #[cfg(not(feature = "no_std"))]
  107. self.classes.reserve(12);
  108. self.classes.insert(b"alnum", ctype::is_alnum);
  109. self.classes.insert(b"alpha", ctype::is_alpha);
  110. self.classes.insert(b"blank", ctype::is_blank);
  111. self.classes.insert(b"cntrl", ctype::is_cntrl);
  112. self.classes.insert(b"digit", ctype::is_digit);
  113. self.classes.insert(b"graph", ctype::is_graph);
  114. self.classes.insert(b"lower", ctype::is_lower);
  115. self.classes.insert(b"print", ctype::is_print);
  116. self.classes.insert(b"punct", ctype::is_punct);
  117. self.classes.insert(b"space", ctype::is_space);
  118. self.classes.insert(b"upper", ctype::is_upper);
  119. self.classes.insert(b"xdigit", ctype::is_xdigit);
  120. self
  121. }
  122. /// "Compile" this regex to a struct ready to match input
  123. pub fn compile(mut self) -> Result<PosixRegex<'static>, Error> {
  124. let search = self.compile_tokens()?;
  125. Ok(PosixRegex::new(Cow::Owned(search)))
  126. }
  127. fn consume(&mut self, amount: usize) {
  128. self.input = &self.input[amount..];
  129. }
  130. fn take_int(&mut self) -> Result<Option<u32>, Error> {
  131. let mut out: Option<u32> = None;
  132. while let Some(&c @ b'0'..=b'9') = self.input.first() {
  133. self.consume(1);
  134. out = Some(out.unwrap_or(0)
  135. .checked_mul(10)
  136. .and_then(|out| out.checked_add((c - b'0') as u32))
  137. .ok_or(Error::IntegerOverflow)?);
  138. }
  139. Ok(out)
  140. }
  141. fn next(&mut self) -> Result<u8, Error> {
  142. self.input.first()
  143. .map(|&c| { self.consume(1); c })
  144. .ok_or(Error::EOF)
  145. }
  146. fn expect(&mut self, c: u8) -> Result<(), Error> {
  147. if self.input.first() != Some(&c) {
  148. return Err(Error::Expected(c, self.input.first().cloned()));
  149. }
  150. self.consume(1);
  151. Ok(())
  152. }
  153. pub fn compile_tokens(&mut self) -> Result<Vec<Vec<(Token, Range)>>, Error> {
  154. let mut alternatives = Vec::new();
  155. let mut chain: Vec<(Token, Range)> = Vec::new();
  156. while let Some(&c) = self.input.first() {
  157. self.consume(1);
  158. let token = match c {
  159. b'^' => Token::Start,
  160. b'$' => Token::End,
  161. b'.' => Token::Any,
  162. b'*' => if let Some(last) = chain.last_mut() {
  163. last.1 = Range(0, None);
  164. continue;
  165. } else {
  166. return Err(Error::LeadingRepetition);
  167. },
  168. b'[' => {
  169. let mut list = Vec::new();
  170. let invert = self.input.first() == Some(&b'^');
  171. if invert {
  172. self.consume(1);
  173. }
  174. loop {
  175. let mut c = self.next()?;
  176. let mut push = true;
  177. if c == b'[' {
  178. // TODO: Handle collation characters properly,
  179. // because currently idk what they are and only
  180. // have the behavior of `grep` to go on.
  181. match self.next()? {
  182. b'.' => {
  183. c = self.next()?;
  184. self.expect(b'.')?;
  185. self.expect(b']')?;
  186. },
  187. b'=' => {
  188. c = self.next()?;
  189. self.expect(b'=')?;
  190. self.expect(b']')?;
  191. },
  192. b':' => {
  193. let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;
  194. let key = &self.input[..end];
  195. let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
  196. self.consume(end + 1);
  197. self.expect(b']')?;
  198. list.push(Collation::Class(class));
  199. push = false;
  200. },
  201. _ => return Err(Error::UnknownCollation)
  202. }
  203. }
  204. if push {
  205. list.push(Collation::Char(c));
  206. if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {
  207. self.consume(1);
  208. let dest = self.next()?;
  209. for c in (c+1)..=dest {
  210. list.push(Collation::Char(c));
  211. }
  212. }
  213. }
  214. if self.input.first() == Some(&b']') {
  215. self.consume(1);
  216. break;
  217. }
  218. }
  219. Token::OneOf {
  220. invert,
  221. list
  222. }
  223. },
  224. b'\\' => match self.next()? {
  225. b'(' => {
  226. let id = self.group_id;
  227. self.group_id += 1;
  228. Token::Group {
  229. id,
  230. branches: self.compile_tokens()?
  231. }
  232. },
  233. b')' => {
  234. alternatives.push(chain);
  235. return Ok(alternatives);
  236. }
  237. b'|' => {
  238. alternatives.push(chain);
  239. chain = Vec::new();
  240. continue;
  241. },
  242. b'<' => Token::WordStart,
  243. b'>' => Token::WordEnd,
  244. c@b'?' | c@b'+' => if let Some(last) = chain.last_mut() {
  245. last.1 = match c {
  246. b'?' => Range(0, Some(1)),
  247. b'+' => Range(1, None),
  248. _ => unreachable!("{}", c)
  249. };
  250. continue;
  251. } else {
  252. return Err(Error::LeadingRepetition);
  253. },
  254. b'{' => if let Some(last) = chain.last_mut() {
  255. let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
  256. let mut second = Some(first);
  257. if let Some(b',') = self.input.first() {
  258. self.consume(1);
  259. second = self.take_int()?;
  260. }
  261. if self.input.first() == Some(&b'}') {
  262. self.consume(1);
  263. } else if self.input.starts_with(br"\}") {
  264. self.consume(2);
  265. } else {
  266. return Err(Error::UnclosedRepetition);
  267. }
  268. if second.map(|second| first > second).unwrap_or(false) {
  269. return Err(Error::IllegalRange);
  270. }
  271. last.1 = Range(first, second);
  272. continue;
  273. } else {
  274. return Err(Error::LeadingRepetition);
  275. },
  276. b'a' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_alnum)] },
  277. b'd' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_digit)] },
  278. b's' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_space)] },
  279. b'S' => Token::OneOf { invert: true, list: vec![Collation::Class(ctype::is_space)] },
  280. b'n' => Token::Char(b'\n'),
  281. b'r' => Token::Char(b'\r'),
  282. b't' => Token::Char(b'\t'),
  283. c => Token::Char(c)
  284. },
  285. c => Token::Char(c)
  286. };
  287. chain.push((token, Range(1, Some(1))));
  288. }
  289. alternatives.push(chain);
  290. Ok(alternatives)
  291. }
  292. }