compile.rs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. //! The regex "compiler", which parses the regex itself.
  2. //! Produces a matcher ready to match input.
  3. #[cfg(feature = "no_std")]
  4. use std::prelude::*;
  5. use std::collections::HashMap;
  6. use std::fmt;
  7. use ::{ctype, PosixRegex};
  8. /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
  9. #[derive(Clone, PartialEq, Eq)]
  10. pub struct Range(pub u32, pub Option<u32>);
  11. impl fmt::Debug for Range {
  12. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  13. match self {
  14. Range(start, None) => write!(f, "{}..", start),
  15. Range(start, Some(end)) => write!(f, "{}..{}", start, end),
  16. }
  17. }
  18. }
  19. /// An item inside square brackets, like [abc] or [[:digit:]]
  20. #[derive(Clone, Debug, PartialEq, Eq)]
  21. pub enum Collation {
  22. Char(u8),
  23. Class(fn(u8) -> bool)
  24. }
  25. impl Collation {
  26. /// Compare this collation to a character
  27. pub fn matches(&self, other: u8) -> bool {
  28. match *self {
  29. Collation::Char(me) => me == other,
  30. Collation::Class(f) => f(other)
  31. }
  32. }
  33. }
  34. /// A single "compiled" token, such as a `.` or a character literal
  35. #[derive(Clone, PartialEq, Eq)]
  36. pub enum Token {
  37. Any,
  38. Char(u8),
  39. End,
  40. Group(Vec<Vec<(Token, Range)>>),
  41. OneOf {
  42. invert: bool,
  43. list: Vec<Collation>
  44. },
  45. Start,
  46. WordEnd,
  47. WordStart
  48. }
  49. impl fmt::Debug for Token {
  50. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  51. match *self {
  52. Token::Any => write!(f, "."),
  53. Token::Char(c) => write!(f, "{:?}", c as char),
  54. Token::End => write!(f, "$"),
  55. Token::Group(ref inner) => write!(f, "Group({:?})", inner),
  56. Token::OneOf { invert, ref list } => write!(f, "[invert: {}; {:?}]", invert, list),
  57. Token::Start => write!(f, "^"),
  58. Token::WordEnd => write!(f, ">"),
  59. Token::WordStart => write!(f, "<")
  60. }
  61. }
  62. }
  63. /// An error that occurred while compiling the regex
  64. #[derive(Clone, Debug, PartialEq, Eq)]
  65. pub enum Error {
  66. EOF,
  67. EmptyRepetition,
  68. Expected(u8, Option<u8>),
  69. IllegalRange,
  70. IntegerOverflow,
  71. LeadingRepetition,
  72. UnclosedRepetition,
  73. UnexpectedToken(u8),
  74. UnknownClass(Vec<u8>),
  75. UnknownCollation
  76. }
  77. /// A regex builder struct
  78. pub struct PosixRegexBuilder<'a> {
  79. input: &'a [u8],
  80. classes: HashMap<&'a [u8], fn(u8) -> bool>
  81. }
  82. impl<'a> PosixRegexBuilder<'a> {
  83. /// Create a new instance that is ready to parse the regex `input`
  84. pub fn new(input: &'a [u8]) -> Self {
  85. Self {
  86. input,
  87. classes: HashMap::new()
  88. }
  89. }
  90. /// Add a custom collation class, for use within square brackets (such as [[:digit:]])
  91. pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
  92. self.classes.insert(name, callback);
  93. self
  94. }
  95. /// Add all the default collation classes, like [[:digit:]] and [[:alnum:]]
  96. pub fn with_default_classes(mut self) -> Self {
  97. self.classes.reserve(12);
  98. self.classes.insert(b"alnum", ctype::is_alnum);
  99. self.classes.insert(b"alpha", ctype::is_alpha);
  100. self.classes.insert(b"blank", ctype::is_blank);
  101. self.classes.insert(b"cntrl", ctype::is_cntrl);
  102. self.classes.insert(b"digit", ctype::is_digit);
  103. self.classes.insert(b"graph", ctype::is_graph);
  104. self.classes.insert(b"lower", ctype::is_lower);
  105. self.classes.insert(b"print", ctype::is_print);
  106. self.classes.insert(b"punct", ctype::is_punct);
  107. self.classes.insert(b"space", ctype::is_space);
  108. self.classes.insert(b"upper", ctype::is_upper);
  109. self.classes.insert(b"xdigit", ctype::is_xdigit);
  110. self
  111. }
  112. /// "Compile" this regex to a struct ready to match input
  113. pub fn compile(&mut self) -> Result<PosixRegex, Error> {
  114. let search = self.compile_inner(true)?;
  115. Ok(PosixRegex {
  116. search
  117. })
  118. }
  119. fn consume(&mut self, amount: usize) {
  120. self.input = &self.input[amount..];
  121. }
  122. fn take_int(&mut self) -> Result<Option<u32>, Error> {
  123. let mut out: Option<u32> = None;
  124. while let Some(&c @ b'0'..=b'9') = self.input.first() {
  125. self.consume(1);
  126. out = Some(out.unwrap_or(0)
  127. .checked_mul(10)
  128. .and_then(|out| out.checked_add((c - b'0') as u32))
  129. .ok_or(Error::IntegerOverflow)?);
  130. }
  131. Ok(out)
  132. }
  133. fn next(&mut self) -> Result<u8, Error> {
  134. self.input.first()
  135. .map(|&c| { self.consume(1); c })
  136. .ok_or(Error::EOF)
  137. }
  138. fn expect(&mut self, c: u8) -> Result<(), Error> {
  139. if self.input.first() != Some(&c) {
  140. return Err(Error::Expected(c, self.input.first().cloned()));
  141. }
  142. self.consume(1);
  143. Ok(())
  144. }
  145. fn compile_inner(&mut self, toplevel: bool) -> Result<Vec<(Token, Range)>, Error> {
  146. let mut search: Vec<(Token, Range)> = Vec::new();
  147. while let Some(&c) = self.input.first() {
  148. self.consume(1);
  149. let token = match c {
  150. b'^' => Token::Start,
  151. b'$' => Token::End,
  152. b'.' => Token::Any,
  153. b'*' => if let Some(last) = search.last_mut() {
  154. last.1 = Range(0, None);
  155. continue;
  156. } else {
  157. return Err(Error::LeadingRepetition);
  158. },
  159. b'[' => {
  160. let mut list = Vec::new();
  161. let invert = self.input.first() == Some(&b'^');
  162. if invert {
  163. self.consume(1);
  164. }
  165. loop {
  166. let mut c = self.next()?;
  167. let mut push = true;
  168. if c == b'[' {
  169. // TODO: Handle collation characters properly,
  170. // because currently idk what they are and only
  171. // have the behavior of `grep` to go on.
  172. match self.next()? {
  173. b'.' => {
  174. c = self.next()?;
  175. self.expect(b'.')?;
  176. self.expect(b']')?;
  177. },
  178. b'=' => {
  179. c = self.next()?;
  180. self.expect(b'=')?;
  181. self.expect(b']')?;
  182. },
  183. b':' => {
  184. let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;
  185. let key = &self.input[..end];
  186. let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
  187. self.consume(end + 1);
  188. self.expect(b']')?;
  189. list.push(Collation::Class(class));
  190. push = false;
  191. },
  192. _ => return Err(Error::UnknownCollation)
  193. }
  194. }
  195. if push {
  196. list.push(Collation::Char(c));
  197. if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {
  198. self.consume(1);
  199. let dest = self.next()?;
  200. for c in (c+1)..=dest {
  201. list.push(Collation::Char(c));
  202. }
  203. }
  204. }
  205. if self.input.first() == Some(&b']') {
  206. self.consume(1);
  207. break;
  208. }
  209. }
  210. Token::OneOf {
  211. invert,
  212. list
  213. }
  214. },
  215. b'\\' => match self.input.first() {
  216. None => return Err(Error::EOF),
  217. Some(b'|') | Some(b')') if !toplevel => return Ok(search),
  218. Some(&c @ b'|') | Some(&c @ b')') if toplevel => return Err(Error::UnexpectedToken(c)),
  219. Some(&c) => {
  220. self.consume(1);
  221. match c {
  222. b'(' => {
  223. let mut branches = Vec::new();
  224. loop {
  225. let inner = self.compile_inner(false)?;
  226. branches.push(inner);
  227. match self.next()? {
  228. b'|' => (),
  229. b')' => break,
  230. _ => unreachable!()
  231. }
  232. }
  233. Token::Group(branches)
  234. },
  235. b'<' => Token::WordStart,
  236. b'>' => Token::WordEnd,
  237. b'?' | b'+' => if let Some(last) = search.last_mut() {
  238. last.1 = match c {
  239. b'?' => Range(0, Some(1)),
  240. b'+' => Range(1, None),
  241. _ => unreachable!()
  242. };
  243. continue;
  244. } else {
  245. return Err(Error::LeadingRepetition);
  246. },
  247. b'{' => if let Some(last) = search.last_mut() {
  248. let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
  249. let mut second = Some(first);
  250. if let Some(b',') = self.input.first() {
  251. self.consume(1);
  252. second = self.take_int()?;
  253. }
  254. if self.input.first() == Some(&b'}') {
  255. self.consume(1);
  256. } else if self.input.starts_with(br"\}") {
  257. self.consume(2);
  258. } else {
  259. return Err(Error::UnclosedRepetition);
  260. }
  261. if second.map(|second| first > second).unwrap_or(false) {
  262. return Err(Error::IllegalRange);
  263. }
  264. last.1 = Range(first, second);
  265. continue;
  266. } else {
  267. return Err(Error::LeadingRepetition);
  268. },
  269. c => Token::Char(c)
  270. }
  271. }
  272. },
  273. c => Token::Char(c)
  274. };
  275. search.push((token, Range(1, Some(1))));
  276. }
  277. Ok(search)
  278. }
  279. }
  280. #[cfg(test)]
  281. mod tests {
  282. use super::*;
  283. fn compile(input: &[u8]) -> Vec<(Token, Range)> {
  284. PosixRegexBuilder::new(input)
  285. .with_default_classes()
  286. .compile()
  287. .expect("error compiling regex")
  288. .search
  289. }
  290. fn t(t: Token) -> (Token, Range) {
  291. (t, Range(1, Some(1)))
  292. }
  293. fn c(c: u8) -> (Token, Range) {
  294. t(Token::Char(c))
  295. }
  296. #[test]
  297. fn basic() {
  298. assert_eq!(compile(b"abc"), &[c(b'a'), c(b'b'), c(b'c')]);
  299. }
  300. #[test]
  301. fn groups() {
  302. assert_eq!(compile(br"\(abc\|bcd\|cde\)"), &[t(Token::Group(vec![
  303. vec![c(b'a'), c(b'b'), c(b'c')],
  304. vec![c(b'b'), c(b'c'), c(b'd')],
  305. vec![c(b'c'), c(b'd'), c(b'e')]
  306. ]))]);
  307. assert_eq!(compile(br"\(abc\|\(bcd\|cde\)\)"), &[
  308. t(Token::Group(vec![
  309. vec![c(b'a'), c(b'b'), c(b'c')],
  310. vec![t(Token::Group(vec![
  311. vec![c(b'b'), c(b'c'), c(b'd')],
  312. vec![c(b'c'), c(b'd'), c(b'e')]
  313. ]))]
  314. ]))
  315. ]);
  316. }
  317. #[test]
  318. fn words() {
  319. assert_eq!(
  320. compile(br"\<word\>"),
  321. &[t(Token::WordStart), c(b'w'), c(b'o'), c(b'r'), c(b'd'), t(Token::WordEnd)]
  322. );
  323. }
  324. #[test]
  325. fn repetitions() {
  326. assert_eq!(
  327. compile(br"yeee*"),
  328. &[c(b'y'), c(b'e'), c(b'e'), (Token::Char(b'e'), Range(0, None))]
  329. );
  330. assert_eq!(
  331. compile(br"yee\?"),
  332. &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(0, Some(1)))]
  333. );
  334. assert_eq!(
  335. compile(br"yee\+"),
  336. &[c(b'y'), c(b'e'), (Token::Char(b'e'), Range(1, None))]
  337. );
  338. assert_eq!(
  339. compile(br"ye\{2}"),
  340. &[c(b'y'), (Token::Char(b'e'), Range(2, Some(2)))]
  341. );
  342. assert_eq!(
  343. compile(br"ye\{2,}"),
  344. &[c(b'y'), (Token::Char(b'e'), Range(2, None))]
  345. );
  346. assert_eq!(
  347. compile(br"ye\{2,3}"),
  348. &[c(b'y'), (Token::Char(b'e'), Range(2, Some(3)))]
  349. );
  350. }
  351. #[test]
  352. fn bracket() {
  353. assert_eq!(
  354. compile(b"[abc]"),
  355. &[t(Token::OneOf {
  356. invert: false,
  357. list: vec![
  358. Collation::Char(b'a'),
  359. Collation::Char(b'b'),
  360. Collation::Char(b'c')
  361. ]
  362. })]
  363. );
  364. assert_eq!(
  365. compile(b"[^abc]"),
  366. &[t(Token::OneOf {
  367. invert: true,
  368. list: vec![
  369. Collation::Char(b'a'),
  370. Collation::Char(b'b'),
  371. Collation::Char(b'c')
  372. ]
  373. })]
  374. );
  375. assert_eq!(
  376. compile(b"[]] [^]]"),
  377. &[
  378. t(Token::OneOf { invert: false, list: vec![ Collation::Char(b']') ] }),
  379. c(b' '),
  380. t(Token::OneOf { invert: true, list: vec![ Collation::Char(b']') ] }),
  381. ]
  382. );
  383. assert_eq!(
  384. compile(b"[0-3] [a-c] [-1] [1-]"),
  385. &[
  386. t(Token::OneOf { invert: false, list: vec![
  387. Collation::Char(b'0'),
  388. Collation::Char(b'1'),
  389. Collation::Char(b'2'),
  390. Collation::Char(b'3')
  391. ] }),
  392. c(b' '),
  393. t(Token::OneOf { invert: false, list: vec![
  394. Collation::Char(b'a'),
  395. Collation::Char(b'b'),
  396. Collation::Char(b'c')
  397. ] }),
  398. c(b' '),
  399. t(Token::OneOf { invert: false, list: vec![
  400. Collation::Char(b'-'),
  401. Collation::Char(b'1')
  402. ] }),
  403. c(b' '),
  404. t(Token::OneOf { invert: false, list: vec![
  405. Collation::Char(b'1'),
  406. Collation::Char(b'-')
  407. ] })
  408. ]
  409. );
  410. assert_eq!(
  411. compile(b"[[.-.]-/]"),
  412. &[
  413. t(Token::OneOf { invert: false, list: vec![
  414. Collation::Char(b'-'),
  415. Collation::Char(b'.'),
  416. Collation::Char(b'/')
  417. ] })
  418. ]
  419. );
  420. assert_eq!(
  421. compile(b"[[:digit:][:upper:]]"),
  422. &[
  423. t(Token::OneOf { invert: false, list: vec![
  424. Collation::Class(ctype::is_digit),
  425. Collation::Class(ctype::is_upper)
  426. ] })
  427. ]
  428. );
  429. }
  430. }