compile.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. //! The regex "compiler", which parses the regex itself.
  2. //! Produces a matcher ready to match input.
  3. #[cfg(feature = "no_std")]
  4. use crate::std::collections::HashMap;
  5. #[cfg(not(feature = "no_std"))]
  6. use std::collections::HashMap;
  7. use crate::{ctype, tree::*, PosixRegex};
  8. extern crate alloc;
  9. use alloc::{borrow::Cow, vec, vec::Vec};
  10. use core::fmt;
  11. /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
  12. #[derive(Clone, Copy, PartialEq, Eq)]
  13. pub struct Range(pub u32, pub Option<u32>);
  14. impl fmt::Debug for Range {
  15. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  16. match self {
  17. Range(start, None) => write!(f, "{}..", start),
  18. Range(start, Some(end)) => write!(f, "{}..{}", start, end),
  19. }
  20. }
  21. }
  22. /// An item inside square brackets, like `[abc]` or `[[:digit:]]`
  23. #[derive(Clone, PartialEq, Eq)]
  24. pub enum Collation {
  25. Char(u8),
  26. Class(fn(u8) -> bool),
  27. }
  28. impl Collation {
  29. /// Compare this collation to a character
  30. pub fn matches(&self, other: u8, insensitive: bool) -> bool {
  31. match *self {
  32. Collation::Char(me) if insensitive => {
  33. if ctype::is_alpha(me) && ctype::is_alpha(other) {
  34. me | 32 == other | 32
  35. } else {
  36. me == other
  37. }
  38. }
  39. Collation::Char(me) => me == other,
  40. Collation::Class(f) => f(other),
  41. }
  42. }
  43. }
  44. impl fmt::Debug for Collation {
  45. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  46. match *self {
  47. Collation::Char(c) => write!(f, "{:?}", c as char),
  48. Collation::Class(c) => write!(f, "{:p}", c),
  49. }
  50. }
  51. }
  52. /// A single "compiled" token, such as a `.` or a character literal
  53. #[derive(Clone, PartialEq, Eq)]
  54. pub enum Token {
  55. /// Internal token used to find matches that might be anywhere in the text
  56. InternalStart,
  57. Alternative,
  58. Any,
  59. BackRef(u32),
  60. Char(u8),
  61. End,
  62. Group(usize),
  63. OneOf {
  64. invert: bool,
  65. list: Vec<Collation>,
  66. },
  67. Root,
  68. Start,
  69. WordEnd,
  70. WordStart,
  71. }
  72. impl fmt::Debug for Token {
  73. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  74. match *self {
  75. Token::InternalStart => write!(f, "<START>"),
  76. Token::Alternative => write!(f, "Alternative"),
  77. Token::Any => write!(f, "."),
  78. Token::BackRef(id) => write!(f, "\\{}", id),
  79. Token::Char(c) => write!(f, "{:?}", c as char),
  80. Token::End => write!(f, "$"),
  81. Token::Group(id) => write!(f, "Group({})", id),
  82. Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
  83. Token::Root => write!(f, "Root"),
  84. Token::Start => write!(f, "^"),
  85. Token::WordEnd => write!(f, ">"),
  86. Token::WordStart => write!(f, "<"),
  87. }
  88. }
  89. }
  90. /// An error that occurred while compiling the regex
  91. #[derive(Clone, Debug, PartialEq, Eq)]
  92. pub enum Error {
  93. EOF,
  94. EmptyRepetition,
  95. Expected(u8, Option<u8>),
  96. IllegalRange,
  97. IntegerOverflow,
  98. InvalidBackRef(u32),
  99. LeadingRepetition,
  100. UnclosedRepetition,
  101. UnexpectedToken(u8),
  102. UnknownClass(Vec<u8>),
  103. UnknownCollation,
  104. }
  105. /// A regex builder struct
  106. pub struct PosixRegexBuilder<'a> {
  107. input: &'a [u8],
  108. classes: HashMap<&'a [u8], fn(u8) -> bool>,
  109. group_id: usize,
  110. builder: TreeBuilder,
  111. }
  112. impl<'a> PosixRegexBuilder<'a> {
  113. /// Create a new instance that is ready to parse the regex `input`
  114. pub fn new(input: &'a [u8]) -> Self {
  115. Self {
  116. input,
  117. classes: HashMap::new(),
  118. group_id: 1,
  119. builder: TreeBuilder::default(),
  120. }
  121. }
  122. /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
  123. pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
  124. self.classes.insert(name, callback);
  125. self
  126. }
  127. /// Add all the default collation classes, like `[[:digit:]]` and `[[:alnum:]]`
  128. pub fn with_default_classes(mut self) -> Self {
  129. #[cfg(not(feature = "no_std"))]
  130. self.classes.reserve(12);
  131. self.classes.insert(b"alnum", ctype::is_alnum);
  132. self.classes.insert(b"alpha", ctype::is_alpha);
  133. self.classes.insert(b"blank", ctype::is_blank);
  134. self.classes.insert(b"cntrl", ctype::is_cntrl);
  135. self.classes.insert(b"digit", ctype::is_digit);
  136. self.classes.insert(b"graph", ctype::is_graph);
  137. self.classes.insert(b"lower", ctype::is_lower);
  138. self.classes.insert(b"print", ctype::is_print);
  139. self.classes.insert(b"punct", ctype::is_punct);
  140. self.classes.insert(b"space", ctype::is_space);
  141. self.classes.insert(b"upper", ctype::is_upper);
  142. self.classes.insert(b"xdigit", ctype::is_xdigit);
  143. self
  144. }
  145. /// "Compile" this regex to a struct ready to match input
  146. pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
  147. let tree = self.compile_tokens()?;
  148. Ok(PosixRegex::new(Cow::Owned(tree)))
  149. }
  150. pub fn compile_tokens(mut self) -> Result<Tree, Error> {
  151. self.builder.start_internal(Token::Root, Range(1, Some(1)));
  152. self.parse()?;
  153. self.builder.finish_internal();
  154. Ok(self.builder.finish())
  155. }
  156. fn consume(&mut self, amount: usize) {
  157. self.input = &self.input[amount..];
  158. }
  159. fn take_int(&mut self) -> Result<Option<u32>, Error> {
  160. let mut out: Option<u32> = None;
  161. while let Some(&c @ b'0'..=b'9') = self.input.first() {
  162. self.consume(1);
  163. out = Some(
  164. out.unwrap_or(0)
  165. .checked_mul(10)
  166. .and_then(|out| out.checked_add((c - b'0') as u32))
  167. .ok_or(Error::IntegerOverflow)?,
  168. );
  169. }
  170. Ok(out)
  171. }
  172. fn next(&mut self) -> Result<u8, Error> {
  173. self.input
  174. .first()
  175. .map(|&c| {
  176. self.consume(1);
  177. c
  178. })
  179. .ok_or(Error::EOF)
  180. }
  181. fn expect(&mut self, c: u8) -> Result<(), Error> {
  182. if self.input.first() != Some(&c) {
  183. return Err(Error::Expected(c, self.input.first().cloned()));
  184. }
  185. self.consume(1);
  186. Ok(())
  187. }
  188. fn parse_range(&mut self) -> Result<Range, Error> {
  189. let mut range = Range(1, Some(1));
  190. if let Some(&c) = self.input.first() {
  191. let new = match c {
  192. b'*' => Some((1, Range(0, None))),
  193. b'\\' => match self.input.get(1) {
  194. Some(b'?') => Some((2, Range(0, Some(1)))),
  195. Some(b'+') => Some((2, Range(1, None))),
  196. Some(b'{') => {
  197. self.consume(2);
  198. let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
  199. let mut second = Some(first);
  200. if let Some(b',') = self.input.first() {
  201. self.consume(1);
  202. second = self.take_int()?;
  203. }
  204. if self.input.first() == Some(&b'}') {
  205. self.consume(1);
  206. } else if self.input.starts_with(br"\}") {
  207. self.consume(2);
  208. } else {
  209. return Err(Error::UnclosedRepetition);
  210. }
  211. if second.map(|second| first > second).unwrap_or(false) {
  212. return Err(Error::IllegalRange);
  213. }
  214. range = Range(first, second);
  215. None
  216. }
  217. _ => None,
  218. },
  219. _ => None,
  220. };
  221. if let Some((consume, new)) = new {
  222. range = new;
  223. self.consume(consume);
  224. }
  225. }
  226. Ok(range)
  227. }
  228. fn parse(&mut self) -> Result<(), Error> {
  229. self.builder
  230. .start_internal(Token::Alternative, Range(1, Some(1)));
  231. while let Ok(c) = self.next() {
  232. let token = match c {
  233. b'^' => Token::Start,
  234. b'$' => Token::End,
  235. b'.' => Token::Any,
  236. b'[' => {
  237. let mut list = Vec::new();
  238. let invert = self.input.first() == Some(&b'^');
  239. if invert {
  240. self.consume(1);
  241. }
  242. loop {
  243. let mut c = self.next()?;
  244. let mut push = true;
  245. if c == b'[' {
  246. // TODO: Handle collation characters properly,
  247. // because currently idk what they are and only
  248. // have the behavior of `grep` to go on.
  249. match self.next()? {
  250. b'.' => {
  251. c = self.next()?;
  252. self.expect(b'.')?;
  253. self.expect(b']')?;
  254. }
  255. b'=' => {
  256. c = self.next()?;
  257. self.expect(b'=')?;
  258. self.expect(b']')?;
  259. }
  260. b':' => {
  261. let end = self
  262. .input
  263. .iter()
  264. .position(|&c| c == b':')
  265. .ok_or(Error::EOF)?;
  266. let key = &self.input[..end];
  267. let class = *self
  268. .classes
  269. .get(key)
  270. .ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
  271. self.consume(end + 1);
  272. self.expect(b']')?;
  273. list.push(Collation::Class(class));
  274. push = false;
  275. }
  276. _ => return Err(Error::UnknownCollation),
  277. }
  278. }
  279. if push {
  280. list.push(Collation::Char(c));
  281. if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']')
  282. {
  283. self.consume(1);
  284. let dest = self.next()?;
  285. for c in (c + 1)..=dest {
  286. list.push(Collation::Char(c));
  287. }
  288. }
  289. }
  290. if self.input.first() == Some(&b']') {
  291. self.consume(1);
  292. break;
  293. }
  294. }
  295. Token::OneOf { invert, list }
  296. }
  297. b'\\'
  298. if self
  299. .input
  300. .first()
  301. .map(|&c| c.is_ascii_digit())
  302. .unwrap_or(false) =>
  303. {
  304. let id = self.take_int()?.unwrap();
  305. if (id as usize) >= self.group_id {
  306. return Err(Error::InvalidBackRef(id));
  307. }
  308. Token::BackRef(id)
  309. }
  310. b'\\' => match self.next()? {
  311. b'(' => {
  312. let id = self.group_id;
  313. self.group_id += 1;
  314. let checkpoint = self.builder.checkpoint();
  315. self.parse()?;
  316. let range = self.parse_range()?;
  317. self.builder
  318. .start_internal_at(checkpoint, Token::Group(id), range);
  319. self.builder.finish_internal();
  320. continue;
  321. }
  322. b')' => break,
  323. b'|' => {
  324. self.builder.finish_internal();
  325. self.builder
  326. .start_internal(Token::Alternative, Range(1, Some(1)));
  327. continue;
  328. }
  329. b'<' => Token::WordStart,
  330. b'>' => Token::WordEnd,
  331. b'a' => Token::OneOf {
  332. invert: false,
  333. list: vec![Collation::Class(ctype::is_alnum)],
  334. },
  335. b'd' => Token::OneOf {
  336. invert: false,
  337. list: vec![Collation::Class(ctype::is_digit)],
  338. },
  339. b's' => Token::OneOf {
  340. invert: false,
  341. list: vec![Collation::Class(ctype::is_space)],
  342. },
  343. b'S' => Token::OneOf {
  344. invert: true,
  345. list: vec![Collation::Class(ctype::is_space)],
  346. },
  347. b'n' => Token::Char(b'\n'),
  348. b'r' => Token::Char(b'\r'),
  349. b't' => Token::Char(b'\t'),
  350. c => Token::Char(c),
  351. },
  352. c => Token::Char(c),
  353. };
  354. let range = self.parse_range()?;
  355. self.builder.leaf(token, range);
  356. }
  357. self.builder.finish_internal();
  358. Ok(())
  359. }
  360. }
  361. #[cfg(test)]
  362. mod tests {
  363. use super::*;
  364. use alloc::{format, string::String};
  365. fn compile(input: &[u8]) -> String {
  366. format!(
  367. "{:?}",
  368. PosixRegexBuilder::new(input)
  369. .with_default_classes()
  370. .compile_tokens()
  371. .expect("error compiling regex")
  372. )
  373. }
  374. #[test]
  375. fn basic() {
  376. assert_eq!(
  377. compile(b"abc"),
  378. "\
  379. Root 1..1
  380. Alternative 1..1
  381. 'a' 1..1
  382. 'b' 1..1
  383. 'c' 1..1
  384. "
  385. );
  386. }
  387. #[test]
  388. fn groups() {
  389. assert_eq!(
  390. compile(br"\(abc\|bcd\|cde\)"),
  391. "\
  392. Root 1..1
  393. Alternative 1..1
  394. Group(1) 1..1
  395. Alternative 1..1
  396. 'a' 1..1
  397. 'b' 1..1
  398. 'c' 1..1
  399. Alternative 1..1
  400. 'b' 1..1
  401. 'c' 1..1
  402. 'd' 1..1
  403. Alternative 1..1
  404. 'c' 1..1
  405. 'd' 1..1
  406. 'e' 1..1
  407. "
  408. );
  409. assert_eq!(
  410. compile(br"\(abc\|\(bcd\|cde\)\)"),
  411. "\
  412. Root 1..1
  413. Alternative 1..1
  414. Group(1) 1..1
  415. Alternative 1..1
  416. 'a' 1..1
  417. 'b' 1..1
  418. 'c' 1..1
  419. Alternative 1..1
  420. Group(2) 1..1
  421. Alternative 1..1
  422. 'b' 1..1
  423. 'c' 1..1
  424. 'd' 1..1
  425. Alternative 1..1
  426. 'c' 1..1
  427. 'd' 1..1
  428. 'e' 1..1
  429. "
  430. );
  431. }
  432. #[test]
  433. fn words() {
  434. assert_eq!(
  435. compile(br"\<word\>"),
  436. "\
  437. Root 1..1
  438. Alternative 1..1
  439. < 1..1
  440. 'w' 1..1
  441. 'o' 1..1
  442. 'r' 1..1
  443. 'd' 1..1
  444. > 1..1
  445. "
  446. );
  447. }
  448. #[test]
  449. fn repetitions() {
  450. assert_eq!(
  451. compile(br"yeee*"),
  452. "\
  453. Root 1..1
  454. Alternative 1..1
  455. 'y' 1..1
  456. 'e' 1..1
  457. 'e' 1..1
  458. 'e' 0..
  459. "
  460. );
  461. assert_eq!(
  462. compile(br"yee\?"),
  463. "\
  464. Root 1..1
  465. Alternative 1..1
  466. 'y' 1..1
  467. 'e' 1..1
  468. 'e' 0..1
  469. "
  470. );
  471. assert_eq!(
  472. compile(br"yee\+"),
  473. "\
  474. Root 1..1
  475. Alternative 1..1
  476. 'y' 1..1
  477. 'e' 1..1
  478. 'e' 1..
  479. "
  480. );
  481. assert_eq!(
  482. compile(br"ye\{2}"),
  483. "\
  484. Root 1..1
  485. Alternative 1..1
  486. 'y' 1..1
  487. 'e' 2..2
  488. "
  489. );
  490. assert_eq!(
  491. compile(br"ye\{2,}"),
  492. "\
  493. Root 1..1
  494. Alternative 1..1
  495. 'y' 1..1
  496. 'e' 2..
  497. "
  498. );
  499. assert_eq!(
  500. compile(br"ye\{2,3}"),
  501. "\
  502. Root 1..1
  503. Alternative 1..1
  504. 'y' 1..1
  505. 'e' 2..3
  506. "
  507. );
  508. }
  509. #[test]
  510. fn bracket() {
  511. assert_eq!(
  512. compile(b"[abc]"),
  513. "\
  514. Root 1..1
  515. Alternative 1..1
  516. {invert: false, ['a', 'b', 'c']} 1..1
  517. "
  518. );
  519. assert_eq!(
  520. compile(b"[^abc]"),
  521. "\
  522. Root 1..1
  523. Alternative 1..1
  524. {invert: true, ['a', 'b', 'c']} 1..1
  525. "
  526. );
  527. assert_eq!(
  528. compile(b"[]] [^]]"),
  529. "\
  530. Root 1..1
  531. Alternative 1..1
  532. {invert: false, [']']} 1..1
  533. ' ' 1..1
  534. {invert: true, [']']} 1..1
  535. "
  536. );
  537. assert_eq!(
  538. compile(b"[0-3] [a-c] [-1] [1-]"),
  539. "\
  540. Root 1..1
  541. Alternative 1..1
  542. {invert: false, ['0', '1', '2', '3']} 1..1
  543. ' ' 1..1
  544. {invert: false, ['a', 'b', 'c']} 1..1
  545. ' ' 1..1
  546. {invert: false, ['-', '1']} 1..1
  547. ' ' 1..1
  548. {invert: false, ['1', '-']} 1..1
  549. "
  550. );
  551. assert_eq!(
  552. compile(b"[[.-.]-/]"),
  553. "\
  554. Root 1..1
  555. Alternative 1..1
  556. {invert: false, ['-', '.', '/']} 1..1
  557. "
  558. );
  559. assert_eq!(
  560. compile(b"[[:digit:][:upper:]]"),
  561. format!(
  562. "\
  563. Root 1..1
  564. Alternative 1..1
  565. {{invert: false, [{:p}, {:p}]}} 1..1
  566. ",
  567. ctype::is_digit as fn(u8) -> bool,
  568. ctype::is_upper as fn(u8) -> bool
  569. )
  570. );
  571. }
  572. #[test]
  573. fn newline() {
  574. assert_eq!(
  575. compile(br"\r\n"),
  576. "\
  577. Root 1..1
  578. Alternative 1..1
  579. '\\r' 1..1
  580. '\\n' 1..1
  581. "
  582. );
  583. }
  584. #[test]
  585. fn backref() {
  586. assert_eq!(
  587. compile(br"\([abc]\)\1"),
  588. "\
  589. Root 1..1
  590. Alternative 1..1
  591. Group(1) 1..1
  592. Alternative 1..1
  593. {invert: false, ['a', 'b', 'c']} 1..1
  594. \\1 1..1
  595. "
  596. )
  597. }
  598. }