compile.rs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. //! The regex "compiler", which parses the regex itself.
  2. //! Produces a matcher ready to match input.
  3. #[cfg(feature = "no_std")]
  4. use alloc::collections::BTreeMap as HashMap;
  5. #[cfg(not(feature = "no_std"))]
  6. use std::collections::HashMap;
  7. use alloc::{borrow::Cow, vec, vec::Vec};
  8. use core::fmt;
  9. use crate::{ctype, tree::*, PosixRegex};
  10. /// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
  11. #[derive(Clone, Copy, PartialEq, Eq)]
  12. pub struct Range(pub u32, pub Option<u32>);
  13. impl fmt::Debug for Range {
  14. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  15. match self {
  16. Range(start, None) => write!(f, "{}..", start),
  17. Range(start, Some(end)) => write!(f, "{}..{}", start, end),
  18. }
  19. }
  20. }
  21. /// An item inside square brackets, like `[abc]` or `[[:digit:]]`
  22. #[derive(Clone, PartialEq, Eq)]
  23. pub enum Collation {
  24. Char(u8),
  25. Class(fn(u8) -> bool),
  26. }
  27. impl Collation {
  28. /// Compare this collation to a character
  29. pub fn matches(&self, other: u8, insensitive: bool) -> bool {
  30. match *self {
  31. Collation::Char(me) if insensitive => {
  32. if ctype::is_alpha(me) && ctype::is_alpha(other) {
  33. me | 32 == other | 32
  34. } else {
  35. me == other
  36. }
  37. }
  38. Collation::Char(me) => me == other,
  39. Collation::Class(f) => f(other),
  40. }
  41. }
  42. }
  43. impl fmt::Debug for Collation {
  44. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  45. match *self {
  46. Collation::Char(c) => write!(f, "{:?}", c as char),
  47. Collation::Class(c) => write!(f, "{:p}", c),
  48. }
  49. }
  50. }
  51. /// A single "compiled" token, such as a `.` or a character literal
  52. #[derive(Clone, PartialEq, Eq)]
  53. pub enum Token {
  54. /// Internal token used to find matches that might be anywhere in the text
  55. InternalStart,
  56. Alternative,
  57. Any,
  58. BackRef(u32),
  59. Char(u8),
  60. End,
  61. Group(usize),
  62. OneOf {
  63. invert: bool,
  64. list: Vec<Collation>,
  65. },
  66. Root,
  67. Start,
  68. WordEnd,
  69. WordStart,
  70. }
  71. impl fmt::Debug for Token {
  72. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  73. match *self {
  74. Token::InternalStart => write!(f, "<START>"),
  75. Token::Alternative => write!(f, "Alternative"),
  76. Token::Any => write!(f, "."),
  77. Token::BackRef(id) => write!(f, "\\{}", id),
  78. Token::Char(c) => write!(f, "{:?}", c as char),
  79. Token::End => write!(f, "$"),
  80. Token::Group(id) => write!(f, "Group({})", id),
  81. Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
  82. Token::Root => write!(f, "Root"),
  83. Token::Start => write!(f, "^"),
  84. Token::WordEnd => write!(f, ">"),
  85. Token::WordStart => write!(f, "<"),
  86. }
  87. }
  88. }
  89. /// An error that occurred while compiling the regex
  90. #[derive(Clone, Debug, PartialEq, Eq)]
  91. pub enum Error {
  92. EOF,
  93. EmptyRepetition,
  94. Expected(u8, Option<u8>),
  95. IllegalRange,
  96. IntegerOverflow,
  97. InvalidBackRef(u32),
  98. LeadingRepetition,
  99. UnclosedRepetition,
  100. UnexpectedToken(u8),
  101. UnknownClass(Vec<u8>),
  102. UnknownCollation,
  103. }
  104. /// A regex builder struct
  105. pub struct PosixRegexBuilder<'a> {
  106. input: &'a [u8],
  107. classes: HashMap<&'a [u8], fn(u8) -> bool>,
  108. group_id: usize,
  109. builder: TreeBuilder,
  110. extended: bool,
  111. }
  112. impl<'a> PosixRegexBuilder<'a> {
  113. /// Create a new instance that is ready to parse the regex `input`
  114. pub fn new(input: &'a [u8]) -> Self {
  115. Self {
  116. input,
  117. classes: HashMap::new(),
  118. group_id: 1,
  119. builder: TreeBuilder::default(),
  120. extended: false,
  121. }
  122. }
  123. /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
  124. pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
  125. self.classes.insert(name, callback);
  126. self
  127. }
  128. /// Add all the default collation classes, like `[[:digit:]]` and `[[:alnum:]]`
  129. pub fn with_default_classes(mut self) -> Self {
  130. #[cfg(not(feature = "no_std"))]
  131. self.classes.reserve(12);
  132. self.classes.insert(b"alnum", ctype::is_alnum);
  133. self.classes.insert(b"alpha", ctype::is_alpha);
  134. self.classes.insert(b"blank", ctype::is_blank);
  135. self.classes.insert(b"cntrl", ctype::is_cntrl);
  136. self.classes.insert(b"digit", ctype::is_digit);
  137. self.classes.insert(b"graph", ctype::is_graph);
  138. self.classes.insert(b"lower", ctype::is_lower);
  139. self.classes.insert(b"print", ctype::is_print);
  140. self.classes.insert(b"punct", ctype::is_punct);
  141. self.classes.insert(b"space", ctype::is_space);
  142. self.classes.insert(b"upper", ctype::is_upper);
  143. self.classes.insert(b"xdigit", ctype::is_xdigit);
  144. self
  145. }
  146. /// Use POSIX extended regex
  147. pub fn extended(mut self, extended: bool) -> Self {
  148. self.extended = true;
  149. self
  150. }
  151. /// "Compile" this regex to a struct ready to match input
  152. pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
  153. let tree = self.compile_tokens()?;
  154. Ok(PosixRegex::new(Cow::Owned(tree)))
  155. }
  156. pub fn compile_tokens(mut self) -> Result<Tree, Error> {
  157. self.builder.start_internal(Token::Root, Range(1, Some(1)));
  158. self.parse()?;
  159. self.builder.finish_internal();
  160. Ok(self.builder.finish())
  161. }
  162. fn consume(&mut self, amount: usize) {
  163. self.input = &self.input[amount..];
  164. }
  165. fn take_int(&mut self) -> Result<Option<u32>, Error> {
  166. let mut out: Option<u32> = None;
  167. while let Some(&c @ b'0'..=b'9') = self.input.first() {
  168. self.consume(1);
  169. out = Some(
  170. out.unwrap_or(0)
  171. .checked_mul(10)
  172. .and_then(|out| out.checked_add((c - b'0') as u32))
  173. .ok_or(Error::IntegerOverflow)?,
  174. );
  175. }
  176. Ok(out)
  177. }
  178. fn next(&mut self) -> Result<u8, Error> {
  179. self.input
  180. .first()
  181. .map(|&c| {
  182. self.consume(1);
  183. c
  184. })
  185. .ok_or(Error::EOF)
  186. }
  187. fn expect(&mut self, c: u8) -> Result<(), Error> {
  188. if self.input.first() != Some(&c) {
  189. return Err(Error::Expected(c, self.input.first().cloned()));
  190. }
  191. self.consume(1);
  192. Ok(())
  193. }
  194. fn parse_range(&mut self) -> Result<Range, Error> {
  195. let mut range = Range(1, Some(1));
  196. if let Some(&c) = self.input.first() {
  197. let new = match c {
  198. b'*' => Some((1, Range(0, None))),
  199. b'\\' => match self.input.get(1) {
  200. Some(b'?') if !self.extended => Some((2, Range(0, Some(1)))),
  201. Some(b'+') if !self.extended => Some((2, Range(1, None))),
  202. Some(b'{') if !self.extended => {
  203. self.consume(2);
  204. let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
  205. let mut second = Some(first);
  206. if let Some(b',') = self.input.first() {
  207. self.consume(1);
  208. second = self.take_int()?;
  209. }
  210. if self.input.first() == Some(&b'}') {
  211. self.consume(1);
  212. } else if self.input.starts_with(br"\}") {
  213. self.consume(2);
  214. } else {
  215. return Err(Error::UnclosedRepetition);
  216. }
  217. if second.map(|second| first > second).unwrap_or(false) {
  218. return Err(Error::IllegalRange);
  219. }
  220. range = Range(first, second);
  221. None
  222. }
  223. _ => None,
  224. },
  225. b'?' if self.extended => Some((1, Range(0, Some(1)))),
  226. b'+' if self.extended => Some((1, Range(1, None))),
  227. b'{' if self.extended => {
  228. self.consume(1);
  229. let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
  230. let mut second = Some(first);
  231. if let Some(b',') = self.input.first() {
  232. self.consume(1);
  233. second = self.take_int()?;
  234. }
  235. if self.input.first() == Some(&b'}') {
  236. self.consume(1);
  237. } else if self.input.starts_with(br"\}") {
  238. self.consume(2);
  239. } else {
  240. return Err(Error::UnclosedRepetition);
  241. }
  242. if second.map(|second| first > second).unwrap_or(false) {
  243. return Err(Error::IllegalRange);
  244. }
  245. range = Range(first, second);
  246. None
  247. }
  248. _ => None,
  249. };
  250. if let Some((consume, new)) = new {
  251. range = new;
  252. self.consume(consume);
  253. }
  254. }
  255. Ok(range)
  256. }
  257. fn parse(&mut self) -> Result<(), Error> {
  258. self.builder
  259. .start_internal(Token::Alternative, Range(1, Some(1)));
  260. while let Ok(c) = self.next() {
  261. let token = match c {
  262. b'^' => Token::Start,
  263. b'$' => Token::End,
  264. b'.' => Token::Any,
  265. b'[' => {
  266. let mut list = Vec::new();
  267. let invert = self.input.first() == Some(&b'^');
  268. if invert {
  269. self.consume(1);
  270. }
  271. loop {
  272. let mut c = self.next()?;
  273. let mut push = true;
  274. if c == b'[' {
  275. // TODO: Handle collation characters properly,
  276. // because currently idk what they are and only
  277. // have the behavior of `grep` to go on.
  278. match self.next()? {
  279. b'.' => {
  280. c = self.next()?;
  281. self.expect(b'.')?;
  282. self.expect(b']')?;
  283. }
  284. b'=' => {
  285. c = self.next()?;
  286. self.expect(b'=')?;
  287. self.expect(b']')?;
  288. }
  289. b':' => {
  290. let end = self
  291. .input
  292. .iter()
  293. .position(|&c| c == b':')
  294. .ok_or(Error::EOF)?;
  295. let key = &self.input[..end];
  296. let class = *self
  297. .classes
  298. .get(key)
  299. .ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
  300. self.consume(end + 1);
  301. self.expect(b']')?;
  302. list.push(Collation::Class(class));
  303. push = false;
  304. }
  305. _ => return Err(Error::UnknownCollation),
  306. }
  307. }
  308. if push {
  309. list.push(Collation::Char(c));
  310. if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']')
  311. {
  312. self.consume(1);
  313. let dest = self.next()?;
  314. for c in (c + 1)..=dest {
  315. list.push(Collation::Char(c));
  316. }
  317. }
  318. }
  319. if self.input.first() == Some(&b']') {
  320. self.consume(1);
  321. break;
  322. }
  323. }
  324. Token::OneOf { invert, list }
  325. }
  326. b'\\'
  327. if self
  328. .input
  329. .first()
  330. .map(|&c| c.is_ascii_digit())
  331. .unwrap_or(false) =>
  332. {
  333. let id = self.take_int()?.unwrap();
  334. if (id as usize) >= self.group_id {
  335. return Err(Error::InvalidBackRef(id));
  336. }
  337. Token::BackRef(id)
  338. }
  339. b'\\' => match self.next()? {
  340. b'(' if !self.extended => {
  341. let id = self.group_id;
  342. self.group_id += 1;
  343. let checkpoint = self.builder.checkpoint();
  344. self.parse()?;
  345. let range = self.parse_range()?;
  346. self.builder
  347. .start_internal_at(checkpoint, Token::Group(id), range);
  348. self.builder.finish_internal();
  349. continue;
  350. }
  351. b')' if !self.extended => break,
  352. b'|' if !self.extended => {
  353. self.builder.finish_internal();
  354. self.builder
  355. .start_internal(Token::Alternative, Range(1, Some(1)));
  356. continue;
  357. }
  358. b'<' => Token::WordStart,
  359. b'>' => Token::WordEnd,
  360. b'a' => Token::OneOf {
  361. invert: false,
  362. list: vec![Collation::Class(ctype::is_alnum)],
  363. },
  364. b'd' => Token::OneOf {
  365. invert: false,
  366. list: vec![Collation::Class(ctype::is_digit)],
  367. },
  368. b's' => Token::OneOf {
  369. invert: false,
  370. list: vec![Collation::Class(ctype::is_space)],
  371. },
  372. b'S' => Token::OneOf {
  373. invert: true,
  374. list: vec![Collation::Class(ctype::is_space)],
  375. },
  376. b'n' => Token::Char(b'\n'),
  377. b'r' => Token::Char(b'\r'),
  378. b't' => Token::Char(b'\t'),
  379. c => Token::Char(c),
  380. },
  381. b'(' if self.extended => {
  382. let id = self.group_id;
  383. self.group_id += 1;
  384. let checkpoint = self.builder.checkpoint();
  385. self.parse()?;
  386. let range = self.parse_range()?;
  387. self.builder
  388. .start_internal_at(checkpoint, Token::Group(id), range);
  389. self.builder.finish_internal();
  390. continue;
  391. }
  392. b')' if self.extended => break,
  393. b'|' if self.extended => {
  394. self.builder.finish_internal();
  395. self.builder
  396. .start_internal(Token::Alternative, Range(1, Some(1)));
  397. continue;
  398. }
  399. c => Token::Char(c),
  400. };
  401. let range = self.parse_range()?;
  402. self.builder.leaf(token, range);
  403. }
  404. self.builder.finish_internal();
  405. Ok(())
  406. }
  407. }
  408. #[cfg(test)]
  409. mod tests {
  410. use super::*;
  411. use alloc::{format, string::String};
  412. fn compile(input: &[u8]) -> String {
  413. format!(
  414. "{:?}",
  415. PosixRegexBuilder::new(input)
  416. .with_default_classes()
  417. .compile_tokens()
  418. .expect("error compiling regex")
  419. )
  420. }
  421. #[test]
  422. fn basic() {
  423. assert_eq!(
  424. compile(b"abc"),
  425. "\
  426. Root 1..1
  427. Alternative 1..1
  428. 'a' 1..1
  429. 'b' 1..1
  430. 'c' 1..1
  431. "
  432. );
  433. }
  434. #[test]
  435. fn groups() {
  436. assert_eq!(
  437. compile(br"\(abc\|bcd\|cde\)"),
  438. "\
  439. Root 1..1
  440. Alternative 1..1
  441. Group(1) 1..1
  442. Alternative 1..1
  443. 'a' 1..1
  444. 'b' 1..1
  445. 'c' 1..1
  446. Alternative 1..1
  447. 'b' 1..1
  448. 'c' 1..1
  449. 'd' 1..1
  450. Alternative 1..1
  451. 'c' 1..1
  452. 'd' 1..1
  453. 'e' 1..1
  454. "
  455. );
  456. assert_eq!(
  457. compile(br"\(abc\|\(bcd\|cde\)\)"),
  458. "\
  459. Root 1..1
  460. Alternative 1..1
  461. Group(1) 1..1
  462. Alternative 1..1
  463. 'a' 1..1
  464. 'b' 1..1
  465. 'c' 1..1
  466. Alternative 1..1
  467. Group(2) 1..1
  468. Alternative 1..1
  469. 'b' 1..1
  470. 'c' 1..1
  471. 'd' 1..1
  472. Alternative 1..1
  473. 'c' 1..1
  474. 'd' 1..1
  475. 'e' 1..1
  476. "
  477. );
  478. }
  479. #[test]
  480. fn words() {
  481. assert_eq!(
  482. compile(br"\<word\>"),
  483. "\
  484. Root 1..1
  485. Alternative 1..1
  486. < 1..1
  487. 'w' 1..1
  488. 'o' 1..1
  489. 'r' 1..1
  490. 'd' 1..1
  491. > 1..1
  492. "
  493. );
  494. }
  495. #[test]
  496. fn repetitions() {
  497. assert_eq!(
  498. compile(br"yeee*"),
  499. "\
  500. Root 1..1
  501. Alternative 1..1
  502. 'y' 1..1
  503. 'e' 1..1
  504. 'e' 1..1
  505. 'e' 0..
  506. "
  507. );
  508. assert_eq!(
  509. compile(br"yee\?"),
  510. "\
  511. Root 1..1
  512. Alternative 1..1
  513. 'y' 1..1
  514. 'e' 1..1
  515. 'e' 0..1
  516. "
  517. );
  518. assert_eq!(
  519. compile(br"yee\+"),
  520. "\
  521. Root 1..1
  522. Alternative 1..1
  523. 'y' 1..1
  524. 'e' 1..1
  525. 'e' 1..
  526. "
  527. );
  528. assert_eq!(
  529. compile(br"ye\{2}"),
  530. "\
  531. Root 1..1
  532. Alternative 1..1
  533. 'y' 1..1
  534. 'e' 2..2
  535. "
  536. );
  537. assert_eq!(
  538. compile(br"ye\{2,}"),
  539. "\
  540. Root 1..1
  541. Alternative 1..1
  542. 'y' 1..1
  543. 'e' 2..
  544. "
  545. );
  546. assert_eq!(
  547. compile(br"ye\{2,3}"),
  548. "\
  549. Root 1..1
  550. Alternative 1..1
  551. 'y' 1..1
  552. 'e' 2..3
  553. "
  554. );
  555. }
  556. #[test]
  557. fn bracket() {
  558. assert_eq!(
  559. compile(b"[abc]"),
  560. "\
  561. Root 1..1
  562. Alternative 1..1
  563. {invert: false, ['a', 'b', 'c']} 1..1
  564. "
  565. );
  566. assert_eq!(
  567. compile(b"[^abc]"),
  568. "\
  569. Root 1..1
  570. Alternative 1..1
  571. {invert: true, ['a', 'b', 'c']} 1..1
  572. "
  573. );
  574. assert_eq!(
  575. compile(b"[]] [^]]"),
  576. "\
  577. Root 1..1
  578. Alternative 1..1
  579. {invert: false, [']']} 1..1
  580. ' ' 1..1
  581. {invert: true, [']']} 1..1
  582. "
  583. );
  584. assert_eq!(
  585. compile(b"[0-3] [a-c] [-1] [1-]"),
  586. "\
  587. Root 1..1
  588. Alternative 1..1
  589. {invert: false, ['0', '1', '2', '3']} 1..1
  590. ' ' 1..1
  591. {invert: false, ['a', 'b', 'c']} 1..1
  592. ' ' 1..1
  593. {invert: false, ['-', '1']} 1..1
  594. ' ' 1..1
  595. {invert: false, ['1', '-']} 1..1
  596. "
  597. );
  598. assert_eq!(
  599. compile(b"[[.-.]-/]"),
  600. "\
  601. Root 1..1
  602. Alternative 1..1
  603. {invert: false, ['-', '.', '/']} 1..1
  604. "
  605. );
  606. assert_eq!(
  607. compile(b"[[:digit:][:upper:]]"),
  608. format!(
  609. "\
  610. Root 1..1
  611. Alternative 1..1
  612. {{invert: false, [{:p}, {:p}]}} 1..1
  613. ",
  614. ctype::is_digit as fn(u8) -> bool,
  615. ctype::is_upper as fn(u8) -> bool
  616. )
  617. );
  618. }
  619. #[test]
  620. fn newline() {
  621. assert_eq!(
  622. compile(br"\r\n"),
  623. "\
  624. Root 1..1
  625. Alternative 1..1
  626. '\\r' 1..1
  627. '\\n' 1..1
  628. "
  629. );
  630. }
  631. #[test]
  632. fn backref() {
  633. assert_eq!(
  634. compile(br"\([abc]\)\1"),
  635. "\
  636. Root 1..1
  637. Alternative 1..1
  638. Group(1) 1..1
  639. Alternative 1..1
  640. {invert: false, ['a', 'b', 'c']} 1..1
  641. \\1 1..1
  642. "
  643. )
  644. }
  645. fn compile_extended(input: &[u8]) -> String {
  646. format!(
  647. "{:?}",
  648. PosixRegexBuilder::new(input)
  649. .with_default_classes()
  650. .extended(true)
  651. .compile_tokens()
  652. .expect("error compiling regex")
  653. )
  654. }
  655. #[test]
  656. fn basic_extended() {
  657. assert_eq!(
  658. compile_extended(b"abc"),
  659. "\
  660. Root 1..1
  661. Alternative 1..1
  662. 'a' 1..1
  663. 'b' 1..1
  664. 'c' 1..1
  665. "
  666. );
  667. }
  668. #[test]
  669. fn groups_extended() {
  670. assert_eq!(
  671. compile_extended(br"(abc|bcd|cde)"),
  672. "\
  673. Root 1..1
  674. Alternative 1..1
  675. Group(1) 1..1
  676. Alternative 1..1
  677. 'a' 1..1
  678. 'b' 1..1
  679. 'c' 1..1
  680. Alternative 1..1
  681. 'b' 1..1
  682. 'c' 1..1
  683. 'd' 1..1
  684. Alternative 1..1
  685. 'c' 1..1
  686. 'd' 1..1
  687. 'e' 1..1
  688. "
  689. );
  690. assert_eq!(
  691. compile_extended(br"(abc|(bcd|cde))"),
  692. "\
  693. Root 1..1
  694. Alternative 1..1
  695. Group(1) 1..1
  696. Alternative 1..1
  697. 'a' 1..1
  698. 'b' 1..1
  699. 'c' 1..1
  700. Alternative 1..1
  701. Group(2) 1..1
  702. Alternative 1..1
  703. 'b' 1..1
  704. 'c' 1..1
  705. 'd' 1..1
  706. Alternative 1..1
  707. 'c' 1..1
  708. 'd' 1..1
  709. 'e' 1..1
  710. "
  711. );
  712. }
  713. #[test]
  714. fn words_extended() {
  715. assert_eq!(
  716. compile_extended(br"\<word\>"),
  717. "\
  718. Root 1..1
  719. Alternative 1..1
  720. < 1..1
  721. 'w' 1..1
  722. 'o' 1..1
  723. 'r' 1..1
  724. 'd' 1..1
  725. > 1..1
  726. "
  727. );
  728. }
  729. #[test]
  730. fn repetitions_extended() {
  731. assert_eq!(
  732. compile_extended(br"yeee*"),
  733. "\
  734. Root 1..1
  735. Alternative 1..1
  736. 'y' 1..1
  737. 'e' 1..1
  738. 'e' 1..1
  739. 'e' 0..
  740. "
  741. );
  742. assert_eq!(
  743. compile_extended(br"yee?"),
  744. "\
  745. Root 1..1
  746. Alternative 1..1
  747. 'y' 1..1
  748. 'e' 1..1
  749. 'e' 0..1
  750. "
  751. );
  752. assert_eq!(
  753. compile_extended(br"yee+"),
  754. "\
  755. Root 1..1
  756. Alternative 1..1
  757. 'y' 1..1
  758. 'e' 1..1
  759. 'e' 1..
  760. "
  761. );
  762. assert_eq!(
  763. compile_extended(br"ye{2}"),
  764. "\
  765. Root 1..1
  766. Alternative 1..1
  767. 'y' 1..1
  768. 'e' 2..2
  769. "
  770. );
  771. assert_eq!(
  772. compile_extended(br"ye{2,}"),
  773. "\
  774. Root 1..1
  775. Alternative 1..1
  776. 'y' 1..1
  777. 'e' 2..
  778. "
  779. );
  780. assert_eq!(
  781. compile_extended(br"ye{2,3}"),
  782. "\
  783. Root 1..1
  784. Alternative 1..1
  785. 'y' 1..1
  786. 'e' 2..3
  787. "
  788. );
  789. }
  790. #[test]
  791. fn bracket_extended() {
  792. assert_eq!(
  793. compile_extended(b"[abc]"),
  794. "\
  795. Root 1..1
  796. Alternative 1..1
  797. {invert: false, ['a', 'b', 'c']} 1..1
  798. "
  799. );
  800. assert_eq!(
  801. compile_extended(b"[^abc]"),
  802. "\
  803. Root 1..1
  804. Alternative 1..1
  805. {invert: true, ['a', 'b', 'c']} 1..1
  806. "
  807. );
  808. assert_eq!(
  809. compile_extended(b"[]] [^]]"),
  810. "\
  811. Root 1..1
  812. Alternative 1..1
  813. {invert: false, [']']} 1..1
  814. ' ' 1..1
  815. {invert: true, [']']} 1..1
  816. "
  817. );
  818. assert_eq!(
  819. compile_extended(b"[0-3] [a-c] [-1] [1-]"),
  820. "\
  821. Root 1..1
  822. Alternative 1..1
  823. {invert: false, ['0', '1', '2', '3']} 1..1
  824. ' ' 1..1
  825. {invert: false, ['a', 'b', 'c']} 1..1
  826. ' ' 1..1
  827. {invert: false, ['-', '1']} 1..1
  828. ' ' 1..1
  829. {invert: false, ['1', '-']} 1..1
  830. "
  831. );
  832. assert_eq!(
  833. compile_extended(b"[[.-.]-/]"),
  834. "\
  835. Root 1..1
  836. Alternative 1..1
  837. {invert: false, ['-', '.', '/']} 1..1
  838. "
  839. );
  840. assert_eq!(
  841. compile_extended(b"[[:digit:][:upper:]]"),
  842. format!(
  843. "\
  844. Root 1..1
  845. Alternative 1..1
  846. {{invert: false, [{:p}, {:p}]}} 1..1
  847. ",
  848. ctype::is_digit as fn(u8) -> bool,
  849. ctype::is_upper as fn(u8) -> bool
  850. )
  851. );
  852. }
  853. #[test]
  854. fn newline_extended() {
  855. assert_eq!(
  856. compile_extended(br"\r\n"),
  857. "\
  858. Root 1..1
  859. Alternative 1..1
  860. '\\r' 1..1
  861. '\\n' 1..1
  862. "
  863. );
  864. }
  865. #[test]
  866. fn backref_extended() {
  867. assert_eq!(
  868. compile_extended(br"([abc])\1"),
  869. "\
  870. Root 1..1
  871. Alternative 1..1
  872. Group(1) 1..1
  873. Alternative 1..1
  874. {invert: false, ['a', 'b', 'c']} 1..1
  875. \\1 1..1
  876. "
  877. )
  878. }
  879. }