matcher.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. //! The matcher: Can find substrings in a string that match any compiled regex
  2. #[cfg(feature = "no_std")]
  3. use std::prelude::*;
  4. use compile::{Token, Range};
  5. use std::borrow::Cow;
  6. use std::fmt;
  7. use std::rc::Rc;
  8. /// A regex matcher, ready to match stuff
  9. #[derive(Clone)]
  10. pub struct PosixRegex {
  11. pub(crate) search: Vec<Vec<(Token, Range)>>
  12. }
  13. impl PosixRegex {
  14. /// Match the string starting at the current position. This does not find
  15. /// substrings.
  16. pub fn matches_exact(self, input: &[u8]) -> Option<PosixRegexResult> {
  17. let mut groups = Vec::new();
  18. let mut matcher = PosixRegexMatcher {
  19. input,
  20. offset: 0,
  21. groups: &mut groups
  22. };
  23. let start = matcher.offset;
  24. if !matcher.matches_exact(self.search.iter().filter_map(|tokens| Branch::new(tokens)).collect()) {
  25. return None;
  26. }
  27. let end = matcher.offset;
  28. Some(PosixRegexResult {
  29. start,
  30. end,
  31. groups
  32. })
  33. }
  34. }
  35. #[derive(Clone)]
  36. struct Branch<'a> {
  37. index: usize,
  38. repeated: u32,
  39. tokens: &'a [(Token, Range)],
  40. path: Box<[(usize, usize)]>,
  41. group_ids: Rc<[usize]>,
  42. repeat_min: u32,
  43. repeat_max: Option<u32>,
  44. next: Option<Rc<Branch<'a>>>
  45. }
  46. impl<'a> fmt::Debug for Branch<'a> {
  47. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  48. write!(f, "{:?}", self.get_token())
  49. }
  50. }
  51. impl<'a> Branch<'a> {
  52. fn new(tokens: &'a [(Token, Range)]) -> Option<Self> {
  53. if tokens.is_empty() {
  54. return None;
  55. }
  56. Some(Self {
  57. index: 0,
  58. repeated: 0,
  59. tokens: tokens,
  60. path: Vec::new().into(),
  61. group_ids: Vec::new().into(),
  62. repeat_min: 0,
  63. repeat_max: Some(0),
  64. next: None
  65. })
  66. }
  67. fn group(
  68. path: Box<[(usize, usize)]>,
  69. group_ids: Rc<[usize]>,
  70. tokens: &'a [(Token, Range)],
  71. range: Range,
  72. next: Option<Branch<'a>>
  73. ) -> Option<Self> {
  74. if tokens.is_empty() {
  75. return None;
  76. }
  77. Some(Self {
  78. index: 0,
  79. repeated: 0,
  80. tokens,
  81. path,
  82. group_ids,
  83. repeat_min: range.0.saturating_sub(1),
  84. repeat_max: range.1.map(|i| i.saturating_sub(1)),
  85. next: next.map(Rc::new)
  86. })
  87. }
  88. fn parent_tokens(&self) -> &[(Token, Range)] {
  89. let mut tokens = self.tokens;
  90. let len = self.path.len();
  91. if len > 0 {
  92. for &(index, variant) in &self.path[..len-1] {
  93. match tokens[index] {
  94. (Token::Group(ref inner), _) => tokens = &inner[variant],
  95. _ => panic!("non-group index in path")
  96. }
  97. }
  98. }
  99. tokens
  100. }
  101. fn tokens(&self) -> &[(Token, Range)] {
  102. let mut tokens = self.parent_tokens();
  103. if let Some(&(index, variant)) = self.path.last() {
  104. match tokens[index] {
  105. (Token::Group(ref inner), _) => tokens = &inner[variant],
  106. _ => panic!("non-group index in path")
  107. }
  108. }
  109. tokens
  110. }
  111. fn get_token(&self) -> &(Token, Range) {
  112. &self.tokens()[self.index]
  113. }
  114. fn next_branch(&self) -> Option<Self> {
  115. if self.repeat_min > 0 {
  116. // Don't add the next branch until we've repeated this one enough
  117. return None;
  118. }
  119. if self.index + 1 >= self.tokens().len() {
  120. return self.next.as_ref().map(|inner| (**inner).clone());
  121. }
  122. Some(Self {
  123. index: self.index + 1,
  124. repeated: 0,
  125. ..self.clone()
  126. })
  127. }
  128. fn add_repeats(&self, branches: &mut Vec<Branch<'a>>) {
  129. if self.repeat_max.map(|max| max == 0).unwrap_or(false) {
  130. return;
  131. }
  132. let tokens = self.parent_tokens();
  133. let &(index, _) = self.path.last().expect("add_repeats called on top level");
  134. match tokens[index] {
  135. (Token::Group(ref repeats), _) => {
  136. for alternative in 0..repeats.len() {
  137. let mut path = self.path.clone();
  138. path.last_mut().unwrap().1 = alternative;
  139. branches.push(Self {
  140. index: 0,
  141. path,
  142. repeated: 0,
  143. repeat_min: self.repeat_min.saturating_sub(1),
  144. repeat_max: self.repeat_max.map(|max| max - 1),
  145. ..self.clone()
  146. });
  147. }
  148. },
  149. _ => panic!("non-group index in path")
  150. }
  151. }
  152. /// Returns if this node is "explored" enough times,
  153. /// meaning it has repeated as many times as it want to and has nowhere to go next.
  154. fn is_explored(&self) -> bool {
  155. let mut branch = Cow::Borrowed(self);
  156. loop {
  157. if branch.repeat_min > 0 {
  158. // Group did not repeat enough times!
  159. return false;
  160. }
  161. let (_, Range(min, _)) = *branch.get_token();
  162. if branch.repeated < min {
  163. return false;
  164. }
  165. match branch.next_branch() {
  166. Some(next) => branch = Cow::Owned(next),
  167. None => break
  168. }
  169. }
  170. true
  171. }
  172. }
  173. struct PosixRegexMatcher<'a> {
  174. input: &'a [u8],
  175. offset: usize,
  176. groups: &'a mut Vec<(usize, usize)>
  177. }
  178. impl<'a> PosixRegexMatcher<'a> {
  179. fn expand<'b>(&mut self, branches: &[Branch<'b>]) -> Vec<Branch<'b>> {
  180. let mut insert = Vec::new();
  181. for branch in branches {
  182. let (ref token, range) = *branch.get_token();
  183. if let Token::Group(ref inner) = token {
  184. // Push the group's inner content as a new branch
  185. let group_id = self.groups.len();
  186. self.groups.push((self.offset, 0));
  187. let mut ids = Vec::with_capacity(branch.group_ids.len() + 1);
  188. ids.extend(&*branch.group_ids);
  189. ids.push(group_id);
  190. let ids = ids.into();
  191. for alternation in 0..inner.len() {
  192. let mut path = Vec::with_capacity(branch.path.len() + 1);
  193. path.extend(&*branch.path);
  194. path.push((branch.index, alternation));
  195. if let Some(branch) = Branch::group(
  196. path.into(),
  197. Rc::clone(&ids),
  198. branch.tokens,
  199. range,
  200. branch.next_branch()
  201. ) {
  202. insert.push(branch);
  203. }
  204. }
  205. }
  206. if branch.repeated >= range.0 {
  207. // Push the next element as a new branch
  208. if let Some(next) = branch.next_branch() {
  209. insert.push(next);
  210. }
  211. branch.add_repeats(&mut insert);
  212. }
  213. }
  214. if !insert.is_empty() {
  215. // Resolve recursively
  216. let mut new = self.expand(&insert);
  217. insert.append(&mut new);
  218. }
  219. insert
  220. }
  221. fn matches_exact(&mut self, mut branches: Vec<Branch>) -> bool {
  222. // Whether or not any branch, at any point, got fully explored. This
  223. // means at least one path of the regex successfully completed!
  224. let mut succeeded = false;
  225. let mut prev = None;
  226. loop {
  227. let next = self.input.get(self.offset).cloned();
  228. let mut index = 0;
  229. let mut remove = 0;
  230. let mut insert = self.expand(&branches);
  231. branches.append(&mut insert);
  232. loop {
  233. if index >= branches.len() {
  234. break;
  235. }
  236. if remove > 0 {
  237. // Just like Rust's `retain` function, shift all elements I
  238. // want to keep back and `truncate` when I'm done.
  239. branches.swap(index, index-remove);
  240. }
  241. let branch = &mut branches[index-remove];
  242. index += 1;
  243. let (ref token, Range(_, mut max)) = *branch.get_token();
  244. let mut token = token;
  245. let mut accepts = true;
  246. // Step 1: Handle zero-width stuff like ^ and \<
  247. loop {
  248. match token {
  249. Token::Start |
  250. Token::WordEnd |
  251. Token::WordStart => {
  252. // Should be cheap to clone since we already make sure
  253. // it's a type that doesn't hold any data.
  254. let original = token.clone();
  255. // Skip ahead to the next token.
  256. match branch.next_branch() {
  257. Some(next) => *branch = next,
  258. None => break
  259. }
  260. let (ref new_token, Range(_, new_max)) = *branch.get_token();
  261. token = new_token;
  262. max = new_max;
  263. accepts = match original {
  264. Token::Start => self.offset == 0,
  265. Token::WordEnd => next.map(::ctype::is_word_boundary).unwrap_or(true),
  266. Token::WordStart => prev.map(::ctype::is_word_boundary).unwrap_or(true),
  267. _ => unreachable!()
  268. };
  269. },
  270. _ => break
  271. }
  272. }
  273. // Step 2: Check if the token matches
  274. accepts = accepts && match *token {
  275. Token::Any => next.is_some(),
  276. Token::Char(c) => next == Some(c),
  277. Token::End => next.is_none(),
  278. Token::Group(_) => false, // <- handled separately
  279. Token::OneOf { invert, ref list } => if let Some(next) = next {
  280. list.iter().any(|c| c.matches(next)) == !invert
  281. } else { false },
  282. // These will only get called if they are encountered at
  283. // EOF (because next_branch returns None), for example
  284. // "abc\>" or "^". Then we simply want to return true as to
  285. // preserve the current `accepts` status.
  286. Token::Start |
  287. Token::WordEnd |
  288. Token::WordStart => true
  289. };
  290. if !accepts || max.map(|max| branch.repeated >= max).unwrap_or(false) {
  291. succeeded = succeeded || branch.is_explored();
  292. for &id in &*branch.group_ids {
  293. self.groups[id].1 = self.offset;
  294. }
  295. remove += 1;
  296. continue;
  297. }
  298. branch.repeated += 1;
  299. }
  300. let end = branches.len() - remove;
  301. branches.truncate(end);
  302. if branches.is_empty() {
  303. return succeeded;
  304. }
  305. if next.is_some() {
  306. self.offset += 1;
  307. prev = next;
  308. }
  309. }
  310. }
  311. }
  312. /// A single result, including start and end bounds
  313. #[derive(Debug, Clone, PartialEq, Eq)]
  314. pub struct PosixRegexResult {
  315. /// An offset in the input string to where the match started (inclusive)
  316. pub start: usize,
  317. /// An offset in the input string to where the match ended (exclusive)
  318. pub end: usize,
  319. /// Start/end offsets in the input to where a group matched
  320. pub groups: Vec<(usize, usize)>
  321. }
  322. #[cfg(test)]
  323. mod tests {
  324. #[cfg(feature = "bench")]
  325. extern crate test;
  326. #[cfg(feature = "bench")]
  327. use self::test::Bencher;
  328. use super::*;
  329. use ::PosixRegexBuilder;
  330. fn matches_exact(regex: &str, input: &str) -> Option<PosixRegexResult> {
  331. PosixRegexBuilder::new(regex.as_bytes())
  332. .with_default_classes()
  333. .compile()
  334. .expect("error compiling regex")
  335. .matches_exact(input.as_bytes())
  336. }
  337. #[test]
  338. fn basic() {
  339. assert!(matches_exact("abc", "abc").is_some());
  340. assert!(matches_exact("abc", "bbc").is_none());
  341. assert!(matches_exact("abc", "acc").is_none());
  342. assert!(matches_exact("abc", "abd").is_none());
  343. }
  344. #[test]
  345. fn repetitions() {
  346. assert!(matches_exact("abc*", "ab").is_some());
  347. assert!(matches_exact("abc*", "abc").is_some());
  348. assert!(matches_exact("abc*", "abccc").is_some());
  349. assert!(matches_exact(r"a\{1,2\}b", "b").is_none());
  350. assert!(matches_exact(r"a\{1,2\}b", "ab").is_some());
  351. assert!(matches_exact(r"a\{1,2\}b", "aab").is_some());
  352. assert!(matches_exact(r"a\{1,2\}b", "aaab").is_none());
  353. }
  354. #[test]
  355. fn any() {
  356. assert!(matches_exact(".*", "").is_some());
  357. assert!(matches_exact(".*b", "b").is_some());
  358. assert!(matches_exact(".*b", "ab").is_some());
  359. assert!(matches_exact(".*b", "aaaaab").is_some());
  360. assert!(matches_exact(".*b", "HELLO WORLD").is_none());
  361. assert!(matches_exact(".*b", "HELLO WORLDb").is_some());
  362. assert!(matches_exact("H.*O WORLD", "HELLO WORLD").is_some());
  363. assert!(matches_exact("H.*ORLD", "HELLO WORLD").is_some());
  364. }
  365. #[test]
  366. fn brackets() {
  367. assert!(matches_exact("[abc]*d", "abcd").is_some());
  368. assert!(matches_exact("[0-9]*d", "1234d").is_some());
  369. assert!(matches_exact("[[:digit:]]*d", "1234d").is_some());
  370. assert!(matches_exact("[[:digit:]]*d", "abcd").is_none());
  371. }
  372. #[test]
  373. fn alternations() {
  374. assert!(matches_exact(r"abc\|bcd", "abc").is_some());
  375. assert!(matches_exact(r"abc\|bcd", "bcd").is_some());
  376. assert!(matches_exact(r"abc\|bcd", "cde").is_none());
  377. assert!(matches_exact(r"[A-Z]\+\|yee", "").is_none());
  378. assert!(matches_exact(r"[A-Z]\+\|yee", "HELLO").is_some());
  379. assert!(matches_exact(r"[A-Z]\+\|yee", "yee").is_some());
  380. assert!(matches_exact(r"[A-Z]\+\|yee", "hello").is_none());
  381. }
  382. #[test]
  383. fn offsets() {
  384. assert_eq!(
  385. matches_exact("abc", "abcd"),
  386. Some(PosixRegexResult { start: 0, end: 3, groups: vec![] })
  387. );
  388. assert_eq!(
  389. matches_exact(r"[[:alpha:]]\+", "abcde12345"),
  390. Some(PosixRegexResult { start: 0, end: 5, groups: vec![] })
  391. );
  392. assert_eq!(
  393. matches_exact(r"a\(bc\)\+d", "abcbcd"),
  394. Some(PosixRegexResult { start: 0, end: 6, groups: vec![(1, 5)] })
  395. );
  396. assert_eq!(
  397. matches_exact(r"hello\( \(world\|universe\) :D\)\?!", "hello world :D!"),
  398. Some(PosixRegexResult { start: 0, end: 15, groups: vec![(5, 14), (6, 11)] })
  399. );
  400. assert_eq!(
  401. matches_exact(r"hello\( \(world\|universe\) :D\)\?", "hello world :D"),
  402. Some(PosixRegexResult { start: 0, end: 14, groups: vec![(5, 14), (6, 11)] })
  403. );
  404. assert_eq!(
  405. matches_exact(r"\(\<hello\>\) world", "hello world"),
  406. Some(PosixRegexResult { start: 0, end: 11, groups: vec![(0, 5)] })
  407. );
  408. }
  409. #[test]
  410. fn start_and_end() {
  411. assert!(matches_exact("^abc$", "abc").is_some());
  412. assert!(matches_exact("^bcd", "bcde").is_some());
  413. assert!(matches_exact("^bcd", "abcd").is_none());
  414. assert!(matches_exact("abc$", "abc").is_some());
  415. assert!(matches_exact("abc$", "abcd").is_none());
  416. assert!(matches_exact(r".*\(^\|a\)c", "c").is_some());
  417. assert!(matches_exact(r".*\(^\|a\)c", "ac").is_some());
  418. assert!(matches_exact(r".*\(^\|a\)c", "bc").is_none());
  419. // Tests if ^ can be repeated without issues
  420. assert!(matches_exact(".*^^a", "helloabc").is_none());
  421. assert!(matches_exact(".*^^a", "abc").is_some());
  422. }
  423. #[test]
  424. fn word_boundaries() {
  425. assert!(matches_exact(r"hello\>.world", "hello world").is_some());
  426. assert!(matches_exact(r"hello\>.world", "hello!world").is_some());
  427. assert!(matches_exact(r"hello\>.world", "hellooworld").is_none());
  428. assert!(matches_exact(r"hello.\<world", "hello world").is_some());
  429. assert!(matches_exact(r"hello.\<world", "hello!world").is_some());
  430. assert!(matches_exact(r"hello.\<world", "hellooworld").is_none());
  431. assert!(matches_exact(r".*\<hello\>", "hihello").is_none());
  432. assert!(matches_exact(r".*\<hello\>", "hi_hello").is_none());
  433. assert!(matches_exact(r".*\<hello\>", "hi hello").is_some());
  434. }
  435. #[test]
  436. fn groups() {
  437. assert!(matches_exact(r"\(a*\|b\|c\)d", "d").is_some());
  438. assert!(matches_exact(r"\(a*\|b\|c\)d", "aaaad").is_some());
  439. assert!(matches_exact(r"\(a*\|b\|c\)d", "bd").is_some());
  440. assert!(matches_exact(r"\(a*\|b\|c\)d", "bbbbbd").is_none());
  441. }
  442. #[test]
  443. fn repeating_groups() {
  444. assert!(matches_exact(r"\(a\|b\|c\)*d", "d").is_some());
  445. assert!(matches_exact(r"\(a\|b\|c\)*d", "aaaad").is_some());
  446. assert!(matches_exact(r"\(a\|b\|c\)*d", "bbbbd").is_some());
  447. assert!(matches_exact(r"\(a\|b\|c\)*d", "aabbd").is_some());
  448. assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "d").is_none());
  449. assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "ad").is_some());
  450. assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "abd").is_some());
  451. assert!(matches_exact(r"\(a\|b\|c\)\{1,2\}d", "abcd").is_none());
  452. assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "ababad").is_none());
  453. assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "ababd").is_some());
  454. assert!(matches_exact(r"\(a\|b\|c\)\{4\}d", "abad").is_none());
  455. }
  456. #[cfg(feature = "bench")]
  457. #[bench]
  458. fn speeeeed(b: &mut Bencher) {
  459. b.iter(|| {
  460. assert!(matches_exact(r"\(\(a*\|b\|c\) test\|yee\)", "aaaaa test").is_some());
  461. })
  462. }
  463. }