token.rs 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. use crate::error::Error;
  2. use std::fmt;
  3. use std::iter::Peekable;
  4. use std::str::CharIndices;
  5. #[derive(Debug, PartialEq, Eq, Copy, Clone)]
  6. pub enum Token<'a> {
  7. Dot,
  8. Comma,
  9. Semi,
  10. Exclamation,
  11. Question,
  12. Colon,
  13. Quote(&'a str),
  14. Word(&'a str),
  15. }
  16. impl fmt::Display for Token<'_> {
  17. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  18. match self {
  19. Token::Dot => write!(f, "."),
  20. Token::Comma => write!(f, ","),
  21. Token::Semi => write!(f, ";"),
  22. Token::Exclamation => write!(f, "!"),
  23. Token::Question => write!(f, "?"),
  24. Token::Colon => write!(f, ":"),
  25. Token::Quote(body) => write!(f, r#""{}""#, body),
  26. Token::Word(word) => write!(f, "{}", word),
  27. }
  28. }
  29. }
  30. #[derive(Clone, Debug)]
  31. pub struct Tokenizer<'a> {
  32. input: &'a str,
  33. chars: Peekable<CharIndices<'a>>,
  34. }
  35. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  36. pub enum ErrorKind {
  37. UnterminatedString,
  38. QuoteInWord,
  39. RawString,
  40. }
  41. impl std::error::Error for ErrorKind {}
  42. impl fmt::Display for ErrorKind {
  43. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  44. write!(
  45. f,
  46. "{}",
  47. match self {
  48. ErrorKind::UnterminatedString => "unterminated string",
  49. ErrorKind::QuoteInWord => "quote in word",
  50. ErrorKind::RawString => "raw strings are not yet supported",
  51. }
  52. )
  53. }
  54. }
  55. #[cfg(test)]
  56. impl<'a> Error<'a> {
  57. fn position_and_kind(&self) -> (usize, ErrorKind) {
  58. (
  59. self.position,
  60. *self.source.downcast_ref::<ErrorKind>().unwrap(),
  61. )
  62. }
  63. }
  64. impl<'a> Tokenizer<'a> {
  65. pub fn new(input: &'a str) -> Tokenizer<'a> {
  66. Tokenizer {
  67. input,
  68. chars: input.char_indices().peekable(),
  69. }
  70. }
  71. pub fn error<T: 'static + std::error::Error>(&mut self, source: T) -> Error<'a> {
  72. Error {
  73. input: self.input,
  74. position: self.cur_pos(),
  75. source: Box::new(source),
  76. }
  77. }
  78. fn consume_whitespace(&mut self) {
  79. while self.cur().map_or(false, |c| c.1.is_whitespace()) {
  80. self.advance();
  81. }
  82. }
  83. fn cur_punct(&mut self) -> Option<Token<'static>> {
  84. let (_, ch) = self.cur()?;
  85. match ch {
  86. '.' => Some(Token::Dot),
  87. ',' => Some(Token::Comma),
  88. ':' => Some(Token::Colon),
  89. '!' => Some(Token::Exclamation),
  90. '?' => Some(Token::Question),
  91. ';' => Some(Token::Semi),
  92. _ => None,
  93. }
  94. }
  95. fn consume_punct(&mut self) -> Option<Token<'a>> {
  96. let x = self.cur_punct()?;
  97. self.advance();
  98. Some(x)
  99. }
  100. fn cur(&mut self) -> Option<(usize, char)> {
  101. self.chars.peek().cloned()
  102. }
  103. fn at_end(&mut self) -> bool {
  104. self.chars.peek().is_none()
  105. }
  106. fn advance(&mut self) -> Option<()> {
  107. let (_, _) = self.chars.next()?;
  108. Some(())
  109. }
  110. fn cur_pos(&mut self) -> usize {
  111. self.cur().map_or(self.input.len(), |(pos, _)| pos)
  112. }
  113. fn str_from(&mut self, pos: usize) -> &'a str {
  114. &self.input[pos..self.cur_pos()]
  115. }
  116. fn consume_string(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  117. if let Some((_, '"')) = self.cur() {
  118. // okay
  119. } else {
  120. return Ok(None);
  121. }
  122. self.advance(); // eat "
  123. let start = self.cur_pos();
  124. loop {
  125. match self.cur() {
  126. Some((_, '"')) => break,
  127. Some(_) => self.advance(),
  128. None => return Err(self.error(ErrorKind::UnterminatedString)),
  129. };
  130. }
  131. let body = self.str_from(start);
  132. self.advance(); // eat final '"'
  133. Ok(Some(Token::Quote(body)))
  134. }
  135. pub fn position(&mut self) -> usize {
  136. self.cur_pos()
  137. }
  138. pub fn peek_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  139. self.clone().next_token()
  140. }
  141. pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  142. self.consume_whitespace();
  143. if self.at_end() {
  144. return Ok(None);
  145. }
  146. if let Some(punct) = self.consume_punct() {
  147. return Ok(Some(punct));
  148. }
  149. if let Some(s) = self.consume_string()? {
  150. return Ok(Some(s));
  151. }
  152. // Attempt to consume a word from the input.
  153. // Stop if we encounter whitespace or punctuation.
  154. let start = self.cur_pos();
  155. while self.cur().map_or(false, |(_, ch)| {
  156. !(self.cur_punct().is_some() || ch.is_whitespace())
  157. }) {
  158. if self.cur().unwrap().1 == '"' {
  159. let so_far = self.str_from(start);
  160. if so_far.starts_with('r') && so_far.chars().skip(1).all(|v| v == '#' || v == '"') {
  161. return Err(self.error(ErrorKind::RawString));
  162. } else {
  163. return Err(self.error(ErrorKind::QuoteInWord));
  164. }
  165. }
  166. self.advance();
  167. }
  168. Ok(Some(Token::Word(&self.str_from(start))))
  169. }
  170. }
  171. #[cfg(test)]
  172. fn tokenize<'a>(input: &'a str) -> Result<Vec<Token<'a>>, Error<'a>> {
  173. let mut tokens = Vec::new();
  174. let mut gen = Tokenizer::new(input);
  175. while let Some(tok) = gen.next_token()? {
  176. tokens.push(tok);
  177. }
  178. Ok(tokens)
  179. }
  180. #[test]
  181. fn tokenize_1() {
  182. assert_eq!(
  183. tokenize("foo\t\r\n bar\nbaz").unwrap(),
  184. [Token::Word("foo"), Token::Word("bar"), Token::Word("baz"),]
  185. );
  186. }
  187. #[test]
  188. fn tokenize_2() {
  189. assert_eq!(
  190. tokenize(",,,.,.,").unwrap(),
  191. [
  192. Token::Comma,
  193. Token::Comma,
  194. Token::Comma,
  195. Token::Dot,
  196. Token::Comma,
  197. Token::Dot,
  198. Token::Comma
  199. ]
  200. );
  201. }
  202. #[test]
  203. fn tokenize_whitespace_dots() {
  204. assert_eq!(
  205. tokenize("baz . ,bar ").unwrap(),
  206. [
  207. Token::Word("baz"),
  208. Token::Dot,
  209. Token::Comma,
  210. Token::Word("bar")
  211. ]
  212. );
  213. }
  214. #[test]
  215. fn tokenize_3() {
  216. assert_eq!(
  217. tokenize("bar, and -baz").unwrap(),
  218. [
  219. Token::Word("bar"),
  220. Token::Comma,
  221. Token::Word("and"),
  222. Token::Word("-baz"),
  223. ]
  224. );
  225. }
  226. #[test]
  227. fn tokenize_4() {
  228. assert_eq!(
  229. tokenize(", , b").unwrap(),
  230. [Token::Comma, Token::Comma, Token::Word("b")]
  231. );
  232. }
  233. #[test]
  234. fn tokenize_5() {
  235. assert_eq!(tokenize(r#""testing""#).unwrap(), [Token::Quote("testing")]);
  236. }
  237. #[test]
  238. fn tokenize_6() {
  239. assert_eq!(
  240. tokenize(r#""testing"#).unwrap_err().position_and_kind(),
  241. (8, ErrorKind::UnterminatedString)
  242. );
  243. }
  244. #[test]
  245. fn tokenize_7() {
  246. assert_eq!(
  247. tokenize(r#"wordy wordy word"quoteno"#)
  248. .unwrap_err()
  249. .position_and_kind(),
  250. (16, ErrorKind::QuoteInWord)
  251. );
  252. }
  253. #[test]
  254. fn tokenize_raw_string_prohibit() {
  255. assert_eq!(
  256. tokenize(r##"r#""#"##).unwrap_err().position_and_kind(),
  257. (2, ErrorKind::RawString)
  258. );
  259. }
  260. #[test]
  261. fn tokenize_raw_string_prohibit_1() {
  262. assert_eq!(
  263. tokenize(r##"map_of_arkansas_r#""#"##)
  264. .unwrap_err()
  265. .position_and_kind(),
  266. (18, ErrorKind::QuoteInWord)
  267. );
  268. }