token.rs 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. use crate::error::Error;
  2. use std::fmt;
  3. use std::iter::Peekable;
  4. use std::str::CharIndices;
  5. #[derive(Debug, PartialEq, Eq, Copy, Clone)]
  6. pub enum Token<'a> {
  7. Dot,
  8. Comma,
  9. Semi,
  10. Exclamation,
  11. Question,
  12. Colon,
  13. EndOfLine,
  14. ParenLeft,
  15. ParenRight,
  16. Quote(&'a str),
  17. Word(&'a str),
  18. }
  19. impl fmt::Display for Token<'_> {
  20. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  21. match self {
  22. Token::Dot => write!(f, "."),
  23. Token::Comma => write!(f, ","),
  24. Token::Semi => write!(f, ";"),
  25. Token::Exclamation => write!(f, "!"),
  26. Token::Question => write!(f, "?"),
  27. Token::Colon => write!(f, ":"),
  28. Token::ParenRight => write!(f, ")"),
  29. Token::ParenLeft => write!(f, "("),
  30. Token::EndOfLine => Ok(()),
  31. Token::Quote(body) => write!(f, r#""{}""#, body),
  32. Token::Word(word) => write!(f, "{}", word),
  33. }
  34. }
  35. }
  36. #[derive(Clone, Debug)]
  37. pub struct Tokenizer<'a> {
  38. input: &'a str,
  39. chars: Peekable<CharIndices<'a>>,
  40. end_of_input_emitted: bool,
  41. }
  42. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  43. pub enum ErrorKind {
  44. UnterminatedString,
  45. QuoteInWord,
  46. RawString,
  47. }
  48. impl std::error::Error for ErrorKind {}
  49. impl fmt::Display for ErrorKind {
  50. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  51. write!(
  52. f,
  53. "{}",
  54. match self {
  55. ErrorKind::UnterminatedString => "unterminated string",
  56. ErrorKind::QuoteInWord => "quote in word",
  57. ErrorKind::RawString => "raw strings are not yet supported",
  58. }
  59. )
  60. }
  61. }
  62. #[cfg(test)]
  63. impl<'a> Error<'a> {
  64. fn position_and_kind(&self) -> (usize, ErrorKind) {
  65. (
  66. self.position,
  67. *self.source.downcast_ref::<ErrorKind>().unwrap(),
  68. )
  69. }
  70. }
  71. impl<'a> Tokenizer<'a> {
  72. pub fn new(input: &'a str) -> Tokenizer<'a> {
  73. Tokenizer {
  74. input,
  75. chars: input.char_indices().peekable(),
  76. end_of_input_emitted: false,
  77. }
  78. }
  79. pub fn error<T: 'static + std::error::Error + Send>(&mut self, source: T) -> Error<'a> {
  80. Error {
  81. input: self.input,
  82. position: self.cur_pos(),
  83. source: Box::new(source),
  84. }
  85. }
  86. fn consume_whitespace(&mut self) {
  87. while self
  88. .cur()
  89. .map_or(false, |c| c.1 != '\n' && c.1.is_whitespace())
  90. {
  91. self.advance();
  92. }
  93. }
  94. fn cur_punct(&mut self) -> Option<Token<'static>> {
  95. let (_, ch) = self.cur()?;
  96. match ch {
  97. '.' => Some(Token::Dot),
  98. ',' => Some(Token::Comma),
  99. ':' => Some(Token::Colon),
  100. '!' => Some(Token::Exclamation),
  101. '?' => Some(Token::Question),
  102. ';' => Some(Token::Semi),
  103. '\n' => Some(Token::EndOfLine),
  104. ')' => Some(Token::ParenRight),
  105. '(' => Some(Token::ParenLeft),
  106. _ => None,
  107. }
  108. }
  109. fn consume_punct(&mut self) -> Option<Token<'a>> {
  110. let x = self.cur_punct()?;
  111. self.advance();
  112. Some(x)
  113. }
  114. fn cur(&mut self) -> Option<(usize, char)> {
  115. self.chars.peek().cloned()
  116. }
  117. fn at_end(&mut self) -> bool {
  118. self.chars.peek().is_none()
  119. }
  120. fn advance(&mut self) -> Option<()> {
  121. let (_, _) = self.chars.next()?;
  122. Some(())
  123. }
  124. fn cur_pos(&mut self) -> usize {
  125. self.cur().map_or(self.input.len(), |(pos, _)| pos)
  126. }
  127. fn str_from(&mut self, pos: usize) -> &'a str {
  128. &self.input[pos..self.cur_pos()]
  129. }
  130. fn consume_string(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  131. if let Some((_, '"')) = self.cur() {
  132. // okay
  133. } else {
  134. return Ok(None);
  135. }
  136. self.advance(); // eat "
  137. let start = self.cur_pos();
  138. loop {
  139. match self.cur() {
  140. Some((_, '"')) => break,
  141. Some(_) => self.advance(),
  142. None => return Err(self.error(ErrorKind::UnterminatedString)),
  143. };
  144. }
  145. let body = self.str_from(start);
  146. self.advance(); // eat final '"'
  147. Ok(Some(Token::Quote(body)))
  148. }
  149. pub fn position(&mut self) -> usize {
  150. self.cur_pos()
  151. }
  152. pub fn peek_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  153. self.clone().next_token()
  154. }
  155. pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  156. self.consume_whitespace();
  157. if self.at_end() {
  158. if self.end_of_input_emitted {
  159. return Ok(None);
  160. } else {
  161. self.end_of_input_emitted = true;
  162. return Ok(Some(Token::EndOfLine));
  163. }
  164. }
  165. if let Some(punct) = self.consume_punct() {
  166. return Ok(Some(punct));
  167. }
  168. if let Some(s) = self.consume_string()? {
  169. return Ok(Some(s));
  170. }
  171. // Attempt to consume a word from the input.
  172. // Stop if we encounter whitespace or punctuation.
  173. let start = self.cur_pos();
  174. while self.cur().map_or(false, |(_, ch)| {
  175. !(self.cur_punct().is_some() || ch.is_whitespace())
  176. }) {
  177. if self.cur().unwrap().1 == '"' {
  178. let so_far = self.str_from(start);
  179. if so_far.starts_with('r') && so_far.chars().skip(1).all(|v| v == '#' || v == '"') {
  180. return Err(self.error(ErrorKind::RawString));
  181. } else {
  182. return Err(self.error(ErrorKind::QuoteInWord));
  183. }
  184. }
  185. self.advance();
  186. }
  187. Ok(Some(Token::Word(&self.str_from(start))))
  188. }
  189. pub fn eat_token(&mut self, token: Token<'a>) -> Result<bool, Error<'a>> {
  190. match self.peek_token()? {
  191. Some(next_tok) if next_tok == token => {
  192. self.next_token()?;
  193. Ok(true)
  194. }
  195. _ => Ok(false),
  196. }
  197. }
  198. }
  199. #[cfg(test)]
  200. fn tokenize<'a>(input: &'a str) -> Result<Vec<Token<'a>>, Error<'a>> {
  201. let mut tokens = Vec::new();
  202. let mut gen = Tokenizer::new(input);
  203. while let Some(tok) = gen.next_token()? {
  204. tokens.push(tok);
  205. }
  206. Ok(tokens)
  207. }
  208. #[test]
  209. fn tokenize_1() {
  210. assert_eq!(
  211. tokenize("foo\t\r\n\n bar\nbaz\n").unwrap(),
  212. [
  213. Token::Word("foo"),
  214. Token::EndOfLine,
  215. Token::EndOfLine,
  216. Token::Word("bar"),
  217. Token::EndOfLine,
  218. Token::Word("baz"),
  219. Token::EndOfLine,
  220. Token::EndOfLine,
  221. ]
  222. );
  223. }
  224. #[test]
  225. fn tokenize_2() {
  226. assert_eq!(
  227. tokenize(",,,.,.,").unwrap(),
  228. [
  229. Token::Comma,
  230. Token::Comma,
  231. Token::Comma,
  232. Token::Dot,
  233. Token::Comma,
  234. Token::Dot,
  235. Token::Comma,
  236. Token::EndOfLine,
  237. ]
  238. );
  239. }
  240. #[test]
  241. fn tokenize_whitespace_dots() {
  242. assert_eq!(
  243. tokenize("baz . ,bar ").unwrap(),
  244. [
  245. Token::Word("baz"),
  246. Token::Dot,
  247. Token::Comma,
  248. Token::Word("bar"),
  249. Token::EndOfLine,
  250. ]
  251. );
  252. }
  253. #[test]
  254. fn tokenize_3() {
  255. assert_eq!(
  256. tokenize("bar, and -baz").unwrap(),
  257. [
  258. Token::Word("bar"),
  259. Token::Comma,
  260. Token::Word("and"),
  261. Token::Word("-baz"),
  262. Token::EndOfLine,
  263. ]
  264. );
  265. }
  266. #[test]
  267. fn tokenize_4() {
  268. assert_eq!(
  269. tokenize(", , b").unwrap(),
  270. [
  271. Token::Comma,
  272. Token::Comma,
  273. Token::Word("b"),
  274. Token::EndOfLine,
  275. ]
  276. );
  277. }
  278. #[test]
  279. fn tokenize_5() {
  280. assert_eq!(
  281. tokenize(r#""testing""#).unwrap(),
  282. [Token::Quote("testing"), Token::EndOfLine,]
  283. );
  284. }
  285. #[test]
  286. fn tokenize_6() {
  287. assert_eq!(
  288. tokenize(r#""testing"#).unwrap_err().position_and_kind(),
  289. (8, ErrorKind::UnterminatedString)
  290. );
  291. }
  292. #[test]
  293. fn tokenize_7() {
  294. assert_eq!(
  295. tokenize(r#"wordy wordy word"quoteno"#)
  296. .unwrap_err()
  297. .position_and_kind(),
  298. (16, ErrorKind::QuoteInWord)
  299. );
  300. }
  301. #[test]
  302. fn tokenize_raw_string_prohibit() {
  303. assert_eq!(
  304. tokenize(r##"r#""#"##).unwrap_err().position_and_kind(),
  305. (2, ErrorKind::RawString)
  306. );
  307. }
  308. #[test]
  309. fn tokenize_raw_string_prohibit_1() {
  310. assert_eq!(
  311. tokenize(r##"map_of_arkansas_r#""#"##)
  312. .unwrap_err()
  313. .position_and_kind(),
  314. (18, ErrorKind::QuoteInWord)
  315. );
  316. }