token.rs 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. use crate::error::Error;
  2. use std::fmt;
  3. use std::iter::Peekable;
  4. use std::str::CharIndices;
  5. #[derive(Debug, PartialEq, Eq, Copy, Clone)]
  6. pub enum Token<'a> {
  7. Dot,
  8. Comma,
  9. Semi,
  10. Exclamation,
  11. Question,
  12. Colon,
  13. EndOfLine,
  14. ParenLeft,
  15. ParenRight,
  16. Quote(&'a str),
  17. Word(&'a str),
  18. }
  19. impl fmt::Display for Token<'_> {
  20. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  21. match self {
  22. Token::Dot => write!(f, "."),
  23. Token::Comma => write!(f, ","),
  24. Token::Semi => write!(f, ";"),
  25. Token::Exclamation => write!(f, "!"),
  26. Token::Question => write!(f, "?"),
  27. Token::Colon => write!(f, ":"),
  28. Token::ParenRight => write!(f, ")"),
  29. Token::ParenLeft => write!(f, "("),
  30. Token::EndOfLine => Ok(()),
  31. Token::Quote(body) => write!(f, r#""{}""#, body),
  32. Token::Word(word) => write!(f, "{}", word),
  33. }
  34. }
  35. }
  36. #[derive(Clone, Debug)]
  37. pub struct Tokenizer<'a> {
  38. input: &'a str,
  39. chars: Peekable<CharIndices<'a>>,
  40. end_of_input_emitted: bool,
  41. }
  42. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  43. pub enum ErrorKind {
  44. UnterminatedString,
  45. QuoteInWord,
  46. RawString,
  47. }
  48. impl std::error::Error for ErrorKind {}
  49. impl fmt::Display for ErrorKind {
  50. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  51. write!(
  52. f,
  53. "{}",
  54. match self {
  55. ErrorKind::UnterminatedString => "unterminated string",
  56. ErrorKind::QuoteInWord => "quote in word",
  57. ErrorKind::RawString => "raw strings are not yet supported",
  58. }
  59. )
  60. }
  61. }
  62. #[cfg(test)]
  63. impl<'a> Error<'a> {
  64. fn position_and_kind(&self) -> (usize, ErrorKind) {
  65. (
  66. self.position,
  67. *self.source.downcast_ref::<ErrorKind>().unwrap(),
  68. )
  69. }
  70. }
  71. impl<'a> Tokenizer<'a> {
  72. pub fn new(input: &'a str) -> Tokenizer<'a> {
  73. Tokenizer {
  74. input,
  75. chars: input.char_indices().peekable(),
  76. end_of_input_emitted: false,
  77. }
  78. }
  79. pub fn error<T: 'static + std::error::Error + Send>(&mut self, source: T) -> Error<'a> {
  80. Error {
  81. input: self.input,
  82. position: self.cur_pos(),
  83. source: Box::new(source),
  84. }
  85. }
  86. fn consume_whitespace(&mut self) {
  87. while self
  88. .cur()
  89. .map_or(false, |c| c.1 != '\n' && c.1.is_whitespace())
  90. {
  91. self.advance();
  92. }
  93. }
  94. fn cur_punct(&mut self) -> Option<Token<'static>> {
  95. let (_, ch) = self.cur()?;
  96. match ch {
  97. '.' => Some(Token::Dot),
  98. ',' => Some(Token::Comma),
  99. ':' => Some(Token::Colon),
  100. '!' => Some(Token::Exclamation),
  101. '?' => Some(Token::Question),
  102. ';' => Some(Token::Semi),
  103. '\n' => Some(Token::EndOfLine),
  104. ')' => Some(Token::ParenRight),
  105. '(' => Some(Token::ParenLeft),
  106. _ => None,
  107. }
  108. }
  109. fn consume_punct(&mut self) -> Option<Token<'a>> {
  110. let x = self.cur_punct()?;
  111. self.advance();
  112. Some(x)
  113. }
  114. fn cur(&mut self) -> Option<(usize, char)> {
  115. self.chars.peek().cloned()
  116. }
  117. fn at_end(&mut self) -> bool {
  118. self.chars.peek().is_none()
  119. }
  120. fn advance(&mut self) -> Option<()> {
  121. let (_, _) = self.chars.next()?;
  122. Some(())
  123. }
  124. fn cur_pos(&mut self) -> usize {
  125. self.cur().map_or(self.input.len(), |(pos, _)| pos)
  126. }
  127. fn str_from(&mut self, pos: usize) -> &'a str {
  128. &self.input[pos..self.cur_pos()]
  129. }
  130. fn consume_string(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  131. if let Some((_, '"')) = self.cur() {
  132. // okay
  133. } else {
  134. return Ok(None);
  135. }
  136. self.advance(); // eat "
  137. let start = self.cur_pos();
  138. loop {
  139. match self.cur() {
  140. Some((_, '"')) => break,
  141. Some(_) => self.advance(),
  142. None => return Err(self.error(ErrorKind::UnterminatedString)),
  143. };
  144. }
  145. let body = self.str_from(start);
  146. self.advance(); // eat final '"'
  147. Ok(Some(Token::Quote(body)))
  148. }
  149. pub fn position(&mut self) -> usize {
  150. self.cur_pos()
  151. }
  152. pub fn peek_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  153. self.clone().next_token()
  154. }
  155. pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  156. self.consume_whitespace();
  157. if self.at_end() {
  158. if self.end_of_input_emitted {
  159. return Ok(None);
  160. } else {
  161. self.end_of_input_emitted = true;
  162. return Ok(Some(Token::EndOfLine));
  163. }
  164. }
  165. if let Some(punct) = self.consume_punct() {
  166. return Ok(Some(punct));
  167. }
  168. if let Some(s) = self.consume_string()? {
  169. return Ok(Some(s));
  170. }
  171. // Attempt to consume a word from the input.
  172. // Stop if we encounter whitespace or punctuation.
  173. let start = self.cur_pos();
  174. while self.cur().map_or(false, |(_, ch)| {
  175. !(self.cur_punct().is_some() || ch.is_whitespace())
  176. }) {
  177. if self.cur().unwrap().1 == '"' {
  178. let so_far = self.str_from(start);
  179. if so_far.starts_with('r') && so_far.chars().skip(1).all(|v| v == '#' || v == '"') {
  180. return Err(self.error(ErrorKind::RawString));
  181. } else {
  182. return Err(self.error(ErrorKind::QuoteInWord));
  183. }
  184. }
  185. self.advance();
  186. }
  187. Ok(Some(Token::Word(&self.str_from(start))))
  188. }
  189. }
  190. #[cfg(test)]
  191. fn tokenize<'a>(input: &'a str) -> Result<Vec<Token<'a>>, Error<'a>> {
  192. let mut tokens = Vec::new();
  193. let mut gen = Tokenizer::new(input);
  194. while let Some(tok) = gen.next_token()? {
  195. tokens.push(tok);
  196. }
  197. Ok(tokens)
  198. }
  199. #[test]
  200. fn tokenize_1() {
  201. assert_eq!(
  202. tokenize("foo\t\r\n\n bar\nbaz\n").unwrap(),
  203. [
  204. Token::Word("foo"),
  205. Token::EndOfLine,
  206. Token::EndOfLine,
  207. Token::Word("bar"),
  208. Token::EndOfLine,
  209. Token::Word("baz"),
  210. Token::EndOfLine,
  211. Token::EndOfLine,
  212. ]
  213. );
  214. }
  215. #[test]
  216. fn tokenize_2() {
  217. assert_eq!(
  218. tokenize(",,,.,.,").unwrap(),
  219. [
  220. Token::Comma,
  221. Token::Comma,
  222. Token::Comma,
  223. Token::Dot,
  224. Token::Comma,
  225. Token::Dot,
  226. Token::Comma,
  227. Token::EndOfLine,
  228. ]
  229. );
  230. }
  231. #[test]
  232. fn tokenize_whitespace_dots() {
  233. assert_eq!(
  234. tokenize("baz . ,bar ").unwrap(),
  235. [
  236. Token::Word("baz"),
  237. Token::Dot,
  238. Token::Comma,
  239. Token::Word("bar"),
  240. Token::EndOfLine,
  241. ]
  242. );
  243. }
  244. #[test]
  245. fn tokenize_3() {
  246. assert_eq!(
  247. tokenize("bar, and -baz").unwrap(),
  248. [
  249. Token::Word("bar"),
  250. Token::Comma,
  251. Token::Word("and"),
  252. Token::Word("-baz"),
  253. Token::EndOfLine,
  254. ]
  255. );
  256. }
  257. #[test]
  258. fn tokenize_4() {
  259. assert_eq!(
  260. tokenize(", , b").unwrap(),
  261. [
  262. Token::Comma,
  263. Token::Comma,
  264. Token::Word("b"),
  265. Token::EndOfLine,
  266. ]
  267. );
  268. }
  269. #[test]
  270. fn tokenize_5() {
  271. assert_eq!(
  272. tokenize(r#""testing""#).unwrap(),
  273. [Token::Quote("testing"), Token::EndOfLine,]
  274. );
  275. }
  276. #[test]
  277. fn tokenize_6() {
  278. assert_eq!(
  279. tokenize(r#""testing"#).unwrap_err().position_and_kind(),
  280. (8, ErrorKind::UnterminatedString)
  281. );
  282. }
  283. #[test]
  284. fn tokenize_7() {
  285. assert_eq!(
  286. tokenize(r#"wordy wordy word"quoteno"#)
  287. .unwrap_err()
  288. .position_and_kind(),
  289. (16, ErrorKind::QuoteInWord)
  290. );
  291. }
  292. #[test]
  293. fn tokenize_raw_string_prohibit() {
  294. assert_eq!(
  295. tokenize(r##"r#""#"##).unwrap_err().position_and_kind(),
  296. (2, ErrorKind::RawString)
  297. );
  298. }
  299. #[test]
  300. fn tokenize_raw_string_prohibit_1() {
  301. assert_eq!(
  302. tokenize(r##"map_of_arkansas_r#""#"##)
  303. .unwrap_err()
  304. .position_and_kind(),
  305. (18, ErrorKind::QuoteInWord)
  306. );
  307. }