token.rs 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. use crate::error::Error;
  2. use std::fmt;
  3. use std::iter::Peekable;
  4. use std::str::CharIndices;
  5. #[derive(Debug, PartialEq, Eq, Copy, Clone)]
  6. pub enum Token<'a> {
  7. Dot,
  8. Comma,
  9. Semi,
  10. Exclamation,
  11. Question,
  12. Colon,
  13. EndOfLine,
  14. Quote(&'a str),
  15. Word(&'a str),
  16. }
  17. impl fmt::Display for Token<'_> {
  18. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  19. match self {
  20. Token::Dot => write!(f, "."),
  21. Token::Comma => write!(f, ","),
  22. Token::Semi => write!(f, ";"),
  23. Token::Exclamation => write!(f, "!"),
  24. Token::Question => write!(f, "?"),
  25. Token::Colon => write!(f, ":"),
  26. Token::EndOfLine => Ok(()),
  27. Token::Quote(body) => write!(f, r#""{}""#, body),
  28. Token::Word(word) => write!(f, "{}", word),
  29. }
  30. }
  31. }
  32. #[derive(Clone, Debug)]
  33. pub struct Tokenizer<'a> {
  34. input: &'a str,
  35. chars: Peekable<CharIndices<'a>>,
  36. end_of_input_emitted: bool,
  37. }
  38. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  39. pub enum ErrorKind {
  40. UnterminatedString,
  41. QuoteInWord,
  42. RawString,
  43. }
  44. impl std::error::Error for ErrorKind {}
  45. impl fmt::Display for ErrorKind {
  46. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  47. write!(
  48. f,
  49. "{}",
  50. match self {
  51. ErrorKind::UnterminatedString => "unterminated string",
  52. ErrorKind::QuoteInWord => "quote in word",
  53. ErrorKind::RawString => "raw strings are not yet supported",
  54. }
  55. )
  56. }
  57. }
  58. #[cfg(test)]
  59. impl<'a> Error<'a> {
  60. fn position_and_kind(&self) -> (usize, ErrorKind) {
  61. (
  62. self.position,
  63. *self.source.downcast_ref::<ErrorKind>().unwrap(),
  64. )
  65. }
  66. }
  67. impl<'a> Tokenizer<'a> {
  68. pub fn new(input: &'a str) -> Tokenizer<'a> {
  69. Tokenizer {
  70. input,
  71. chars: input.char_indices().peekable(),
  72. end_of_input_emitted: false,
  73. }
  74. }
  75. pub fn error<T: 'static + std::error::Error>(&mut self, source: T) -> Error<'a> {
  76. Error {
  77. input: self.input,
  78. position: self.cur_pos(),
  79. source: Box::new(source),
  80. }
  81. }
  82. fn consume_whitespace(&mut self) {
  83. while self
  84. .cur()
  85. .map_or(false, |c| c.1 != '\n' && c.1.is_whitespace())
  86. {
  87. self.advance();
  88. }
  89. }
  90. fn cur_punct(&mut self) -> Option<Token<'static>> {
  91. let (_, ch) = self.cur()?;
  92. match ch {
  93. '.' => Some(Token::Dot),
  94. ',' => Some(Token::Comma),
  95. ':' => Some(Token::Colon),
  96. '!' => Some(Token::Exclamation),
  97. '?' => Some(Token::Question),
  98. ';' => Some(Token::Semi),
  99. '\n' => Some(Token::EndOfLine),
  100. _ => None,
  101. }
  102. }
  103. fn consume_punct(&mut self) -> Option<Token<'a>> {
  104. let x = self.cur_punct()?;
  105. self.advance();
  106. Some(x)
  107. }
  108. fn cur(&mut self) -> Option<(usize, char)> {
  109. self.chars.peek().cloned()
  110. }
  111. fn at_end(&mut self) -> bool {
  112. self.chars.peek().is_none()
  113. }
  114. fn advance(&mut self) -> Option<()> {
  115. let (_, _) = self.chars.next()?;
  116. Some(())
  117. }
  118. fn cur_pos(&mut self) -> usize {
  119. self.cur().map_or(self.input.len(), |(pos, _)| pos)
  120. }
  121. fn str_from(&mut self, pos: usize) -> &'a str {
  122. &self.input[pos..self.cur_pos()]
  123. }
  124. fn consume_string(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  125. if let Some((_, '"')) = self.cur() {
  126. // okay
  127. } else {
  128. return Ok(None);
  129. }
  130. self.advance(); // eat "
  131. let start = self.cur_pos();
  132. loop {
  133. match self.cur() {
  134. Some((_, '"')) => break,
  135. Some(_) => self.advance(),
  136. None => return Err(self.error(ErrorKind::UnterminatedString)),
  137. };
  138. }
  139. let body = self.str_from(start);
  140. self.advance(); // eat final '"'
  141. Ok(Some(Token::Quote(body)))
  142. }
  143. pub fn position(&mut self) -> usize {
  144. self.cur_pos()
  145. }
  146. pub fn peek_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  147. self.clone().next_token()
  148. }
  149. pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error<'a>> {
  150. self.consume_whitespace();
  151. if self.at_end() {
  152. if self.end_of_input_emitted {
  153. return Ok(None);
  154. } else {
  155. self.end_of_input_emitted = true;
  156. return Ok(Some(Token::EndOfLine));
  157. }
  158. }
  159. if let Some(punct) = self.consume_punct() {
  160. return Ok(Some(punct));
  161. }
  162. if let Some(s) = self.consume_string()? {
  163. return Ok(Some(s));
  164. }
  165. // Attempt to consume a word from the input.
  166. // Stop if we encounter whitespace or punctuation.
  167. let start = self.cur_pos();
  168. while self.cur().map_or(false, |(_, ch)| {
  169. !(self.cur_punct().is_some() || ch.is_whitespace())
  170. }) {
  171. if self.cur().unwrap().1 == '"' {
  172. let so_far = self.str_from(start);
  173. if so_far.starts_with('r') && so_far.chars().skip(1).all(|v| v == '#' || v == '"') {
  174. return Err(self.error(ErrorKind::RawString));
  175. } else {
  176. return Err(self.error(ErrorKind::QuoteInWord));
  177. }
  178. }
  179. self.advance();
  180. }
  181. Ok(Some(Token::Word(&self.str_from(start))))
  182. }
  183. }
  184. #[cfg(test)]
  185. fn tokenize<'a>(input: &'a str) -> Result<Vec<Token<'a>>, Error<'a>> {
  186. let mut tokens = Vec::new();
  187. let mut gen = Tokenizer::new(input);
  188. while let Some(tok) = gen.next_token()? {
  189. tokens.push(tok);
  190. }
  191. Ok(tokens)
  192. }
  193. #[test]
  194. fn tokenize_1() {
  195. assert_eq!(
  196. tokenize("foo\t\r\n\n bar\nbaz\n").unwrap(),
  197. [
  198. Token::Word("foo"),
  199. Token::EndOfLine,
  200. Token::EndOfLine,
  201. Token::Word("bar"),
  202. Token::EndOfLine,
  203. Token::Word("baz"),
  204. Token::EndOfLine,
  205. Token::EndOfLine,
  206. ]
  207. );
  208. }
  209. #[test]
  210. fn tokenize_2() {
  211. assert_eq!(
  212. tokenize(",,,.,.,").unwrap(),
  213. [
  214. Token::Comma,
  215. Token::Comma,
  216. Token::Comma,
  217. Token::Dot,
  218. Token::Comma,
  219. Token::Dot,
  220. Token::Comma,
  221. Token::EndOfLine,
  222. ]
  223. );
  224. }
  225. #[test]
  226. fn tokenize_whitespace_dots() {
  227. assert_eq!(
  228. tokenize("baz . ,bar ").unwrap(),
  229. [
  230. Token::Word("baz"),
  231. Token::Dot,
  232. Token::Comma,
  233. Token::Word("bar"),
  234. Token::EndOfLine,
  235. ]
  236. );
  237. }
  238. #[test]
  239. fn tokenize_3() {
  240. assert_eq!(
  241. tokenize("bar, and -baz").unwrap(),
  242. [
  243. Token::Word("bar"),
  244. Token::Comma,
  245. Token::Word("and"),
  246. Token::Word("-baz"),
  247. Token::EndOfLine,
  248. ]
  249. );
  250. }
  251. #[test]
  252. fn tokenize_4() {
  253. assert_eq!(
  254. tokenize(", , b").unwrap(),
  255. [
  256. Token::Comma,
  257. Token::Comma,
  258. Token::Word("b"),
  259. Token::EndOfLine,
  260. ]
  261. );
  262. }
  263. #[test]
  264. fn tokenize_5() {
  265. assert_eq!(
  266. tokenize(r#""testing""#).unwrap(),
  267. [Token::Quote("testing"), Token::EndOfLine,]
  268. );
  269. }
  270. #[test]
  271. fn tokenize_6() {
  272. assert_eq!(
  273. tokenize(r#""testing"#).unwrap_err().position_and_kind(),
  274. (8, ErrorKind::UnterminatedString)
  275. );
  276. }
  277. #[test]
  278. fn tokenize_7() {
  279. assert_eq!(
  280. tokenize(r#"wordy wordy word"quoteno"#)
  281. .unwrap_err()
  282. .position_and_kind(),
  283. (16, ErrorKind::QuoteInWord)
  284. );
  285. }
  286. #[test]
  287. fn tokenize_raw_string_prohibit() {
  288. assert_eq!(
  289. tokenize(r##"r#""#"##).unwrap_err().position_and_kind(),
  290. (2, ErrorKind::RawString)
  291. );
  292. }
  293. #[test]
  294. fn tokenize_raw_string_prohibit_1() {
  295. assert_eq!(
  296. tokenize(r##"map_of_arkansas_r#""#"##)
  297. .unwrap_err()
  298. .position_and_kind(),
  299. (18, ErrorKind::QuoteInWord)
  300. );
  301. }