瀏覽代碼

New streaming tokenizer

Mark Rousskov 6 年之前
父節點
當前提交
2973af0572
共有 4 個文件被更改,包括 195 次插入196 次删除
  1. 0 0
      parser/src/code_block.rs
  2. 2 195
      parser/src/label.rs
  3. 3 1
      parser/src/lib.rs
  4. 190 0
      parser/src/token.rs

+ 0 - 0
parser/src/code_block.rs


+ 2 - 195
parser/src/label.rs

@@ -5,7 +5,8 @@
 //! The grammar is as follows:
 //!
 //! ```text
-//! labels: <label-list>.
+//! Command: `labels: <label-list>.`
+//!
 //! <label-list>:
 //!  - <label-delta>
 //!  - <label-delta> and <label-list>
@@ -76,43 +77,6 @@ impl<'a> LabelDelta<'a> {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Copy, Clone)]
-pub enum Token<'a> {
-    Labels,
-    Comma,
-    Dot,
-    And,
-    Word(&'a str),
-}
-
-impl<'a> Token<'a> {
-    fn divide(self, split: char, tok: Token<'a>) -> Vec<Token<'a>> {
-        let word = if let Token::Word(word) = self {
-            word
-        } else {
-            return vec![self];
-        };
-        if !word.contains(split) {
-            return vec![self];
-        }
-        let mut toks = word
-            .split(split)
-            .flat_map(|w| vec![Token::Word(w), tok])
-            .collect::<Vec<_>>();
-        // strip last token that we inserted; it's not actually one we need/want.
-        assert_eq!(toks.pop(), Some(tok));
-        if word.ends_with(split) {
-            // strip empty string
-            assert_eq!(toks.pop(), Some(Token::Word("")));
-        }
-        if word.starts_with(split) {
-            // strip empty string
-            assert_eq!(toks.remove(0), Token::Word(""));
-        }
-        toks
-    }
-}
-
 #[derive(PartialEq, Eq, Debug)]
 pub enum ParseError<'a> {
     EmptyLabel,
@@ -123,87 +87,6 @@ pub enum ParseError<'a> {
     },
 }
 
-#[derive(Debug)]
-struct TokenStream<'a> {
-    tokens: Vec<Token<'a>>,
-    position: usize,
-}
-
-impl<'a> TokenStream<'a> {
-    fn new(input: &'a str) -> TokenStream<'a> {
-        let tokens = input
-            .split_whitespace()
-            .map(|word| Token::Word(word))
-            .flat_map(|tok| tok.divide(',', Token::Comma))
-            .flat_map(|tok| tok.divide('.', Token::Dot))
-            .map(|tok| {
-                if let Token::Word("and") = tok {
-                    Token::And
-                } else {
-                    tok
-                }
-            })
-            .flat_map(|tok| {
-                if let Token::Word(word) = tok {
-                    let split = "labels:";
-                    if word.starts_with(split) {
-                        if word == split {
-                            vec![Token::Labels]
-                        } else {
-                            vec![Token::Labels, Token::Word(&word[split.len()..])]
-                        }
-                    } else {
-                        vec![tok]
-                    }
-                } else {
-                    vec![tok]
-                }
-            })
-            .collect();
-        TokenStream {
-            tokens,
-            position: 0,
-        }
-    }
-
-    fn current(&self) -> Option<Token<'a>> {
-        self.tokens.get(self.position).cloned()
-    }
-
-    fn advance(&mut self) -> Result<(), ParseError<'a>> {
-        self.position += 1;
-        if self.position > self.tokens.len() {
-            return Err(ParseError::UnexpectedEnd);
-        }
-        Ok(())
-    }
-
-    fn eat(&mut self, tok: Token<'a>, expect: &'static str) -> Result<(), ParseError<'a>> {
-        if self.current() == Some(tok) {
-            self.advance()?;
-            return Ok(());
-        }
-
-        Err(ParseError::Unexpected {
-            found: self.current(),
-            expected: expect,
-        })
-    }
-
-    fn at_end(&self) -> bool {
-        self.position == self.tokens.len()
-    }
-}
-
-impl<'a, T> PartialEq<T> for TokenStream<'a>
-where
-    T: ?Sized + PartialEq<[Token<'a>]>,
-{
-    fn eq(&self, other: &T) -> bool {
-        other == &self.tokens[self.position..]
-    }
-}
-
 fn parse_command<'a>(input: &mut TokenStream<'a>) -> Result<Vec<LabelDelta<'a>>, ParseError<'a>> {
     input.eat(Token::Labels, "labels command start")?;
 
@@ -245,71 +128,6 @@ pub fn parse<'a>(input: &'a str) -> Result<Vec<LabelDelta<'a>>, ParseError<'a>>
     Ok(labels)
 }
 
-#[test]
-fn tokenize_1() {
-    assert_eq!(
-        TokenStream::new("foo\t\r\n bar\nbaz"),
-        [Token::Word("foo"), Token::Word("bar"), Token::Word("baz"),]
-    );
-}
-
-#[test]
-fn tokenize_2() {
-    assert_eq!(
-        TokenStream::new(",.,.,"),
-        [
-            Token::Comma,
-            Token::Dot,
-            Token::Comma,
-            Token::Dot,
-            Token::Comma
-        ]
-    );
-}
-
-#[test]
-fn tokenize_whitespace_dots() {
-    assert_eq!(
-        TokenStream::new("baz . ,bar "),
-        [
-            Token::Word("baz"),
-            Token::Dot,
-            Token::Comma,
-            Token::Word("bar")
-        ]
-    );
-}
-
-#[test]
-fn tokenize_3() {
-    assert_eq!(
-        TokenStream::new("bar, and -baz"),
-        [
-            Token::Word("bar"),
-            Token::Comma,
-            Token::And,
-            Token::Word("-baz"),
-        ]
-    );
-}
-
-#[test]
-fn tokenize_labels() {
-    assert_eq!(TokenStream::new("labels:"), [Token::Labels]);
-    assert_eq!(
-        TokenStream::new("foo labels:"),
-        [Token::Word("foo"), Token::Labels]
-    );
-    assert_eq!(
-        TokenStream::new("labels:T-compiler"),
-        [Token::Labels, Token::Word("T-compiler")]
-    );
-    assert_eq!(
-        TokenStream::new("barlabels:T-compiler"),
-        [Token::Word("barlabels:T-compiler")]
-    );
-}
-
 #[test]
 fn parse_simple() {
     assert_eq!(
@@ -350,17 +168,6 @@ fn parse_no_label_paragraph() {
     );
 }
 
-#[test]
-fn parse_nested_labels() {
-    assert_eq!(
-        parse("labels: +foo, bar, labels: oh no.."),
-        Err(ParseError::Unexpected {
-            found: Some(Token::Labels),
-            expected: "label delta"
-        }),
-    );
-}
-
 #[test]
 fn parse_multi_label() {
     let para = "

+ 3 - 1
parser/src/lib.rs

@@ -1 +1,3 @@
-pub mod label;
+//pub mod label;
+pub mod code_block;
+pub mod token;

+ 190 - 0
parser/src/token.rs

@@ -0,0 +1,190 @@
+use std::iter::Peekable;
+use std::str::CharIndices;
+
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum Token<'a> {
+    Dot,
+    Comma,
+    Semi,
+    Exclamation,
+    Question,
+    Colon,
+    Quote(&'a str),
+    Word(&'a str),
+}
+
+#[derive(Debug)]
+pub struct TokenStream<'a> {
+    tokens: Vec<Token<'a>>,
+    position: usize,
+}
+
+#[derive(Debug)]
+pub struct Tokenizer<'a> {
+    input: &'a str,
+    chars: Peekable<CharIndices<'a>>,
+}
+
+impl<'a> Tokenizer<'a> {
+    fn new(input: &'a str) -> Tokenizer<'a> {
+        Tokenizer {
+            input: input,
+            chars: input.char_indices().peekable(),
+        }
+    }
+
+    fn consume_whitespace(&mut self) {
+        while self.cur().map_or(false, |c| c.1.is_whitespace()) {
+            self.advance();
+        }
+    }
+
+    fn cur_punct(&mut self) -> Option<Token<'static>> {
+        let (_, ch) = self.cur()?;
+        match ch {
+            '.' => Some(Token::Dot),
+            ',' => Some(Token::Comma),
+            ':' => Some(Token::Colon),
+            '!' => Some(Token::Exclamation),
+            '?' => Some(Token::Question),
+            ';' => Some(Token::Semi),
+            _ => None,
+        }
+    }
+
+    fn consume_punct(&mut self) -> Option<Token<'a>> {
+        let x = self.cur_punct()?;
+        self.advance();
+        Some(x)
+    }
+
+    fn cur(&mut self) -> Option<(usize, char)> {
+        self.chars.peek().cloned()
+    }
+
+    fn at_end(&mut self) -> bool {
+        self.chars.peek().is_none()
+    }
+
+    fn advance(&mut self) -> Option<()> {
+        let (_, _) = self.chars.next()?;
+        Some(())
+    }
+
+    fn cur_pos(&mut self) -> usize {
+        self.cur().map_or(self.input.len(), |(pos, _)| pos)
+    }
+
+    fn str_from(&mut self, pos: usize) -> &'a str {
+        &self.input[pos..self.cur_pos()]
+    }
+
+    fn next_token(&mut self) -> Option<Token<'a>> {
+        self.consume_whitespace();
+        if self.at_end() {
+            return None;
+        }
+        if let Some(punct) = self.consume_punct() {
+            return Some(punct);
+        }
+
+        // Attempt to consume a word from the input.
+        // Stop if we encounter whitespace or punctuation.
+        let start = self.cur_pos();
+        while self.cur().map_or(false, |(_, ch)| {
+            !(self.cur_punct().is_some() || ch.is_whitespace())
+        }) {
+            self.advance();
+        }
+        Some(Token::Word(&self.str_from(start)))
+    }
+}
+
+impl<'a> TokenStream<'a> {
+    pub fn new(input: &'a str) -> TokenStream<'a> {
+        let mut tokens = Vec::new();
+        let mut gen = Tokenizer::new(input);
+        while let Some(tok) = gen.next_token() {
+            tokens.push(tok);
+        }
+        TokenStream {
+            tokens,
+            position: 0,
+        }
+    }
+
+    pub fn current(&self) -> Option<Token<'a>> {
+        self.tokens.get(self.position).cloned()
+    }
+
+    pub fn at_end(&self) -> bool {
+        self.position == self.tokens.len()
+    }
+}
+
+impl<'a, T> PartialEq<T> for TokenStream<'a>
+where
+    T: ?Sized + PartialEq<[Token<'a>]>,
+{
+    fn eq(&self, other: &T) -> bool {
+        other == &self.tokens[self.position..]
+    }
+}
+
+#[test]
+fn tokenize_1() {
+    assert_eq!(
+        TokenStream::new("foo\t\r\n bar\nbaz"),
+        [Token::Word("foo"), Token::Word("bar"), Token::Word("baz"),]
+    );
+}
+
+#[test]
+fn tokenize_2() {
+    assert_eq!(
+        TokenStream::new(",,,.,.,"),
+        [
+            Token::Comma,
+            Token::Comma,
+            Token::Comma,
+            Token::Dot,
+            Token::Comma,
+            Token::Dot,
+            Token::Comma
+        ]
+    );
+}
+
+#[test]
+fn tokenize_whitespace_dots() {
+    assert_eq!(
+        TokenStream::new("baz . ,bar "),
+        [
+            Token::Word("baz"),
+            Token::Dot,
+            Token::Comma,
+            Token::Word("bar")
+        ]
+    );
+}
+
+#[test]
+fn tokenize_3() {
+    assert_eq!(
+        TokenStream::new("bar, and -baz"),
+        [
+            Token::Word("bar"),
+            Token::Comma,
+            Token::Word("and"),
+            Token::Word("-baz"),
+        ]
+    );
+}
+
+#[test]
+fn tokenize_4() {
+    assert_eq!(
+        TokenStream::new(", , b"),
+        [Token::Comma, Token::Comma, Token::Word("b")]
+    );
+}