use std::{fmt::Debug, str::Chars}; static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; #[derive(Eq, PartialEq, Clone, PartialOrd, Ord)] pub enum Token { /// Values Identifier(String), /// Number with at most one decimal point DecimalValue(String), /// Some character literal that was surrounded by 'single-quotes'. CharLit(String), /// Some string literal that was surrounded by "double-quotes". StringLit(String), // Keywords /// `let` LetKeyword, /// `mut` MutKeyword, /// `import` ImportKeyword, /// `return` ReturnKeyword, /// `fn` FnKeyword, /// `pub` PubKeyword, /// `as` AsKeyword, /// `->` Arrow, /// `if` If, /// `else` Else, /// `true` True, /// `false` False, /// `extern` Extern, /// `struct` Struct, /// `while` While, /// `for` For, /// `In` In, // Symbols /// `;` Semi, /// `=` Equals, /// `:` Colon, /// `+` Plus, /// `*` Star, /// `-` Minus, /// `/` Slash, /// `%` Percent, /// `>` GreaterThan, /// `<` LessThan, /// `&` Et, /// `!` Exclamation, /// `(` ParenOpen, /// `)` ParenClose, /// `{` BraceOpen, /// `}` BraceClose, /// `[` BracketOpen, /// `]` BracketClose, /// `,` Comma, /// `.` Dot, Unknown(char), Eof, } impl Token { pub fn get_token_prec(&self) -> i8 { match &self { Token::Plus => 10, Token::Minus => 10, Token::Star => 20, _ => -1, } } } impl From for String { fn from(value: Token) -> Self { format!("{:?}", value) } } impl Token { pub fn len(&self) -> usize { self.to_string().len() } } impl ToString for Token { fn to_string(&self) -> String { match &self { Token::Identifier(ident) => ident.clone(), Token::DecimalValue(val) => val.to_string(), Token::CharLit(lit) => format!("\'{}\'", lit), Token::StringLit(lit) => format!("\"{}\"", lit), Token::LetKeyword => String::from("let"), Token::MutKeyword => String::from("mut"), Token::ImportKeyword => String::from("import"), Token::ReturnKeyword => String::from("return"), Token::FnKeyword => String::from("fn"), Token::PubKeyword => String::from("pub"), Token::Arrow => String::from("=>"), Token::If => String::from("if"), Token::Else => String::from("else"), Token::True => String::from("true"), Token::False => String::from("false"), Token::Extern => String::from("extern"), Token::Struct => String::from("struct"), Token::AsKeyword => String::from("as"), Token::For => String::from("for"), Token::In => String::from("in"), Token::While => String::from("while"), Token::Semi => String::from(';'), Token::Equals => String::from('='), Token::Colon => String::from(':'), Token::Plus => String::from('+'), Token::Star => String::from('*'), Token::Minus => String::from('-'), Token::GreaterThan => String::from('>'), Token::LessThan => String::from('<'), Token::Et => String::from('&'), Token::Exclamation => String::from('!'), Token::ParenOpen => String::from('('), Token::ParenClose => String::from(')'), Token::BraceOpen => String::from('{'), Token::BraceClose => String::from('}'), Token::BracketOpen => String::from('['), Token::BracketClose => String::from(']'), Token::Comma => String::from(','), Token::Dot => String::from('.'), Token::Eof => String::new(), Token::Slash => String::from('/'), Token::Percent => String::from('%'), Token::Unknown(val) => val.to_string(), } } } impl std::fmt::Debug for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Unknown(value) => write!(f, "Unknown(\'{}\')", value.to_string()), _ => write!(f, "{}", self.to_string()), } } } /// A token with a position #[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct FullToken { pub token: Token, pub position: Position, } impl Debug for FullToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_fmt(format_args!( "Token({:?}) (Ln {}, Col {})", self.token, self.position.1, self.position.0 )) } } /// (Column, Line) #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Ord)] pub struct Position(pub u32, pub u32); impl Position { pub fn add(&self, num: u32) -> Position { Position(self.0 + num, self.1) } pub fn sub(&self, num: u32) -> Position { Position(self.0 - num, self.1) } } impl PartialOrd for Position { fn partial_cmp(&self, other: &Self) -> Option { match self.1.partial_cmp(&other.1) { Some(core::cmp::Ordering::Equal) => {} ord => return ord, } self.0.partial_cmp(&other.0) } } pub struct Cursor<'a> { pub position: Position, pub char_stream: Chars<'a>, } impl<'a> Cursor<'a> { pub fn next(&mut self) -> Option { let next = self.char_stream.next(); if let Some('\n') = next { self.position.1 += 1; self.position.0 = 0; } self.position.0 += 1; next } fn first(&mut self) -> Option { // `.next()` optimizes better than `.nth(0)` self.char_stream.clone().next() } #[allow(dead_code)] // Is this actually needed? fn second(&mut self) -> Option { // `.next()` optimizes better than `.nth(1)` let mut stream = self.char_stream.clone(); stream.next(); stream.next() } } /// Take source text and produce a list of [`FullToken`]s from it, ie. /// tokenizing it. pub fn tokenize>(to_tokenize: T) -> Result, Error> { let to_tokenize = to_tokenize.into(); let mut cursor = Cursor { char_stream: to_tokenize.chars(), position: Position(0, 1), }; let mut tokens = Vec::new(); while let Some(character) = &cursor.next() { // Save "current" token first character position let position = cursor.position.sub(1); let variant = match character { // Whitespace w if w.is_whitespace() => continue, // Comments '/' if cursor.first() == Some('/') => { while !matches!(cursor.first(), Some('\n') | None) { cursor.next(); } continue; } '\"' | '\'' => { let mut value = String::new(); let mut escape_next = false; while cursor.first().is_some() && (cursor.first() != Some(*character) || escape_next) { if cursor.first() == Some('\\') && !escape_next { cursor.next(); // Consume backslash and always add next character escape_next = true; } else { let c = &cursor.next().unwrap(); if escape_next { value += &escape_char(&c).to_string(); } else { value += &c.to_string(); } escape_next = false; } } if cursor.first() == Some(*character) { cursor.next(); } else { return Err(Error::MissingQuotation(position)); } match character { '\'' => Token::CharLit(value), '\"' => Token::StringLit(value), _ => unreachable!(), } } // "words" c if c.is_alphabetic() => { let mut value = character.to_string(); while let Some(c) = cursor.first() { if !(c.is_ascii_alphanumeric() || c == '_') { break; } value += &c.to_string(); cursor.next(); } // Check for keywords let variant = match value.as_str() { "let" => Token::LetKeyword, "mut" => Token::MutKeyword, "import" => Token::ImportKeyword, "return" => Token::ReturnKeyword, "fn" => Token::FnKeyword, "if" => Token::If, "else" => Token::Else, "true" => Token::True, "false" => Token::False, "extern" => Token::Extern, "pub" => Token::PubKeyword, "struct" => Token::Struct, "as" => Token::AsKeyword, "for" => Token::For, "while" => Token::While, "in" => Token::In, _ => Token::Identifier(value), }; variant } // Decimals c if DECIMAL_NUMERICS.contains(c) => { let mut value = character.to_string(); while let Some(c) = cursor.first() { if !DECIMAL_NUMERICS.contains(&c) { break; } value += &c.to_string(); cursor.next(); } Token::DecimalValue(value) } '-' if cursor.first() == Some('>') => { cursor.next(); // Eat `>` Token::Arrow } // Single character tokens '=' => Token::Equals, ';' => Token::Semi, ':' => Token::Colon, '+' => Token::Plus, '*' => Token::Star, '-' => Token::Minus, '>' => Token::GreaterThan, '<' => Token::LessThan, '&' => Token::Et, '!' => Token::Exclamation, '(' => Token::ParenOpen, ')' => Token::ParenClose, '[' => Token::BracketOpen, ']' => Token::BracketClose, '{' => Token::BraceOpen, '}' => Token::BraceClose, ',' => Token::Comma, '.' => Token::Dot, '/' => Token::Slash, '%' => Token::Percent, // Unknown token value => Token::Unknown(*value), }; tokens.push(FullToken { token: variant, position, }); } tokens.push(FullToken { token: Token::Eof, position: cursor.position, }); Ok(tokens) } fn escape_char(c: &char) -> char { match c { 't' => '\t', 'n' => '\n', 'r' => '\r', '0' => '\0', _ => *c, } } #[derive(thiserror::Error, Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum Error { #[error("Invalid token '{}' ", .0)] InvalidToken(char, Position), #[error("String literal is never finished!")] MissingQuotation(Position), } impl Error { pub fn get_position(&self) -> &Position { match self { Error::InvalidToken(_, pos) => pos, Error::MissingQuotation(pos) => pos, } } }