use std::{fmt::Debug, ops::AddAssign, str::Chars}; static BINARY_NUMERICS: &[char] = &['0', '1']; static OCTAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7']; static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; static HEXADECIMAL_NUMERICS: &[char] = &[ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', ]; #[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)] pub enum Keyword { Function, End, Local, Return, If, Then, } impl Keyword { pub fn parse(from: &str) -> Option { Some(match from { "function" => Keyword::Function, "end" => Keyword::End, "local" => Keyword::Local, "return" => Keyword::Return, "if" => Keyword::If, "then" => Keyword::Then, _ => None?, }) } } impl ToString for Keyword { fn to_string(&self) -> String { match self { Keyword::Function => "function", Keyword::End => "end", Keyword::Local => "local", Keyword::Return => "return", Keyword::If => "if", Keyword::Then => "then", } .to_string() } } #[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)] pub enum Token { /// Word-like-values Word(String), Keyword(Keyword), /// Number in the decimal base DecimalValue(String), /// Integer number in the hexadecimal base HexadecimalValue(String), /// Integer number in the octal base OctalValue(String), /// Integer number in the binary base BinaryValue(String), /// Some string literal that was surrounded by "double-quotes". StringLit(String), /// Special one-character symbol Symbol(char), Whitespace(String), Comment(String), Eof, } impl From for String { fn from(value: Token) -> Self { format!("{:?}", value) } } impl Token { pub fn len(&self) -> usize { self.to_string().len() } } impl ToString for Token { fn to_string(&self) -> String { match &self { Token::Word(ident) => ident.clone(), Token::DecimalValue(val) => val.to_string(), Token::HexadecimalValue(val) => format!("0x{}", val), Token::OctalValue(val) => format!("0o{}", val), Token::BinaryValue(val) => format!("0b{}", val), Token::StringLit(lit) => format!("\"{}\"", lit), Token::Eof => String::new(), Token::Whitespace(val) => val.clone(), Token::Comment(val) => format!("--{}", val.clone()), Token::Symbol(val) => val.to_string(), Token::Keyword(keyword) => keyword.to_string(), } } } /// A token with a position #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct FullToken { pub token: Token, pub position: Position, } impl Debug for FullToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_fmt(format_args!("{:?} {:?}", self.token, self.position,)) } } /// (Column, Line) #[derive(Clone, Copy, Hash, PartialEq, Eq, Ord)] pub struct Position(pub u32, pub u32); impl Position { pub fn add(&self, num: u32) -> Position { Position(self.0 + num, self.1) } pub fn sub(&self, num: u32) -> Position { Position(self.0 - num, self.1) } } impl PartialOrd for Position { fn partial_cmp(&self, other: &Self) -> Option { match self.1.partial_cmp(&other.1) { Some(core::cmp::Ordering::Equal) => {} ord => return ord, } self.0.partial_cmp(&other.0) } } impl Debug for Position { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_fmt(format_args!("Ln {}, Col {}", self.1, self.0 + 1)) } } pub struct Cursor<'a> { pub position: Position, pub char_stream: Chars<'a>, } impl<'a> Cursor<'a> { pub fn next(&mut self) -> Option { let next = self.char_stream.next(); if let Some('\n') = next { self.position.1 += 1; self.position.0 = 0; } self.position.0 += 1; next } fn first(&mut self) -> Option { // `.next()` optimizes better than `.nth(0)` self.char_stream.clone().next() } #[allow(dead_code)] // Is this actually needed? fn second(&mut self) -> Option { // `.next()` optimizes better than `.nth(1)` let mut stream = self.char_stream.clone(); stream.next(); stream.next() } } /// Take source text and produce a list of [`FullToken`]s from it, ie. /// tokenizing it. pub fn tokenize>(to_tokenize: T) -> Result, Error> { let to_tokenize = to_tokenize.into(); let mut cursor = Cursor { char_stream: to_tokenize.chars(), position: Position(0, 1), }; let mut tokens = Vec::new(); while let Some(character) = &cursor.next() { // Save "current" token first character position let position = cursor.position.sub(1); let variant = match character { // Whitespace w if w.is_whitespace() => { let mut whitespace = String::from(*w); while let Some(w) = cursor.first() { if !w.is_whitespace() { break; } whitespace.push(cursor.next().unwrap()); } Token::Whitespace(whitespace) } // Comments '-' if cursor.first() == Some('-') => { cursor.next(); let mut comment = String::new(); while !matches!(cursor.first(), Some('\n') | None) { if let Some(c) = cursor.next() { comment.push(c); } } Token::Comment(comment) } '\"' => { let mut value = String::new(); let mut escape_next = false; while cursor.first().is_some() && (cursor.first() != Some(*character) || escape_next) { if cursor.first() == Some('\\') && !escape_next { cursor.next(); // Consume backslash and always add next character escape_next = true; } else { let c = &cursor.next().unwrap(); if escape_next { value += &escape_char(&c).to_string(); } else { value += &c.to_string(); } escape_next = false; } } if cursor.first() == Some(*character) { cursor.next(); } else { return Err(Error::MissingQuotation(position)); } match character { '\"' => Token::StringLit(value), _ => unreachable!(), } } // "words" c if c.is_alphabetic() => { let mut value = character.to_string(); while let Some(c) = cursor.first() { if !(c.is_ascii_alphanumeric() || c == '_') { break; } value += &c.to_string(); cursor.next(); } if let Some(keyword) = Keyword::parse(&value) { Token::Keyword(keyword) } else { Token::Word(value) } } // Decimals c if DECIMAL_NUMERICS.contains(c) => { let mut value = NumberType::Decimal(character.to_string()); let mut numerics = DECIMAL_NUMERICS; if let Some(second) = cursor.second() { if cursor.first() == Some('x') && HEXADECIMAL_NUMERICS .contains(&second.to_lowercase().next().unwrap_or('.')) { cursor.next(); value = NumberType::Hexadecimal(String::new()); numerics = HEXADECIMAL_NUMERICS; } else if cursor.first() == Some('o') && OCTAL_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.')) { cursor.next(); value = NumberType::Octal(String::new()); numerics = OCTAL_NUMERICS; } else if cursor.first() == Some('b') && BINARY_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.')) { cursor.next(); value = NumberType::Binary(String::new()); numerics = BINARY_NUMERICS; } } while let Some(c) = cursor.first() { if !numerics.contains(&c.to_lowercase().next().unwrap_or('.')) { break; } value += c; cursor.next(); } match value { NumberType::Decimal(dec) => Token::DecimalValue(dec), NumberType::Hexadecimal(hex) => Token::HexadecimalValue(hex), NumberType::Octal(oct) => Token::OctalValue(oct), NumberType::Binary(bin) => Token::BinaryValue(bin), } } // Some one-character token value => Token::Symbol(*value), }; tokens.push(FullToken { token: variant, position, }); } tokens.push(FullToken { token: Token::Eof, position: cursor.position, }); Ok(tokens) } fn escape_char(c: &char) -> char { match c { 't' => '\t', 'n' => '\n', 'r' => '\r', '0' => '\0', _ => *c, } } enum NumberType { Decimal(String), Hexadecimal(String), Octal(String), Binary(String), } impl AddAssign for NumberType { fn add_assign(&mut self, rhs: char) { *self = match self { NumberType::Decimal(val) => NumberType::Decimal(val.to_owned() + &rhs.to_string()), NumberType::Hexadecimal(val) => { NumberType::Hexadecimal(val.to_owned() + &rhs.to_string()) } NumberType::Octal(val) => NumberType::Octal(val.to_owned() + &rhs.to_string()), NumberType::Binary(val) => NumberType::Binary(val.to_owned() + &rhs.to_string()), }; } } #[derive(thiserror::Error, Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum Error { #[error("Invalid token '{}' ", .0)] InvalidToken(char, Position), #[error("String literal is never finished!")] MissingQuotation(Position), } impl Error { pub fn get_position(&self) -> &Position { match self { Error::InvalidToken(_, pos) => pos, Error::MissingQuotation(pos) => pos, } } }