2023-08-02 17:38:38 +02:00
|
|
|
use std::{fmt::Debug, str::Chars};
|
2023-07-27 16:40:12 +02:00
|
|
|
|
|
|
|
static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
|
|
|
|
|
2023-08-02 14:31:33 +02:00
|
|
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
|
|
|
pub enum Token {
|
|
|
|
// Values
|
|
|
|
Identifier(String),
|
|
|
|
/// Number with at most one decimal point
|
|
|
|
DecimalValue(String),
|
|
|
|
|
|
|
|
// Keywords
|
|
|
|
/// `let`
|
|
|
|
LetKeyword,
|
|
|
|
/// `import`
|
|
|
|
ImportKeyword,
|
|
|
|
/// `return`
|
|
|
|
ReturnKeyword,
|
|
|
|
/// `fn`
|
|
|
|
FnKeyword,
|
2023-08-03 19:30:00 +02:00
|
|
|
/// `->`
|
|
|
|
Arrow,
|
2023-08-03 20:24:57 +02:00
|
|
|
/// `if`
|
|
|
|
If,
|
2023-08-02 14:31:33 +02:00
|
|
|
|
|
|
|
// Symbols
|
|
|
|
/// `;`
|
|
|
|
Semi,
|
|
|
|
/// `=`
|
|
|
|
Equals,
|
|
|
|
/// `:`
|
|
|
|
Colon,
|
|
|
|
/// `+`
|
|
|
|
Plus,
|
|
|
|
/// `*`
|
|
|
|
Times,
|
2023-08-03 19:30:00 +02:00
|
|
|
/// `-`
|
|
|
|
Minus,
|
2023-08-03 20:24:57 +02:00
|
|
|
|
2023-08-03 19:30:00 +02:00
|
|
|
/// `>`
|
|
|
|
GreaterThan,
|
|
|
|
/// `<`
|
|
|
|
LessThan,
|
2023-08-03 20:24:57 +02:00
|
|
|
/// `&`
|
|
|
|
Et,
|
|
|
|
|
2023-08-02 14:31:33 +02:00
|
|
|
/// `(`
|
|
|
|
ParenOpen,
|
|
|
|
/// `)`
|
|
|
|
ParenClose,
|
|
|
|
/// `{`
|
|
|
|
BraceOpen,
|
|
|
|
/// `}`
|
|
|
|
BraceClose,
|
|
|
|
/// `,`
|
|
|
|
Comma,
|
|
|
|
|
|
|
|
Eof,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Token {
|
|
|
|
pub fn get_token_prec(&self) -> i8 {
|
|
|
|
match &self {
|
|
|
|
Token::Plus => 10,
|
2023-08-03 19:30:00 +02:00
|
|
|
Token::Minus => 10,
|
2023-08-02 14:31:33 +02:00
|
|
|
Token::Times => 20,
|
|
|
|
_ => -1,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-02 19:17:06 +02:00
|
|
|
impl From<Token> for String {
|
|
|
|
fn from(value: Token) -> Self {
|
|
|
|
format!("{:?}", value)
|
2023-08-02 18:58:10 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-02 14:31:33 +02:00
|
|
|
#[derive(Clone)]
|
|
|
|
pub struct FullToken {
|
|
|
|
pub token: Token,
|
|
|
|
pub position: Position,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Debug for FullToken {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
f.write_fmt(format_args!(
|
|
|
|
"{:?} (Ln {}, Col {})",
|
|
|
|
self.token, self.position.1, self.position.0
|
|
|
|
))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub type Position = (u32, u32);
|
|
|
|
|
|
|
|
pub struct Cursor<'a> {
|
|
|
|
pub position: Position,
|
|
|
|
char_stream: Chars<'a>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Cursor<'a> {
|
|
|
|
fn next(&mut self) -> Option<char> {
|
|
|
|
let next = self.char_stream.next();
|
|
|
|
if let Some('\n') = next {
|
|
|
|
self.position.1 += 1;
|
|
|
|
self.position.0 = 0;
|
|
|
|
}
|
2023-08-02 18:58:10 +02:00
|
|
|
self.position.0 += 1;
|
2023-08-02 14:31:33 +02:00
|
|
|
next
|
|
|
|
}
|
|
|
|
|
|
|
|
fn first(&mut self) -> Option<char> {
|
|
|
|
// `.next()` optimizes better than `.nth(0)`
|
|
|
|
self.char_stream.clone().next()
|
|
|
|
}
|
|
|
|
|
2023-08-02 17:38:38 +02:00
|
|
|
#[allow(dead_code)] // Is this actually needed?
|
2023-08-02 14:31:33 +02:00
|
|
|
fn second(&mut self) -> Option<char> {
|
|
|
|
// `.next()` optimizes better than `.nth(1)`
|
|
|
|
let mut stream = self.char_stream.clone();
|
|
|
|
stream.next();
|
|
|
|
stream.next()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-02 18:17:57 +02:00
|
|
|
pub fn tokenize<T: Into<String>>(to_tokenize: T) -> Result<Vec<FullToken>, Error> {
|
2023-07-27 16:40:12 +02:00
|
|
|
let to_tokenize = to_tokenize.into();
|
|
|
|
let mut cursor = Cursor {
|
2023-08-02 14:31:33 +02:00
|
|
|
char_stream: to_tokenize.chars(),
|
2023-08-02 18:17:57 +02:00
|
|
|
position: (0, 1),
|
2023-07-27 16:40:12 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
let mut tokens = Vec::new();
|
|
|
|
|
2023-07-27 20:17:44 +02:00
|
|
|
while let Some(character) = &cursor.next() {
|
2023-07-27 16:40:12 +02:00
|
|
|
let variant = match character {
|
|
|
|
// Whitespace
|
|
|
|
w if w.is_whitespace() => continue,
|
|
|
|
// Comments
|
2023-08-02 14:31:33 +02:00
|
|
|
'/' if cursor.first() == Some('/') => {
|
|
|
|
while !matches!(cursor.first(), Some('\n')) {
|
2023-07-27 20:17:44 +02:00
|
|
|
cursor.next();
|
2023-07-27 16:40:12 +02:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// "words"
|
|
|
|
c if c.is_alphabetic() => {
|
|
|
|
let mut value = character.to_string();
|
2023-08-02 14:31:33 +02:00
|
|
|
while let Some(c) = cursor.first() {
|
2023-07-27 16:40:12 +02:00
|
|
|
if !c.is_ascii_alphanumeric() {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
value += &c.to_string();
|
2023-07-27 20:17:44 +02:00
|
|
|
cursor.next();
|
2023-07-27 16:40:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check for keywords
|
|
|
|
let variant = match value.as_str() {
|
2023-07-27 20:17:44 +02:00
|
|
|
"let" => Token::LetKeyword,
|
2023-07-27 20:47:50 +02:00
|
|
|
"import" => Token::ImportKeyword,
|
2023-08-02 14:31:33 +02:00
|
|
|
"return" => Token::ReturnKeyword,
|
|
|
|
"fn" => Token::FnKeyword,
|
2023-08-03 20:24:57 +02:00
|
|
|
"if" => Token::If,
|
2023-07-27 20:17:44 +02:00
|
|
|
_ => Token::Identifier(value),
|
2023-07-27 16:40:12 +02:00
|
|
|
};
|
|
|
|
variant
|
|
|
|
}
|
|
|
|
// Decimals
|
|
|
|
c if DECIMAL_NUMERICS.contains(c) => {
|
|
|
|
let mut value = character.to_string();
|
2023-08-02 14:31:33 +02:00
|
|
|
while let Some(c) = cursor.first() {
|
|
|
|
if !DECIMAL_NUMERICS.contains(&c) {
|
2023-07-27 16:40:12 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
value += &c.to_string();
|
2023-07-27 20:17:44 +02:00
|
|
|
cursor.next();
|
2023-07-27 16:40:12 +02:00
|
|
|
}
|
2023-07-27 20:17:44 +02:00
|
|
|
Token::DecimalValue(value)
|
2023-07-27 16:40:12 +02:00
|
|
|
}
|
2023-08-03 19:30:00 +02:00
|
|
|
'-' if cursor.first() == Some('>') => {
|
|
|
|
cursor.next(); // Eat `>`
|
|
|
|
Token::Arrow
|
|
|
|
}
|
2023-07-27 16:40:12 +02:00
|
|
|
// Single character tokens
|
2023-07-27 20:17:44 +02:00
|
|
|
'=' => Token::Equals,
|
2023-08-02 14:31:33 +02:00
|
|
|
';' => Token::Semi,
|
2023-07-27 20:47:50 +02:00
|
|
|
':' => Token::Colon,
|
|
|
|
'+' => Token::Plus,
|
|
|
|
'*' => Token::Times,
|
2023-08-03 19:30:00 +02:00
|
|
|
'-' => Token::Minus,
|
|
|
|
'>' => Token::GreaterThan,
|
|
|
|
'<' => Token::LessThan,
|
2023-08-03 20:24:57 +02:00
|
|
|
'&' => Token::Et,
|
2023-07-27 20:47:50 +02:00
|
|
|
'(' => Token::ParenOpen,
|
|
|
|
')' => Token::ParenClose,
|
2023-08-02 14:31:33 +02:00
|
|
|
'{' => Token::BraceOpen,
|
|
|
|
'}' => Token::BraceClose,
|
2023-07-27 22:01:50 +02:00
|
|
|
',' => Token::Comma,
|
2023-07-27 16:40:12 +02:00
|
|
|
// Invalid token
|
2023-08-02 18:17:57 +02:00
|
|
|
_ => Err(Error::InvalidToken(*character, cursor.position))?,
|
2023-07-27 16:40:12 +02:00
|
|
|
};
|
|
|
|
|
2023-07-27 20:17:44 +02:00
|
|
|
tokens.push(FullToken {
|
|
|
|
token: variant,
|
2023-08-02 18:17:57 +02:00
|
|
|
position: cursor.position,
|
2023-07-27 20:17:44 +02:00
|
|
|
});
|
2023-07-27 16:40:12 +02:00
|
|
|
}
|
|
|
|
|
2023-07-27 20:17:44 +02:00
|
|
|
tokens.push(FullToken {
|
|
|
|
token: Token::Eof,
|
2023-08-02 18:17:57 +02:00
|
|
|
position: cursor.position,
|
2023-07-27 16:40:12 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
Ok(tokens)
|
|
|
|
}
|
2023-08-02 18:17:57 +02:00
|
|
|
|
|
|
|
#[derive(thiserror::Error, Debug)]
|
|
|
|
pub enum Error {
|
|
|
|
#[error("Invalid token '{}' at Ln {}, Col {}", .0, (.1).1, (.1).0)]
|
|
|
|
InvalidToken(char, Position),
|
|
|
|
}
|