reid-llvm/src/lexer.rs

use std::{fmt::Debug, str::Chars};

static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];

#[derive(Debug, Eq, PartialEq, Clone)]
pub enum Token {
    // Values
    Identifier(String),
    /// Number with at most one decimal point
    DecimalValue(String),

    // Keywords
    /// `let`
    LetKeyword,
    /// `import`
    ImportKeyword,
    /// `return`
    ReturnKeyword,
    /// `fn`
    FnKeyword,
    /// `->`
    Arrow,
    /// `if`
    If,

    // Symbols
    /// `;`
    Semi,
    /// `=`
    Equals,
    /// `:`
    Colon,
    /// `+`
    Plus,
    /// `*`
    Times,
    /// `-`
    Minus,

    /// `>`
    GreaterThan,
    /// `<`
    LessThan,
    /// `&`
    Et,

    /// `(`
    ParenOpen,
    /// `)`
    ParenClose,
    /// `{`
    BraceOpen,
    /// `}`
    BraceClose,
    /// `,`
    Comma,

    Eof,
}

impl Token {
    pub fn get_token_prec(&self) -> i8 {
        match &self {
            Token::Plus => 10,
            Token::Minus => 10,
            Token::Times => 20,
            _ => -1,
        }
    }
}

impl From<Token> for String {
    fn from(value: Token) -> Self {
        format!("{:?}", value)
    }
}

#[derive(Clone)]
pub struct FullToken {
    pub token: Token,
    pub position: Position,
}

impl Debug for FullToken {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_fmt(format_args!(
            "{:?} (Ln {}, Col {})",
            self.token, self.position.1, self.position.0
        ))
    }
}

pub type Position = (u32, u32);

pub struct Cursor<'a> {
    pub position: Position,
    char_stream: Chars<'a>,
}

impl<'a> Cursor<'a> {
    fn next(&mut self) -> Option<char> {
        let next = self.char_stream.next();
        if let Some('\n') = next {
            self.position.1 += 1;
            self.position.0 = 0;
        }
        self.position.0 += 1;
        next
    }

    fn first(&mut self) -> Option<char> {
        // `.next()` optimizes better than `.nth(0)`
        self.char_stream.clone().next()
    }

    #[allow(dead_code)] // Is this actually needed?
    fn second(&mut self) -> Option<char> {
        // `.next()` optimizes better than `.nth(1)`
        let mut stream = self.char_stream.clone();
        stream.next();
        stream.next()
    }
}

pub fn tokenize<T: Into<String>>(to_tokenize: T) -> Result<Vec<FullToken>, Error> {
    let to_tokenize = to_tokenize.into();
    let mut cursor = Cursor {
        char_stream: to_tokenize.chars(),
        position: (0, 1),
    };

    let mut tokens = Vec::new();

    while let Some(character) = &cursor.next() {
        let variant = match character {
            // Whitespace
            w if w.is_whitespace() => continue,
            // Comments
            '/' if cursor.first() == Some('/') => {
                while !matches!(cursor.first(), Some('\n')) {
                    cursor.next();
                }
                continue;
            }
            // "words"
            c if c.is_alphabetic() => {
                let mut value = character.to_string();
                while let Some(c) = cursor.first() {
                    if !c.is_ascii_alphanumeric() {
                        break;
                    }
                    value += &c.to_string();
                    cursor.next();
                }

                // Check for keywords
                let variant = match value.as_str() {
                    "let" => Token::LetKeyword,
                    "import" => Token::ImportKeyword,
                    "return" => Token::ReturnKeyword,
                    "fn" => Token::FnKeyword,
                    "if" => Token::If,
                    _ => Token::Identifier(value),
                };
                variant
            }
            // Decimals
            c if DECIMAL_NUMERICS.contains(c) => {
                let mut value = character.to_string();
                while let Some(c) = cursor.first() {
                    if !DECIMAL_NUMERICS.contains(&c) {
                        break;
                    }
                    value += &c.to_string();
                    cursor.next();
                }
                Token::DecimalValue(value)
            }
            '-' if cursor.first() == Some('>') => {
                cursor.next(); // Eat `>`
                Token::Arrow
            }
            // Single character tokens
            '=' => Token::Equals,
            ';' => Token::Semi,
            ':' => Token::Colon,
            '+' => Token::Plus,
            '*' => Token::Times,
            '-' => Token::Minus,
            '>' => Token::GreaterThan,
            '<' => Token::LessThan,
            '&' => Token::Et,
            '(' => Token::ParenOpen,
            ')' => Token::ParenClose,
            '{' => Token::BraceOpen,
            '}' => Token::BraceClose,
            ',' => Token::Comma,
            // Invalid token
            _ => Err(Error::InvalidToken(*character, cursor.position))?,
        };

        tokens.push(FullToken {
            token: variant,
            position: cursor.position,
        });
    }

    tokens.push(FullToken {
        token: Token::Eof,
        position: cursor.position,
    });

    Ok(tokens)
}

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("Invalid token '{}' at Ln {}, Col {}", .0, (.1).1, (.1).0)]
    InvalidToken(char, Position),
}
Fix warnings 2023-08-02 17:38:38 +02:00			`use std::{fmt::Debug, str::Chars};`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00
			`static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];`

Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`#[derive(Debug, Eq, PartialEq, Clone)]`
			`pub enum Token {`
			`// Values`
			`Identifier(String),`
			`/// Number with at most one decimal point`
			`DecimalValue(String),`

			`// Keywords`
			/// `let`
			`LetKeyword,`
			/// `import`
			`ImportKeyword,`
			/// `return`
			`ReturnKeyword,`
			/// `fn`
			`FnKeyword,`
Add return types, function args 2023-08-03 19:30:00 +02:00			/// `->`
			`Arrow,`
Add parsing for if-statements 2023-08-03 20:24:57 +02:00			/// `if`
			`If,`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00
			`// Symbols`
			/// `;`
			`Semi,`
			/// `=`
			`Equals,`
			/// `:`
			`Colon,`
			/// `+`
			`Plus,`
			/// `*`
			`Times,`
Add return types, function args 2023-08-03 19:30:00 +02:00			/// `-`
			`Minus,`
Add parsing for if-statements 2023-08-03 20:24:57 +02:00
Add return types, function args 2023-08-03 19:30:00 +02:00			/// `>`
			`GreaterThan,`
			/// `<`
			`LessThan,`
Add parsing for if-statements 2023-08-03 20:24:57 +02:00			/// `&`
			`Et,`

Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			/// `(`
			`ParenOpen,`
			/// `)`
			`ParenClose,`
			/// `{`
			`BraceOpen,`
			/// `}`
			`BraceClose,`
			/// `,`
			`Comma,`

			`Eof,`
			`}`

			`impl Token {`
			`pub fn get_token_prec(&self) -> i8 {`
			`match &self {`
			`Token::Plus => 10,`
Add return types, function args 2023-08-03 19:30:00 +02:00			`Token::Minus => 10,`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`Token::Times => 20,`
			`_ => -1,`
			`}`
			`}`
			`}`

Add more error handling 2023-08-02 19:17:06 +02:00			`impl From<Token> for String {`
			`fn from(value: Token) -> Self {`
			`format!("{:?}", value)`
Add errors for lexer and parser 2023-08-02 18:58:10 +02:00			`}`
			`}`

Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`#[derive(Clone)]`
			`pub struct FullToken {`
			`pub token: Token,`
			`pub position: Position,`
			`}`

			`impl Debug for FullToken {`
			`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
			`f.write_fmt(format_args!(`
			`"{:?} (Ln {}, Col {})",`
			`self.token, self.position.1, self.position.0`
			`))`
			`}`
			`}`

			`pub type Position = (u32, u32);`

			`pub struct Cursor<'a> {`
			`pub position: Position,`
			`char_stream: Chars<'a>,`
			`}`

			`impl<'a> Cursor<'a> {`
			`fn next(&mut self) -> Option<char> {`
			`let next = self.char_stream.next();`
			`if let Some('\n') = next {`
			`self.position.1 += 1;`
			`self.position.0 = 0;`
			`}`
Add errors for lexer and parser 2023-08-02 18:58:10 +02:00			`self.position.0 += 1;`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`next`
			`}`

			`fn first(&mut self) -> Option<char> {`
			// `.next()` optimizes better than `.nth(0)`
			`self.char_stream.clone().next()`
			`}`

Fix warnings 2023-08-02 17:38:38 +02:00			`#[allow(dead_code)] // Is this actually needed?`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`fn second(&mut self) -> Option<char> {`
			// `.next()` optimizes better than `.nth(1)`
			`let mut stream = self.char_stream.clone();`
			`stream.next();`
			`stream.next()`
			`}`
			`}`

Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00			`pub fn tokenize<T: Into<String>>(to_tokenize: T) -> Result<Vec<FullToken>, Error> {`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`let to_tokenize = to_tokenize.into();`
			`let mut cursor = Cursor {`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`char_stream: to_tokenize.chars(),`
Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00			`position: (0, 1),`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`};`

			`let mut tokens = Vec::new();`

Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`while let Some(character) = &cursor.next() {`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`let variant = match character {`
			`// Whitespace`
			`w if w.is_whitespace() => continue,`
			`// Comments`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`'/' if cursor.first() == Some('/') => {`
			`while !matches!(cursor.first(), Some('\n')) {`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`cursor.next();`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`}`
			`continue;`
			`}`
			`// "words"`
			`c if c.is_alphabetic() => {`
			`let mut value = character.to_string();`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`while let Some(c) = cursor.first() {`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`if !c.is_ascii_alphanumeric() {`
			`break;`
			`}`
			`value += &c.to_string();`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`cursor.next();`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`}`

			`// Check for keywords`
			`let variant = match value.as_str() {`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`"let" => Token::LetKeyword,`
Add import statement parsing 2023-07-27 20:47:50 +02:00			`"import" => Token::ImportKeyword,`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`"return" => Token::ReturnKeyword,`
			`"fn" => Token::FnKeyword,`
Add parsing for if-statements 2023-08-03 20:24:57 +02:00			`"if" => Token::If,`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`_ => Token::Identifier(value),`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`};`
			`variant`
			`}`
			`// Decimals`
			`c if DECIMAL_NUMERICS.contains(c) => {`
			`let mut value = character.to_string();`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`while let Some(c) = cursor.first() {`
			`if !DECIMAL_NUMERICS.contains(&c) {`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`break;`
			`}`
			`value += &c.to_string();`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`cursor.next();`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`}`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`Token::DecimalValue(value)`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`}`
Add return types, function args 2023-08-03 19:30:00 +02:00			`'-' if cursor.first() == Some('>') => {`
			cursor.next(); // Eat `>`
			`Token::Arrow`
			`}`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`// Single character tokens`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`'=' => Token::Equals,`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`';' => Token::Semi,`
Add import statement parsing 2023-07-27 20:47:50 +02:00			`':' => Token::Colon,`
			`'+' => Token::Plus,`
			`'*' => Token::Times,`
Add return types, function args 2023-08-03 19:30:00 +02:00			`'-' => Token::Minus,`
			`'>' => Token::GreaterThan,`
			`'<' => Token::LessThan,`
Add parsing for if-statements 2023-08-03 20:24:57 +02:00			`'&' => Token::Et,`
Add import statement parsing 2023-07-27 20:47:50 +02:00			`'(' => Token::ParenOpen,`
			`')' => Token::ParenClose,`
Add preliminary codegen, update parsing to require functions 2023-08-02 14:31:33 +02:00			`'{' => Token::BraceOpen,`
			`'}' => Token::BraceClose,`
Add function calls to parsing, also Top Level Expressions 2023-07-27 22:01:50 +02:00			`',' => Token::Comma,`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`// Invalid token`
Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00			`_ => Err(Error::InvalidToken(*character, cursor.position))?,`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`};`

Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`tokens.push(FullToken {`
			`token: variant,`
Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00			`position: cursor.position,`
Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`});`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`}`

Add parser, token stream, successfully parse let statement 2023-07-27 20:17:44 +02:00			`tokens.push(FullToken {`
			`token: Token::Eof,`
Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00			`position: cursor.position,`
Add simple tokenizer for easiest.reid 2023-07-27 16:40:12 +02:00			`});`

			`Ok(tokens)`
			`}`
Add errors, move compiling to examples 2023-08-02 18:17:57 +02:00
			`#[derive(thiserror::Error, Debug)]`
			`pub enum Error {`
			`#[error("Invalid token '{}' at Ln {}, Col {}", .0, (.1).1, (.1).0)]`
			`InvalidToken(char, Position),`
			`}`