commit cca69976dd86a43ed140da9f943ef429fb8e18b0 Author: sofia Date: Thu Jul 27 17:40:12 2023 +0300 Add simple tokenizer for easiest.reid diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..df6e7a4 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "reid" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d0e8d97 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "reid" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/easiest.reid b/easiest.reid new file mode 100644 index 0000000..5cd961a --- /dev/null +++ b/easiest.reid @@ -0,0 +1,4 @@ +// Hello, comment here! +let hello = 32; +let beep = + hello ; \ No newline at end of file diff --git a/easy.reid b/easy.reid new file mode 100644 index 0000000..71492ae --- /dev/null +++ b/easy.reid @@ -0,0 +1,9 @@ +// Arithmetic, function calls and imports! + +import std::print; + +let arithmetic = 3 + 2 * 5 + 1 * 2; +let multiplier = 5 * 2; + +let result = arithmetic * multiplier + arithmetic; +print(result); \ No newline at end of file diff --git a/hard.reid b/hard.reid new file mode 100644 index 0000000..8b7b6fb --- /dev/null +++ b/hard.reid @@ -0,0 +1,8 @@ +// New types, type-casting + +import std::print; + +let text: string = "hello there!"; +let value: i16 = 123; + +print(text + (value as string)); \ No newline at end of file diff --git a/medium.reid b/medium.reid new file mode 100644 index 0000000..018c054 --- /dev/null +++ b/medium.reid @@ -0,0 +1,12 @@ +// if-statements, functions + +import std::print; + +fn fibonacci(value: i32) -> i32 { + if value < 3 { + return 1; + } + return fibonacci(value - 1) + fibonacci(value - 2); +} + +print(fibonacci(15)); \ No newline at end of file diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..3ecfd2b --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,138 @@ +use std::{fmt::Debug, iter::Peekable, str::Chars}; + +pub static EASIEST: &str = include_str!("../easiest.reid"); +// pub static EASY: &str = include_str!("../easy.reid"); +// pub static MEDIUM: &str = include_str!("../medium.reid"); +// pub static HARD: &str = include_str!("../hard.reid"); + +static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; + +pub fn tokenize>(to_tokenize: T) -> Result, String> { + let to_tokenize = to_tokenize.into(); + let mut position = (0, 1); + let mut cursor = Cursor { + char_stream: to_tokenize.chars().peekable(), + position, + }; + + let mut tokens = Vec::new(); + + while let Some(character) = &cursor.consume() { + position.0 += 1; + if *character == '\n' { + position.1 += 1; + position.0 = 0; + } + + let peek = cursor.peek(); + + let variant = match character { + // Whitespace + w if w.is_whitespace() => continue, + // Comments + '/' if peek == Some(&'/') => { + while !matches!(&cursor.peek(), Some('\n')) { + cursor.consume(); + } + continue; + } + // "words" + c if c.is_alphabetic() => { + let mut value = character.to_string(); + while let Some(c) = &cursor.peek() { + if !c.is_ascii_alphanumeric() { + break; + } + value += &c.to_string(); + cursor.consume(); + } + + // Check for keywords + let variant = match value.as_str() { + "let" => TokenVariant::LetKeyword, + _ => TokenVariant::Identifier(value), + }; + variant + } + // Decimals + c if DECIMAL_NUMERICS.contains(c) => { + let mut value = character.to_string(); + while let Some(c) = &cursor.peek() { + if !DECIMAL_NUMERICS.contains(c) { + break; + } + value += &c.to_string(); + cursor.consume(); + } + TokenVariant::DecimalValue(value) + } + // Single character tokens + '=' => TokenVariant::Equals, + ';' => TokenVariant::Semicolon, + // Invalid token + _ => Err(format!( + "Unknown token '{}' at {}, {}", + character, position.0, position.1 + ))?, + }; + + tokens.push(Token { variant, position }); + } + + position.0 += 1; + + tokens.push(Token { + variant: TokenVariant::Eof, + position, + }); + + Ok(tokens) +} + +pub struct Token { + variant: TokenVariant, + position: Position, +} + +impl Debug for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "{:?} (Ln {}, Col {})", + self.variant, self.position.1, self.position.0 + )) + } +} + +pub type Position = (u32, u32); + +#[derive(Debug)] +pub enum TokenVariant { + LetKeyword, + Semicolon, + Equals, + Identifier(String), + /// Number with at most one decimal point + DecimalValue(String), + Eof, +} + +pub struct Cursor<'a> { + pub position: Position, + char_stream: Peekable>, +} + +impl<'a> Cursor<'a> { + fn consume(&mut self) -> Option { + let next = self.char_stream.next(); + self.position.0 += 1; + if let Some('\n') = next { + self.position.1 += 1; + self.position.0 = 0; + } + next + } + + fn peek(&mut self) -> Option<&char> { + self.char_stream.peek() + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..23b42f3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,9 @@ +use crate::lexer::EASIEST; + +mod lexer; + +fn main() { + let token_stream = lexer::tokenize(EASIEST).unwrap(); + + dbg!(&token_stream); +}