From cca69976dd86a43ed140da9f943ef429fb8e18b0 Mon Sep 17 00:00:00 2001 From: sofia Date: Thu, 27 Jul 2023 17:40:12 +0300 Subject: [PATCH] Add simple tokenizer for easiest.reid --- .gitignore | 1 + Cargo.lock | 7 +++ Cargo.toml | 8 +++ easiest.reid | 4 ++ easy.reid | 9 ++++ hard.reid | 8 +++ medium.reid | 12 +++++ src/lexer.rs | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 9 ++++ 9 files changed, 196 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 easiest.reid create mode 100644 easy.reid create mode 100644 hard.reid create mode 100644 medium.reid create mode 100644 src/lexer.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..df6e7a4 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "reid" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d0e8d97 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "reid" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/easiest.reid b/easiest.reid new file mode 100644 index 0000000..5cd961a --- /dev/null +++ b/easiest.reid @@ -0,0 +1,4 @@ +// Hello, comment here! +let hello = 32; +let beep = + hello ; \ No newline at end of file diff --git a/easy.reid b/easy.reid new file mode 100644 index 0000000..71492ae --- /dev/null +++ b/easy.reid @@ -0,0 +1,9 @@ +// Arithmetic, function calls and imports! + +import std::print; + +let arithmetic = 3 + 2 * 5 + 1 * 2; +let multiplier = 5 * 2; + +let result = arithmetic * multiplier + arithmetic; +print(result); \ No newline at end of file diff --git a/hard.reid b/hard.reid new file mode 100644 index 0000000..8b7b6fb --- /dev/null +++ b/hard.reid @@ -0,0 +1,8 @@ +// New types, type-casting + +import std::print; + +let text: string = "hello there!"; +let value: i16 = 123; + +print(text + (value as string)); \ No newline at end of file diff --git a/medium.reid b/medium.reid new file mode 100644 index 0000000..018c054 --- /dev/null +++ b/medium.reid @@ -0,0 +1,12 @@ +// if-statements, functions + +import std::print; + +fn fibonacci(value: i32) -> i32 { + if value < 3 { + return 1; + } + return fibonacci(value - 1) + fibonacci(value - 2); +} + +print(fibonacci(15)); \ No newline at end of file diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..3ecfd2b --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,138 @@ +use std::{fmt::Debug, iter::Peekable, str::Chars}; + +pub static EASIEST: &str = include_str!("../easiest.reid"); +// pub static EASY: &str = include_str!("../easy.reid"); +// pub static MEDIUM: &str = include_str!("../medium.reid"); +// pub static HARD: &str = include_str!("../hard.reid"); + +static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; + +pub fn tokenize>(to_tokenize: T) -> Result, String> { + let to_tokenize = to_tokenize.into(); + let mut position = (0, 1); + let mut cursor = Cursor { + char_stream: to_tokenize.chars().peekable(), + position, + }; + + let mut tokens = Vec::new(); + + while let Some(character) = &cursor.consume() { + position.0 += 1; + if *character == '\n' { + position.1 += 1; + position.0 = 0; + } + + let peek = cursor.peek(); + + let variant = match character { + // Whitespace + w if w.is_whitespace() => continue, + // Comments + '/' if peek == Some(&'/') => { + while !matches!(&cursor.peek(), Some('\n')) { + cursor.consume(); + } + continue; + } + // "words" + c if c.is_alphabetic() => { + let mut value = character.to_string(); + while let Some(c) = &cursor.peek() { + if !c.is_ascii_alphanumeric() { + break; + } + value += &c.to_string(); + cursor.consume(); + } + + // Check for keywords + let variant = match value.as_str() { + "let" => TokenVariant::LetKeyword, + _ => TokenVariant::Identifier(value), + }; + variant + } + // Decimals + c if DECIMAL_NUMERICS.contains(c) => { + let mut value = character.to_string(); + while let Some(c) = &cursor.peek() { + if !DECIMAL_NUMERICS.contains(c) { + break; + } + value += &c.to_string(); + cursor.consume(); + } + TokenVariant::DecimalValue(value) + } + // Single character tokens + '=' => TokenVariant::Equals, + ';' => TokenVariant::Semicolon, + // Invalid token + _ => Err(format!( + "Unknown token '{}' at {}, {}", + character, position.0, position.1 + ))?, + }; + + tokens.push(Token { variant, position }); + } + + position.0 += 1; + + tokens.push(Token { + variant: TokenVariant::Eof, + position, + }); + + Ok(tokens) +} + +pub struct Token { + variant: TokenVariant, + position: Position, +} + +impl Debug for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "{:?} (Ln {}, Col {})", + self.variant, self.position.1, self.position.0 + )) + } +} + +pub type Position = (u32, u32); + +#[derive(Debug)] +pub enum TokenVariant { + LetKeyword, + Semicolon, + Equals, + Identifier(String), + /// Number with at most one decimal point + DecimalValue(String), + Eof, +} + +pub struct Cursor<'a> { + pub position: Position, + char_stream: Peekable>, +} + +impl<'a> Cursor<'a> { + fn consume(&mut self) -> Option { + let next = self.char_stream.next(); + self.position.0 += 1; + if let Some('\n') = next { + self.position.1 += 1; + self.position.0 = 0; + } + next + } + + fn peek(&mut self) -> Option<&char> { + self.char_stream.peek() + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..23b42f3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,9 @@ +use crate::lexer::EASIEST; + +mod lexer; + +fn main() { + let token_stream = lexer::tokenize(EASIEST).unwrap(); + + dbg!(&token_stream); +}