ferrite-lua/src/token_stream/lexer.rs
2026-03-14 16:03:58 +02:00

364 lines
11 KiB
Rust

use std::{fmt::Debug, ops::AddAssign, str::Chars};
static BINARY_NUMERICS: &[char] = &['0', '1'];
static OCTAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7'];
static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
static HEXADECIMAL_NUMERICS: &[char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
];
#[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)]
pub enum Keyword {
Function,
End,
Local,
Return,
If,
Then,
}
impl Keyword {
pub fn parse(from: &str) -> Option<Keyword> {
Some(match from {
"function" => Keyword::Function,
"end" => Keyword::End,
"local" => Keyword::Local,
"return" => Keyword::Return,
"if" => Keyword::If,
"then" => Keyword::Then,
_ => None?,
})
}
}
impl ToString for Keyword {
fn to_string(&self) -> String {
match self {
Keyword::Function => "function",
Keyword::End => "end",
Keyword::Local => "local",
Keyword::Return => "return",
Keyword::If => "if",
Keyword::Then => "then",
}
.to_string()
}
}
#[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)]
pub enum Token {
/// Word-like-values
Word(String),
Keyword(Keyword),
/// Number in the decimal base
DecimalValue(String),
/// Integer number in the hexadecimal base
HexadecimalValue(String),
/// Integer number in the octal base
OctalValue(String),
/// Integer number in the binary base
BinaryValue(String),
/// Some string literal that was surrounded by "double-quotes".
StringLit(String),
/// Special one-character symbol
Symbol(char),
Whitespace(String),
Comment(String),
Eof,
}
impl From<Token> for String {
fn from(value: Token) -> Self {
format!("{:?}", value)
}
}
impl Token {
pub fn len(&self) -> usize {
self.to_string().len()
}
}
impl ToString for Token {
fn to_string(&self) -> String {
match &self {
Token::Word(ident) => ident.clone(),
Token::DecimalValue(val) => val.to_string(),
Token::HexadecimalValue(val) => format!("0x{}", val),
Token::OctalValue(val) => format!("0o{}", val),
Token::BinaryValue(val) => format!("0b{}", val),
Token::StringLit(lit) => format!("\"{}\"", lit),
Token::Eof => String::new(),
Token::Whitespace(val) => val.clone(),
Token::Comment(val) => format!("--{}", val.clone()),
Token::Symbol(val) => val.to_string(),
Token::Keyword(keyword) => keyword.to_string(),
}
}
}
/// A token with a position
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct FullToken {
pub token: Token,
pub position: Position,
}
impl Debug for FullToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!("{:?} {:?}", self.token, self.position,))
}
}
/// (Column, Line)
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord)]
pub struct Position(pub u32, pub u32);
impl Position {
pub fn add(&self, num: u32) -> Position {
Position(self.0 + num, self.1)
}
pub fn sub(&self, num: u32) -> Position {
Position(self.0 - num, self.1)
}
}
impl PartialOrd for Position {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
match self.1.partial_cmp(&other.1) {
Some(core::cmp::Ordering::Equal) => {}
ord => return ord,
}
self.0.partial_cmp(&other.0)
}
}
impl Debug for Position {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!("Ln {}, Col {}", self.1, self.0 + 1))
}
}
pub struct Cursor<'a> {
pub position: Position,
pub char_stream: Chars<'a>,
}
impl<'a> Cursor<'a> {
pub fn next(&mut self) -> Option<char> {
let next = self.char_stream.next();
if let Some('\n') = next {
self.position.1 += 1;
self.position.0 = 0;
}
self.position.0 += 1;
next
}
fn first(&mut self) -> Option<char> {
// `.next()` optimizes better than `.nth(0)`
self.char_stream.clone().next()
}
#[allow(dead_code)] // Is this actually needed?
fn second(&mut self) -> Option<char> {
// `.next()` optimizes better than `.nth(1)`
let mut stream = self.char_stream.clone();
stream.next();
stream.next()
}
}
/// Take source text and produce a list of [`FullToken`]s from it, ie.
/// tokenizing it.
pub fn tokenize<T: Into<String>>(to_tokenize: T) -> Result<Vec<FullToken>, Error> {
let to_tokenize = to_tokenize.into();
let mut cursor = Cursor {
char_stream: to_tokenize.chars(),
position: Position(0, 1),
};
let mut tokens = Vec::new();
while let Some(character) = &cursor.next() {
// Save "current" token first character position
let position = cursor.position.sub(1);
let variant = match character {
// Whitespace
w if w.is_whitespace() => {
let mut whitespace = String::from(*w);
while let Some(w) = cursor.first() {
if !w.is_whitespace() {
break;
}
whitespace.push(cursor.next().unwrap());
}
Token::Whitespace(whitespace)
}
// Comments
'-' if cursor.first() == Some('-') => {
cursor.next();
let mut comment = String::new();
while !matches!(cursor.first(), Some('\n') | None) {
if let Some(c) = cursor.next() {
comment.push(c);
}
}
Token::Comment(comment)
}
'\"' => {
let mut value = String::new();
let mut escape_next = false;
while cursor.first().is_some()
&& (cursor.first() != Some(*character) || escape_next)
{
if cursor.first() == Some('\\') && !escape_next {
cursor.next(); // Consume backslash and always add next character
escape_next = true;
} else {
let c = &cursor.next().unwrap();
if escape_next {
value += &escape_char(&c).to_string();
} else {
value += &c.to_string();
}
escape_next = false;
}
}
if cursor.first() == Some(*character) {
cursor.next();
} else {
return Err(Error::MissingQuotation(position));
}
match character {
'\"' => Token::StringLit(value),
_ => unreachable!(),
}
}
// "words"
c if c.is_alphabetic() => {
let mut value = character.to_string();
while let Some(c) = cursor.first() {
if !(c.is_ascii_alphanumeric() || c == '_') {
break;
}
value += &c.to_string();
cursor.next();
}
if let Some(keyword) = Keyword::parse(&value) {
Token::Keyword(keyword)
} else {
Token::Word(value)
}
}
// Decimals
c if DECIMAL_NUMERICS.contains(c) => {
let mut value = NumberType::Decimal(character.to_string());
let mut numerics = DECIMAL_NUMERICS;
if let Some(second) = cursor.second() {
if cursor.first() == Some('x')
&& HEXADECIMAL_NUMERICS
.contains(&second.to_lowercase().next().unwrap_or('.'))
{
cursor.next();
value = NumberType::Hexadecimal(String::new());
numerics = HEXADECIMAL_NUMERICS;
} else if cursor.first() == Some('o')
&& OCTAL_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.'))
{
cursor.next();
value = NumberType::Octal(String::new());
numerics = OCTAL_NUMERICS;
} else if cursor.first() == Some('b')
&& BINARY_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.'))
{
cursor.next();
value = NumberType::Binary(String::new());
numerics = BINARY_NUMERICS;
}
}
while let Some(c) = cursor.first() {
if !numerics.contains(&c.to_lowercase().next().unwrap_or('.')) {
break;
}
value += c;
cursor.next();
}
match value {
NumberType::Decimal(dec) => Token::DecimalValue(dec),
NumberType::Hexadecimal(hex) => Token::HexadecimalValue(hex),
NumberType::Octal(oct) => Token::OctalValue(oct),
NumberType::Binary(bin) => Token::BinaryValue(bin),
}
}
// Some one-character token
value => Token::Symbol(*value),
};
tokens.push(FullToken {
token: variant,
position,
});
}
tokens.push(FullToken {
token: Token::Eof,
position: cursor.position,
});
Ok(tokens)
}
fn escape_char(c: &char) -> char {
match c {
't' => '\t',
'n' => '\n',
'r' => '\r',
'0' => '\0',
_ => *c,
}
}
enum NumberType {
Decimal(String),
Hexadecimal(String),
Octal(String),
Binary(String),
}
impl AddAssign<char> for NumberType {
fn add_assign(&mut self, rhs: char) {
*self = match self {
NumberType::Decimal(val) => NumberType::Decimal(val.to_owned() + &rhs.to_string()),
NumberType::Hexadecimal(val) => {
NumberType::Hexadecimal(val.to_owned() + &rhs.to_string())
}
NumberType::Octal(val) => NumberType::Octal(val.to_owned() + &rhs.to_string()),
NumberType::Binary(val) => NumberType::Binary(val.to_owned() + &rhs.to_string()),
};
}
}
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum Error {
#[error("Invalid token '{}' ", .0)]
InvalidToken(char, Position),
#[error("String literal is never finished!")]
MissingQuotation(Position),
}
impl Error {
pub fn get_position(&self) -> &Position {
match self {
Error::InvalidToken(_, pos) => pos,
Error::MissingQuotation(pos) => pos,
}
}
}