397 lines
12 KiB
Rust
397 lines
12 KiB
Rust
use std::{fmt::Debug, ops::AddAssign, str::Chars};
|
|
|
|
static BINARY_NUMERICS: &[char] = &['0', '1'];
|
|
static OCTAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7'];
|
|
static DECIMAL_NUMERICS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
|
|
static HEXADECIMAL_NUMERICS: &[char] = &[
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
|
];
|
|
|
|
#[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)]
|
|
pub enum Keyword {
|
|
Function,
|
|
End,
|
|
Local,
|
|
Global,
|
|
Return,
|
|
If,
|
|
ElseIf,
|
|
Else,
|
|
Then,
|
|
True,
|
|
False,
|
|
Nil,
|
|
Not,
|
|
For,
|
|
Do,
|
|
Break,
|
|
GoTo,
|
|
}
|
|
|
|
impl Keyword {
|
|
pub fn parse(from: &str) -> Option<Keyword> {
|
|
Some(match from {
|
|
"function" => Keyword::Function,
|
|
"end" => Keyword::End,
|
|
"local" => Keyword::Local,
|
|
"global" => Keyword::Global,
|
|
"return" => Keyword::Return,
|
|
"if" => Keyword::If,
|
|
"elseif" => Keyword::ElseIf,
|
|
"else" => Keyword::Else,
|
|
"then" => Keyword::Then,
|
|
"true" => Keyword::True,
|
|
"false" => Keyword::False,
|
|
"nil" => Keyword::Nil,
|
|
"not" => Keyword::Not,
|
|
"for" => Keyword::For,
|
|
"do" => Keyword::Do,
|
|
"break" => Keyword::Break,
|
|
"goto" => Keyword::GoTo,
|
|
_ => None?,
|
|
})
|
|
}
|
|
}
|
|
|
|
impl ToString for Keyword {
|
|
fn to_string(&self) -> String {
|
|
match self {
|
|
Keyword::Function => "function",
|
|
Keyword::End => "end",
|
|
Keyword::Local => "local",
|
|
Keyword::Global => "global",
|
|
Keyword::Return => "return",
|
|
Keyword::If => "if",
|
|
Keyword::ElseIf => "elif",
|
|
Keyword::Else => "else",
|
|
Keyword::Then => "then",
|
|
Keyword::True => "true",
|
|
Keyword::False => "false",
|
|
Keyword::Nil => "nil",
|
|
Keyword::Not => "not",
|
|
Keyword::For => "for",
|
|
Keyword::Do => "do",
|
|
Keyword::Break => "break",
|
|
Keyword::GoTo => "goto",
|
|
}
|
|
.to_string()
|
|
}
|
|
}
|
|
|
|
#[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash, Debug)]
|
|
pub enum Token {
|
|
/// Word-like-values
|
|
Word(String),
|
|
Keyword(Keyword),
|
|
/// Number in the decimal base
|
|
DecimalValue(String),
|
|
/// Integer number in the hexadecimal base
|
|
HexadecimalValue(String),
|
|
/// Integer number in the octal base
|
|
OctalValue(String),
|
|
/// Integer number in the binary base
|
|
BinaryValue(String),
|
|
/// Some string literal that was surrounded by "double-quotes".
|
|
StringLit(String),
|
|
|
|
/// Special one-character symbol
|
|
Symbol(char),
|
|
|
|
Whitespace(String),
|
|
Comment(String),
|
|
Eof,
|
|
}
|
|
|
|
impl From<Token> for String {
|
|
fn from(value: Token) -> Self {
|
|
format!("{:?}", value)
|
|
}
|
|
}
|
|
|
|
impl Token {
|
|
pub fn len(&self) -> usize {
|
|
self.to_string().len()
|
|
}
|
|
}
|
|
|
|
impl ToString for Token {
|
|
fn to_string(&self) -> String {
|
|
match &self {
|
|
Token::Word(ident) => ident.clone(),
|
|
Token::DecimalValue(val) => val.to_string(),
|
|
Token::HexadecimalValue(val) => format!("0x{}", val),
|
|
Token::OctalValue(val) => format!("0o{}", val),
|
|
Token::BinaryValue(val) => format!("0b{}", val),
|
|
Token::StringLit(lit) => format!("\"{}\"", lit),
|
|
Token::Eof => String::new(),
|
|
Token::Whitespace(val) => val.clone(),
|
|
Token::Comment(val) => format!("--{}", val.clone()),
|
|
Token::Symbol(val) => val.to_string(),
|
|
Token::Keyword(keyword) => keyword.to_string(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A token with a position
|
|
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub struct FullToken {
|
|
pub token: Token,
|
|
pub position: Position,
|
|
}
|
|
|
|
impl Debug for FullToken {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.write_fmt(format_args!("{:?} {:?}", self.token, self.position,))
|
|
}
|
|
}
|
|
|
|
/// (Column, Line)
|
|
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord)]
|
|
pub struct Position(pub u32, pub u32);
|
|
|
|
impl Position {
|
|
pub fn add(&self, num: u32) -> Position {
|
|
Position(self.0 + num, self.1)
|
|
}
|
|
|
|
pub fn sub(&self, num: u32) -> Position {
|
|
Position(self.0 - num, self.1)
|
|
}
|
|
}
|
|
|
|
impl PartialOrd for Position {
|
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
|
match self.1.partial_cmp(&other.1) {
|
|
Some(core::cmp::Ordering::Equal) => {}
|
|
ord => return ord,
|
|
}
|
|
self.0.partial_cmp(&other.0)
|
|
}
|
|
}
|
|
|
|
impl Debug for Position {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.write_fmt(format_args!("Ln {}, Col {}", self.1, self.0 + 1))
|
|
}
|
|
}
|
|
|
|
pub struct Cursor<'a> {
|
|
pub position: Position,
|
|
pub char_stream: Chars<'a>,
|
|
}
|
|
|
|
impl<'a> Cursor<'a> {
|
|
pub fn next(&mut self) -> Option<char> {
|
|
let next = self.char_stream.next();
|
|
if let Some('\n') = next {
|
|
self.position.1 += 1;
|
|
self.position.0 = 0;
|
|
}
|
|
self.position.0 += 1;
|
|
next
|
|
}
|
|
|
|
fn first(&mut self) -> Option<char> {
|
|
// `.next()` optimizes better than `.nth(0)`
|
|
self.char_stream.clone().next()
|
|
}
|
|
|
|
#[allow(dead_code)] // Is this actually needed?
|
|
fn second(&mut self) -> Option<char> {
|
|
// `.next()` optimizes better than `.nth(1)`
|
|
let mut stream = self.char_stream.clone();
|
|
stream.next();
|
|
stream.next()
|
|
}
|
|
}
|
|
|
|
/// Take source text and produce a list of [`FullToken`]s from it, ie.
|
|
/// tokenizing it.
|
|
pub fn tokenize<T: Into<String>>(to_tokenize: T) -> Result<Vec<FullToken>, Error> {
|
|
let to_tokenize = to_tokenize.into();
|
|
let mut cursor = Cursor {
|
|
char_stream: to_tokenize.chars(),
|
|
position: Position(0, 1),
|
|
};
|
|
|
|
let mut tokens = Vec::new();
|
|
|
|
while let Some(character) = &cursor.next() {
|
|
// Save "current" token first character position
|
|
let position = cursor.position.sub(1);
|
|
|
|
let variant = match character {
|
|
// Whitespace
|
|
w if w.is_whitespace() => {
|
|
let mut whitespace = String::from(*w);
|
|
while let Some(w) = cursor.first() {
|
|
if !w.is_whitespace() {
|
|
break;
|
|
}
|
|
whitespace.push(cursor.next().unwrap());
|
|
}
|
|
Token::Whitespace(whitespace)
|
|
}
|
|
// Comments
|
|
'-' if cursor.first() == Some('-') => {
|
|
cursor.next();
|
|
|
|
let mut comment = String::new();
|
|
while !matches!(cursor.first(), Some('\n') | None) {
|
|
if let Some(c) = cursor.next() {
|
|
comment.push(c);
|
|
}
|
|
}
|
|
Token::Comment(comment)
|
|
}
|
|
'\"' => {
|
|
let mut value = String::new();
|
|
let mut escape_next = false;
|
|
while cursor.first().is_some()
|
|
&& (cursor.first() != Some(*character) || escape_next)
|
|
{
|
|
if cursor.first() == Some('\\') && !escape_next {
|
|
cursor.next(); // Consume backslash and always add next character
|
|
escape_next = true;
|
|
} else {
|
|
let c = &cursor.next().unwrap();
|
|
if escape_next {
|
|
value += &escape_char(&c).to_string();
|
|
} else {
|
|
value += &c.to_string();
|
|
}
|
|
escape_next = false;
|
|
}
|
|
}
|
|
if cursor.first() == Some(*character) {
|
|
cursor.next();
|
|
} else {
|
|
return Err(Error::MissingQuotation(position));
|
|
}
|
|
match character {
|
|
'\"' => Token::StringLit(value),
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
// "words"
|
|
c if c.is_alphabetic() => {
|
|
let mut value = character.to_string();
|
|
while let Some(c) = cursor.first() {
|
|
if !(c.is_ascii_alphanumeric() || c == '_') {
|
|
break;
|
|
}
|
|
value += &c.to_string();
|
|
cursor.next();
|
|
}
|
|
|
|
if let Some(keyword) = Keyword::parse(&value) {
|
|
Token::Keyword(keyword)
|
|
} else {
|
|
Token::Word(value)
|
|
}
|
|
}
|
|
// Decimals
|
|
c if DECIMAL_NUMERICS.contains(c) => {
|
|
let mut value = NumberType::Decimal(character.to_string());
|
|
let mut numerics = DECIMAL_NUMERICS;
|
|
if let Some(second) = cursor.second() {
|
|
if cursor.first() == Some('x')
|
|
&& HEXADECIMAL_NUMERICS
|
|
.contains(&second.to_lowercase().next().unwrap_or('.'))
|
|
{
|
|
cursor.next();
|
|
value = NumberType::Hexadecimal(String::new());
|
|
numerics = HEXADECIMAL_NUMERICS;
|
|
} else if cursor.first() == Some('o')
|
|
&& OCTAL_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.'))
|
|
{
|
|
cursor.next();
|
|
value = NumberType::Octal(String::new());
|
|
numerics = OCTAL_NUMERICS;
|
|
} else if cursor.first() == Some('b')
|
|
&& BINARY_NUMERICS.contains(&second.to_lowercase().next().unwrap_or('.'))
|
|
{
|
|
cursor.next();
|
|
value = NumberType::Binary(String::new());
|
|
numerics = BINARY_NUMERICS;
|
|
}
|
|
}
|
|
while let Some(c) = cursor.first() {
|
|
if !numerics.contains(&c.to_lowercase().next().unwrap_or('.')) {
|
|
break;
|
|
}
|
|
value += c;
|
|
cursor.next();
|
|
}
|
|
match value {
|
|
NumberType::Decimal(dec) => Token::DecimalValue(dec),
|
|
NumberType::Hexadecimal(hex) => Token::HexadecimalValue(hex),
|
|
NumberType::Octal(oct) => Token::OctalValue(oct),
|
|
NumberType::Binary(bin) => Token::BinaryValue(bin),
|
|
}
|
|
}
|
|
// Some one-character token
|
|
value => Token::Symbol(*value),
|
|
};
|
|
|
|
tokens.push(FullToken {
|
|
token: variant,
|
|
position,
|
|
});
|
|
}
|
|
|
|
tokens.push(FullToken {
|
|
token: Token::Eof,
|
|
position: cursor.position,
|
|
});
|
|
|
|
Ok(tokens)
|
|
}
|
|
|
|
fn escape_char(c: &char) -> char {
|
|
match c {
|
|
't' => '\t',
|
|
'n' => '\n',
|
|
'r' => '\r',
|
|
'0' => '\0',
|
|
_ => *c,
|
|
}
|
|
}
|
|
|
|
enum NumberType {
|
|
Decimal(String),
|
|
Hexadecimal(String),
|
|
Octal(String),
|
|
Binary(String),
|
|
}
|
|
|
|
impl AddAssign<char> for NumberType {
|
|
fn add_assign(&mut self, rhs: char) {
|
|
*self = match self {
|
|
NumberType::Decimal(val) => NumberType::Decimal(val.to_owned() + &rhs.to_string()),
|
|
NumberType::Hexadecimal(val) => {
|
|
NumberType::Hexadecimal(val.to_owned() + &rhs.to_string())
|
|
}
|
|
NumberType::Octal(val) => NumberType::Octal(val.to_owned() + &rhs.to_string()),
|
|
NumberType::Binary(val) => NumberType::Binary(val.to_owned() + &rhs.to_string()),
|
|
};
|
|
}
|
|
}
|
|
|
|
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
|
pub enum Error {
|
|
#[error("Invalid token '{}' ", .0)]
|
|
InvalidToken(char, Position),
|
|
#[error("String literal is never finished!")]
|
|
MissingQuotation(Position),
|
|
}
|
|
|
|
impl Error {
|
|
pub fn get_position(&self) -> &Position {
|
|
match self {
|
|
Error::InvalidToken(_, pos) => pos,
|
|
Error::MissingQuotation(pos) => pos,
|
|
}
|
|
}
|
|
}
|