253 lines
7.9 KiB
C++
253 lines
7.9 KiB
C++
#include "tokens.h"
|
|
|
|
#include <string>
|
|
#include <cctype>
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <optional>
|
|
|
|
static bool iswhitespace(char& character) {
|
|
return character == ' '
|
|
|| character == '\t'
|
|
|| character == '\n'
|
|
|| character == '\r';
|
|
}
|
|
|
|
static std::optional<char> get_escaped(std::string_view inspected) {
|
|
if (inspected == "n")
|
|
return '\n';
|
|
if (inspected == "r")
|
|
return '\r';
|
|
if (inspected == "t")
|
|
return '\t';
|
|
if (inspected == "\\")
|
|
return '\\';
|
|
if (inspected.size() <= 3) {
|
|
for (char c : inspected) {
|
|
std::cout << c << std::endl;
|
|
if (!std::isdigit(c))
|
|
return {};
|
|
if (std::stoi(std::string{ c }) > 8)
|
|
return {};
|
|
}
|
|
unsigned int x;
|
|
std::stringstream ss;
|
|
ss << std::oct << inspected;
|
|
ss >> x;
|
|
|
|
return static_cast<char>(x);
|
|
}
|
|
return {};
|
|
}
|
|
|
|
namespace token {
|
|
std::string type_name(Type& type) {
|
|
switch (type) {
|
|
case token::Type::Ident:
|
|
return "Ident";
|
|
case token::Type::Symbol:
|
|
return "Symbol";
|
|
case token::Type::LiteralInt:
|
|
return "LiteralInt";
|
|
case token::Type::LiteralStr:
|
|
return "LiteralStr";
|
|
|
|
case token::Type::ReturnKeyword:
|
|
return "Return";
|
|
case token::Type::IfKeyword:
|
|
return "If";
|
|
case token::Type::ElseKeyword:
|
|
return "Else";
|
|
|
|
case token::Type::Whitespace:
|
|
return "Whitespace";
|
|
|
|
case token::Type::Eof:
|
|
return "EOF";
|
|
|
|
default:
|
|
return "Unknown";
|
|
}
|
|
}
|
|
|
|
std::string Token::formatted() {
|
|
std::stringstream out{ "" };
|
|
out << type_name(this->type);
|
|
out << "(" << this->content << ")";
|
|
out << " at line " << this->metadata.start.line + 1
|
|
<< " col " << this->metadata.start.col + 1;
|
|
|
|
return out.str();
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& stream, Token& token) {
|
|
stream << token.formatted();
|
|
return stream;
|
|
}
|
|
|
|
Metadata operator+(Metadata meta, Metadata other) {
|
|
return Metadata{
|
|
Position {
|
|
std::min(meta.start.line, other.start.line),
|
|
std::min(meta.start.col, other.start.col),
|
|
},
|
|
Position {
|
|
std::max(meta.start.line, other.start.line),
|
|
std::max(meta.start.col, other.start.col),
|
|
},
|
|
meta.filename };
|
|
}
|
|
|
|
Metadata operator+(Metadata& meta, int length) {
|
|
return Metadata{
|
|
meta.start,
|
|
Position {meta.end.line, meta.end.col + length},
|
|
meta.filename,
|
|
};
|
|
}
|
|
|
|
TokenStream::TokenStream(std::vector<Token>& tokens)
|
|
: m_tokens{ tokens }, m_position{ 0 } {
|
|
};
|
|
|
|
Token TokenStream::peek(int length) {
|
|
int new_pos = m_position + length;
|
|
if (new_pos < 0 || new_pos > static_cast<int>(m_tokens.size())) {
|
|
return Token{ Type::Eof, {}, {} };
|
|
}
|
|
return m_tokens[new_pos];
|
|
}
|
|
|
|
Token TokenStream::peek() {
|
|
return this->peek(0);
|
|
}
|
|
|
|
Token TokenStream::next() {
|
|
token::Token got = this->peek(0);
|
|
m_position++;
|
|
while (m_position < static_cast<int>(m_tokens.size()) && this->peek().type == Type::Whitespace) {
|
|
m_position++;
|
|
}
|
|
return got;
|
|
}
|
|
|
|
Token TokenStream::expect(Type type) {
|
|
auto next = this->next();
|
|
if (next.type == type) {
|
|
return next;
|
|
}
|
|
throw std::runtime_error("Expected " + type_name(type) + ", got " + next.formatted());
|
|
}
|
|
|
|
Token TokenStream::expect(Type type, std::string_view content) {
|
|
auto next = this->next();
|
|
if (next.type == type && next.content == content) {
|
|
return next;
|
|
}
|
|
throw std::runtime_error("Expected " + type_name(type) + "(" + std::string{ content } + "), got " + next.formatted());
|
|
}
|
|
|
|
Metadata TokenStream::metadata() {
|
|
return this->peek(0).metadata;
|
|
}
|
|
|
|
std::vector<token::Token> tokenize(std::string_view text, std::string filename) {
|
|
std::vector<token::Token> tokens{};
|
|
|
|
uint32_t line = 0;
|
|
uint32_t line_start = 0;
|
|
|
|
int text_length = static_cast<int>(text.length());
|
|
|
|
for (int i = 0; i < text_length;) {
|
|
Position position{ line, i - line_start };
|
|
Metadata meta{ position, position, filename };
|
|
|
|
char c = text[i];
|
|
|
|
if (std::isdigit(c)) {
|
|
std::string content{};
|
|
do {
|
|
content += c;
|
|
if ((i + 1) >= text_length) break;
|
|
c = text[++i];
|
|
} while (std::isdigit(c));
|
|
tokens.push_back(token::Token{ token::Type::LiteralInt, content, meta + content.size() });
|
|
}
|
|
else if (c == '\"') {
|
|
std::string content{};
|
|
c = text[++i]; // Skip initial "
|
|
do {
|
|
if (c == '\\') {
|
|
std::string escaped_content{};
|
|
if ((i + 1) >= text_length) break;
|
|
auto potential = get_escaped(escaped_content + text[++i]);
|
|
while (potential.has_value() && (i + 1) < text_length) {
|
|
escaped_content += text[i];
|
|
potential = get_escaped(escaped_content + text[++i]);
|
|
}
|
|
if (escaped_content.size() > 0) {
|
|
auto escaped = get_escaped(escaped_content);
|
|
if (escaped.has_value())
|
|
content += *escaped;
|
|
}
|
|
|
|
c = text[i];
|
|
}
|
|
else {
|
|
content += c;
|
|
if ((i + 1) >= text_length) break;
|
|
c = text[++i];
|
|
}
|
|
} while (c != '\"');
|
|
i++; // Skip second "
|
|
tokens.push_back(token::Token{ token::Type::LiteralStr, content, meta + (content.size() + 2) });
|
|
}
|
|
else if (std::isalpha(c) || c == '_') {
|
|
std::string content{};
|
|
do {
|
|
content += c;
|
|
if ((i + 1) >= text_length) break;
|
|
c = text[++i];
|
|
} while (std::isalnum(c) || c == '_');
|
|
|
|
token::Type type = token::Type::Ident;
|
|
if (content == "return") {
|
|
type = token::Type::ReturnKeyword;
|
|
}
|
|
else if (content == "if") {
|
|
type = token::Type::IfKeyword;
|
|
}
|
|
else if (content == "else") {
|
|
type = token::Type::ElseKeyword;
|
|
}
|
|
tokens.push_back(token::Token{ type, content, meta + content.size() });
|
|
}
|
|
else if (iswhitespace(c)) {
|
|
std::string content{};
|
|
do {
|
|
if (c == '\n') {
|
|
line++;
|
|
line_start = i + 1;
|
|
}
|
|
content += c;
|
|
if ((i + 1) >= text_length) break;
|
|
c = text[++i];
|
|
} while (iswhitespace(c));
|
|
tokens.push_back(token::Token{ token::Type::Whitespace, content, meta + content.size() });
|
|
}
|
|
else {
|
|
tokens.push_back(token::Token{ token::Type::Symbol, std::string{c}, meta });
|
|
i++;
|
|
}
|
|
}
|
|
|
|
Position position{ line, static_cast<uint32_t>(text.length()) - line_start };
|
|
Metadata meta{ position, position, filename };
|
|
|
|
tokens.push_back(token::Token{ token::Type::Eof, {}, meta });
|
|
|
|
return tokens;
|
|
}
|
|
} |