c-compiler/src/tokens.cpp
2026-04-16 20:45:38 +03:00

253 lines
7.9 KiB
C++

#include "tokens.h"
#include <string>
#include <cctype>
#include <vector>
#include <iostream>
#include <sstream>
#include <optional>
static bool iswhitespace(char& character) {
return character == ' '
|| character == '\t'
|| character == '\n'
|| character == '\r';
}
static std::optional<char> get_escaped(std::string_view inspected) {
if (inspected == "n")
return '\n';
if (inspected == "r")
return '\r';
if (inspected == "t")
return '\t';
if (inspected == "\\")
return '\\';
if (inspected.size() <= 3) {
for (char c : inspected) {
std::cout << c << std::endl;
if (!std::isdigit(c))
return {};
if (std::stoi(std::string{ c }) > 8)
return {};
}
unsigned int x;
std::stringstream ss;
ss << std::oct << inspected;
ss >> x;
return static_cast<char>(x);
}
return {};
}
namespace token {
std::string type_name(Type& type) {
switch (type) {
case token::Type::Ident:
return "Ident";
case token::Type::Symbol:
return "Symbol";
case token::Type::LiteralInt:
return "LiteralInt";
case token::Type::LiteralStr:
return "LiteralStr";
case token::Type::ReturnKeyword:
return "Return";
case token::Type::IfKeyword:
return "If";
case token::Type::ElseKeyword:
return "Else";
case token::Type::Whitespace:
return "Whitespace";
case token::Type::Eof:
return "EOF";
default:
return "Unknown";
}
}
std::string Token::formatted() {
std::stringstream out{ "" };
out << type_name(this->type);
out << "(" << this->content << ")";
out << " at line " << this->metadata.start.line + 1
<< " col " << this->metadata.start.col + 1;
return out.str();
}
std::ostream& operator<<(std::ostream& stream, Token& token) {
stream << token.formatted();
return stream;
}
Metadata operator+(Metadata meta, Metadata other) {
return Metadata{
Position {
std::min(meta.start.line, other.start.line),
std::min(meta.start.col, other.start.col),
},
Position {
std::max(meta.start.line, other.start.line),
std::max(meta.start.col, other.start.col),
},
meta.filename };
}
Metadata operator+(Metadata& meta, int length) {
return Metadata{
meta.start,
Position {meta.end.line, meta.end.col + length},
meta.filename,
};
}
TokenStream::TokenStream(std::vector<Token>& tokens)
: m_tokens{ tokens }, m_position{ 0 } {
};
Token TokenStream::peek(int length) {
int new_pos = m_position + length;
if (new_pos < 0 || new_pos > static_cast<int>(m_tokens.size())) {
return Token{ Type::Eof, {}, {} };
}
return m_tokens[new_pos];
}
Token TokenStream::peek() {
return this->peek(0);
}
Token TokenStream::next() {
token::Token got = this->peek(0);
m_position++;
while (m_position < static_cast<int>(m_tokens.size()) && this->peek().type == Type::Whitespace) {
m_position++;
}
return got;
}
Token TokenStream::expect(Type type) {
auto next = this->next();
if (next.type == type) {
return next;
}
throw std::runtime_error("Expected " + type_name(type) + ", got " + next.formatted());
}
Token TokenStream::expect(Type type, std::string_view content) {
auto next = this->next();
if (next.type == type && next.content == content) {
return next;
}
throw std::runtime_error("Expected " + type_name(type) + "(" + std::string{ content } + "), got " + next.formatted());
}
Metadata TokenStream::metadata() {
return this->peek(0).metadata;
}
std::vector<token::Token> tokenize(std::string_view text, std::string filename) {
std::vector<token::Token> tokens{};
uint32_t line = 0;
uint32_t line_start = 0;
int text_length = static_cast<int>(text.length());
for (int i = 0; i < text_length;) {
Position position{ line, i - line_start };
Metadata meta{ position, position, filename };
char c = text[i];
if (std::isdigit(c)) {
std::string content{};
do {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (std::isdigit(c));
tokens.push_back(token::Token{ token::Type::LiteralInt, content, meta + content.size() });
}
else if (c == '\"') {
std::string content{};
c = text[++i]; // Skip initial "
do {
if (c == '\\') {
std::string escaped_content{};
if ((i + 1) >= text_length) break;
auto potential = get_escaped(escaped_content + text[++i]);
while (potential.has_value() && (i + 1) < text_length) {
escaped_content += text[i];
potential = get_escaped(escaped_content + text[++i]);
}
if (escaped_content.size() > 0) {
auto escaped = get_escaped(escaped_content);
if (escaped.has_value())
content += *escaped;
}
c = text[i];
}
else {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
}
} while (c != '\"');
i++; // Skip second "
tokens.push_back(token::Token{ token::Type::LiteralStr, content, meta + (content.size() + 2) });
}
else if (std::isalpha(c) || c == '_') {
std::string content{};
do {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (std::isalnum(c) || c == '_');
token::Type type = token::Type::Ident;
if (content == "return") {
type = token::Type::ReturnKeyword;
}
else if (content == "if") {
type = token::Type::IfKeyword;
}
else if (content == "else") {
type = token::Type::ElseKeyword;
}
tokens.push_back(token::Token{ type, content, meta + content.size() });
}
else if (iswhitespace(c)) {
std::string content{};
do {
if (c == '\n') {
line++;
line_start = i + 1;
}
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (iswhitespace(c));
tokens.push_back(token::Token{ token::Type::Whitespace, content, meta + content.size() });
}
else {
tokens.push_back(token::Token{ token::Type::Symbol, std::string{c}, meta });
i++;
}
}
Position position{ line, static_cast<uint32_t>(text.length()) - line_start };
Metadata meta{ position, position, filename };
tokens.push_back(token::Token{ token::Type::Eof, {}, meta });
return tokens;
}
}