Compare commits

...

10 Commits

Author SHA1 Message Date
866d78e6d1 Compile vararg 2026-04-11 23:18:13 +03:00
8646d5c0d0 Parse varargs 2026-04-11 22:18:02 +03:00
3f61e3749e Allow anonymous parameters 2026-04-11 22:11:52 +03:00
5dc6e8ca16 Modify printf to take a number additionally 2026-04-11 22:07:48 +03:00
96e4d6232f Fix tokenization of long words 2026-04-11 22:06:15 +03:00
bd76e8676f Add literal strings 2026-04-11 22:03:38 +03:00
ac7731446e Add pointer types 2026-04-11 21:45:54 +03:00
6855360a97 Add void and char types 2026-04-11 21:40:59 +03:00
7cf752f67b Add forward declarations 2026-04-11 21:34:44 +03:00
07c0059c5c Remove unused forward-declaration 2026-04-11 19:18:46 +03:00
11 changed files with 261 additions and 52 deletions

59
README.md Normal file
View File

@ -0,0 +1,59 @@
# Simple C-compiler
This is a simple work-in-progress C-compiler (or C-like) written in C++ with the
goal of learning to write C++ better in the future, and to provide reference for
my knowledge about modern C++ programming.
As of writing, a simple fibonacci sequence program is already possible to be
compiled and executed, and can be viewed via [`test.c`](./test.c).
As far as compiler-design goes, this project still falls behind my other
project, [Reid-LLVM](https://git.teascade.net/teascade/reid-llvm), which is
significantly more capable and more robust.
## Structure of the program
The program is structured into several different staged, all of which are
orchestrated via main.cpp.
Currently the stages are as follows:
1. Firstly, the program is **tokenized**. This stage could also be called the
lexer, depending on your preference. In this stage, the source code for the
program is transformed into discrete tokens which can then be used during the
parsing phase easier than regular text. The code for this stage is mostly in
[`src/tokens.cpp`](src/tokens.cpp).
2. **TODO:** Preprocessing stage hasn't yet been developed, but it will go here.
3. Then the program is **parsed**. This is the stage where the tokens from the
previous stage(s) are converted into an Abstract Syntax Tree (AST), which is
a format that is easier for the computer to process. The AST itself lives in
[`src/ast.h`](src/ast.h), and the code for the parsing phase lives in
[`src/parsing.cpp`](src/parsing.cpp).
4. **TODO:** Typechecking phase hasn't yet been developed, but it will go here.
5. Finally the program is **compiled**, or in other words **code-generated**,
hence why this is the **codegen** stage. This is where the AST from the
previous stages is taken and LLVM Intermediate Representation is produced
using LLVM-bindings.
## Compiling and running the program
In order to compile the program, you need the following:
- CMake
- C++20 (or newer) capable compiler
- LLVM 21.1.0 or newer
And in order to execute the program which is compiled you also need:
- LLVM 21.1.0 or newer (as it is dynamically linked)
- `whereis`-utility in `$PATH`
- `ld`-utility in `$PATH`
Then, to compile the program you run:
```sh
cmake -Bbuild
make -C build
```
and to run the program, run simply `./build/llvm_c_compiler`. This will read a
file called `test.c` from `$PWD`, and produce two files (`test.o` and `test`).
An executable file called `test` is produced as a result, compiled from the
original `test.c`.

View File

@ -9,6 +9,12 @@ namespace AST {
return out.str();
}
std::string StringLiteralExpression::formatted() {
std::stringstream out{ "" };
out << "\"" << this->m_value << "\"";
return out.str();
}
std::string ValueReferenceExpression::formatted() {
return this->m_name;
}
@ -80,17 +86,26 @@ namespace AST {
for (auto& param : this->m_params) {
if (counter++ > 0)
out << ", ";
out << param.second->formatted() << " " << param.first;
out << param.second->formatted();
if (param.first) {
out << " " << *param.first;
}
}
if (this->m_is_vararg) {
if (counter > 0)
out << ", ";
out << "...";
}
out << ") -> ";
out << this->m_return_ty->formatted();
out << " {\n";
for (auto& statement : this->m_statements) {
out << " " << statement->formatted() << "\n";
if (this->m_statements) {
out << " {\n";
for (auto& statement : *this->m_statements) {
out << " " << statement->formatted() << "\n";
}
out << "}";
}
out << "}";
return out.str();
}
}

View File

@ -41,6 +41,16 @@ namespace AST {
virtual codegen::StackValue codegen(codegen::Builder& builder, codegen::Scope& scope) override;
};
class StringLiteralExpression : public Expression {
private:
std::string m_value;
public:
StringLiteralExpression(token::Metadata meta, std::string value) : Expression{ meta }, m_value{ value } {}
virtual ~StringLiteralExpression() override = default;
virtual std::string formatted() override;
virtual codegen::StackValue codegen(codegen::Builder& builder, codegen::Scope& scope) override;
};
class ValueReferenceExpression : public Expression {
private:
std::string m_name;
@ -165,19 +175,22 @@ namespace AST {
class Function : public TopLevelStatement {
private:
std::unique_ptr<types::Type> m_return_ty;
std::vector<std::pair<std::string, std::unique_ptr<types::Type>>> m_params;
std::vector<std::pair<std::optional<std::string>, std::unique_ptr<types::Type>>> m_params;
bool m_is_vararg;
std::string m_name;
std::vector<std::unique_ptr<Statement>> m_statements;
std::optional<std::vector<std::unique_ptr<Statement>>> m_statements;
public:
Function(
token::Metadata meta,
std::unique_ptr<types::Type> return_ty,
std::vector<std::pair<std::string, std::unique_ptr<types::Type>>> params,
std::vector<std::pair<std::optional<std::string>, std::unique_ptr<types::Type>>> params,
bool is_vararg,
std::string name,
std::vector<std::unique_ptr<Statement>> statements)
std::optional<std::vector<std::unique_ptr<Statement>>> statements)
: TopLevelStatement{ meta }
, m_return_ty{ std::move(return_ty) }
, m_params{ std::move(params) }
, m_is_vararg{ is_vararg }
, m_name{ name }
, m_statements{ std::move(statements) } {
}

View File

@ -22,7 +22,19 @@ namespace AST {
return codegen::StackValue{
llvm::ConstantInt::get(ty, this->m_value),
std::unique_ptr<types::Type>{stack_type}
std::unique_ptr<types::Type>{stack_type}
};
}
codegen::StackValue StringLiteralExpression::codegen(codegen::Builder& builder, codegen::Scope&) {
auto stack_type = new types::PointerType{ std::make_unique<types::FundamentalType>(types::FundamentalTypeKind::Char) };
auto str = llvm::StringRef{ this->m_value.c_str() };
return codegen::StackValue{
builder.builder->CreateGlobalString(str),
std::unique_ptr<types::Type>{stack_type},
};
}
@ -175,7 +187,7 @@ namespace AST {
param_ty_ptrs.push_back(std::move(param.second));
}
auto fn_ty_ptr = std::shared_ptr<types::Type>{ new types::FunctionType{ ret_ty_ptr, param_ty_ptrs } };
auto fn_ty_ptr = std::shared_ptr<types::Type>{ new types::FunctionType{ ret_ty_ptr, param_ty_ptrs, this->m_is_vararg } };
auto fn_ty = fn_ty_ptr->codegen(builder);
auto function = llvm::Function::Create(
@ -187,24 +199,28 @@ namespace AST {
scope.values[this->m_name] = codegen::StackValue{ function, fn_ty_ptr };
auto BB = llvm::BasicBlock::Create(*builder.context, "entry", function, nullptr);
builder.block = BB;
if (this->m_statements) {
auto BB = llvm::BasicBlock::Create(*builder.context, "entry", function, nullptr);
builder.block = BB;
codegen::Scope inner_scope{ scope };
codegen::Scope inner_scope{ scope };
int counter = 0;
for (auto& param : this->m_params) {
auto param_ty_ptr = param_ty_ptrs[counter];
auto arg = function->getArg(counter++);
arg->setName(param.first);
inner_scope.values[param.first] = codegen::StackValue{
arg,
param_ty_ptr,
};
}
int counter = 0;
for (auto& param : this->m_params) {
auto param_ty_ptr = param_ty_ptrs[counter];
auto arg = function->getArg(counter++);
if (param.first) {
arg->setName(*param.first);
inner_scope.values[*param.first] = codegen::StackValue{
arg,
param_ty_ptr,
};
}
}
for (auto& statement : this->m_statements) {
statement->codegen(builder, inner_scope);
for (auto& statement : *this->m_statements) {
statement->codegen(builder, inner_scope);
}
}
llvm::verifyFunction(*function);
@ -220,6 +236,10 @@ namespace types {
return builder.builder->getInt32Ty();
case FundamentalTypeKind::Bool:
return builder.builder->getInt1Ty();
case FundamentalTypeKind::Char:
return builder.builder->getInt8Ty();
case FundamentalTypeKind::Void:
return builder.builder->getVoidTy();
default:
return builder.builder->getVoidTy();
}
@ -234,7 +254,7 @@ namespace types {
auto ret_ty = this->m_ret_ty->codegen(builder);
return llvm::FunctionType::get(ret_ty, params, false);
return llvm::FunctionType::get(ret_ty, params, this->m_vararg);
}
llvm::Type* PointerType::codegen(codegen::Builder& builder) {

View File

@ -20,8 +20,6 @@
#include "tokens.h"
#include "parsing.h"
void llvm_hello_world();
std::string read_file(std::string_view filepath) {
std::ifstream input{ std::string{filepath} };
if (!input) {
@ -161,11 +159,18 @@ std::optional<CompileOutput> compile(std::string_view in_filename) {
};
}
struct ClosePipeDeleter {
// Note 2: Consider adding noexcept.
void operator()(FILE* file) const {
pclose(file);
}
};
/// @brief Executes a given command and returns the output as std::string
std::string exec(const char* cmd) {
std::array<char, 128> buffer;
std::string result;
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
std::unique_ptr<FILE, ClosePipeDeleter> pipe(popen(cmd, "r"), ClosePipeDeleter{});
if (!pipe) {
throw std::runtime_error("popen() failed!");
}

View File

@ -12,10 +12,37 @@ namespace parsing {
try {
auto token = inner.expect(token::Type::Ident);
// TODO eventually make this be potentially more than one word
std::string type_name = token.content;
std::unique_ptr<types::Type> returned{};
if (type_name == "int") {
auto ty = new types::FundamentalType{ types::FundamentalTypeKind::Int };
returned = std::unique_ptr<types::Type>{ ty };
}
else if (type_name == "char") {
auto ty = new types::FundamentalType{ types::FundamentalTypeKind::Char };
returned = std::unique_ptr<types::Type>{ ty };
}
else if (type_name == "void") {
auto ty = new types::FundamentalType{ types::FundamentalTypeKind::Void };
returned = std::unique_ptr<types::Type>{ ty };
}
else {
throw std::runtime_error("Expected type name, got " + type_name);
}
while (inner.peek().type == token::Type::Symbol && inner.peek().content == "*") {
inner.next();
auto ty = new types::PointerType{ std::move(returned) };
returned = std::unique_ptr<types::Type>{ ty };
}
stream.m_position = inner.m_position;
auto ty = new types::FundamentalType{ types::FundamentalTypeKind::Int };
return std::unique_ptr<types::Type>{ ty };
return returned;
}
catch (std::runtime_error& error) {
return std::string{ error.what() };
@ -32,6 +59,12 @@ namespace parsing {
auto expr = new AST::IntLiteralExpression{ token.metadata, std::stoi(token.content) };
return std::unique_ptr<AST::Expression>{ expr };
}
else if (token.type == token::Type::LiteralStr) {
stream.m_position = inner.m_position;
auto expr = new AST::StringLiteralExpression{ token.metadata, token.content };
return std::unique_ptr<AST::Expression>{ expr };
}
else if (token.type == token::Type::Ident) {
stream.m_position = inner.m_position;
@ -241,33 +274,60 @@ namespace parsing {
auto name_token = inner.expect(token::Type::Ident);
inner.expect(token::Type::Symbol, "(");
std::vector<std::pair<std::string, std::unique_ptr<types::Type>>> params;
std::vector<std::pair<std::optional<std::string>, std::unique_ptr<types::Type>>> params;
bool is_vararg = false;
while (inner.peek().content != ")") {
if (params.size() > 0) {
inner.expect(token::Type::Symbol, ",");
}
if (inner.peek().content == ".") {
inner.next();
inner.expect(token::Type::Symbol, ".");
inner.expect(token::Type::Symbol, ".");
is_vararg = true;
break;
}
auto param_ty = parse_type(inner).unwrap();
auto param_name = inner.expect(token::Type::Ident);
params.push_back(std::pair(param_name.content, std::move(param_ty)));
std::optional<std::string> param_name{};
if (inner.peek().type == token::Type::Ident) {
param_name = inner.expect(token::Type::Ident).content;
}
params.push_back(std::pair(param_name, std::move(param_ty)));
}
inner.expect(token::Type::Symbol, ")");
inner.expect(token::Type::Symbol, "{");
std::vector<std::unique_ptr<AST::Statement>> statements{};
std::optional<std::vector<std::unique_ptr<AST::Statement>>> statements{};
if (inner.peek().content == "{") {
inner.expect(token::Type::Symbol, "{");
auto statement = parse_statement(inner);
while (statement.ok()) {
statements.push_back(statement.unwrap());
statement = parse_statement(inner);
std::vector<std::unique_ptr<AST::Statement>> statement_list{};
auto statement = parse_statement(inner);
while (statement.ok()) {
statement_list.push_back(statement.unwrap());
statement = parse_statement(inner);
}
statements = std::optional{ std::move(statement_list) };
inner.expect(token::Type::Symbol, "}");
}
else {
inner.expect(token::Type::Symbol, ";");
}
inner.expect(token::Type::Symbol, "}");
stream.m_position = inner.m_position;
auto fun = new AST::Function{ before_meta + stream.metadata(), std::move(type), std::move(params), name_token.content, std::move(statements) };
auto fun = new AST::Function{
before_meta + stream.metadata(),
std::move(type),
std::move(params),
is_vararg,
name_token.content,
std::move(statements)
};
return std::unique_ptr<AST::TopLevelStatement>{ fun };
}
catch (std::runtime_error& error) {

View File

@ -23,6 +23,8 @@ namespace token {
return "Symbol";
case token::Type::LiteralInt:
return "LiteralInt";
case token::Type::LiteralStr:
return "LiteralStr";
case token::Type::ReturnKeyword:
return "Return";
@ -126,7 +128,9 @@ namespace token {
uint32_t line = 0;
uint32_t line_start = 0;
for (int i = 0; i < static_cast<int>(text.length());) {
int text_length = static_cast<int>(text.length());
for (int i = 0; i < text_length;) {
Position position{ line, i - line_start };
Metadata meta{ position, position, filename };
@ -136,14 +140,27 @@ namespace token {
std::string content{};
do {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (std::isdigit(c));
tokens.push_back(token::Token{ token::Type::LiteralInt, content, meta + content.size() });
}
else if (c == '\"') {
std::string content{};
c = text[++i]; // Skip initial "
do {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (c != '\"');
i++; // Skip second "
tokens.push_back(token::Token{ token::Type::LiteralStr, content, meta + (content.size() + 2) });
}
else if (std::isalpha(c)) {
std::string content{};
do {
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (std::isalnum(c));
@ -167,6 +184,7 @@ namespace token {
line_start = i + 1;
}
content += c;
if ((i + 1) >= text_length) break;
c = text[++i];
} while (iswhitespace(c));
// tokens.push_back(token::Token{ token::Type::Whitespace, content });

View File

@ -11,6 +11,7 @@ namespace token {
Ident,
Symbol,
LiteralInt,
LiteralStr,
ReturnKeyword,
IfKeyword,

View File

@ -42,6 +42,12 @@ namespace types {
switch (this->m_ty) {
case FundamentalTypeKind::Int:
return "Int";
case FundamentalTypeKind::Bool:
return "Bool";
case FundamentalTypeKind::Char:
return "Char";
case FundamentalTypeKind::Void:
return "Void";
default:
return "Unknown";
}
@ -103,6 +109,12 @@ namespace types {
out << param->formatted();
}
if (this->m_vararg) {
if (counter > 0)
out << ", ";
out << "...";
}
out << ") -> " << this->m_ret_ty->formatted();
return out.str();
}
@ -134,7 +146,7 @@ namespace types {
std::string PointerType::formatted() {
std::stringstream out{ "" };
out << this->m_inner << "*";
out << this->m_inner->formatted() << "*";
return out.str();
}

View File

@ -20,7 +20,9 @@ namespace types {
enum FundamentalTypeKind {
Int,
Bool
Bool,
Char,
Void,
};
class Type {
@ -57,9 +59,10 @@ namespace types {
private:
std::shared_ptr<Type> m_ret_ty;
std::vector<std::shared_ptr<Type>> m_param_tys;
bool m_vararg;
public:
FunctionType(std::shared_ptr<Type> ret_ty, std::vector<std::shared_ptr<Type>> param_tys)
: m_ret_ty{ std::move(ret_ty) }, m_param_tys{ std::move(param_tys) } {
FunctionType(std::shared_ptr<Type> ret_ty, std::vector<std::shared_ptr<Type>> param_tys, bool vararg)
: m_ret_ty{ std::move(ret_ty) }, m_param_tys{ std::move(param_tys) }, m_vararg{ vararg } {
}
virtual ~FunctionType() override = default;
virtual std::string formatted() override;

5
test.c
View File

@ -1,3 +1,5 @@
void printf(char*, ...);
int fibonacci(int n) {
if (n < 2)
return 1;
@ -5,5 +7,6 @@ int fibonacci(int n) {
}
int main() {
return fibonacci(10);
printf("10th fibonacci number is %d!", fibonacci(10));
return 0;
}