diff --git a/.gitignore b/.gitignore index e660fd9..f59093e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,7 @@ bin/ +.vscode +*.o +*.tab.hpp +*.tab.cpp +*.yy.cpp +*.output diff --git a/Makefile b/Makefile index 6b5074e..0cae049 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,28 @@ -CPPFLAGS += -std=c++20 -W -Wall -g -I include +CPPFLAGS += -std=c++20 -W -Wall -g -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -I include -.PHONY: default +CPPFILES := $(wildcard src/*.cpp) +DEPENDENCIES := $(patsubst %.cpp,%.d,$(CPPFILES)) +-include $(DEPENDENCIES) +OBJFILES := $(patsubst %.cpp,%.o,$(CPPFILES)) +OBJFILES += src/lexer.yy.o src/parser.tab.o + + +.PHONY: default clean with_coverage coverage default: bin/c_compiler -bin/c_compiler : src/cli.cpp src/compiler.cpp +bin/c_compiler : $(OBJFILES) @mkdir -p bin - g++ $(CPPFLAGS) -o bin/c_compiler $^ + g++ $(CPPFLAGS) -o $@ $^ + +%.o: %.cpp Makefile + g++ $(CPPFLAGS) -MMD -MP -c $< -o $@ + +src/parser.tab.cpp src/parser.tab.hpp: src/parser.y + bison -v -d src/parser.y -o src/parser.tab.cpp + +src/lexer.yy.cpp : src/lexer.flex src/parser.tab.hpp + flex -o src/lexer.yy.cpp src/lexer.flex with_coverage : CPPFLAGS += --coverage with_coverage : bin/c_compiler @@ -25,3 +41,8 @@ clean : @rm -rf coverage @find . -name "*.o" -delete @rm -rf bin/* + @rm -f src/*.tab.hpp + @rm -f src/*.tab.cpp + @rm -f src/*.yy.cpp + @rm -f src/*.output + diff --git a/README.md b/README.md index d31db1b..5b8e358 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,9 @@ Changelog * Directly linked to ANSI C parser and lexer. * Added a "Getting started" guide and incorporated last year's feedback from Ed. * Changed the 10% of the grade (previously only for time management) to also account for code design to reward thoughtful planning. - * Improved the skeleton compiler to be more advanced by providing lexer and parser to hopefully jump-start progress and avoid unnecessary debugging. [WIP] + * Improved the skeleton compiler to be more advanced by integrating lexer and parser to hopefully jump-start progress and avoid unnecessary debugging. * Covered assembler directives in more details by showcasing the meaning behind an example assembly program, because that topic had always caused confusion in the past years. [WIP] - * New for 2022/2023: * Target architecture is now RISC-V rather than MIPS, in order to align with the modernised Instruction Architectures half of the module. diff --git a/compiler_tests/_example/example.c b/compiler_tests/_example/example.c index 674e87b..7594893 100644 --- a/compiler_tests/_example/example.c +++ b/compiler_tests/_example/example.c @@ -1,4 +1,4 @@ int f() { - return 5; + return; } diff --git a/docs/assembler_directives.md b/docs/assembler_directives.md new file mode 100644 index 0000000..31b392f --- /dev/null +++ b/docs/assembler_directives.md @@ -0,0 +1,11 @@ +Assembler directives +==================== +"The assembler implements a number of directives that control the assembly of instructions into an object file. These directives give the ability to include arbitrary data in the object file, control exporting of symbols, selection of sections, alignment of data, assembly options for compression, position dependent and position independent code" - quote from [RISC-V Assembler Reference](https://michaeljclark.github.io/asm.html). + +The linked guide explains in details all available directives, but fortunately you only need a very small subset to start with and even the more advanced features only require a few additional directives. While [Godbolt](https://godbolt.org/z/vMMnWbsff) emits some directives, to see all of them (more than you actually need) you are advised to run: + +```riscv64-unknown-elf-gcc -std=c90 -pedantic -ansi -O0 -march=rv32imfd -mabi=ilp32d -S [source-file.c] -o [dest-file.s]```. + +The below picture offers a quick walk-through of a very simple program with detailed annotations describing the meaning behind the included directives. Some of them a crucial (e.g. section specifiers, labels, data emitting) while others not so much (e.g. file attributes, compiler identifier, symbol types) - you will get a feel for them during the development of the compiler. Most importantly, you only need to set the correct section and provide function directives as long as you deal with local variables. **In other words, you can postpone studying this document in details until you decide to deal with global variables.** + +![Assembler directives](./assembler_directives.png) diff --git a/docs/assembler_directives.png b/docs/assembler_directives.png new file mode 100644 index 0000000..aadf5e1 Binary files /dev/null and b/docs/assembler_directives.png differ diff --git a/docs/basic_compiler.md b/docs/basic_compiler.md new file mode 100644 index 0000000..a156cfc --- /dev/null +++ b/docs/basic_compiler.md @@ -0,0 +1,22 @@ +Basic compiler +============== + +For the first time ever, you are provided with a basic compiler that can lex, parse and generate (incorrect) code for the following program: +``` +int f() { + return; +} +``` + +The output assembly is hardcoded, so that the basic compiler passes one of the provided test cases. However, having a functioning compiler should allow you to hopefully jump-start the development of the actually interesting parts of this coursework while avoiding the common early pitfalls that students have faced in previous years. It should also allow you to better understand the underlying C90 grammar and have an easier time when adding new features. + +The provided basic compiler is able to traverse the following AST related to the above program. In order to expand its capabilities, you should develop the parser and the corresponding code generation at the same time - do not try to fully implement one before the other. + +![int_main_return_tree](./int_main_return_tree.png) + + +The lexer and parser are loosely based on the "official" grammar covered [here](https://www.lysator.liu.se/c/ANSI-C-grammar-l.html) and [here](https://www.lysator.liu.se/c/ANSI-C-grammar-y.html) respectively. While they should suffice for a significant portions of features, you might need to improve them to implement the more advanced ones. If you find the grammar too complicated to understand, it is also perfectly fine to create your own simple grammar and build upon it as you add more features. + +You can follow the patterns introduced for the code generation part of the basic compiler, but you might find adjusting them to your needs be better in the long run. You are recommended to follow the coding style that best suits you while hopefully picking strong design skills throughout the development of your compiler. + + diff --git a/docs/c_compiler.md b/docs/c_compiler.md index 2cd09a3..ec75eb9 100644 --- a/docs/c_compiler.md +++ b/docs/c_compiler.md @@ -1,7 +1,7 @@ Main coursework: A compiler for the C language ============================================== -Your program should read C source code from a file, and write RISC-V assembly to another file. +Your program should read C source code from a file, and write corresponding RISC-V assembly to another file. Environment ----------- @@ -10,26 +10,25 @@ Environment Developing your compiler ------------------------ -If you wish to use C++, then a basic framework for building your compiler has been provided. +If you wish to use C++, then a basic framework for building your compiler has been provided. You are strongly recommended to check out its structure [here](./basic_compiler.md). Source files can be found in the [./src](../src) folder and header files can be found in the [./include](../include) folder. -You can test your compiler against the provided test-suite by running `./test.sh` from the top of this repo; the output should look as follows: +You can test your compiler against the provided test-suite by running [`./test.sh`](../test.sh) from the top of this repo; the output should look as follows: ```console -root@host:/workspaces/langproc-env# ./test.sh - -g++ -std=c++20 -W -Wall -g -I include -o bin/c_compiler src/cli.cpp src/compiler.cpp - +> ./test.sh +> compiler_tests/_example/example.c > Pass compiler_tests/array/declare_global.c > Fail: simulation did not exit with exit-code 0 +... ``` -By default, the first `_example/example.c` test should be passing. +By default, the first [`_example/example.c`](../compiler_tests/_example/example.c) test should be passing. -This basic framework ignores the source input file and always produces the same assembly, which loads the value `5` into `a0`. +This basic framework is only able to compile a very simple program, as described [here](./basic_compiler.md). Program build and execution --------------------------- @@ -47,16 +46,14 @@ You can assume that the command-line arguments will always be in this order, and Input ----- -The input file will be pre-processed [ANSI C](https://en.wikipedia.org/wiki/ANSI_C), also called C90 or C89. It's what's generally thought of as "classic" or "normal" C, but not the _really_ old one without function prototypes (you may never have come across that). C90 is still often used in embedded systems, and pretty much the entire Linux kernel is in C90. +The input file will be pre-processed [ANSI C](https://en.wikipedia.org/wiki/ANSI_C), also called C90 or C89. It is what is generally thought of as "classic" or "normal" C, but not the _really_ old one without function prototypes (you may never have come across that). C90 is still often used in embedded systems, and pretty much the entire Linux kernel is in C90. -You've mainly been taught C++, but you're probably aware of C as a subset of C++ without classes, which is a good mental model. Your programs (lexer, parser and compiler) will never be given code that has different parsing or execution semantics under C and C++ (so, for example, I won't give you code that uses `class` as an identifier). +You have mainly been taught C++, but you are probably aware of C as a subset of C++ without classes, which is a good mental model. Your programs (lexer, parser and compiler) will never be given code that has different parsing or execution semantics under C and C++ (so, for example, I will not give you code that uses `class` as an identifier). -The source code will not contain any compiler-specific or platform-specific extensions. If you pre-process a typical program (see later), you'll see many things such as `__attribute__` or `__declspec` coming from the system headers. You will not need to deal with any of these. +The source code will not contain any compiler-specific or platform-specific extensions. If you pre-process a typical program (see later), you will see many things such as `__attribute__` or `__declspec` coming from the system headers. You will not need to deal with any of these. The test inputs will be a set of files of increasing complexity and variety. The test inputs will not have syntax errors or other programming errors, so your code does not need to handle these gracefully. -[This is the "official" C90 grammar](https://www.lysator.liu.se/c/ANSI-C-grammar-y.html), presented in the form of a Yacc parser file without any specific actions linked to each rule. There is also a [corresponding Lex lexer file](https://www.lysator.liu.se/c/ANSI-C-grammar-l.html) attached. You do not need to use everything that is in there, but it can help to give you an idea of the AST constructs that you need. If you find the grammar too complicated to understand, it is also perfectly fine to create your own simple grammar and build upon it as you add more features. - Features ------- @@ -162,10 +159,13 @@ I then use spike to simulate the executable on RISC-V, like so: This command should produce the exit code `0`. +Assembler directives +--------------- +[You will need to consider assembler directives in your output](./assembler_directives.md) Useful links ------------ -* [Godbolt](https://godbolt.org/z/vMMnWbsff) - Great tool for viewing what a real (`gcc` in this case) RISC-V compiler would produce for a given snippet of C code. This link is pre-configured for the correct architecture (`RV32IMFD`) and ABI (`ILP32D`) that the coursework targets. Code optimisation is also disabled to best mimic what you might want your compiler to output. You can replicate Godbolt locally by running `riscv64-unknown-elf-gcc -std=c90 -pedantic -ansi -O0 -march=rv32imfd -mabi=ilp32d -S [source-file.c] -o [dest-file.s]`, which might make debugging easier for some. +* [Godbolt](https://godbolt.org/z/vMMnWbsff) - Great tool for viewing what a real (`gcc` in this case) RISC-V compiler would produce for a given snippet of C code. This link is pre-configured for the correct architecture (`RV32IMFD`) and ABI (`ILP32D`) that the coursework targets. Code optimisation is also disabled to best mimic what you might want your compiler to output. You can replicate Godbolt locally by running `riscv64-unknown-elf-gcc -std=c90 -pedantic -ansi -O0 -march=rv32imfd -mabi=ilp32d -S [source-file.c] -o [dest-file.s]`, which might make debugging and directives analysis easier for some. * [Interactive RISC-V simulator](https://creatorsim.github.io/creator) - Might be helpful when trying to work out the behaviour of certain instructions that Godbolt emits. @@ -175,7 +175,7 @@ Useful links * [RISC-V Assembler Reference](https://michaeljclark.github.io/asm.html) - Very useful resource containing information about structuring your output assembly files and most importantly the assembler directives - if you don't know the meaning behind `.data`, `.text`, or `.word` then definitely check this out as well as experiment with Godbolt to see how it actually emits them. -Getting started +Getting started --------------- [How to get started? (previous students' perspectives)](./starting_guide.md) diff --git a/docs/environment_guide.md b/docs/environment_guide.md index 82d66c2..1a371ed 100644 --- a/docs/environment_guide.md +++ b/docs/environment_guide.md @@ -10,10 +10,10 @@ Many students develop their compiler in VS Code, as this has good support for co ### VS Code + Docker (the most popular option) 1) Install [Docker Desktop](https://www.docker.com/products/docker-desktop/). If you are on Apple M1/M2, make sure to choose the Apple Silicon download. -2) Open VS Code and install the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension -3) Open the folder containing this file, in VS Code +2) Open VS Code and install the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension. +3) Open the top folder of your repo (`/langproc-cw-.../`), in VS Code. 4) Open the Command Palette in VS Code. You can do this by the shortcut `Ctrl + Shift + P` on Windows or `Cmd + Shift + P` on Mac. Alternatively, you can access this from `View -> Command Palette`. -5) Enter `>Dev Containers: Reopen in Container` into the Command Palette +5) Enter `>Dev Containers: Reopen in Container` into the Command Palette. 6) After a delay -- depending on how fast your Internet connection can download ~1GB -- you will now be in the container environment. For those interested, VS Code reads the container configuration from the [.devcontainer/devcontainer.json](.devcontainer/devcontainer.json) file. 7) Test that your tools are correctly setup by running `./toolchain_test.sh` in the VS Code terminal, accessible via `Terminal -> New Terminal`. Your output should look as follows: @@ -39,9 +39,9 @@ Many students develop their compiler in VS Code, as this has good support for co > Warning for Windows users: if you are running Windows and use this method, you may experience errors related to the line endings of your files. Windows uses the special characters CRLF (`\r\n`) to represent the end of a line, whereas Linux uses just LF (`\n`). As such, if you edit these files on Windows they are most likely to be saved using CRLF. See if you can change your editor to use LF file endings or, even better, see if your editor supports [EditorConfig](https://editorconfig.org/), which standardises formatting across all files based on the [.editorconfig](.editorconfig) file in the same folder as this file. 1) Install [Docker](https://www.docker.com/products/docker-desktop/). If you are on Apple M1/M2, make sure to choose the Apple Silicon download. -2) Open a terminal (Powershell on Windows; Terminal on Mac) to the folder containing this file +2) Open a terminal (Powershell on Windows; Terminal on Mac) to the folder containing this file. 3) Inside that terminal, run `docker build -t compilers_image .` -4) Once that completes, run `docker run --rm -it -v "${PWD}:/code" -w "/code" --name "compilers_env" compilers_image` +4) Once that completes, run `docker run --rm -it -v "${PWD}:/code" -w "/code" --name "compilers_env" compilers_image`. 5) You should now be inside the LangProc tools container, where you can run `./toolchain_test.sh` inside the `/code` folder to check that your tools are working correctly. Note that the folder containing this file, as well as any subdirectories, are mounted inside this container under the path `/code`. The output of running the command should look as follows: ```console diff --git a/docs/int_main_return_tree.png b/docs/int_main_return_tree.png new file mode 100644 index 0000000..f4986c1 Binary files /dev/null and b/docs/int_main_return_tree.png differ diff --git a/include/ast.hpp b/include/ast.hpp new file mode 100644 index 0000000..39d64b7 --- /dev/null +++ b/include/ast.hpp @@ -0,0 +1,17 @@ +#ifndef AST_HPP +#define AST_HPP + +#include +#include +#include + +#include "ast_direct_declarator.hpp" +#include "ast_function_definition.hpp" +#include "ast_identifier.hpp" +#include "ast_jump_statement.hpp" +#include "ast_node.hpp" +#include "ast_type_specifier.hpp" + +extern Node* parseAST(std::string file_name); + +#endif diff --git a/include/ast_context.hpp b/include/ast_context.hpp new file mode 100644 index 0000000..87022b5 --- /dev/null +++ b/include/ast_context.hpp @@ -0,0 +1,9 @@ +#ifndef AST_CONTEXT +#define AST_CONTEXT + +// An object of class Context is passed between AST nodes during compilation to provide adequate context +class Context { + /* TODO decide what goes inside here */ +}; + +#endif diff --git a/include/ast_direct_declarator.hpp b/include/ast_direct_declarator.hpp new file mode 100644 index 0000000..7a5f298 --- /dev/null +++ b/include/ast_direct_declarator.hpp @@ -0,0 +1,13 @@ +#ifndef AST_DIRECT_DECLARATOR +#define AST_DIRECT_DECLARATOR + +#include "ast_node.hpp" + +class DirectDeclarator : public Node { +public: + DirectDeclarator(Node* identifier); + ~DirectDeclarator() {}; + void emitRISC(std::ostream &stream, Context &context) const; +}; + +#endif diff --git a/include/ast_function_definition.hpp b/include/ast_function_definition.hpp new file mode 100644 index 0000000..3b26724 --- /dev/null +++ b/include/ast_function_definition.hpp @@ -0,0 +1,13 @@ +#ifndef AST_FUNCTION_DEFINITION_HPP +#define AST_FUNCTION_DEFINITION_HPP + +#include "ast_node.hpp" + +class FunctionDefinition : public Node { +public: + FunctionDefinition(Node* declaration_specifiers, Node* declarator, Node* compound_statement); + ~FunctionDefinition() {}; + void emitRISC(std::ostream &stream, Context &context) const; +}; + +#endif diff --git a/include/ast_identifier.hpp b/include/ast_identifier.hpp new file mode 100644 index 0000000..65e70a1 --- /dev/null +++ b/include/ast_identifier.hpp @@ -0,0 +1,15 @@ +#ifndef AST_IDENTIFIER +#define AST_IDENTIFIER + +#include "ast_node.hpp" + +class Identifier : public Node { +private: + std::string* identifier; +public: + Identifier(std::string* _identifier) : identifier(_identifier) {}; + ~Identifier() {delete identifier;}; + void emitRISC(std::ostream &stream, Context &context) const; +}; + +#endif diff --git a/include/ast_jump_statement.hpp b/include/ast_jump_statement.hpp new file mode 100644 index 0000000..1f25076 --- /dev/null +++ b/include/ast_jump_statement.hpp @@ -0,0 +1,13 @@ +#ifndef AST_JUMP_STATEMENT +#define AST_JUMP_STATEMENT + +#include "ast_node.hpp" + +class JumpStatement : public Node { +public: + JumpStatement() {}; + ~JumpStatement() {}; + void emitRISC(std::ostream &stream, Context &context) const; +}; + +#endif diff --git a/include/ast_node.hpp b/include/ast_node.hpp new file mode 100644 index 0000000..08bd764 --- /dev/null +++ b/include/ast_node.hpp @@ -0,0 +1,19 @@ +#ifndef AST_NODE_HPP +#define AST_NODE_HPP + +#include +#include + +#include "ast_context.hpp" + +class Node { +protected: + std::vector branches; + +public: + Node() {}; + virtual ~Node(); + virtual void emitRISC(std::ostream &stream, Context &context) const = 0; +}; + +#endif diff --git a/include/ast_type_specifier.hpp b/include/ast_type_specifier.hpp new file mode 100644 index 0000000..238eeae --- /dev/null +++ b/include/ast_type_specifier.hpp @@ -0,0 +1,15 @@ +#ifndef AST_TYPE_SPECIFIER +#define AST_TYPE_SPECIFIER + +#include "ast_node.hpp" + +class TypeSpecifier : public Node { +private: + std::string type; +public: + TypeSpecifier(std::string _type) : type(_type) {}; + ~TypeSpecifier() {}; + void emitRISC(std::ostream &stream, Context &context) const {}; +}; + +#endif diff --git a/include/cli.h b/include/cli.h index 10ce52f..afaf3d4 100644 --- a/include/cli.h +++ b/include/cli.h @@ -4,6 +4,6 @@ #include #include -int parse_command_line_args(int argc, char **argv, std::string &sourcePath, std::string &outputPath); +int parseCommandLineArgs(int argc, char **argv, std::string &source_path, std::string &output_path); #endif diff --git a/src/ast_context.cpp b/src/ast_context.cpp new file mode 100644 index 0000000..4f0b361 --- /dev/null +++ b/src/ast_context.cpp @@ -0,0 +1 @@ +#include "ast_context.hpp" diff --git a/src/ast_direct_declarator.cpp b/src/ast_direct_declarator.cpp new file mode 100644 index 0000000..f07ec30 --- /dev/null +++ b/src/ast_direct_declarator.cpp @@ -0,0 +1,11 @@ +#include "ast_direct_declarator.hpp" + +DirectDeclarator::DirectDeclarator(Node* identifier) { + branches.insert(branches.end(), {identifier}); +} + +void DirectDeclarator::emitRISC(std::ostream &stream, Context &context) const { + // Emit identifier + branches[0]->emitRISC(stream, context); + stream << ":" << std::endl; +} diff --git a/src/ast_function_definition.cpp b/src/ast_function_definition.cpp new file mode 100644 index 0000000..117cf66 --- /dev/null +++ b/src/ast_function_definition.cpp @@ -0,0 +1,13 @@ +#include "ast_function_definition.hpp" + +FunctionDefinition::FunctionDefinition(Node* declaration_specifiers, Node* declarator, Node* compound_statement) { + branches.insert(branches.end(), {declaration_specifiers, declarator, compound_statement}); +} + +void FunctionDefinition::emitRISC(std::ostream &stream, Context &context) const { + // Emit declarator + branches[1]->emitRISC(stream, context); + + // Emit compound_statement + branches[2]->emitRISC(stream, context); +} diff --git a/src/ast_identifier.cpp b/src/ast_identifier.cpp new file mode 100644 index 0000000..4ef98e7 --- /dev/null +++ b/src/ast_identifier.cpp @@ -0,0 +1,5 @@ +#include "ast_identifier.hpp" + +void Identifier::emitRISC(std::ostream &stream, Context &context) const { + stream << *identifier; +} diff --git a/src/ast_jump_statement.cpp b/src/ast_jump_statement.cpp new file mode 100644 index 0000000..55dcea7 --- /dev/null +++ b/src/ast_jump_statement.cpp @@ -0,0 +1,10 @@ +#include "ast_jump_statement.hpp" + +void JumpStatement::emitRISC(std::ostream &stream, Context &context) const { + // TODO these lines are hardcoded for the example test to pass, you have to correct them + stream << "addi t0, zero, 0" << std::endl; + stream << "addi t0, t0, 5" << std::endl; + stream << "add a0, zero, t0" << std::endl; + stream << "ret" << std::endl; + //------------------------------------------------------------------------------------- +} diff --git a/src/ast_node.cpp b/src/ast_node.cpp new file mode 100644 index 0000000..4381172 --- /dev/null +++ b/src/ast_node.cpp @@ -0,0 +1,7 @@ +#include "ast_node.hpp" + +Node::~Node() { + for (unsigned i = 0; i < branches.size(); i++){ + delete branches[i]; + } +} diff --git a/src/ast_type_specifier.cpp b/src/ast_type_specifier.cpp new file mode 100644 index 0000000..fddbedc --- /dev/null +++ b/src/ast_type_specifier.cpp @@ -0,0 +1,2 @@ +#include "ast_type_specifier.hpp" + diff --git a/src/cli.cpp b/src/cli.cpp index 8c82799..7cb83b1 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -1,62 +1,52 @@ #include -int parse_command_line_args(int argc, char **argv, std::string &sourcePath, std::string &outputPath) -{ - std::string input = ""; +int parseCommandLineArgs(int argc, char **argv, std::string &source_path, std::string &output_path) { + std::string input = ""; - if ((argc <= 1) || (argv[argc - 1] == NULL) || (argv[argc - 1][0] == '-')) - { - std::cerr << "No command line arguments were provided" << std::endl; - return 1; - } - else - { - input = argv[argc - 1]; - } + if ((argc <= 1) || (argv[argc - 1] == NULL) || (argv[argc - 1][0] == '-')) { + std::cerr << "No command line arguments were provided" << std::endl; + return 1; + } + else { + input = argv[argc - 1]; + } - // Prevent opterr messages from being outputted. - opterr = 0; + // Prevent opterr messages from being outputted. + opterr = 0; - // bin/c_compiler -S [source-file.c] -o [dest-file.s] - int opt; - while ((opt = getopt(argc, argv, "S:o:")) != -1) - { - switch (opt) - { - case 'S': - sourcePath = std::string(optarg); - break; - case 'o': - outputPath = std::string(optarg); - break; - case '?': - if (optopt == 'S' || optopt == 'o') - { - fprintf(stderr, "Option -%c requires an argument.\n", optopt); - } - else if (isprint(optopt)) - { - fprintf(stderr, "Unknown option `-%c'.\n", optopt); - } - else - { - fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); - } - return 1; - } + // bin/c_compiler -S [source-file.c] -o [dest-file.s] + int opt; + while ((opt = getopt(argc, argv, "S:o:")) != -1) { + switch (opt) { + case 'S': + source_path = std::string(optarg); + break; + case 'o': + output_path = std::string(optarg); + break; + case '?': + if (optopt == 'S' || optopt == 'o') { + fprintf(stderr, "Option -%c requires an argument.\n", optopt); + } + else if (isprint(optopt)) { + fprintf(stderr, "Unknown option `-%c'.\n", optopt); + } + else { + fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); + } + return 1; } + } - if (sourcePath.length() == 0) - { - std::cerr << "The source path -S argument was not set." << std::endl; - return 1; - } + if (source_path.length() == 0) { + std::cerr << "The source path -S argument was not set." << std::endl; + return 1; + } - if (outputPath.length() == 0) - { - std::cerr << "The output path -o argument was not set." << std::endl; - return 1; - } + if (output_path.length() == 0) { + std::cerr << "The output path -o argument was not set." << std::endl; + return 1; + } - return 0; + return 0; } diff --git a/src/compiler.cpp b/src/compiler.cpp index 41d4220..b298d60 100644 --- a/src/compiler.cpp +++ b/src/compiler.cpp @@ -3,52 +3,38 @@ #include #include "cli.h" - -void compile(std::ostream &w) -{ - w << ".text" << std::endl; - w << ".globl f" << std::endl; - w << std::endl; - - w << "f:" << std::endl; - w << "addi t0, zero, 0" << std::endl; - w << "addi t0, t0, 5" << std::endl; - w << "add a0, zero, t0" << std::endl; - w << "ret" << std::endl; -} - -// TODO: uncomment the below if you're using Flex/Bison. -// extern FILE *yyin; +#include "ast.hpp" int main(int argc, char **argv) { - // Parse CLI arguments, to fetch the values of the source and output files. - std::string sourcePath = ""; - std::string outputPath = ""; - if (parse_command_line_args(argc, argv, sourcePath, outputPath)) - { - return 1; - } - - // TODO: uncomment the below lines if you're using Flex/Bison. - // This configures Flex to look at sourcePath instead of - // reading from stdin. - // yyin = fopen(sourcePath, "r"); - // if (yyin == NULL) - // { - // perror("Could not open source file"); - // return 1; - // } - - // Open the output file in truncation mode (to overwrite the contents) - std::ofstream output; - output.open(outputPath, std::ios::trunc); - - // Compile the input - std::cout << "Compiling: " << sourcePath << std::endl; - compile(output); - std::cout << "Compiled to: " << outputPath << std::endl; - - output.close(); - return 0; + // Parse CLI arguments, to fetch the values of the source and output files. + std::string source_path = ""; + std::string output_path = ""; + if (parseCommandLineArgs(argc, argv, source_path, output_path)) { + return 1; + } + + // Parse input and generate AST + Node* root = parseAST(source_path); + + // Open the output file in truncation mode (to overwrite the contents) + std::ofstream output; + output.open(output_path, std::ios::trunc); + + // Emit assembler directives + // TODO these are just examples ones, make sure you understand the concept of directives and correct them + std::vector directives = {"text", "globl f"}; + for (auto directive : directives) { + output << "." << directive << "\n"; + } + output << std::endl; + + // Do actual compilation + Context context; + root->emitRISC(output, context); + + // Close output file + output.close(); + + return 0; } diff --git a/src/lexer.flex b/src/lexer.flex new file mode 100644 index 0000000..120fe38 --- /dev/null +++ b/src/lexer.flex @@ -0,0 +1,124 @@ +%option noyywrap + +%{ + // A lot of this lexer is based off the ANSI C grammar: + // https://www.lysator.liu.se/c/ANSI-C-grammar-l.html#MUL-ASSIGN + // Avoid error "error: `fileno' was not declared in this scope" + extern "C" int fileno(FILE *stream); + + #include "parser.tab.hpp" +%} + +D [0-9] +L [a-zA-Z_] +H [a-fA-F0-9] +E [Ee][+-]?{D}+ +FS (f|F|l|L) +IS (u|U|l|L)* + +%% +"/*" {/* consumes comment - TODO you might want to process and emit it in your assembly for debugging */} + +"auto" {return(AUTO);} +"break" {return(BREAK);} +"case" {return(CASE);} +"char" {return(CHAR);} +"const" {return(CONST);} +"continue" {return(CONTINUE);} +"default" {return(DEFAULT);} +"do" {return(DO);} +"double" {return(DOUBLE);} +"else" {return(ELSE);} +"enum" {return(ENUM);} +"extern" {return(EXTERN);} +"float" {return(FLOAT);} +"for" {return(FOR);} +"goto" {return(GOTO);} +"if" {return(IF);} +"int" {return(INT);} +"long" {return(LONG);} +"register" {return(REGISTER);} +"return" {return(RETURN);} +"short" {return(SHORT);} +"signed" {return(SIGNED);} +"sizeof" {return(SIZEOF);} +"static" {return(STATIC);} +"struct" {return(STRUCT);} +"switch" {return(SWITCH);} +"typedef" {return(TYPEDEF);} +"union" {return(UNION);} +"unsigned" {return(UNSIGNED);} +"void" {return(VOID);} +"volatile" {return(VOLATILE);} +"while" {return(WHILE);} + +{L}({L}|{D})* {yylval.string = new std::string(yytext); return(IDENTIFIER);} + +0[xX]{H}+{IS}? {yylval.number_int = (int)strtol(yytext, NULL, 0); return(INT_CONSTANT);} +0{D}+{IS}? {yylval.number_int = (int)strtol(yytext, NULL, 0); return(INT_CONSTANT);} +{D}+{IS}? {yylval.number_int = (int)strtol(yytext, NULL, 0); return(INT_CONSTANT);} +L?'(\\.|[^\\'])+' {yylval.number_int = (int)strtol(yytext, NULL, 0); return(INT_CONSTANT);} + +{D}+{E}{FS}? {yylval.number_float = strtod(yytext, NULL); return(FLOAT_CONSTANT);} +{D}*"."{D}+({E})?{FS}? {yylval.number_float = strtod(yytext, NULL); return(FLOAT_CONSTANT);} +{D}+"."{D}*({E})?{FS}? {yylval.number_float = strtod(yytext, NULL); return(FLOAT_CONSTANT);} + +L?\"(\\.|[^\\"])*\" {/* TODO process string literal */; return(STRING_LITERAL);} + +"..." {return(ELLIPSIS);} +">>=" {return(RIGHT_ASSIGN);} +"<<=" {return(LEFT_ASSIGN);} +"+=" {return(ADD_ASSIGN);} +"-=" {return(SUB_ASSIGN);} +"*=" {return(MUL_ASSIGN);} +"/=" {return(DIV_ASSIGN);} +"%=" {return(MOD_ASSIGN);} +"&=" {return(AND_ASSIGN);} +"^=" {return(XOR_ASSIGN);} +"|=" {return(OR_ASSIGN);} +">>" {return(RIGHT_OP);} +"<<" {return(LEFT_OP);} +"++" {return(INC_OP);} +"--" {return(DEC_OP);} +"->" {return(PTR_OP);} +"&&" {return(AND_OP);} +"||" {return(OR_OP);} +"<=" {return(LE_OP);} +">=" {return(GE_OP);} +"==" {return(EQ_OP);} +"!=" {return(NE_OP);} +";" {return(';');} +("{"|"<%") {return('{');} +("}"|"%>") {return('}');} +"," {return(',');} +":" {return(':');} +"=" {return('=');} +"(" {return('(');} +")" {return(')');} +("["|"<:") {return('[');} +("]"|":>") {return(']');} +"." {return('.');} +"&" {return('&');} +"!" {return('!');} +"~" {return('~');} +"-" {return('-');} +"+" {return('+');} +"*" {return('*');} +"/" {return('/');} +"%" {return('%');} +"<" {return('<');} +">" {return('>');} +"^" {return('^');} +"|" {return('|');} +"?" {return('?');} + +[ \a\b\t\v\f\n\r] {/* ignore new lines and special sequences */} +. {/* ignore bad characters */} + +%% + +void yyerror (char const *s) +{ + fprintf(stderr, "Lexing error: %s\n", s); + exit(1); +} diff --git a/src/parser.y b/src/parser.y new file mode 100644 index 0000000..caa24f5 --- /dev/null +++ b/src/parser.y @@ -0,0 +1,445 @@ +%code requires{ + #include "ast.hpp" + + extern Node *g_root; // A way of getting the AST out + extern FILE *yyin; + + // This is to fix problems when generating C++ + // We are declaring the functions provided by Flex, so + // that Bison generated code can call them. + int yylex(void); + void yyerror(const char *); +} + +// Represents the value associated with any kind of AST node. +%union{ + Node* node; + int number_int; + double number_float; + std::string* string; + yytokentype token; +} + +%token IDENTIFIER INT_CONSTANT FLOAT_CONSTANT STRING_LITERAL +%token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP AND_OP OR_OP +%token MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN XOR_ASSIGN OR_ASSIGN +%token TYPE_NAME TYPEDEF EXTERN STATIC AUTO REGISTER SIZEOF +%token CHAR SHORT INT LONG SIGNED UNSIGNED FLOAT DOUBLE CONST VOLATILE VOID +%token STRUCT UNION ENUM ELLIPSIS +%token CASE DEFAULT IF ELSE SWITCH WHILE DO FOR GOTO CONTINUE BREAK RETURN + +%type translation_unit external_declaration function_definition primary_expression postfix_expression argument_expression_list +%type unary_expression cast_expression multiplicative_expression additive_expression shift_expression relational_expression +%type equality_expression and_expression exclusive_or_expression inclusive_or_expression logical_and_expression logical_or_expression +%type conditional_expression assignment_expression expression constant_expression declaration declaration_specifiers init_declarator_list +%type init_declarator type_specifier struct_specifier struct_declaration_list struct_declaration specifier_qualifier_list struct_declarator_list +%type struct_declarator enum_specifier enumerator_list enumerator declarator direct_declarator pointer parameter_list parameter_declaration +%type identifier_list type_name abstract_declarator direct_abstract_declarator initializer initializer_list statement labeled_statement +%type compound_statement declaration_list statement_list expression_statement selection_statement iteration_statement jump_statement + +%type unary_operator assignment_operator storage_class_specifier + +%type INT_CONSTANT STRING_LITERAL +%type FLOAT_CONSTANT +%type IDENTIFIER + + +%start ROOT +%% + +ROOT + : translation_unit {g_root = $1;} + +translation_unit + : external_declaration {$$ = $1;} + | translation_unit external_declaration + ; + +external_declaration + : function_definition {$$ = $1;} + | declaration + ; + +function_definition + : declaration_specifiers declarator declaration_list compound_statement + | declaration_specifiers declarator compound_statement {$$ = new FunctionDefinition($1, $2, $3);} + | declarator declaration_list compound_statement + | declarator compound_statement + ; + + +primary_expression + : IDENTIFIER + | INT_CONSTANT + | FLOAT_CONSTANT + | STRING_LITERAL + | '(' expression ')' + ; + +postfix_expression + : primary_expression + | postfix_expression '[' expression ']' + | postfix_expression '(' ')' + | postfix_expression '(' argument_expression_list ')' + | postfix_expression '.' IDENTIFIER + | postfix_expression PTR_OP IDENTIFIER + | postfix_expression INC_OP + | postfix_expression DEC_OP + ; + +argument_expression_list + : assignment_expression + | argument_expression_list ',' assignment_expression + ; + +unary_expression + : postfix_expression + | INC_OP unary_expression + | DEC_OP unary_expression + | unary_operator cast_expression + | SIZEOF unary_expression + | SIZEOF '(' type_name ')' + ; + +unary_operator + : '&' + | '*' + | '+' + | '-' + | '~' + | '!' + ; + +cast_expression + : unary_expression + | '(' type_name ')' cast_expression + ; + +multiplicative_expression + : cast_expression + | multiplicative_expression '*' cast_expression + | multiplicative_expression '/' cast_expression + | multiplicative_expression '%' cast_expression + ; + +additive_expression + : multiplicative_expression + | additive_expression '+' multiplicative_expression + | additive_expression '-' multiplicative_expression + ; + +shift_expression + : additive_expression + | shift_expression LEFT_OP additive_expression + | shift_expression RIGHT_OP additive_expression + ; + +relational_expression + : shift_expression + | relational_expression '<' shift_expression + | relational_expression '>' shift_expression + | relational_expression LE_OP shift_expression + | relational_expression GE_OP shift_expression + ; + +equality_expression + : relational_expression + | equality_expression EQ_OP relational_expression + | equality_expression NE_OP relational_expression + ; + +and_expression + : equality_expression + | and_expression '&' equality_expression + ; + +exclusive_or_expression + : and_expression + | exclusive_or_expression '^' and_expression + ; + +inclusive_or_expression + : exclusive_or_expression + | inclusive_or_expression '|' exclusive_or_expression + ; + +logical_and_expression + : inclusive_or_expression + | logical_and_expression AND_OP inclusive_or_expression + ; + +logical_or_expression + : logical_and_expression + | logical_or_expression OR_OP logical_and_expression + ; + +conditional_expression + : logical_or_expression + | logical_or_expression '?' expression ':' conditional_expression + ; + +assignment_expression + : conditional_expression + | unary_expression assignment_operator assignment_expression + ; + +assignment_operator + : '=' + | MUL_ASSIGN + | DIV_ASSIGN + | MOD_ASSIGN + | ADD_ASSIGN + | SUB_ASSIGN + | LEFT_ASSIGN + | RIGHT_ASSIGN + | AND_ASSIGN + | XOR_ASSIGN + | OR_ASSIGN + ; + +expression + : assignment_expression + | expression ',' assignment_expression + ; + +constant_expression + : conditional_expression + ; + +declaration + : declaration_specifiers ';' + | declaration_specifiers init_declarator_list ';' + ; + +declaration_specifiers + : storage_class_specifier + | storage_class_specifier declaration_specifiers + | type_specifier {$$ = $1;} + | type_specifier declaration_specifiers + ; + +init_declarator_list + : init_declarator + | init_declarator_list ',' init_declarator + ; + +init_declarator + : declarator + | declarator '=' initializer + ; + +storage_class_specifier + : TYPEDEF + | EXTERN + | STATIC + | AUTO + | REGISTER + ; + +type_specifier + : VOID + | CHAR + | SHORT + | INT {$$ = new TypeSpecifier("int");} + | LONG + | FLOAT + | DOUBLE + | SIGNED + | UNSIGNED + | struct_specifier + | enum_specifier + | TYPE_NAME + ; + +struct_specifier + : STRUCT IDENTIFIER '{' struct_declaration_list '}' + | STRUCT '{' struct_declaration_list '}' + | STRUCT IDENTIFIER + ; + +struct_declaration_list + : struct_declaration + | struct_declaration_list struct_declaration + ; + +struct_declaration + : specifier_qualifier_list struct_declarator_list ';' + ; + +specifier_qualifier_list + : type_specifier specifier_qualifier_list + | type_specifier + ; + +struct_declarator_list + : struct_declarator + | struct_declarator_list ',' struct_declarator + ; + +struct_declarator + : declarator + | ':' constant_expression + | declarator ':' constant_expression + ; + +enum_specifier + : ENUM '{' enumerator_list '}' + | ENUM IDENTIFIER '{' enumerator_list '}' + | ENUM IDENTIFIER + ; + +enumerator_list + : enumerator + | enumerator_list ',' enumerator + ; + +enumerator + : IDENTIFIER + | IDENTIFIER '=' constant_expression + ; + +declarator + : pointer direct_declarator + | direct_declarator {$$ = $1;} + ; + +direct_declarator + : IDENTIFIER {$$ = new Identifier($1);} + | '(' declarator ')' + | direct_declarator '[' constant_expression ']' + | direct_declarator '[' ']' + | direct_declarator '(' parameter_list ')' + | direct_declarator '(' identifier_list ')' + | direct_declarator '(' ')' {$$ = new DirectDeclarator($1);} + ; + +pointer + : '*' + | '*' pointer + ; + +parameter_list + : parameter_declaration + | parameter_list ',' parameter_declaration + ; + +parameter_declaration + : declaration_specifiers declarator + | declaration_specifiers abstract_declarator + | declaration_specifiers + ; + +identifier_list + : IDENTIFIER + | identifier_list ',' IDENTIFIER + ; + +type_name + : specifier_qualifier_list + | specifier_qualifier_list abstract_declarator + ; + +abstract_declarator + : pointer + | direct_abstract_declarator + | pointer direct_abstract_declarator + ; + +direct_abstract_declarator + : '(' abstract_declarator ')' + | '[' ']' + | '[' constant_expression ']' + | direct_abstract_declarator '[' ']' + | direct_abstract_declarator '[' constant_expression ']' + | '(' ')' + | '(' parameter_list ')' + | direct_abstract_declarator '(' ')' + | direct_abstract_declarator '(' parameter_list ')' + ; + +initializer + : assignment_expression + | '{' initializer_list '}' + | '{' initializer_list ',' '}' + ; + +initializer_list + : initializer + | initializer_list ',' initializer + ; + +statement + : labeled_statement + | compound_statement + | expression_statement + | selection_statement + | iteration_statement + | jump_statement {$$ = $1;} + ; + +labeled_statement + : IDENTIFIER ':' statement + | CASE constant_expression ':' statement + | DEFAULT ':' statement + ; + +compound_statement + : '{' '}' + | '{' statement_list '}' {$$ = $2;} + | '{' declaration_list '}' + | '{' declaration_list statement_list '}' + ; + +declaration_list + : declaration + | declaration_list declaration + ; + +statement_list + : statement {$$ = $1;} + | statement_list statement + ; + +expression_statement + : ';' + | expression ';' {$$ = $1;} + ; + +selection_statement + : IF '(' expression ')' statement + | IF '(' expression ')' statement ELSE statement + | SWITCH '(' expression ')' statement + ; + +iteration_statement + : WHILE '(' expression ')' statement + | DO statement WHILE '(' expression ')' ';' + | FOR '(' expression_statement expression_statement ')' statement + | FOR '(' expression_statement expression_statement expression ')' statement + ; + +jump_statement + : GOTO IDENTIFIER ';' + | CONTINUE ';' + | BREAK ';' + | RETURN ';' {$$ = new JumpStatement();} + | RETURN expression ';' + ; + + + +%% + +Node *g_root; + +Node *parseAST(std::string file_name) +{ + yyin = fopen(file_name.c_str(), "r"); + if(yyin == NULL){ + std::cerr << "Couldn't open input file: " << file_name << std::endl; + exit(1); + } + g_root = NULL; + yyparse(); + return g_root; +}