From 9627cdc496c73bfb525475f29e6dd89b7e70f13d Mon Sep 17 00:00:00 2001 From: Akuli Date: Thu, 23 Mar 2023 00:24:23 +0200 Subject: [PATCH] Improve handling bad bytes in source files (#354) --- compare_compilers.sh | 4 ++-- self_hosted/parses_wrong.txt | 1 - self_hosted/runs_wrong.txt | 1 - self_hosted/tokenizer.jou | 7 ++++++- self_hosted/tokenizes_wrong.txt | 2 -- src/tokenize.c | 2 ++ tests/syntax_error/tab.jou | 3 +++ 7 files changed, 13 insertions(+), 7 deletions(-) delete mode 100644 self_hosted/tokenizes_wrong.txt create mode 100644 tests/syntax_error/tab.jou diff --git a/compare_compilers.sh b/compare_compilers.sh index aa2845c2..3fb96281 100755 --- a/compare_compilers.sh +++ b/compare_compilers.sh @@ -38,7 +38,7 @@ function append_line() local line="$2" echo -e " ${YELLOW}Adding $line to $file${RESET}" - if grep -q $'\r' $error_list_file; then + if [ -f $file ] && grep -q $'\r' $file; then # CRLF line endings (likely Windows, but depends on git settings) printf "%s\r\n" "$line" >> "$file" else @@ -91,7 +91,7 @@ for action in tokenize parse run; do (./jou $flag $file || true) &> tmp/compare_compilers/compiler_written_in_c.txt (./self_hosted_compiler $flag $file || true) &> tmp/compare_compilers/self_hosted.txt - if grep -qxF $file <(cat $error_list_file | tr -d '\r'); then + if [ -f $error_list_file ] && grep -qxF $file <(cat $error_list_file | tr -d '\r'); then # The file is skipped, so the two compilers should behave differently if diff tmp/compare_compilers/compiler_written_in_c.txt tmp/compare_compilers/self_hosted.txt >/dev/null; then if [ $fix = yes ]; then diff --git a/self_hosted/parses_wrong.txt b/self_hosted/parses_wrong.txt index 29a26fb1..da3fe42f 100644 --- a/self_hosted/parses_wrong.txt +++ b/self_hosted/parses_wrong.txt @@ -14,7 +14,6 @@ tests/should_succeed/local_import.jou tests/should_succeed/method.jou tests/should_succeed/printf.jou tests/should_succeed/sizeof.jou -tests/syntax_error/bad_byte.jou tests/wrong_type/arrow_operator_not_pointer_method.jou tests/wrong_type/float_and_double.jou tests/should_succeed/linked_list.jou diff --git a/self_hosted/runs_wrong.txt b/self_hosted/runs_wrong.txt index 7d0f66d3..c06fb2a5 100644 --- a/self_hosted/runs_wrong.txt +++ b/self_hosted/runs_wrong.txt @@ -53,7 +53,6 @@ tests/should_succeed/undefined_value_warning.jou tests/should_succeed/unreachable_warning.jou tests/should_succeed/unused_import.jou tests/syntax_error/bad_addressof.jou -tests/syntax_error/bad_byte.jou tests/syntax_error/self_outside_class.jou tests/wrong_type/arrow_operator_not_pointer_method.jou tests/wrong_type/assign_to_deref_non_pointer.jou diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 4d8b2c6f..bff1f69e 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -495,11 +495,16 @@ class Tokenizer: self->unread_byte(b) token.kind = TokenKind::Operator token.short_string = self->read_operator() + elif b == '\t': + fail(self->location, "Jou files cannot contain tab characters (use 4 spaces for indentation)") elif b == '\0': token.kind = TokenKind::EndOfFile else: message: byte[100] - sprintf(message, "unexpected byte %#02x", b) + if b < 0x80 and isprint(b) != 0: + sprintf(message, "unexpected byte '%c' (%#02x)", b, b) + else: + sprintf(message, "unexpected byte %#02x", b) fail(self->location, message) self->handle_parentheses(&token) diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt deleted file mode 100644 index 9e7dbc5e..00000000 --- a/self_hosted/tokenizes_wrong.txt +++ /dev/null @@ -1,2 +0,0 @@ -# This is a list of files that are not yet supported by the tokenizer of the self-hosted compiler. -tests/syntax_error/bad_byte.jou diff --git a/src/tokenize.c b/src/tokenize.c index a8f919ac..e3744a89 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -445,6 +445,8 @@ static Token read_token(struct State *st) break; case '\'': t.type = TOKEN_CHAR; t.data.char_value = read_char_literal(st); break; case '"': t.type = TOKEN_STRING; t.data.string_value = read_string(st, '"', NULL); break; + case '\t': + fail_with_error(st->location, "Jou files cannot contain tab characters (use 4 spaces for indentation)"); default: if(is_identifier_or_number_byte(c)) { read_identifier_or_number(st, c, &t.data.name); diff --git a/tests/syntax_error/tab.jou b/tests/syntax_error/tab.jou new file mode 100644 index 00000000..38597b52 --- /dev/null +++ b/tests/syntax_error/tab.jou @@ -0,0 +1,3 @@ +# This file contains a tab character. Make sure your editor doesn't ruin it. +def main() -> int: + return 0 # Error: Jou files cannot contain tab characters (use 4 spaces for indentation)