diff --git a/src/Compilation.zig b/src/Compilation.zig index a3b60e21..1e13c78b 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -120,8 +120,6 @@ types: struct { int16: Type = .{ .specifier = .invalid }, int64: Type = .{ .specifier = .invalid }, } = .{}, -/// Mapping from Source.Id to byte offset of first non-utf8 byte -invalid_utf8_locs: std.AutoHashMapUnmanaged(Source.Id, u32) = .{}, string_interner: StringInterner = .{}, pub fn init(gpa: Allocator) Compilation { @@ -153,7 +151,6 @@ pub fn deinit(comp: *Compilation) void { comp.pragma_handlers.deinit(); comp.generated_buf.deinit(); comp.builtins.deinit(comp.gpa); - comp.invalid_utf8_locs.deinit(comp.gpa); comp.string_interner.deinit(comp.gpa); } @@ -949,7 +946,7 @@ pub fn getSource(comp: *const Compilation, id: Source.Id) Source { } /// Creates a Source from the contents of `reader` and adds it to the Compilation -/// Performs newline splicing, line-ending normalization to '\n', and UTF-8 validation. +/// Performs newline splicing and line-ending normalization to '\n' /// caller retains ownership of `path` /// `expected_size` will be allocated to hold the contents of `reader` and *must* be at least /// as large as the entire contents of `reader`. @@ -1092,9 +1089,6 @@ pub fn addSourceFromReader(comp: *Compilation, reader: anytype, path: []const u8 }; try comp.sources.put(duped_path, source); - if (source.offsetOfInvalidUtf8()) |offset| { - try comp.invalid_utf8_locs.putNoClobber(comp.gpa, source_id, offset); - } return source; } @@ -1460,32 +1454,26 @@ test "ignore BOM at beginning of file" { const BOM = "\xEF\xBB\xBF"; const Test = struct { - fn run(buf: []const u8, input_type: enum { valid_utf8, invalid_utf8 }) !void { + fn run(buf: []const u8) !void { var comp = Compilation.init(std.testing.allocator); defer comp.deinit(); var buf_reader = std.io.fixedBufferStream(buf); const source = try comp.addSourceFromReader(buf_reader.reader(), "file.c", @intCast(buf.len)); - switch (input_type) { - .valid_utf8 => { - const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf; - try std.testing.expectEqualStrings(expected_output, source.buf); - try std.testing.expect(!comp.invalid_utf8_locs.contains(source.id)); - }, - .invalid_utf8 => try std.testing.expect(comp.invalid_utf8_locs.contains(source.id)), - } + const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf; + try std.testing.expectEqualStrings(expected_output, source.buf); } }; - try Test.run(BOM, .valid_utf8); - try Test.run(BOM ++ "x", .valid_utf8); - try Test.run("x" ++ BOM, .valid_utf8); - try Test.run(BOM ++ " ", .valid_utf8); - try Test.run(BOM ++ "\n", .valid_utf8); - try Test.run(BOM ++ "\\", .valid_utf8); - - try Test.run(BOM[0..1] ++ "x", .invalid_utf8); - try Test.run(BOM[0..2] ++ "x", .invalid_utf8); - try Test.run(BOM[1..] ++ "x", .invalid_utf8); - try Test.run(BOM[2..] ++ "x", .invalid_utf8); + try Test.run(BOM); + try Test.run(BOM ++ "x"); + try Test.run("x" ++ BOM); + try Test.run(BOM ++ " "); + try Test.run(BOM ++ "\n"); + try Test.run(BOM ++ "\\"); + + try Test.run(BOM[0..1] ++ "x"); + try Test.run(BOM[0..2] ++ "x"); + try Test.run(BOM[1..] ++ "x"); + try Test.run(BOM[2..] ++ "x"); } diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 68fa38b7..d3a5a5ca 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -1474,6 +1474,16 @@ const messages = struct { const opt = "c99-compat"; const kind = .off; }; + pub const unexpected_character = struct { + const msg = "unexpected character 4}>"; + const extra = .actual_codepoint; + const kind = .@"error"; + }; + pub const invalid_identifier_start_char = struct { + const msg = "character 4}> not allowed at the start of an identifier"; + const extra = .actual_codepoint; + const kind = .@"error"; + }; pub const unicode_zero_width = struct { const msg = "identifier contains Unicode character 4}> that is invisible in some environments"; const opt = "unicode-homoglyph"; diff --git a/src/LangOpts.zig b/src/LangOpts.zig index 4cdfd3de..1755db58 100644 --- a/src/LangOpts.zig +++ b/src/LangOpts.zig @@ -1,5 +1,6 @@ const std = @import("std"); const DiagnosticTag = @import("Diagnostics.zig").Tag; +const CharInfo = @import("CharInfo.zig"); const LangOpts = @This(); @@ -85,6 +86,20 @@ pub const Standard = enum { .c2x, .gnu2x => "202311L", }; } + + pub fn codepointAllowedInIdentifier(standard: Standard, codepoint: u21, is_start: bool) bool { + if (is_start) { + return if (standard.atLeast(.c11)) + CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint) + else + CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint); + } else { + return if (standard.atLeast(.c11)) + CharInfo.isC11IdChar(codepoint) + else + CharInfo.isC99IdChar(codepoint); + } + } }; emulate: Compiler = .clang, diff --git a/src/Parser.zig b/src/Parser.zig index 461aef57..011e4f60 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -186,15 +186,18 @@ string_ids: struct { ucontext_t: StringId, }, -fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool { - if (codepoint <= 0x7F) return false; - var diagnosed = false; +/// Checks codepoint for various pedantic warnings +/// Returns true if diagnostic issued +fn checkIdentifierCodepointWarnings(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool { + assert(codepoint >= 0x80); + + const err_start = comp.diag.list.items.len; + if (!CharInfo.isC99IdChar(codepoint)) { try comp.diag.add(.{ .tag = .c99_compat, .loc = loc, }, &.{}); - diagnosed = true; } if (CharInfo.isInvisible(codepoint)) { try comp.diag.add(.{ @@ -202,7 +205,6 @@ fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Loca .loc = loc, .extra = .{ .actual_codepoint = codepoint }, }, &.{}); - diagnosed = true; } if (CharInfo.homoglyph(codepoint)) |resembles| { try comp.diag.add(.{ @@ -210,31 +212,78 @@ fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Loca .loc = loc, .extra = .{ .codepoints = .{ .actual = codepoint, .resembles = resembles } }, }, &.{}); - diagnosed = true; } - return diagnosed; + return comp.diag.list.items.len != err_start; +} + +/// Issues diagnostics for the current extended identifier token +/// Return value indicates whether the token should be considered an identifier +/// true means consider the token to actually be an identifier +/// false means it is not +fn validateExtendedIdentifier(p: *Parser) !bool { + assert(p.tok_ids[p.tok_i] == .extended_identifier); + + const slice = p.tokSlice(p.tok_i); + const view = std.unicode.Utf8View.init(slice) catch { + try p.errTok(.invalid_utf8, p.tok_i); + return error.FatalError; + }; + var it = view.iterator(); + + var valid_identifier = true; + var warned = false; + var len: usize = 0; + var invalid_char: u21 = undefined; + var loc = p.pp.tokens.items(.loc)[p.tok_i]; + + const standard = p.comp.langopts.standard; + while (it.nextCodepoint()) |codepoint| { + defer { + len += 1; + loc.byte_offset += std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + } + if (codepoint == '$') { + warned = true; + try p.comp.diag.add(.{ + .tag = .dollar_in_identifier_extension, + .loc = loc, + }, &.{}); + } + + if (codepoint <= 0x7F) continue; + if (!valid_identifier) continue; + + const allowed = standard.codepointAllowedInIdentifier(codepoint, len == 0); + if (!allowed) { + invalid_char = codepoint; + valid_identifier = false; + continue; + } + + if (!warned) { + warned = try checkIdentifierCodepointWarnings(p.comp, codepoint, loc); + } + } + + if (!valid_identifier) { + if (len == 1) { + try p.errExtra(.unexpected_character, p.tok_i, .{ .actual_codepoint = invalid_char }); + return false; + } else { + try p.errExtra(.invalid_identifier_start_char, p.tok_i, .{ .actual_codepoint = invalid_char }); + } + } + + return true; } fn eatIdentifier(p: *Parser) !?TokenIndex { switch (p.tok_ids[p.tok_i]) { .identifier => {}, .extended_identifier => { - const slice = p.tokSlice(p.tok_i); - var it = std.unicode.Utf8View.initUnchecked(slice).iterator(); - var loc = p.pp.tokens.items(.loc)[p.tok_i]; - - if (mem.indexOfScalar(u8, slice, '$')) |i| { - loc.byte_offset += @intCast(i); - try p.comp.diag.add(.{ - .tag = .dollar_in_identifier_extension, - .loc = loc, - }, &.{}); - loc = p.pp.tokens.items(.loc)[p.tok_i]; - } - - while (it.nextCodepoint()) |c| { - if (try checkIdentifierCodepoint(p.comp, c, loc)) break; - loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable; + if (!try p.validateExtendedIdentifier()) { + p.tok_i += 1; + return null; } }, else => return null, diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index 95758ae3..227fa7db 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -226,14 +226,6 @@ fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { } fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { - if (pp.comp.invalid_utf8_locs.get(source.id)) |offset| { - try pp.comp.diag.add(.{ - .tag = .invalid_utf8, - // Todo: compute line number - .loc = .{ .id = source.id, .byte_offset = offset }, - }, &.{}); - return error.FatalError; - } var guard_name = pp.findIncludeGuard(source); pp.preprocess_count += 1; diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 28de3c2c..1e9008c7 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -3,8 +3,6 @@ const assert = std.debug.assert; const Compilation = @import("Compilation.zig"); const Source = @import("Source.zig"); const LangOpts = @import("LangOpts.zig"); -const CharInfo = @import("CharInfo.zig"); -const unicode = @import("unicode.zig"); const Tokenizer = @This(); @@ -817,24 +815,6 @@ pub const Token = struct { }; } - /// Check if codepoint may appear in specified context - /// does not check basic character set chars because the tokenizer handles them separately to keep the common - /// case on the fast path - pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool { - if (codepoint == '$') return comp.langopts.dollars_in_identifiers; - if (codepoint <= 0x7F) return false; - return switch (where) { - .start => if (comp.langopts.standard.atLeast(.c11)) - CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint) - else - CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint), - .inside => if (comp.langopts.standard.atLeast(.c11)) - CharInfo.isC11IdChar(codepoint) - else - CharInfo.isC99IdChar(codepoint), - }; - } - const all_kws = std.ComptimeStringMap(Id, .{ .{ "auto", auto: { @setEvalBranchQuota(3000); @@ -1038,18 +1018,10 @@ pub fn next(self: *Tokenizer) Token { var return_state = state; var counter: u32 = 0; - var codepoint_len: u3 = undefined; + var codepoint_len: u32 = undefined; while (self.index < self.buf.len) : (self.index += codepoint_len) { - // Source files get checked for valid utf-8 before being tokenized so it is safe to use - // these versions. - codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]); - const c: u21 = switch (codepoint_len) { - 1 => @as(u21, self.buf[self.index]), - 2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]), - 3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]), - 4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]), - else => unreachable, - }; + codepoint_len = 1; + const c = self.buf[self.index]; switch (state) { .start => switch (c) { '\n' => { @@ -1137,13 +1109,19 @@ pub fn next(self: *Tokenizer) Token { '#' => state = .hash, '0'...'9' => state = .pp_num, '\t', '\x0B', '\x0C', ' ' => state = .whitespace, - else => if (Token.mayAppearInIdent(self.comp, c, .start)) { + '$' => if (self.comp.langopts.dollars_in_identifiers) { state = .extended_identifier; } else { id = .invalid; self.index += codepoint_len; break; }, + 0x80...0xFF => state = .extended_identifier, + else => { + id = .invalid; + self.index += codepoint_len; + break; + }, }, .whitespace => switch (c) { '\t', '\x0B', '\x0C', ' ' => {}, @@ -1311,12 +1289,16 @@ pub fn next(self: *Tokenizer) Token { }, .identifier, .extended_identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, - else => { - if (!Token.mayAppearInIdent(self.comp, c, .inside)) { - id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier; - break; - } + '$' => if (self.comp.langopts.dollars_in_identifiers) { state = .extended_identifier; + } else { + id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier; + break; + }, + 0x80...0xFF => state = .extended_identifier, + else => { + id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier; + break; }, }, .equal => switch (c) { diff --git a/src/unicode.zig b/src/unicode.zig deleted file mode 100644 index 3b467f2a..00000000 --- a/src/unicode.zig +++ /dev/null @@ -1,41 +0,0 @@ -//! Copied from https://github.com/ziglang/zig/blob/6f0807f50f4e946bb850e746beaa5d6556cf7750/lib/std/unicode.zig -//! with all safety checks removed. These functions must only be called with known-good buffers that have already -//! been validated as being legitimate UTF8-encoded data, otherwise undefined behavior will occur. - -pub fn utf8ByteSequenceLength_unsafe(first_byte: u8) u3 { - return switch (first_byte) { - 0b0000_0000...0b0111_1111 => 1, - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => unreachable, - }; -} - -pub fn utf8Decode2_unsafe(bytes: []const u8) u21 { - var value: u21 = bytes[0] & 0b00011111; - value <<= 6; - return value | (bytes[1] & 0b00111111); -} - -pub fn utf8Decode3_unsafe(bytes: []const u8) u21 { - var value: u21 = bytes[0] & 0b00001111; - - value <<= 6; - value |= bytes[1] & 0b00111111; - - value <<= 6; - return value | (bytes[2] & 0b00111111); -} - -pub fn utf8Decode4_unsafe(bytes: []const u8) u21 { - var value: u21 = bytes[0] & 0b00000111; - value <<= 6; - value |= bytes[1] & 0b00111111; - - value <<= 6; - value |= bytes[2] & 0b00111111; - - value <<= 6; - return value | (bytes[3] & 0b00111111); -} diff --git a/test/cases/extended identifiers c11.c b/test/cases/extended identifiers c11.c index c56f0aba..0009b35f 100644 --- a/test/cases/extended identifiers c11.c +++ b/test/cases/extended identifiers c11.c @@ -40,7 +40,9 @@ int あああ = "あああ" * 1; "extended identifiers c11.c:22:10: warning: identifier contains Unicode character that is invisible in some environments [-Wunicode-homoglyph]" \ "extended identifiers c11.c:23:14: warning: using this character in an identifier is incompatible with C99 [-Wc99-compat]" \ "extended identifiers c11.c:23:14: warning: treating Unicode character as identifier character rather than as ')' symbol [-Wunicode-homoglyph]" \ - "extended identifiers c11.c:29:8: error: expected identifier or '{'" \ - "extended identifiers c11.c:33:9: warning: declaration does not declare anything [-Wmissing-declaration]" \ - "extended identifiers c11.c:33:9: error: expected ';', found invalid bytes" \ + "extended identifiers c11.c:29:8: error: unexpected character " \ + "extended identifiers c11.c:29:1: warning: declaration does not declare anything [-Wmissing-declaration]" \ + "extended identifiers c11.c:33:9: error: unexpected character " \ + "extended identifiers c11.c:33:10: warning: declaration does not declare anything [-Wmissing-declaration]" \ "extended identifiers c11.c:36:18: error: invalid operands to binary expression ('char *' and 'int')" \ + diff --git a/test/cases/extended identifiers c99.c b/test/cases/extended identifiers c99.c index 4ac71fb1..b44dcf05 100644 --- a/test/cases/extended identifiers c99.c +++ b/test/cases/extended identifiers c99.c @@ -26,5 +26,7 @@ int Uǿ1 = 0; int Lǿ = 0; int Lǿ1 = 0; -#define EXPECTED_ERRORS "extended identifiers c99.c:10:9: error: expected identifier or '('" \ - "extended identifiers c99.c:16:9: error: expected identifier or '('" \ +#define EXPECTED_ERRORS "extended identifiers c99.c:10:9: error: unexpected character " \ + "extended identifiers c99.c:10:11: error: expected identifier or '('" \ + "extended identifiers c99.c:16:9: error: character not allowed at the start of an identifier" \ + diff --git a/test/cases/latin1.c b/test/cases/latin1.c index d6d67bf0..7bf16fbf 100644 --- a/test/cases/latin1.c +++ b/test/cases/latin1.c @@ -2,3 +2,5 @@ int main(void) { char * = "ABC"; return 0; } + +#define EXPECTED_ERRORS "latin1.c:2:11: error: source file is not valid UTF-8" diff --git a/test/runner.zig b/test/runner.zig index cdd040e0..221433fe 100644 --- a/test/runner.zig +++ b/test/runner.zig @@ -67,14 +67,7 @@ fn testOne(allocator: std.mem.Allocator, path: []const u8, self_exe_dir: []const _ = try pp.preprocess(builtin_macros); _ = try pp.preprocess(user_macros); - const eof = pp.preprocess(file) catch |err| { - if (!std.unicode.utf8ValidateSlice(file.buf)) { - if (comp.diag.list.items.len > 0 and comp.diag.list.items[comp.diag.list.items.len - 1].tag == .invalid_utf8) { - return; - } - } - return err; - }; + const eof = try pp.preprocess(file); try pp.tokens.append(allocator, eof); var tree = try aro.Parser.parse(&pp); @@ -217,15 +210,6 @@ pub fn main() !void { _ = try pp.preprocess(builtin_macros); _ = try pp.preprocess(user_macros); const eof = pp.preprocess(file) catch |err| { - if (!std.unicode.utf8ValidateSlice(file.buf)) { - // non-utf8 files are not preprocessed, so we can't use EXPECTED_ERRORS; instead we - // check that the most recent error is .invalid_utf8 - if (comp.diag.list.items.len > 0 and comp.diag.list.items[comp.diag.list.items.len - 1].tag == .invalid_utf8) { - _ = comp.diag.list.pop(); - continue; - } - } - fail_count += 1; progress.log("could not preprocess file '{s}': {s}\n", .{ path, @errorName(err) }); continue; @@ -284,7 +268,15 @@ pub fn main() !void { const expected_types = pp.defines.get("EXPECTED_TYPES"); - var tree = try aro.Parser.parse(&pp); + var tree = aro.Parser.parse(&pp) catch |err| switch (err) { + error.FatalError => { + if (try checkExpectedErrors(&pp, &progress, &buf)) |some| { + if (some) ok_count += 1 else fail_count += 1; + } + continue; + }, + else => |e| return e, + }; defer tree.deinit(); const ast_path = try std.fs.path.join(gpa, &.{ args[1], "ast", std.fs.path.basename(path) });