Source: don't require UTF-8 encoding for source files

This removes the requirement that source files be UTF-8 encoded for tokenization and preprocessing. Only the parser cares about the source encoding, so we defer any checks until then.
Vexu · Sep 28, 2023 · 8c9c4f5 · 8c9c4f5
1 parent 5688dbc
commit 8c9c4f5
Show file tree

Hide file tree

Showing 11 changed files with 152 additions and 159 deletions.
diff --git a/src/Compilation.zig b/src/Compilation.zig
@@ -120,8 +120,6 @@ types: struct {
     int16: Type = .{ .specifier = .invalid },
     int64: Type = .{ .specifier = .invalid },
 } = .{},
-/// Mapping from Source.Id to byte offset of first non-utf8 byte
-invalid_utf8_locs: std.AutoHashMapUnmanaged(Source.Id, u32) = .{},
 string_interner: StringInterner = .{},
 
 pub fn init(gpa: Allocator) Compilation {
@@ -153,7 +151,6 @@ pub fn deinit(comp: *Compilation) void {
     comp.pragma_handlers.deinit();
     comp.generated_buf.deinit();
     comp.builtins.deinit(comp.gpa);
-    comp.invalid_utf8_locs.deinit(comp.gpa);
     comp.string_interner.deinit(comp.gpa);
 }
 
@@ -949,7 +946,7 @@ pub fn getSource(comp: *const Compilation, id: Source.Id) Source {
 }
 
 /// Creates a Source from the contents of `reader` and adds it to the Compilation
-/// Performs newline splicing, line-ending normalization to '\n', and UTF-8 validation.
+/// Performs newline splicing and line-ending normalization to '\n'
 /// caller retains ownership of `path`
 /// `expected_size` will be allocated to hold the contents of `reader` and *must* be at least
 /// as large as the entire contents of `reader`.
@@ -1092,9 +1089,6 @@ pub fn addSourceFromReader(comp: *Compilation, reader: anytype, path: []const u8
     };
 
     try comp.sources.put(duped_path, source);
-    if (source.offsetOfInvalidUtf8()) |offset| {
-        try comp.invalid_utf8_locs.putNoClobber(comp.gpa, source_id, offset);
-    }
     return source;
 }
 
@@ -1460,32 +1454,26 @@ test "ignore BOM at beginning of file" {
     const BOM = "\xEF\xBB\xBF";
 
     const Test = struct {
-        fn run(buf: []const u8, input_type: enum { valid_utf8, invalid_utf8 }) !void {
+        fn run(buf: []const u8) !void {
             var comp = Compilation.init(std.testing.allocator);
             defer comp.deinit();
 
             var buf_reader = std.io.fixedBufferStream(buf);
             const source = try comp.addSourceFromReader(buf_reader.reader(), "file.c", @intCast(buf.len));
-            switch (input_type) {
-                .valid_utf8 => {
-                    const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
-                    try std.testing.expectEqualStrings(expected_output, source.buf);
-                    try std.testing.expect(!comp.invalid_utf8_locs.contains(source.id));
-                },
-                .invalid_utf8 => try std.testing.expect(comp.invalid_utf8_locs.contains(source.id)),
-            }
+            const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
+            try std.testing.expectEqualStrings(expected_output, source.buf);
         }
     };
 
-    try Test.run(BOM, .valid_utf8);
-    try Test.run(BOM ++ "x", .valid_utf8);
-    try Test.run("x" ++ BOM, .valid_utf8);
-    try Test.run(BOM ++ " ", .valid_utf8);
-    try Test.run(BOM ++ "\n", .valid_utf8);
-    try Test.run(BOM ++ "\\", .valid_utf8);
-
-    try Test.run(BOM[0..1] ++ "x", .invalid_utf8);
-    try Test.run(BOM[0..2] ++ "x", .invalid_utf8);
-    try Test.run(BOM[1..] ++ "x", .invalid_utf8);
-    try Test.run(BOM[2..] ++ "x", .invalid_utf8);
+    try Test.run(BOM);
+    try Test.run(BOM ++ "x");
+    try Test.run("x" ++ BOM);
+    try Test.run(BOM ++ " ");
+    try Test.run(BOM ++ "\n");
+    try Test.run(BOM ++ "\\");
+
+    try Test.run(BOM[0..1] ++ "x");
+    try Test.run(BOM[0..2] ++ "x");
+    try Test.run(BOM[1..] ++ "x");
+    try Test.run(BOM[2..] ++ "x");
 }
diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig
@@ -1474,6 +1474,16 @@ const messages = struct {
         const opt = "c99-compat";
         const kind = .off;
     };
+    pub const unexpected_character = struct {
+        const msg = "unexpected character <U+{X:0>4}>";
+        const extra = .actual_codepoint;
+        const kind = .@"error";
+    };
+    pub const invalid_identifier_start_char = struct {
+        const msg = "character <U+{X:0>4}> not allowed at the start of an identifier";
+        const extra = .actual_codepoint;
+        const kind = .@"error";
+    };
     pub const unicode_zero_width = struct {
         const msg = "identifier contains Unicode character <U+{X:0>4}> that is invisible in some environments";
         const opt = "unicode-homoglyph";

diff --git a/src/LangOpts.zig b/src/LangOpts.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const DiagnosticTag = @import("Diagnostics.zig").Tag;
+const CharInfo = @import("CharInfo.zig");
 
 const LangOpts = @This();
 
@@ -85,6 +86,20 @@ pub const Standard = enum {
             .c2x, .gnu2x => "202311L",
         };
     }
+
+    pub fn codepointAllowedInIdentifier(standard: Standard, codepoint: u21, is_start: bool) bool {
+        if (is_start) {
+            return if (standard.atLeast(.c11))
+                CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
+            else
+                CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint);
+        } else {
+            return if (standard.atLeast(.c11))
+                CharInfo.isC11IdChar(codepoint)
+            else
+                CharInfo.isC99IdChar(codepoint);
+        }
+    }
 };
 
 emulate: Compiler = .clang,

diff --git a/src/Parser.zig b/src/Parser.zig
@@ -186,55 +186,104 @@ string_ids: struct {
     ucontext_t: StringId,
 },
 
-fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
-    if (codepoint <= 0x7F) return false;
-    var diagnosed = false;
+/// Checks codepoint for various pedantic warnings
+/// Returns true if diagnostic issued
+fn checkIdentifierCodepointWarnings(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
+    assert(codepoint >= 0x80);
+
+    const err_start = comp.diag.list.items.len;
+
     if (!CharInfo.isC99IdChar(codepoint)) {
         try comp.diag.add(.{
             .tag = .c99_compat,
             .loc = loc,
         }, &.{});
-        diagnosed = true;
     }
     if (CharInfo.isInvisible(codepoint)) {
         try comp.diag.add(.{
             .tag = .unicode_zero_width,
             .loc = loc,
             .extra = .{ .actual_codepoint = codepoint },
         }, &.{});
-        diagnosed = true;
     }
     if (CharInfo.homoglyph(codepoint)) |resembles| {
         try comp.diag.add(.{
             .tag = .unicode_homoglyph,
             .loc = loc,
             .extra = .{ .codepoints = .{ .actual = codepoint, .resembles = resembles } },
         }, &.{});
-        diagnosed = true;
     }
-    return diagnosed;
+    return comp.diag.list.items.len != err_start;
+}
+
+/// Issues diagnostics for the current extended identifier token
+/// Return value indicates whether the token should be considered an identifier
+/// true means consider the token to actually be an identifier
+/// false means it is not
+fn validateExtendedIdentifier(p: *Parser) !bool {
+    assert(p.tok_ids[p.tok_i] == .extended_identifier);
+
+    const slice = p.tokSlice(p.tok_i);
+    const view = std.unicode.Utf8View.init(slice) catch {
+        try p.errTok(.invalid_utf8, p.tok_i);
+        return error.FatalError;
+    };
+    var it = view.iterator();
+
+    var valid_identifier = true;
+    var warned = false;
+    var len: usize = 0;
+    var invalid_char: u21 = undefined;
+    var loc = p.pp.tokens.items(.loc)[p.tok_i];
+
+    const standard = p.comp.langopts.standard;
+    while (it.nextCodepoint()) |codepoint| {
+        defer {
+            len += 1;
+            loc.byte_offset += std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
+        }
+        if (codepoint == '$') {
+            warned = true;
+            try p.comp.diag.add(.{
+                .tag = .dollar_in_identifier_extension,
+                .loc = loc,
+            }, &.{});
+        }
+
+        if (codepoint <= 0x7F) continue;
+        if (!valid_identifier) continue;
+
+        const allowed = standard.codepointAllowedInIdentifier(codepoint, len == 0);
+        if (!allowed) {
+            invalid_char = codepoint;
+            valid_identifier = false;
+            continue;
+        }
+
+        if (!warned) {
+            warned = try checkIdentifierCodepointWarnings(p.comp, codepoint, loc);
+        }
+    }
+
+    if (!valid_identifier) {
+        if (len == 1) {
+            try p.errExtra(.unexpected_character, p.tok_i, .{ .actual_codepoint = invalid_char });
+            return false;
+        } else {
+            try p.errExtra(.invalid_identifier_start_char, p.tok_i, .{ .actual_codepoint = invalid_char });
+        }
+    }
+
+    return true;
 }
 
 fn eatIdentifier(p: *Parser) !?TokenIndex {
     switch (p.tok_ids[p.tok_i]) {
         .identifier => {},
         .extended_identifier => {
-            const slice = p.tokSlice(p.tok_i);
-            var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
-            var loc = p.pp.tokens.items(.loc)[p.tok_i];
-
-            if (mem.indexOfScalar(u8, slice, '$')) |i| {
-                loc.byte_offset += @intCast(i);
-                try p.comp.diag.add(.{
-                    .tag = .dollar_in_identifier_extension,
-                    .loc = loc,
-                }, &.{});
-                loc = p.pp.tokens.items(.loc)[p.tok_i];
-            }
-
-            while (it.nextCodepoint()) |c| {
-                if (try checkIdentifierCodepoint(p.comp, c, loc)) break;
-                loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
+            if (!try p.validateExtendedIdentifier()) {
+                p.tok_i += 1;
+                return null;
             }
         },
         else => return null,

diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig
@@ -226,14 +226,6 @@ fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 {
 }
 
 fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token {
-    if (pp.comp.invalid_utf8_locs.get(source.id)) |offset| {
-        try pp.comp.diag.add(.{
-            .tag = .invalid_utf8,
-            // Todo: compute line number
-            .loc = .{ .id = source.id, .byte_offset = offset },
-        }, &.{});
-        return error.FatalError;
-    }
     var guard_name = pp.findIncludeGuard(source);
 
     pp.preprocess_count += 1;

diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig
@@ -3,8 +3,6 @@ const assert = std.debug.assert;
 const Compilation = @import("Compilation.zig");
 const Source = @import("Source.zig");
 const LangOpts = @import("LangOpts.zig");
-const CharInfo = @import("CharInfo.zig");
-const unicode = @import("unicode.zig");
 
 const Tokenizer = @This();
 
@@ -817,24 +815,6 @@ pub const Token = struct {
         };
     }
 
-    /// Check if codepoint may appear in specified context
-    /// does not check basic character set chars because the tokenizer handles them separately to keep the common
-    /// case on the fast path
-    pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
-        if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
-        if (codepoint <= 0x7F) return false;
-        return switch (where) {
-            .start => if (comp.langopts.standard.atLeast(.c11))
-                CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
-            else
-                CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint),
-            .inside => if (comp.langopts.standard.atLeast(.c11))
-                CharInfo.isC11IdChar(codepoint)
-            else
-                CharInfo.isC99IdChar(codepoint),
-        };
-    }
-
     const all_kws = std.ComptimeStringMap(Id, .{
         .{ "auto", auto: {
             @setEvalBranchQuota(3000);
@@ -1038,18 +1018,10 @@ pub fn next(self: *Tokenizer) Token {
 
     var return_state = state;
     var counter: u32 = 0;
-    var codepoint_len: u3 = undefined;
+    var codepoint_len: u32 = undefined;
     while (self.index < self.buf.len) : (self.index += codepoint_len) {
-        // Source files get checked for valid utf-8 before being tokenized so it is safe to use
-        // these versions.
-        codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]);
-        const c: u21 = switch (codepoint_len) {
-            1 => @as(u21, self.buf[self.index]),
-            2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]),
-            3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]),
-            4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]),
-            else => unreachable,
-        };
+        codepoint_len = 1;
+        const c = self.buf[self.index];
         switch (state) {
             .start => switch (c) {
                 '\n' => {
@@ -1137,13 +1109,19 @@ pub fn next(self: *Tokenizer) Token {
                 '#' => state = .hash,
                 '0'...'9' => state = .pp_num,
                 '\t', '\x0B', '\x0C', ' ' => state = .whitespace,
-                else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
+                '$' => if (self.comp.langopts.dollars_in_identifiers) {
                     state = .extended_identifier;
                 } else {
                     id = .invalid;
                     self.index += codepoint_len;
                     break;
                 },
+                0x80...0xFF => state = .extended_identifier,
+                else => {
+                    id = .invalid;
+                    self.index += codepoint_len;
+                    break;
+                },
             },
             .whitespace => switch (c) {
                 '\t', '\x0B', '\x0C', ' ' => {},
@@ -1311,12 +1289,16 @@ pub fn next(self: *Tokenizer) Token {
             },
             .identifier, .extended_identifier => switch (c) {
                 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
-                else => {
-                    if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
-                        id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
-                        break;
-                    }
+                '$' => if (self.comp.langopts.dollars_in_identifiers) {
                     state = .extended_identifier;
+                } else {
+                    id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
+                    break;
+                },
+                0x80...0xFF => state = .extended_identifier,
+                else => {
+                    id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
+                    break;
                 },
             },
             .equal => switch (c) {