diff --git a/src/Compilation.zig b/src/Compilation.zig
index a3b60e21..1e13c78b 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -120,8 +120,6 @@ types: struct {
     int16: Type = .{ .specifier = .invalid },
     int64: Type = .{ .specifier = .invalid },
 } = .{},
-/// Mapping from Source.Id to byte offset of first non-utf8 byte
-invalid_utf8_locs: std.AutoHashMapUnmanaged(Source.Id, u32) = .{},
 string_interner: StringInterner = .{},
 
 pub fn init(gpa: Allocator) Compilation {
@@ -153,7 +151,6 @@ pub fn deinit(comp: *Compilation) void {
     comp.pragma_handlers.deinit();
     comp.generated_buf.deinit();
     comp.builtins.deinit(comp.gpa);
-    comp.invalid_utf8_locs.deinit(comp.gpa);
     comp.string_interner.deinit(comp.gpa);
 }
 
@@ -949,7 +946,7 @@ pub fn getSource(comp: *const Compilation, id: Source.Id) Source {
 }
 
 /// Creates a Source from the contents of `reader` and adds it to the Compilation
-/// Performs newline splicing, line-ending normalization to '\n', and UTF-8 validation.
+/// Performs newline splicing and line-ending normalization to '\n'
 /// caller retains ownership of `path`
 /// `expected_size` will be allocated to hold the contents of `reader` and *must* be at least
 /// as large as the entire contents of `reader`.
@@ -1092,9 +1089,6 @@ pub fn addSourceFromReader(comp: *Compilation, reader: anytype, path: []const u8
     };
 
     try comp.sources.put(duped_path, source);
-    if (source.offsetOfInvalidUtf8()) |offset| {
-        try comp.invalid_utf8_locs.putNoClobber(comp.gpa, source_id, offset);
-    }
     return source;
 }
 
@@ -1460,32 +1454,26 @@ test "ignore BOM at beginning of file" {
     const BOM = "\xEF\xBB\xBF";
 
     const Test = struct {
-        fn run(buf: []const u8, input_type: enum { valid_utf8, invalid_utf8 }) !void {
+        fn run(buf: []const u8) !void {
             var comp = Compilation.init(std.testing.allocator);
             defer comp.deinit();
 
             var buf_reader = std.io.fixedBufferStream(buf);
             const source = try comp.addSourceFromReader(buf_reader.reader(), "file.c", @intCast(buf.len));
-            switch (input_type) {
-                .valid_utf8 => {
-                    const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
-                    try std.testing.expectEqualStrings(expected_output, source.buf);
-                    try std.testing.expect(!comp.invalid_utf8_locs.contains(source.id));
-                },
-                .invalid_utf8 => try std.testing.expect(comp.invalid_utf8_locs.contains(source.id)),
-            }
+            const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
+            try std.testing.expectEqualStrings(expected_output, source.buf);
         }
     };
 
-    try Test.run(BOM, .valid_utf8);
-    try Test.run(BOM ++ "x", .valid_utf8);
-    try Test.run("x" ++ BOM, .valid_utf8);
-    try Test.run(BOM ++ " ", .valid_utf8);
-    try Test.run(BOM ++ "\n", .valid_utf8);
-    try Test.run(BOM ++ "\\", .valid_utf8);
-
-    try Test.run(BOM[0..1] ++ "x", .invalid_utf8);
-    try Test.run(BOM[0..2] ++ "x", .invalid_utf8);
-    try Test.run(BOM[1..] ++ "x", .invalid_utf8);
-    try Test.run(BOM[2..] ++ "x", .invalid_utf8);
+    try Test.run(BOM);
+    try Test.run(BOM ++ "x");
+    try Test.run("x" ++ BOM);
+    try Test.run(BOM ++ " ");
+    try Test.run(BOM ++ "\n");
+    try Test.run(BOM ++ "\\");
+
+    try Test.run(BOM[0..1] ++ "x");
+    try Test.run(BOM[0..2] ++ "x");
+    try Test.run(BOM[1..] ++ "x");
+    try Test.run(BOM[2..] ++ "x");
 }
diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig
index 68fa38b7..d3a5a5ca 100644
--- a/src/Diagnostics.zig
+++ b/src/Diagnostics.zig
@@ -1474,6 +1474,16 @@ const messages = struct {
         const opt = "c99-compat";
         const kind = .off;
     };
+    pub const unexpected_character = struct {
+        const msg = "unexpected character <U+{X:0>4}>";
+        const extra = .actual_codepoint;
+        const kind = .@"error";
+    };
+    pub const invalid_identifier_start_char = struct {
+        const msg = "character <U+{X:0>4}> not allowed at the start of an identifier";
+        const extra = .actual_codepoint;
+        const kind = .@"error";
+    };
     pub const unicode_zero_width = struct {
         const msg = "identifier contains Unicode character <U+{X:0>4}> that is invisible in some environments";
         const opt = "unicode-homoglyph";
diff --git a/src/LangOpts.zig b/src/LangOpts.zig
index 4cdfd3de..1755db58 100644
--- a/src/LangOpts.zig
+++ b/src/LangOpts.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const DiagnosticTag = @import("Diagnostics.zig").Tag;
+const CharInfo = @import("CharInfo.zig");
 
 const LangOpts = @This();
 
@@ -85,6 +86,20 @@ pub const Standard = enum {
             .c2x, .gnu2x => "202311L",
         };
     }
+
+    pub fn codepointAllowedInIdentifier(standard: Standard, codepoint: u21, is_start: bool) bool {
+        if (is_start) {
+            return if (standard.atLeast(.c11))
+                CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
+            else
+                CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint);
+        } else {
+            return if (standard.atLeast(.c11))
+                CharInfo.isC11IdChar(codepoint)
+            else
+                CharInfo.isC99IdChar(codepoint);
+        }
+    }
 };
 
 emulate: Compiler = .clang,
diff --git a/src/Parser.zig b/src/Parser.zig
index 461aef57..011e4f60 100644
--- a/src/Parser.zig
+++ b/src/Parser.zig
@@ -186,15 +186,18 @@ string_ids: struct {
     ucontext_t: StringId,
 },
 
-fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
-    if (codepoint <= 0x7F) return false;
-    var diagnosed = false;
+/// Checks codepoint for various pedantic warnings
+/// Returns true if diagnostic issued
+fn checkIdentifierCodepointWarnings(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
+    assert(codepoint >= 0x80);
+
+    const err_start = comp.diag.list.items.len;
+
     if (!CharInfo.isC99IdChar(codepoint)) {
         try comp.diag.add(.{
             .tag = .c99_compat,
             .loc = loc,
         }, &.{});
-        diagnosed = true;
     }
     if (CharInfo.isInvisible(codepoint)) {
         try comp.diag.add(.{
@@ -202,7 +205,6 @@ fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Loca
             .loc = loc,
             .extra = .{ .actual_codepoint = codepoint },
         }, &.{});
-        diagnosed = true;
     }
     if (CharInfo.homoglyph(codepoint)) |resembles| {
         try comp.diag.add(.{
@@ -210,31 +212,78 @@ fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Loca
             .loc = loc,
             .extra = .{ .codepoints = .{ .actual = codepoint, .resembles = resembles } },
         }, &.{});
-        diagnosed = true;
     }
-    return diagnosed;
+    return comp.diag.list.items.len != err_start;
+}
+
+/// Issues diagnostics for the current extended identifier token
+/// Return value indicates whether the token should be considered an identifier
+/// true means consider the token to actually be an identifier
+/// false means it is not
+fn validateExtendedIdentifier(p: *Parser) !bool {
+    assert(p.tok_ids[p.tok_i] == .extended_identifier);
+
+    const slice = p.tokSlice(p.tok_i);
+    const view = std.unicode.Utf8View.init(slice) catch {
+        try p.errTok(.invalid_utf8, p.tok_i);
+        return error.FatalError;
+    };
+    var it = view.iterator();
+
+    var valid_identifier = true;
+    var warned = false;
+    var len: usize = 0;
+    var invalid_char: u21 = undefined;
+    var loc = p.pp.tokens.items(.loc)[p.tok_i];
+
+    const standard = p.comp.langopts.standard;
+    while (it.nextCodepoint()) |codepoint| {
+        defer {
+            len += 1;
+            loc.byte_offset += std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
+        }
+        if (codepoint == '$') {
+            warned = true;
+            try p.comp.diag.add(.{
+                .tag = .dollar_in_identifier_extension,
+                .loc = loc,
+            }, &.{});
+        }
+
+        if (codepoint <= 0x7F) continue;
+        if (!valid_identifier) continue;
+
+        const allowed = standard.codepointAllowedInIdentifier(codepoint, len == 0);
+        if (!allowed) {
+            invalid_char = codepoint;
+            valid_identifier = false;
+            continue;
+        }
+
+        if (!warned) {
+            warned = try checkIdentifierCodepointWarnings(p.comp, codepoint, loc);
+        }
+    }
+
+    if (!valid_identifier) {
+        if (len == 1) {
+            try p.errExtra(.unexpected_character, p.tok_i, .{ .actual_codepoint = invalid_char });
+            return false;
+        } else {
+            try p.errExtra(.invalid_identifier_start_char, p.tok_i, .{ .actual_codepoint = invalid_char });
+        }
+    }
+
+    return true;
 }
 
 fn eatIdentifier(p: *Parser) !?TokenIndex {
     switch (p.tok_ids[p.tok_i]) {
         .identifier => {},
         .extended_identifier => {
-            const slice = p.tokSlice(p.tok_i);
-            var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
-            var loc = p.pp.tokens.items(.loc)[p.tok_i];
-
-            if (mem.indexOfScalar(u8, slice, '$')) |i| {
-                loc.byte_offset += @intCast(i);
-                try p.comp.diag.add(.{
-                    .tag = .dollar_in_identifier_extension,
-                    .loc = loc,
-                }, &.{});
-                loc = p.pp.tokens.items(.loc)[p.tok_i];
-            }
-
-            while (it.nextCodepoint()) |c| {
-                if (try checkIdentifierCodepoint(p.comp, c, loc)) break;
-                loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
+            if (!try p.validateExtendedIdentifier()) {
+                p.tok_i += 1;
+                return null;
             }
         },
         else => return null,
diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig
index 95758ae3..227fa7db 100644
--- a/src/Preprocessor.zig
+++ b/src/Preprocessor.zig
@@ -226,14 +226,6 @@ fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 {
 }
 
 fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token {
-    if (pp.comp.invalid_utf8_locs.get(source.id)) |offset| {
-        try pp.comp.diag.add(.{
-            .tag = .invalid_utf8,
-            // Todo: compute line number
-            .loc = .{ .id = source.id, .byte_offset = offset },
-        }, &.{});
-        return error.FatalError;
-    }
     var guard_name = pp.findIncludeGuard(source);
 
     pp.preprocess_count += 1;
diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig
index 28de3c2c..1e9008c7 100644
--- a/src/Tokenizer.zig
+++ b/src/Tokenizer.zig
@@ -3,8 +3,6 @@ const assert = std.debug.assert;
 const Compilation = @import("Compilation.zig");
 const Source = @import("Source.zig");
 const LangOpts = @import("LangOpts.zig");
-const CharInfo = @import("CharInfo.zig");
-const unicode = @import("unicode.zig");
 
 const Tokenizer = @This();
 
@@ -817,24 +815,6 @@ pub const Token = struct {
         };
     }
 
-    /// Check if codepoint may appear in specified context
-    /// does not check basic character set chars because the tokenizer handles them separately to keep the common
-    /// case on the fast path
-    pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
-        if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
-        if (codepoint <= 0x7F) return false;
-        return switch (where) {
-            .start => if (comp.langopts.standard.atLeast(.c11))
-                CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
-            else
-                CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint),
-            .inside => if (comp.langopts.standard.atLeast(.c11))
-                CharInfo.isC11IdChar(codepoint)
-            else
-                CharInfo.isC99IdChar(codepoint),
-        };
-    }
-
     const all_kws = std.ComptimeStringMap(Id, .{
         .{ "auto", auto: {
             @setEvalBranchQuota(3000);
@@ -1038,18 +1018,10 @@ pub fn next(self: *Tokenizer) Token {
 
     var return_state = state;
     var counter: u32 = 0;
-    var codepoint_len: u3 = undefined;
+    var codepoint_len: u32 = undefined;
     while (self.index < self.buf.len) : (self.index += codepoint_len) {
-        // Source files get checked for valid utf-8 before being tokenized so it is safe to use
-        // these versions.
-        codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]);
-        const c: u21 = switch (codepoint_len) {
-            1 => @as(u21, self.buf[self.index]),
-            2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]),
-            3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]),
-            4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]),
-            else => unreachable,
-        };
+        codepoint_len = 1;
+        const c = self.buf[self.index];
         switch (state) {
             .start => switch (c) {
                 '\n' => {
@@ -1137,13 +1109,19 @@ pub fn next(self: *Tokenizer) Token {
                 '#' => state = .hash,
                 '0'...'9' => state = .pp_num,
                 '\t', '\x0B', '\x0C', ' ' => state = .whitespace,
-                else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
+                '$' => if (self.comp.langopts.dollars_in_identifiers) {
                     state = .extended_identifier;
                 } else {
                     id = .invalid;
                     self.index += codepoint_len;
                     break;
                 },
+                0x80...0xFF => state = .extended_identifier,
+                else => {
+                    id = .invalid;
+                    self.index += codepoint_len;
+                    break;
+                },
             },
             .whitespace => switch (c) {
                 '\t', '\x0B', '\x0C', ' ' => {},
@@ -1311,12 +1289,16 @@ pub fn next(self: *Tokenizer) Token {
             },
             .identifier, .extended_identifier => switch (c) {
                 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
-                else => {
-                    if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
-                        id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
-                        break;
-                    }
+                '$' => if (self.comp.langopts.dollars_in_identifiers) {
                     state = .extended_identifier;
+                } else {
+                    id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
+                    break;
+                },
+                0x80...0xFF => state = .extended_identifier,
+                else => {
+                    id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
+                    break;
                 },
             },
             .equal => switch (c) {
diff --git a/src/unicode.zig b/src/unicode.zig
deleted file mode 100644
index 3b467f2a..00000000
--- a/src/unicode.zig
+++ /dev/null
@@ -1,41 +0,0 @@
-//! Copied from https://github.com/ziglang/zig/blob/6f0807f50f4e946bb850e746beaa5d6556cf7750/lib/std/unicode.zig
-//! with all safety checks removed. These functions must only be called with known-good buffers that have already
-//! been validated as being legitimate UTF8-encoded data, otherwise undefined behavior will occur.
-
-pub fn utf8ByteSequenceLength_unsafe(first_byte: u8) u3 {
-    return switch (first_byte) {
-        0b0000_0000...0b0111_1111 => 1,
-        0b1100_0000...0b1101_1111 => 2,
-        0b1110_0000...0b1110_1111 => 3,
-        0b1111_0000...0b1111_0111 => 4,
-        else => unreachable,
-    };
-}
-
-pub fn utf8Decode2_unsafe(bytes: []const u8) u21 {
-    var value: u21 = bytes[0] & 0b00011111;
-    value <<= 6;
-    return value | (bytes[1] & 0b00111111);
-}
-
-pub fn utf8Decode3_unsafe(bytes: []const u8) u21 {
-    var value: u21 = bytes[0] & 0b00001111;
-
-    value <<= 6;
-    value |= bytes[1] & 0b00111111;
-
-    value <<= 6;
-    return value | (bytes[2] & 0b00111111);
-}
-
-pub fn utf8Decode4_unsafe(bytes: []const u8) u21 {
-    var value: u21 = bytes[0] & 0b00000111;
-    value <<= 6;
-    value |= bytes[1] & 0b00111111;
-
-    value <<= 6;
-    value |= bytes[2] & 0b00111111;
-
-    value <<= 6;
-    return value | (bytes[3] & 0b00111111);
-}
diff --git a/test/cases/extended identifiers c11.c b/test/cases/extended identifiers c11.c
index c56f0aba..0009b35f 100644
--- a/test/cases/extended identifiers c11.c	
+++ b/test/cases/extended identifiers c11.c	
@@ -40,7 +40,9 @@ int あああ =  "あああ" * 1;
     "extended identifiers c11.c:22:10: warning: identifier contains Unicode character <U+00AD> that is invisible in some environments [-Wunicode-homoglyph]" \
     "extended identifiers c11.c:23:14: warning: using this character in an identifier is incompatible with C99 [-Wc99-compat]" \
     "extended identifiers c11.c:23:14: warning: treating Unicode character <U+FF09> as identifier character rather than as ')' symbol [-Wunicode-homoglyph]" \
-    "extended identifiers c11.c:29:8: error: expected identifier or '{'" \
-    "extended identifiers c11.c:33:9: warning: declaration does not declare anything [-Wmissing-declaration]" \
-    "extended identifiers c11.c:33:9: error: expected ';', found invalid bytes" \
+    "extended identifiers c11.c:29:8: error: unexpected character <U+00A3>" \
+    "extended identifiers c11.c:29:1: warning: declaration does not declare anything [-Wmissing-declaration]" \
+    "extended identifiers c11.c:33:9: error: unexpected character <U+00A2>" \
+    "extended identifiers c11.c:33:10: warning: declaration does not declare anything [-Wmissing-declaration]" \
     "extended identifiers c11.c:36:18: error: invalid operands to binary expression ('char *' and 'int')" \
+
diff --git a/test/cases/extended identifiers c99.c b/test/cases/extended identifiers c99.c
index 4ac71fb1..b44dcf05 100644
--- a/test/cases/extended identifiers c99.c	
+++ b/test/cases/extended identifiers c99.c	
@@ -26,5 +26,7 @@ int Uǿ1 = 0;
 int Lǿ = 0;
 int Lǿ1 = 0;
 
-#define EXPECTED_ERRORS "extended identifiers c99.c:10:9: error: expected identifier or '('" \
-    "extended identifiers c99.c:16:9: error: expected identifier or '('" \
+#define EXPECTED_ERRORS "extended identifiers c99.c:10:9: error: unexpected character <U+2122>" \
+    "extended identifiers c99.c:10:11: error: expected identifier or '('" \
+    "extended identifiers c99.c:16:9: error: character <U+0661> not allowed at the start of an identifier" \
+
diff --git a/test/cases/latin1.c b/test/cases/latin1.c
index d6d67bf0..7bf16fbf 100644
--- a/test/cases/latin1.c
+++ b/test/cases/latin1.c
@@ -2,3 +2,5 @@ int main(void) {
     char *� = "ABC";
     return 0;
 }
+
+#define EXPECTED_ERRORS "latin1.c:2:11: error: source file is not valid UTF-8"
diff --git a/test/runner.zig b/test/runner.zig
index cdd040e0..221433fe 100644
--- a/test/runner.zig
+++ b/test/runner.zig
@@ -67,14 +67,7 @@ fn testOne(allocator: std.mem.Allocator, path: []const u8, self_exe_dir: []const
     _ = try pp.preprocess(builtin_macros);
     _ = try pp.preprocess(user_macros);
 
-    const eof = pp.preprocess(file) catch |err| {
-        if (!std.unicode.utf8ValidateSlice(file.buf)) {
-            if (comp.diag.list.items.len > 0 and comp.diag.list.items[comp.diag.list.items.len - 1].tag == .invalid_utf8) {
-                return;
-            }
-        }
-        return err;
-    };
+    const eof = try pp.preprocess(file);
     try pp.tokens.append(allocator, eof);
 
     var tree = try aro.Parser.parse(&pp);
@@ -217,15 +210,6 @@ pub fn main() !void {
         _ = try pp.preprocess(builtin_macros);
         _ = try pp.preprocess(user_macros);
         const eof = pp.preprocess(file) catch |err| {
-            if (!std.unicode.utf8ValidateSlice(file.buf)) {
-                // non-utf8 files are not preprocessed, so we can't use EXPECTED_ERRORS; instead we
-                // check that the most recent error is .invalid_utf8
-                if (comp.diag.list.items.len > 0 and comp.diag.list.items[comp.diag.list.items.len - 1].tag == .invalid_utf8) {
-                    _ = comp.diag.list.pop();
-                    continue;
-                }
-            }
-
             fail_count += 1;
             progress.log("could not preprocess file '{s}': {s}\n", .{ path, @errorName(err) });
             continue;
@@ -284,7 +268,15 @@ pub fn main() !void {
 
         const expected_types = pp.defines.get("EXPECTED_TYPES");
 
-        var tree = try aro.Parser.parse(&pp);
+        var tree = aro.Parser.parse(&pp) catch |err| switch (err) {
+            error.FatalError => {
+                if (try checkExpectedErrors(&pp, &progress, &buf)) |some| {
+                    if (some) ok_count += 1 else fail_count += 1;
+                }
+                continue;
+            },
+            else => |e| return e,
+        };
         defer tree.deinit();
 
         const ast_path = try std.fs.path.join(gpa, &.{ args[1], "ast", std.fs.path.basename(path) });