Skip to content

Commit

Permalink
Source: don't require UTF-8 encoding for source files
Browse files Browse the repository at this point in the history
This removes the requirement that source files be UTF-8 encoded
for tokenization and preprocessing. Only the parser cares about
the source encoding, so we defer any checks until then.
  • Loading branch information
ehaas committed Sep 28, 2023
1 parent 5688dbc commit 8c9c4f5
Show file tree
Hide file tree
Showing 11 changed files with 152 additions and 159 deletions.
42 changes: 15 additions & 27 deletions src/Compilation.zig
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,6 @@ types: struct {
int16: Type = .{ .specifier = .invalid },
int64: Type = .{ .specifier = .invalid },
} = .{},
/// Mapping from Source.Id to byte offset of first non-utf8 byte
invalid_utf8_locs: std.AutoHashMapUnmanaged(Source.Id, u32) = .{},
string_interner: StringInterner = .{},

pub fn init(gpa: Allocator) Compilation {
Expand Down Expand Up @@ -153,7 +151,6 @@ pub fn deinit(comp: *Compilation) void {
comp.pragma_handlers.deinit();
comp.generated_buf.deinit();
comp.builtins.deinit(comp.gpa);
comp.invalid_utf8_locs.deinit(comp.gpa);
comp.string_interner.deinit(comp.gpa);
}

Expand Down Expand Up @@ -949,7 +946,7 @@ pub fn getSource(comp: *const Compilation, id: Source.Id) Source {
}

/// Creates a Source from the contents of `reader` and adds it to the Compilation
/// Performs newline splicing, line-ending normalization to '\n', and UTF-8 validation.
/// Performs newline splicing and line-ending normalization to '\n'
/// caller retains ownership of `path`
/// `expected_size` will be allocated to hold the contents of `reader` and *must* be at least
/// as large as the entire contents of `reader`.
Expand Down Expand Up @@ -1092,9 +1089,6 @@ pub fn addSourceFromReader(comp: *Compilation, reader: anytype, path: []const u8
};

try comp.sources.put(duped_path, source);
if (source.offsetOfInvalidUtf8()) |offset| {
try comp.invalid_utf8_locs.putNoClobber(comp.gpa, source_id, offset);
}
return source;
}

Expand Down Expand Up @@ -1460,32 +1454,26 @@ test "ignore BOM at beginning of file" {
const BOM = "\xEF\xBB\xBF";

const Test = struct {
fn run(buf: []const u8, input_type: enum { valid_utf8, invalid_utf8 }) !void {
fn run(buf: []const u8) !void {
var comp = Compilation.init(std.testing.allocator);
defer comp.deinit();

var buf_reader = std.io.fixedBufferStream(buf);
const source = try comp.addSourceFromReader(buf_reader.reader(), "file.c", @intCast(buf.len));
switch (input_type) {
.valid_utf8 => {
const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
try std.testing.expectEqualStrings(expected_output, source.buf);
try std.testing.expect(!comp.invalid_utf8_locs.contains(source.id));
},
.invalid_utf8 => try std.testing.expect(comp.invalid_utf8_locs.contains(source.id)),
}
const expected_output = if (mem.startsWith(u8, buf, BOM)) buf[BOM.len..] else buf;
try std.testing.expectEqualStrings(expected_output, source.buf);
}
};

try Test.run(BOM, .valid_utf8);
try Test.run(BOM ++ "x", .valid_utf8);
try Test.run("x" ++ BOM, .valid_utf8);
try Test.run(BOM ++ " ", .valid_utf8);
try Test.run(BOM ++ "\n", .valid_utf8);
try Test.run(BOM ++ "\\", .valid_utf8);

try Test.run(BOM[0..1] ++ "x", .invalid_utf8);
try Test.run(BOM[0..2] ++ "x", .invalid_utf8);
try Test.run(BOM[1..] ++ "x", .invalid_utf8);
try Test.run(BOM[2..] ++ "x", .invalid_utf8);
try Test.run(BOM);
try Test.run(BOM ++ "x");
try Test.run("x" ++ BOM);
try Test.run(BOM ++ " ");
try Test.run(BOM ++ "\n");
try Test.run(BOM ++ "\\");

try Test.run(BOM[0..1] ++ "x");
try Test.run(BOM[0..2] ++ "x");
try Test.run(BOM[1..] ++ "x");
try Test.run(BOM[2..] ++ "x");
}
10 changes: 10 additions & 0 deletions src/Diagnostics.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1474,6 +1474,16 @@ const messages = struct {
const opt = "c99-compat";
const kind = .off;
};
pub const unexpected_character = struct {
const msg = "unexpected character <U+{X:0>4}>";
const extra = .actual_codepoint;
const kind = .@"error";
};
pub const invalid_identifier_start_char = struct {
const msg = "character <U+{X:0>4}> not allowed at the start of an identifier";
const extra = .actual_codepoint;
const kind = .@"error";
};
pub const unicode_zero_width = struct {
const msg = "identifier contains Unicode character <U+{X:0>4}> that is invisible in some environments";
const opt = "unicode-homoglyph";
Expand Down
15 changes: 15 additions & 0 deletions src/LangOpts.zig
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const std = @import("std");
const DiagnosticTag = @import("Diagnostics.zig").Tag;
const CharInfo = @import("CharInfo.zig");

const LangOpts = @This();

Expand Down Expand Up @@ -85,6 +86,20 @@ pub const Standard = enum {
.c2x, .gnu2x => "202311L",
};
}

pub fn codepointAllowedInIdentifier(standard: Standard, codepoint: u21, is_start: bool) bool {
if (is_start) {
return if (standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint);
} else {
return if (standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint);
}
}
};

emulate: Compiler = .clang,
Expand Down
95 changes: 72 additions & 23 deletions src/Parser.zig
Original file line number Diff line number Diff line change
Expand Up @@ -186,55 +186,104 @@ string_ids: struct {
ucontext_t: StringId,
},

fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
if (codepoint <= 0x7F) return false;
var diagnosed = false;
/// Checks codepoint for various pedantic warnings
/// Returns true if diagnostic issued
fn checkIdentifierCodepointWarnings(comp: *Compilation, codepoint: u21, loc: Source.Location) Compilation.Error!bool {
assert(codepoint >= 0x80);

const err_start = comp.diag.list.items.len;

if (!CharInfo.isC99IdChar(codepoint)) {
try comp.diag.add(.{
.tag = .c99_compat,
.loc = loc,
}, &.{});
diagnosed = true;
}
if (CharInfo.isInvisible(codepoint)) {
try comp.diag.add(.{
.tag = .unicode_zero_width,
.loc = loc,
.extra = .{ .actual_codepoint = codepoint },
}, &.{});
diagnosed = true;
}
if (CharInfo.homoglyph(codepoint)) |resembles| {
try comp.diag.add(.{
.tag = .unicode_homoglyph,
.loc = loc,
.extra = .{ .codepoints = .{ .actual = codepoint, .resembles = resembles } },
}, &.{});
diagnosed = true;
}
return diagnosed;
return comp.diag.list.items.len != err_start;
}

/// Issues diagnostics for the current extended identifier token
/// Return value indicates whether the token should be considered an identifier
/// true means consider the token to actually be an identifier
/// false means it is not
fn validateExtendedIdentifier(p: *Parser) !bool {
assert(p.tok_ids[p.tok_i] == .extended_identifier);

const slice = p.tokSlice(p.tok_i);
const view = std.unicode.Utf8View.init(slice) catch {
try p.errTok(.invalid_utf8, p.tok_i);
return error.FatalError;
};
var it = view.iterator();

var valid_identifier = true;
var warned = false;
var len: usize = 0;
var invalid_char: u21 = undefined;
var loc = p.pp.tokens.items(.loc)[p.tok_i];

const standard = p.comp.langopts.standard;
while (it.nextCodepoint()) |codepoint| {
defer {
len += 1;
loc.byte_offset += std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
}
if (codepoint == '$') {
warned = true;
try p.comp.diag.add(.{
.tag = .dollar_in_identifier_extension,
.loc = loc,
}, &.{});
}

if (codepoint <= 0x7F) continue;
if (!valid_identifier) continue;

const allowed = standard.codepointAllowedInIdentifier(codepoint, len == 0);
if (!allowed) {
invalid_char = codepoint;
valid_identifier = false;
continue;
}

if (!warned) {
warned = try checkIdentifierCodepointWarnings(p.comp, codepoint, loc);
}
}

if (!valid_identifier) {
if (len == 1) {
try p.errExtra(.unexpected_character, p.tok_i, .{ .actual_codepoint = invalid_char });
return false;
} else {
try p.errExtra(.invalid_identifier_start_char, p.tok_i, .{ .actual_codepoint = invalid_char });
}
}

return true;
}

fn eatIdentifier(p: *Parser) !?TokenIndex {
switch (p.tok_ids[p.tok_i]) {
.identifier => {},
.extended_identifier => {
const slice = p.tokSlice(p.tok_i);
var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
var loc = p.pp.tokens.items(.loc)[p.tok_i];

if (mem.indexOfScalar(u8, slice, '$')) |i| {
loc.byte_offset += @intCast(i);
try p.comp.diag.add(.{
.tag = .dollar_in_identifier_extension,
.loc = loc,
}, &.{});
loc = p.pp.tokens.items(.loc)[p.tok_i];
}

while (it.nextCodepoint()) |c| {
if (try checkIdentifierCodepoint(p.comp, c, loc)) break;
loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
if (!try p.validateExtendedIdentifier()) {
p.tok_i += 1;
return null;
}
},
else => return null,
Expand Down
8 changes: 0 additions & 8 deletions src/Preprocessor.zig
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,6 @@ fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 {
}

fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token {
if (pp.comp.invalid_utf8_locs.get(source.id)) |offset| {
try pp.comp.diag.add(.{
.tag = .invalid_utf8,
// Todo: compute line number
.loc = .{ .id = source.id, .byte_offset = offset },
}, &.{});
return error.FatalError;
}
var guard_name = pp.findIncludeGuard(source);

pp.preprocess_count += 1;
Expand Down
56 changes: 19 additions & 37 deletions src/Tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ const assert = std.debug.assert;
const Compilation = @import("Compilation.zig");
const Source = @import("Source.zig");
const LangOpts = @import("LangOpts.zig");
const CharInfo = @import("CharInfo.zig");
const unicode = @import("unicode.zig");

const Tokenizer = @This();

Expand Down Expand Up @@ -817,24 +815,6 @@ pub const Token = struct {
};
}

/// Check if codepoint may appear in specified context
/// does not check basic character set chars because the tokenizer handles them separately to keep the common
/// case on the fast path
pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
if (codepoint <= 0x7F) return false;
return switch (where) {
.start => if (comp.langopts.standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint) and !CharInfo.isC99DisallowedInitialIDChar(codepoint),
.inside => if (comp.langopts.standard.atLeast(.c11))
CharInfo.isC11IdChar(codepoint)
else
CharInfo.isC99IdChar(codepoint),
};
}

const all_kws = std.ComptimeStringMap(Id, .{
.{ "auto", auto: {
@setEvalBranchQuota(3000);
Expand Down Expand Up @@ -1038,18 +1018,10 @@ pub fn next(self: *Tokenizer) Token {

var return_state = state;
var counter: u32 = 0;
var codepoint_len: u3 = undefined;
var codepoint_len: u32 = undefined;
while (self.index < self.buf.len) : (self.index += codepoint_len) {
// Source files get checked for valid utf-8 before being tokenized so it is safe to use
// these versions.
codepoint_len = unicode.utf8ByteSequenceLength_unsafe(self.buf[self.index]);
const c: u21 = switch (codepoint_len) {
1 => @as(u21, self.buf[self.index]),
2 => unicode.utf8Decode2_unsafe(self.buf[self.index..]),
3 => unicode.utf8Decode3_unsafe(self.buf[self.index..]),
4 => unicode.utf8Decode4_unsafe(self.buf[self.index..]),
else => unreachable,
};
codepoint_len = 1;
const c = self.buf[self.index];
switch (state) {
.start => switch (c) {
'\n' => {
Expand Down Expand Up @@ -1137,13 +1109,19 @@ pub fn next(self: *Tokenizer) Token {
'#' => state = .hash,
'0'...'9' => state = .pp_num,
'\t', '\x0B', '\x0C', ' ' => state = .whitespace,
else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
'$' => if (self.comp.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = .invalid;
self.index += codepoint_len;
break;
},
0x80...0xFF => state = .extended_identifier,
else => {
id = .invalid;
self.index += codepoint_len;
break;
},
},
.whitespace => switch (c) {
'\t', '\x0B', '\x0C', ' ' => {},
Expand Down Expand Up @@ -1311,12 +1289,16 @@ pub fn next(self: *Tokenizer) Token {
},
.identifier, .extended_identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
else => {
if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
break;
}
'$' => if (self.comp.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
break;
},
0x80...0xFF => state = .extended_identifier,
else => {
id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
break;
},
},
.equal => switch (c) {
Expand Down
Loading

0 comments on commit 8c9c4f5

Please sign in to comment.