From 3777b2501f560352d911fdd95333b0603df7940d Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 12 Oct 2023 14:59:11 -0700 Subject: [PATCH 01/26] Tokenizer: remove return_state --- src/Tokenizer.zig | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 4e37d764..6cfe4656 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -1038,7 +1038,6 @@ pub fn next(self: *Tokenizer) Token { var start = self.index; var id: Token.Id = .eof; - var return_state = state; var counter: u32 = 0; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; @@ -1219,7 +1218,6 @@ pub fn next(self: *Tokenizer) Token { }, .string_literal => switch (c) { '\\' => { - return_state = .string_literal; state = if (self.path_escapes) .path_escape else .escape_sequence; }, '"' => { @@ -1268,7 +1266,7 @@ pub fn next(self: *Tokenizer) Token { }, .escape_sequence => switch (c) { '\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => { - state = return_state; + state = .string_literal; }, '\r', '\n' => unreachable, // removed by line splicing '0'...'7' => { @@ -1292,24 +1290,24 @@ pub fn next(self: *Tokenizer) Token { .octal_escape => switch (c) { '0'...'7' => { counter += 1; - if (counter == 3) state = return_state; + if (counter == 3) state = .string_literal; }, else => { self.index -= 1; - state = return_state; + state = .string_literal; }, }, .hex_escape => switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => {}, else => { self.index -= 1; - state = return_state; + state = .string_literal; }, }, .unicode_escape => switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => { counter -= 1; - if (counter == 0) state = return_state; + if (counter == 0) state = .string_literal; }, else => { id = .invalid; From e3fb67940efcbcd3378f85ac69005f7d7874f161 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Wed, 11 Oct 2023 21:13:07 -0700 Subject: [PATCH 02/26] Parser: add string literal classification --- src/CharLiteral.zig | 72 ++++++++++++++++++++++++++++++--------------- src/Parser.zig | 48 ++++++++---------------------- 2 files changed, 61 insertions(+), 59 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index 7c47ac7f..454f6474 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -19,30 +19,56 @@ const CharDiagnostic = struct { extra: Diagnostics.Message.Extra, }; -pub const Kind = enum { +pub const StringKind = enum { char, wide, utf_8, utf_16, utf_32, - pub fn classify(id: Tokenizer.Token.Id) Kind { + pub fn classify(id: Tokenizer.Token.Id) ?StringKind { return switch (id) { - .char_literal, - .string_literal, - => .char, - .char_literal_utf_8, - .string_literal_utf_8, - => .utf_8, - .char_literal_wide, - .string_literal_wide, - => .wide, - .char_literal_utf_16, - .string_literal_utf_16, - => .utf_16, - .char_literal_utf_32, - .string_literal_utf_32, - => .utf_32, + .string_literal => .char, + .string_literal_utf_8 => .utf_8, + .string_literal_wide => .wide, + .string_literal_utf_16 => .utf_16, + .string_literal_utf_32 => .utf_32, + else => null, + }; + } + + pub fn concat(self: StringKind, other: StringKind) !StringKind { + if (self == other) return self; // can always concat with own kind + if (self == .char) return other; // char + X -> X + if (other == .char) return self; // X + char -> X + return error.CannotConcat; + } + + pub fn charKind(self: StringKind) CharKind { + return switch (self) { + .char => .char, + .wide => .wide, + .utf_8 => .utf_8, + .utf_16 => .utf_16, + .utf_32 => .utf_32, + }; + } +}; + +pub const CharKind = enum { + char, + wide, + utf_8, + utf_16, + utf_32, + + pub fn classify(id: Tokenizer.Token.Id) CharKind { + return switch (id) { + .char_literal => .char, + .char_literal_utf_8 => .utf_8, + .char_literal_wide => .wide, + .char_literal_utf_16 => .utf_16, + .char_literal_utf_32 => .utf_32, else => unreachable, }; } @@ -51,7 +77,7 @@ pub const Kind = enum { /// May be smaller than the largest value that can be represented. /// For example u8 char literals may only specify 0-127 via literals or /// character escapes, but may specify up to \xFF via hex escapes. - pub fn maxCodepoint(kind: Kind, comp: *const Compilation) u21 { + pub fn maxCodepoint(kind: CharKind, comp: *const Compilation) u21 { return @intCast(switch (kind) { .char => std.math.maxInt(u7), .wide => @min(0x10FFFF, comp.types.wchar.maxInt(comp)), @@ -62,7 +88,7 @@ pub const Kind = enum { } /// Largest integer that can be represented by this character kind - pub fn maxInt(kind: Kind, comp: *const Compilation) u32 { + pub fn maxInt(kind: CharKind, comp: *const Compilation) u32 { return @intCast(switch (kind) { .char, .utf_8 => std.math.maxInt(u8), .wide => comp.types.wchar.maxInt(comp), @@ -71,7 +97,7 @@ pub const Kind = enum { }); } - pub fn charLiteralType(kind: Kind, comp: *const Compilation) Type { + pub fn charLiteralType(kind: CharKind, comp: *const Compilation) Type { return switch (kind) { .char => Type.int, .wide => comp.types.wchar, @@ -83,7 +109,7 @@ pub const Kind = enum { /// Return the actual contents of the string literal with leading / trailing quotes and /// specifiers removed - pub fn contentSlice(kind: Kind, delimited: []const u8) []const u8 { + pub fn contentSlice(kind: CharKind, delimited: []const u8) []const u8 { const end = delimited.len - 1; // remove trailing quote return switch (kind) { .char => delimited[1..end], @@ -98,13 +124,13 @@ pub const Kind = enum { pub const Parser = struct { literal: []const u8, i: usize = 0, - kind: Kind, + kind: CharKind, /// We only want to issue a max of 1 error per char literal errored: bool = false, errors: std.BoundedArray(CharDiagnostic, 4) = .{}, comp: *const Compilation, - pub fn init(literal: []const u8, kind: Kind, comp: *const Compilation) Parser { + pub fn init(literal: []const u8, kind: CharKind, comp: *const Compilation) Parser { return .{ .literal = literal, .comp = comp, diff --git a/src/Parser.zig b/src/Parser.zig index 93da1d3e..38f79056 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7524,41 +7524,17 @@ fn makePredefinedIdentifier(p: *Parser, start: u32) !Result { fn stringLiteral(p: *Parser) Error!Result { var start = p.tok_i; - // use 1 for wchar_t - var width: ?u8 = null; - var is_u8_literal = false; - while (true) { - switch (p.tok_ids[p.tok_i]) { - .string_literal => {}, - .string_literal_utf_16 => if (width) |some| { - if (some != 16) try p.err(.unsupported_str_cat); - } else { - width = 16; - }, - .string_literal_utf_8 => { - is_u8_literal = true; - if (width) |some| { - if (some != 8) try p.err(.unsupported_str_cat); - } else { - width = 8; - } - }, - .string_literal_utf_32 => if (width) |some| { - if (some != 32) try p.err(.unsupported_str_cat); - } else { - width = 32; - }, - .string_literal_wide => if (width) |some| { - if (some != 1) try p.err(.unsupported_str_cat); - } else { - width = 1; - }, - else => break, - } - p.tok_i += 1; + var string_kind = CharLiteral.StringKind.classify(p.tok_ids[start]).?; + p.tok_i += 1; + while (true) : (p.tok_i += 1) { + const next = CharLiteral.StringKind.classify(p.tok_ids[p.tok_i]) orelse break; + string_kind = string_kind.concat(next) catch { + try p.err(.unsupported_str_cat); + while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} + break; + }; } - if (width == null) width = 8; - if (width.? != 8) return p.todo("unicode string literals"); + if (string_kind != .char and string_kind != .utf_8) return p.todo("unicode string literals"); const string_start = p.retained_strings.items.len; while (start < p.tok_i) : (start += 1) { @@ -7600,7 +7576,7 @@ fn stringLiteral(p: *Parser) Error!Result { const slice = p.retained_strings.items[string_start..]; const arr_ty = try p.arena.create(Type.Array); - const specifier: Type.Specifier = if (is_u8_literal and p.comp.langopts.hasChar8_T()) .uchar else .char; + const specifier: Type.Specifier = if (string_kind == .utf_8 and p.comp.langopts.hasChar8_T()) .uchar else .char; arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = slice.len }; var res: Result = .{ @@ -7647,7 +7623,7 @@ fn parseUnicodeEscape(p: *Parser, tok: TokenIndex, count: u8, slice: []const u8, fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; - const char_kind = CharLiteral.Kind.classify(tok_id); + const char_kind = CharLiteral.CharKind.classify(tok_id); var val: u32 = 0; const slice = char_kind.contentSlice(p.tokSlice(p.tok_i)); From b377602fd76348247c80caa9d816645955e52b79 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 13 Oct 2023 16:50:50 -0700 Subject: [PATCH 03/26] Source: handle invalid utf-8 encoding in lineCol --- src/Source.zig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Source.zig b/src/Source.zig index c7f401cf..6986f88b 100644 --- a/src/Source.zig +++ b/src/Source.zig @@ -74,7 +74,10 @@ pub fn lineCol(source: Source, loc: Location) LineCol { i += 1; continue; }; - const cp = std.unicode.utf8Decode(source.buf[i..][0..len]) catch unreachable; + const cp = std.unicode.utf8Decode(source.buf[i..][0..len]) catch { + i += 1; + continue; + }; width += codepointWidth(cp); i += len; } From 3e32c035ac726951d05511a3c149811cc718630e Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 12 Oct 2023 19:05:49 -0700 Subject: [PATCH 04/26] Parser: add unicode and wide string literal support --- src/CharLiteral.zig | 49 ++++++++++-- src/Parser.zig | 159 ++++++++++++++++++++------------------ src/Tokenizer.zig | 57 +------------- test/cases/strings.c | 14 ++-- test/cases/wide strings.c | 55 +++++++++++++ 5 files changed, 192 insertions(+), 142 deletions(-) create mode 100644 test/cases/wide strings.c diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index 454f6474..c8b5f416 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -6,8 +6,10 @@ const Tokenizer = @import("Tokenizer.zig"); const mem = std.mem; pub const Item = union(enum) { - /// decoded escape + /// decoded hex or character escape value: u32, + /// validated unicode codepoint + codepoint: u21, /// Char literal in the source text is not utf8 encoded improperly_encoded: []const u8, /// 1 or more unescaped bytes @@ -53,6 +55,37 @@ pub const StringKind = enum { .utf_32 => .utf_32, }; } + + pub fn contentSlice(kind: StringKind, delimited: []const u8) []const u8 { + const end = delimited.len - 1; // remove trailing quote + return switch (kind) { + .char => delimited[1..end], + .wide => delimited[2..end], + .utf_8 => delimited[3..end], + .utf_16 => delimited[2..end], + .utf_32 => delimited[2..end], + }; + } + + pub fn charUnitSize(kind: StringKind, comp: *const Compilation) usize { + return switch (kind) { + .char => 1, + .wide => @intCast(comp.types.wchar.sizeof(comp).?), + .utf_8 => 1, + .utf_16 => 2, + .utf_32 => 4, + }; + } + + /// Required alignment within aro (on compiler host); not on compilation target + pub fn internalStorageAlignment(kind: StringKind, comp: *const Compilation) usize { + return switch (kind.charUnitSize(comp)) { + 1 => @alignOf(u8), + 2 => @alignOf(u16), + 4 => @alignOf(u32), + else => unreachable, + }; + } }; pub const CharKind = enum { @@ -125,16 +158,18 @@ pub const Parser = struct { literal: []const u8, i: usize = 0, kind: CharKind, + max_codepoint: u21, /// We only want to issue a max of 1 error per char literal errored: bool = false, errors: std.BoundedArray(CharDiagnostic, 4) = .{}, comp: *const Compilation, - pub fn init(literal: []const u8, kind: CharKind, comp: *const Compilation) Parser { + pub fn init(literal: []const u8, kind: CharKind, max_codepoint: u21, comp: *const Compilation) Parser { return .{ .literal = literal, .comp = comp, .kind = kind, + .max_codepoint = max_codepoint, }; } @@ -160,9 +195,9 @@ pub const Parser = struct { const view = std.unicode.Utf8View.init(unescaped_slice) catch { if (self.kind != .char) { self.err(.illegal_char_encoding_error, .{ .none = {} }); - } else { - self.warn(.illegal_char_encoding_warning, .{ .none = {} }); + return null; } + self.warn(.illegal_char_encoding_warning, .{ .none = {} }); return .{ .improperly_encoded = self.literal[start..self.i] }; }; return .{ .utf8_text = view }; @@ -220,8 +255,9 @@ pub const Parser = struct { return null; } - if (val > self.kind.maxCodepoint(self.comp)) { + if (val > self.max_codepoint) { self.err(.char_too_large, .{ .none = {} }); + return null; } if (val < 0xA0 and (val != '$' and val != '@' and val != '`')) { @@ -242,7 +278,7 @@ pub const Parser = struct { } self.warn(.c89_ucn_in_literal, .{ .none = {} }); - return .{ .value = val }; + return .{ .codepoint = @intCast(val) }; } fn parseEscapedChar(self: *Parser) Item { @@ -302,6 +338,7 @@ pub const Parser = struct { } if (overflowed or val > self.kind.maxInt(self.comp)) { self.err(.escape_sequence_overflow, .{ .unsigned = 0 }); + return 0; } if (count == 0) { std.debug.assert(base == .hex); diff --git a/src/Parser.zig b/src/Parser.zig index 38f79056..deefccba 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7531,95 +7531,104 @@ fn stringLiteral(p: *Parser) Error!Result { string_kind = string_kind.concat(next) catch { try p.err(.unsupported_str_cat); while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} - break; + return error.ParsingFailed; }; } - if (string_kind != .char and string_kind != .utf_8) return p.todo("unicode string literals"); + const char_width = string_kind.charUnitSize(p.comp); + + const retain_start = mem.alignForward(usize, p.retained_strings.items.len, string_kind.internalStorageAlignment(p.comp)); + try p.retained_strings.resize(retain_start); - const string_start = p.retained_strings.items.len; while (start < p.tok_i) : (start += 1) { - var slice = p.tokSlice(start); - slice = slice[0 .. slice.len - 1]; - var i = mem.indexOf(u8, slice, "\"").? + 1; - try p.retained_strings.ensureUnusedCapacity(slice.len); - while (i < slice.len) : (i += 1) { - switch (slice[i]) { - '\\' => { - i += 1; - switch (slice[i]) { - '\n' => i += 1, - '\r' => i += 2, - '\'', '\"', '\\', '?' => |c| p.retained_strings.appendAssumeCapacity(c), - 'n' => p.retained_strings.appendAssumeCapacity('\n'), - 'r' => p.retained_strings.appendAssumeCapacity('\r'), - 't' => p.retained_strings.appendAssumeCapacity('\t'), - 'a' => p.retained_strings.appendAssumeCapacity(0x07), - 'b' => p.retained_strings.appendAssumeCapacity(0x08), - 'e' => { - try p.errExtra(.non_standard_escape_char, start, .{ .invalid_escape = .{ .char = 'e', .offset = @intCast(i) } }); - p.retained_strings.appendAssumeCapacity(0x1B); - }, - 'f' => p.retained_strings.appendAssumeCapacity(0x0C), - 'v' => p.retained_strings.appendAssumeCapacity(0x0B), - 'x' => p.retained_strings.appendAssumeCapacity(try p.parseNumberEscape(start, 16, slice, &i)), - '0'...'7' => p.retained_strings.appendAssumeCapacity(try p.parseNumberEscape(start, 8, slice, &i)), - 'u' => try p.parseUnicodeEscape(start, 4, slice, &i), - 'U' => try p.parseUnicodeEscape(start, 8, slice, &i), - else => unreachable, - } - }, - else => |c| p.retained_strings.appendAssumeCapacity(c), - } + const this_kind = CharLiteral.StringKind.classify(p.tok_ids[start]).?; + const slice = this_kind.contentSlice(p.tokSlice(start)); + var char_literal_parser = CharLiteral.Parser.init(slice, this_kind.charKind(), 0x10ffff, p.comp); + + try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * char_width); // +1 for null terminator + while (char_literal_parser.next()) |item| switch (item) { + .value => |v| { + switch (char_width) { + 1 => p.retained_strings.appendAssumeCapacity(@intCast(v)), + 2 => { + const word: u16 = @intCast(v); + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&word)); + }, + 4 => p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&v)), + else => unreachable, + } + }, + .codepoint => |c| { + switch (char_width) { + 1 => { + var buf: [4]u8 = undefined; + const written = std.unicode.utf8Encode(c, &buf) catch unreachable; + const encoded = buf[0..written]; + p.retained_strings.appendSliceAssumeCapacity(encoded); + }, + 2 => { + var utf16_buf: [2]u16 = undefined; + var utf8_buf: [4]u8 = undefined; + const utf8_written = std.unicode.utf8Encode(c, &utf8_buf) catch unreachable; + const utf16_written = std.unicode.utf8ToUtf16Le(&utf16_buf, utf8_buf[0..utf8_written]) catch unreachable; + const bytes = std.mem.sliceAsBytes(utf16_buf[0..utf16_written]); + p.retained_strings.appendSliceAssumeCapacity(bytes); + }, + 4 => { + const val: u32 = c; + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); + }, + else => unreachable, + } + }, + .improperly_encoded => |bytes| p.retained_strings.appendSliceAssumeCapacity(bytes), + .utf8_text => |view| { + switch (char_width) { + 1 => p.retained_strings.appendSliceAssumeCapacity(view.bytes), + 2 => { + var capacity_slice: []align(@alignOf(u16)) u8 = @alignCast(p.retained_strings.unusedCapacitySlice()); + const dest_len = if (capacity_slice.len % 2 == 0) capacity_slice.len else capacity_slice.len - 1; + var dest = std.mem.bytesAsSlice(u16, capacity_slice[0..dest_len]); + const words_written = std.unicode.utf8ToUtf16Le(dest, view.bytes) catch unreachable; + p.retained_strings.items.len += words_written * 2; + }, + 4 => { + var it = view.iterator(); + while (it.nextCodepoint()) |codepoint| { + const val: u32 = codepoint; + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); + } + }, + else => unreachable, + } + }, + }; + for (char_literal_parser.errors.constSlice()) |item| { + try p.errExtra(item.tag, p.tok_i, item.extra); } } - try p.retained_strings.append(0); - const slice = p.retained_strings.items[string_start..]; + p.retained_strings.appendNTimesAssumeCapacity(0, char_width); + const slice = p.retained_strings.items[retain_start..]; const arr_ty = try p.arena.create(Type.Array); - const specifier: Type.Specifier = if (string_kind == .utf_8 and p.comp.langopts.hasChar8_T()) .uchar else .char; + const specifier: Type.Specifier = switch (string_kind) { + .char => .char, + .utf_8 => if (p.comp.langopts.hasChar8_T()) .uchar else .char, + else => string_kind.charKind().charLiteralType(p.comp).specifier, + }; - arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = slice.len }; + arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = @divExact(slice.len, char_width) }; var res: Result = .{ .ty = .{ .specifier = .array, .data = .{ .array = arr_ty }, }, - .val = Value.bytes(@intCast(string_start), @intCast(p.retained_strings.items.len)), + .val = Value.bytes(@intCast(retain_start), @intCast(p.retained_strings.items.len)), }; res.node = try p.addNode(.{ .tag = .string_literal_expr, .ty = res.ty, .data = undefined }); if (!p.in_macro) try p.value_map.put(res.node, res.val); return res; } -fn parseNumberEscape(p: *Parser, tok: TokenIndex, base: u8, slice: []const u8, i: *usize) !u8 { - if (base == 16) i.* += 1; // skip x - var char: u8 = 0; - var reported = false; - while (i.* < slice.len) : (i.* += 1) { - const val = std.fmt.charToDigit(slice[i.*], base) catch break; // validated by Tokenizer - const product, const overflowed = @mulWithOverflow(char, base); - if (overflowed != 0 and !reported) { - try p.errExtra(.escape_sequence_overflow, tok, .{ .unsigned = i.* }); - reported = true; - } - char = product + val; - } - i.* -= 1; - return char; -} - -fn parseUnicodeEscape(p: *Parser, tok: TokenIndex, count: u8, slice: []const u8, i: *usize) !void { - const c = std.fmt.parseInt(u21, slice[i.* + 1 ..][0..count], 16) catch 0x110000; // count validated by tokenizer - i.* += count + 1; - if (!std.unicode.utf8ValidCodepoint(c) or (c < 0xa0 and c != '$' and c != '@' and c != '`')) { - try p.errExtra(.invalid_universal_character, tok, .{ .unsigned = i.* - count - 2 }); - return; - } - var buf: [4]u8 = undefined; - const to_write = std.unicode.utf8Encode(c, &buf) catch unreachable; // validated above - p.retained_strings.appendSliceAssumeCapacity(buf[0..to_write]); -} - fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; @@ -7632,7 +7641,8 @@ fn charLiteral(p: *Parser) Error!Result { // fast path: single unescaped ASCII char val = slice[0]; } else { - var char_literal_parser = CharLiteral.Parser.init(slice, char_kind, p.comp); + const max_codepoint = char_kind.maxCodepoint(p.comp); + var char_literal_parser = CharLiteral.Parser.init(slice, char_kind, max_codepoint, p.comp); const max_chars_expected = 4; var stack_fallback = std.heap.stackFallback(max_chars_expected * @sizeOf(u32), p.comp.gpa); @@ -7640,20 +7650,21 @@ fn charLiteral(p: *Parser) Error!Result { defer chars.deinit(); while (char_literal_parser.next()) |item| switch (item) { - .value => |c| try chars.append(c), + .value => |v| try chars.append(v), + .codepoint => |c| try chars.append(c), .improperly_encoded => |s| { try chars.ensureUnusedCapacity(s.len); for (s) |c| chars.appendAssumeCapacity(c); }, .utf8_text => |view| { var it = view.iterator(); - var max_codepoint: u21 = 0; + var max_codepoint_seen: u21 = 0; try chars.ensureUnusedCapacity(view.bytes.len); while (it.nextCodepoint()) |c| { - max_codepoint = @max(max_codepoint, c); + max_codepoint_seen = @max(max_codepoint_seen, c); chars.appendAssumeCapacity(c); } - if (max_codepoint > char_kind.maxCodepoint(p.comp)) { + if (max_codepoint_seen > max_codepoint) { char_literal_parser.err(.char_too_large, .{ .none = {} }); } }, diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 6cfe4656..5fbf3a61 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -1001,9 +1001,6 @@ pub fn next(self: *Tokenizer) Token { char_literal, char_escape_sequence, escape_sequence, - octal_escape, - hex_escape, - unicode_escape, identifier, extended_identifier, equal, @@ -1038,7 +1035,6 @@ pub fn next(self: *Tokenizer) Token { var start = self.index; var id: Token.Id = .eof; - var counter: u32 = 0; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; switch (state) { @@ -1265,54 +1261,8 @@ pub fn next(self: *Tokenizer) Token { else => state = .char_literal, }, .escape_sequence => switch (c) { - '\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => { - state = .string_literal; - }, '\r', '\n' => unreachable, // removed by line splicing - '0'...'7' => { - counter = 1; - state = .octal_escape; - }, - 'x' => state = .hex_escape, - 'u' => { - counter = 4; - state = .unicode_escape; - }, - 'U' => { - counter = 8; - state = .unicode_escape; - }, - else => { - id = .invalid; - break; - }, - }, - .octal_escape => switch (c) { - '0'...'7' => { - counter += 1; - if (counter == 3) state = .string_literal; - }, - else => { - self.index -= 1; - state = .string_literal; - }, - }, - .hex_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => { - self.index -= 1; - state = .string_literal; - }, - }, - .unicode_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => { - counter -= 1; - if (counter == 0) state = .string_literal; - }, - else => { - id = .invalid; - break; - }, + else => state = .string_literal, }, .identifier, .extended_identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, @@ -1737,9 +1687,6 @@ pub fn next(self: *Tokenizer) Token { .char_literal, .escape_sequence, .char_escape_sequence, - .octal_escape, - .hex_escape, - .unicode_escape, .multi_line_comment, .multi_line_comment_asterisk, => id = .invalid, @@ -2112,7 +2059,7 @@ test "extended identifiers" { try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal}); try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal}); - try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid }); + try expectTokens("\"\\u\u{E0000}\"", &.{.string_literal}); try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier }); } diff --git a/test/cases/strings.c b/test/cases/strings.c index 402f4633..9bcda7e6 100644 --- a/test/cases/strings.c +++ b/test/cases/strings.c @@ -15,12 +15,12 @@ _Static_assert(1, "\u0060"); _Static_assert(1, "aaァ\e[1;"); #pragma GCC diagnostic pop -#define EXPECTED_ERRORS "strings.c:2:30: error: escape sequence out of range" \ - "strings.c:4:20: error: invalid universal character" \ - "strings.c:5:20: error: invalid universal character" \ +#define EXPECTED_ERRORS "strings.c:2:36: error: escape sequence out of range" \ + "strings.c:4:31: error: invalid universal character" \ + "strings.c:5:27: error: character 'b' cannot be specified by a universal character name" \ "strings.c:7:9: warning: multi-character character constant [-Wmultichar]" \ "strings.c:7:9: warning: character constant too long for its type" \ - "strings.c:9:20: error: invalid universal character" \ - "strings.c:10:20: error: invalid universal character" \ - "strings.c:11:20: error: invalid universal character" \ - "strings.c:15:24: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ + "strings.c:9:27: error: invalid universal character" \ + "strings.c:10:31: error: invalid universal character" \ + "strings.c:11:31: error: invalid universal character" \ + "strings.c:15:4: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c new file mode 100644 index 00000000..c483fe32 --- /dev/null +++ b/test/cases/wide strings.c @@ -0,0 +1,55 @@ +//aro-args -std=c2x +#include +typedef __WCHAR_TYPE__ wchar_t; + +uint8_t b[] = u8""; +_Static_assert(sizeof(b) == sizeof(uint8_t[1])); +char c[] = ""; +_Static_assert(sizeof(c) == 1); +wchar_t d[] = L""; +_Static_assert(sizeof(d) == sizeof(wchar_t[1])); +uint16_t e[] = u""; +_Static_assert(sizeof(e) == sizeof(uint16_t[1])); +uint32_t f[] = U""; +_Static_assert(sizeof(f) == sizeof(uint32_t[1])); + +uint16_t A[] = u"abc"; +_Static_assert(sizeof(A) == 8); + +uint32_t B[] = U"ABC"; +_Static_assert(sizeof(B) == 16); + +wchar_t C[] = L"ABC"; +_Static_assert(sizeof(C) == sizeof(wchar_t) * 4); + +uint16_t D[] = u"a" U"b"; + +uint16_t E[] = u"a" u"bc"; +_Static_assert(sizeof(E) == 8); + +uint32_t F[] = U"A" "BC"; +_Static_assert(sizeof(F) == 16); + +uint16_t G[] = u"🤗"; +_Static_assert(sizeof(G) == 6); + +uint16_t H[] = u"\U0001F917"; +_Static_assert(sizeof(H) == 6); + +uint32_t I[] = U"🤗"; +_Static_assert(sizeof(I) == 8); + +uint8_t J[] = u8"🤗"; +_Static_assert(sizeof(J) == 5); + +uint8_t K[] = u8"\U0001F917"; +_Static_assert(sizeof(K) == 5); + +uint16_t L[] = u"\xFFFFF"; + +uint8_t M[] = u8"\xFFF"; + +#define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ + "wide strings.c:48:26: error: escape sequence out of range" \ + "wide strings.c:50:24: error: escape sequence out of range" \ + From ae473a33bb898c12f603469523b700b8bed3d033 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Mon, 16 Oct 2023 09:01:40 -0700 Subject: [PATCH 05/26] Tokenizer: rename escape_sequence to string_escape_sequence --- src/Tokenizer.zig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 5fbf3a61..efaa60f2 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -1000,7 +1000,7 @@ pub fn next(self: *Tokenizer) Token { char_literal_start, char_literal, char_escape_sequence, - escape_sequence, + string_escape_sequence, identifier, extended_identifier, equal, @@ -1214,7 +1214,7 @@ pub fn next(self: *Tokenizer) Token { }, .string_literal => switch (c) { '\\' => { - state = if (self.path_escapes) .path_escape else .escape_sequence; + state = if (self.path_escapes) .path_escape else .string_escape_sequence; }, '"' => { self.index += 1; @@ -1260,7 +1260,7 @@ pub fn next(self: *Tokenizer) Token { '\r', '\n' => unreachable, // removed by line splicing else => state = .char_literal, }, - .escape_sequence => switch (c) { + .string_escape_sequence => switch (c) { '\r', '\n' => unreachable, // removed by line splicing else => state = .string_literal, }, @@ -1685,7 +1685,7 @@ pub fn next(self: *Tokenizer) Token { .path_escape, .char_literal_start, .char_literal, - .escape_sequence, + .string_escape_sequence, .char_escape_sequence, .multi_line_comment, .multi_line_comment_asterisk, From 38f771dbc385041d93ae0b789b6aafa4f32f599b Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Mon, 16 Oct 2023 09:29:06 -0700 Subject: [PATCH 06/26] CharLiteral: make char unit size an enum --- src/CharLiteral.zig | 23 ++++++++++++----------- src/Compilation.zig | 14 ++++++++++++++ src/Parser.zig | 27 ++++++++++++--------------- 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index c8b5f416..69c73046 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -67,23 +67,24 @@ pub const StringKind = enum { }; } - pub fn charUnitSize(kind: StringKind, comp: *const Compilation) usize { + pub fn charUnitSize(kind: StringKind, comp: *const Compilation) Compilation.CharUnitSize { return switch (kind) { - .char => 1, - .wide => @intCast(comp.types.wchar.sizeof(comp).?), - .utf_8 => 1, - .utf_16 => 2, - .utf_32 => 4, + .char => .@"1", + .wide => switch (comp.types.wchar.sizeof(comp).?) { + 2 => .@"2", + 4 => .@"4", + else => unreachable, + }, + .utf_8 => .@"1", + .utf_16 => .@"2", + .utf_32 => .@"4", }; } - /// Required alignment within aro (on compiler host); not on compilation target + /// Required alignment within aro (on compiler host) for writing to retained_strings pub fn internalStorageAlignment(kind: StringKind, comp: *const Compilation) usize { return switch (kind.charUnitSize(comp)) { - 1 => @alignOf(u8), - 2 => @alignOf(u16), - 4 => @alignOf(u32), - else => unreachable, + inline else => |size| @alignOf(size.Type()), }; } }; diff --git a/src/Compilation.zig b/src/Compilation.zig index c0c3126a..cd79c994 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -1428,6 +1428,20 @@ pub fn hasBuiltinFunction(comp: *const Compilation, builtin: Builtin) bool { } } +pub const CharUnitSize = enum(usize) { + @"1" = 1, + @"2" = 2, + @"4" = 4, + + pub fn Type(comptime self: CharUnitSize) type { + return switch (self) { + .@"1" => u8, + .@"2" => u16, + .@"4" => u32, + }; + } +}; + pub const renderErrors = Diagnostics.render; test "addSourceFromReader" { diff --git a/src/Parser.zig b/src/Parser.zig index deefccba..a04bce6b 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7544,28 +7544,27 @@ fn stringLiteral(p: *Parser) Error!Result { const slice = this_kind.contentSlice(p.tokSlice(start)); var char_literal_parser = CharLiteral.Parser.init(slice, this_kind.charKind(), 0x10ffff, p.comp); - try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * char_width); // +1 for null terminator + try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * @intFromEnum(char_width)); // +1 for null terminator while (char_literal_parser.next()) |item| switch (item) { .value => |v| { switch (char_width) { - 1 => p.retained_strings.appendAssumeCapacity(@intCast(v)), - 2 => { + .@"1" => p.retained_strings.appendAssumeCapacity(@intCast(v)), + .@"2" => { const word: u16 = @intCast(v); p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&word)); }, - 4 => p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&v)), - else => unreachable, + .@"4" => p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&v)), } }, .codepoint => |c| { switch (char_width) { - 1 => { + .@"1" => { var buf: [4]u8 = undefined; const written = std.unicode.utf8Encode(c, &buf) catch unreachable; const encoded = buf[0..written]; p.retained_strings.appendSliceAssumeCapacity(encoded); }, - 2 => { + .@"2" => { var utf16_buf: [2]u16 = undefined; var utf8_buf: [4]u8 = undefined; const utf8_written = std.unicode.utf8Encode(c, &utf8_buf) catch unreachable; @@ -7573,32 +7572,30 @@ fn stringLiteral(p: *Parser) Error!Result { const bytes = std.mem.sliceAsBytes(utf16_buf[0..utf16_written]); p.retained_strings.appendSliceAssumeCapacity(bytes); }, - 4 => { + .@"4" => { const val: u32 = c; p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); }, - else => unreachable, } }, .improperly_encoded => |bytes| p.retained_strings.appendSliceAssumeCapacity(bytes), .utf8_text => |view| { switch (char_width) { - 1 => p.retained_strings.appendSliceAssumeCapacity(view.bytes), - 2 => { + .@"1" => p.retained_strings.appendSliceAssumeCapacity(view.bytes), + .@"2" => { var capacity_slice: []align(@alignOf(u16)) u8 = @alignCast(p.retained_strings.unusedCapacitySlice()); const dest_len = if (capacity_slice.len % 2 == 0) capacity_slice.len else capacity_slice.len - 1; var dest = std.mem.bytesAsSlice(u16, capacity_slice[0..dest_len]); const words_written = std.unicode.utf8ToUtf16Le(dest, view.bytes) catch unreachable; p.retained_strings.items.len += words_written * 2; }, - 4 => { + .@"4" => { var it = view.iterator(); while (it.nextCodepoint()) |codepoint| { const val: u32 = codepoint; p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); } }, - else => unreachable, } }, }; @@ -7606,7 +7603,7 @@ fn stringLiteral(p: *Parser) Error!Result { try p.errExtra(item.tag, p.tok_i, item.extra); } } - p.retained_strings.appendNTimesAssumeCapacity(0, char_width); + p.retained_strings.appendNTimesAssumeCapacity(0, @intFromEnum(char_width)); const slice = p.retained_strings.items[retain_start..]; const arr_ty = try p.arena.create(Type.Array); @@ -7616,7 +7613,7 @@ fn stringLiteral(p: *Parser) Error!Result { else => string_kind.charKind().charLiteralType(p.comp).specifier, }; - arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = @divExact(slice.len, char_width) }; + arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = @divExact(slice.len, @intFromEnum(char_width)) }; var res: Result = .{ .ty = .{ .specifier = .array, From 8375ac8639bf79a0d998f46378076f7a42023d54 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Mon, 16 Oct 2023 09:34:12 -0700 Subject: [PATCH 07/26] CharLiteral: remove duplication in contentSlice --- src/CharLiteral.zig | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index 69c73046..c70cc3fd 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -21,6 +21,18 @@ const CharDiagnostic = struct { extra: Diagnostics.Message.Extra, }; +/// This assumes StringKind and CharKind have the same enum tags and C-source prefix +fn contentSliceInternal(comptime T: type, kind: T, delimited: []const u8) []const u8 { + const end = delimited.len - 1; // remove trailing quote + return switch (kind) { + .char => delimited[1..end], + .wide => delimited[2..end], + .utf_8 => delimited[3..end], + .utf_16 => delimited[2..end], + .utf_32 => delimited[2..end], + }; +} + pub const StringKind = enum { char, wide, @@ -57,14 +69,7 @@ pub const StringKind = enum { } pub fn contentSlice(kind: StringKind, delimited: []const u8) []const u8 { - const end = delimited.len - 1; // remove trailing quote - return switch (kind) { - .char => delimited[1..end], - .wide => delimited[2..end], - .utf_8 => delimited[3..end], - .utf_16 => delimited[2..end], - .utf_32 => delimited[2..end], - }; + return contentSliceInternal(StringKind, kind, delimited); } pub fn charUnitSize(kind: StringKind, comp: *const Compilation) Compilation.CharUnitSize { @@ -144,14 +149,7 @@ pub const CharKind = enum { /// Return the actual contents of the string literal with leading / trailing quotes and /// specifiers removed pub fn contentSlice(kind: CharKind, delimited: []const u8) []const u8 { - const end = delimited.len - 1; // remove trailing quote - return switch (kind) { - .char => delimited[1..end], - .wide => delimited[2..end], - .utf_8 => delimited[3..end], - .utf_16 => delimited[2..end], - .utf_32 => delimited[2..end], - }; + return contentSliceInternal(CharKind, kind, delimited); } }; From 149463d5530f5122a032a24658dbadb3cf50d8e1 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Mon, 16 Oct 2023 23:52:49 -0700 Subject: [PATCH 08/26] Value: support all string types when writing / dumping strings --- src/Attribute.zig | 9 ++++++++- src/Compilation.zig | 2 +- src/Diagnostics.zig | 5 +++++ src/Ir.zig | 2 +- src/Parser.zig | 14 +++++--------- src/Tree.zig | 15 ++------------- src/Value.zig | 38 +++++++++++++++++++++++++++++++++++--- src/codegen/x86_64.zig | 2 +- test/cases/wide strings.c | 5 +++++ 9 files changed, 63 insertions(+), 29 deletions(-) diff --git a/src/Attribute.zig b/src/Attribute.zig index 966d240c..f45067df 100644 --- a/src/Attribute.zig +++ b/src/Attribute.zig @@ -263,10 +263,17 @@ fn diagnoseField( .bytes => { const bytes = val.data.bytes.trim(1); // remove null terminator if (wanted == Value.ByteRange) { + std.debug.assert(node.tag == .string_literal_expr); + if (!node.ty.elemType().is(.char) and !node.ty.elemType().is(.uchar)) { + return Diagnostics.Message{ + .tag = .attribute_requires_string, + .extra = .{ .str = decl.name }, + }; + } @field(@field(arguments, decl.name), field.name) = bytes; return null; } else if (@typeInfo(wanted) == .Enum and @hasDecl(wanted, "opts") and wanted.opts.enum_kind == .string) { - const str = bytes.slice(strings); + const str = bytes.slice(strings, .@"1"); if (std.meta.stringToEnum(wanted, str)) |enum_val| { @field(@field(arguments, decl.name), field.name) = enum_val; return null; diff --git a/src/Compilation.zig b/src/Compilation.zig index cd79c994..0f053c4b 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -1428,7 +1428,7 @@ pub fn hasBuiltinFunction(comp: *const Compilation, builtin: Builtin) bool { } } -pub const CharUnitSize = enum(usize) { +pub const CharUnitSize = enum(u32) { @"1" = 1, @"2" = 2, @"4" = 4, diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 7c8ae4fd..52e096f0 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -2510,6 +2510,11 @@ const messages = struct { const opt = "unknown-escape-sequence"; const extra = .invalid_escape; }; + pub const attribute_requires_string = struct { + const msg = "attribute '{s}' requires an ordinary string"; + const kind = .@"error"; + const extra = .str; + }; }; list: std.ArrayListUnmanaged(Message) = .{}, diff --git a/src/Ir.zig b/src/Ir.zig index 43739252..4c45f78b 100644 --- a/src/Ir.zig +++ b/src/Ir.zig @@ -552,7 +552,7 @@ fn writeValue(ir: Ir, val_ref: Interner.Ref, color: bool, w: anytype) !void { switch (v.tag) { .unavailable => try w.writeAll(" unavailable"), .int => try w.print("{d}", .{v.data.int}), - .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(ir.strings)}), + .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(ir.strings, .@"1")}), // std.fmt does @as instead of @floatCast .float => try w.print("{d}", .{@as(f64, @floatCast(v.data.float))}), else => try w.print("({s})", .{@tagName(v.tag)}), diff --git a/src/Parser.zig b/src/Parser.zig index a04bce6b..7c1e8678 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -495,7 +495,7 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ /// Returned slice is invalidated if additional strings are added to p.retained_strings fn retainedString(p: *Parser, range: Value.ByteRange) []const u8 { - return range.slice(p.retained_strings.items); + return range.slice(p.retained_strings.items, .@"1"); } fn errDeprecated(p: *Parser, tag: Diagnostics.Tag, tok_i: TokenIndex, msg: ?Value.ByteRange) Compilation.Error!void { @@ -1153,17 +1153,13 @@ fn staticAssertMessage(p: *Parser, cond_node: NodeIndex, message: Result) !?[]co try buf.appendSlice(")'"); } if (message.node != .none) { + assert(p.nodes.items(.tag)[@intFromEnum(message.node)] == .string_literal_expr); if (buf.items.len > 0) { try buf.append(' '); } - const data = message.val.data.bytes; - try buf.ensureUnusedCapacity(data.len()); - try Tree.dumpStr( - p.retained_strings.items, - data, - p.nodes.items(.tag)[@intFromEnum(message.node)], - buf.writer(), - ); + const byte_range = message.val.data.bytes; + try buf.ensureUnusedCapacity(byte_range.len()); + try byte_range.dumpString(message.ty, p.comp, p.retained_strings.items, buf.writer()); } return try p.comp.diag.arena.allocator().dupe(u8, buf.items); } diff --git a/src/Tree.zig b/src/Tree.zig index 5c25ec66..f6335014 100644 --- a/src/Tree.zig +++ b/src/Tree.zig @@ -656,17 +656,6 @@ pub fn isLvalExtra(nodes: Node.List.Slice, extra: []const NodeIndex, value_map: } } -pub fn dumpStr(retained_strings: []const u8, range: Value.ByteRange, tag: Tag, writer: anytype) !void { - switch (tag) { - .string_literal_expr => { - const lit_range = range.trim(1); // remove null-terminator - const str = lit_range.slice(retained_strings); - try writer.print("\"{}\"", .{std.zig.fmtEscapes(str)}); - }, - else => unreachable, - } -} - pub fn tokSlice(tree: Tree, tok_i: TokenIndex) []const u8 { if (tree.tokens.items(.id)[tok_i].lexeme()) |some| return some; const loc = tree.tokens.items(.loc)[tok_i]; @@ -716,8 +705,8 @@ fn dumpAttribute(attr: Attribute, strings: []const u8, writer: anytype) !void { try writer.writeAll(f.name); try writer.writeAll(": "); switch (f.type) { - Value.ByteRange => try writer.print("\"{s}\"", .{@field(args, f.name).slice(strings)}), - ?Value.ByteRange => try writer.print("\"{?s}\"", .{if (@field(args, f.name)) |range| range.slice(strings) else null}), + Value.ByteRange => try writer.print("\"{s}\"", .{@field(args, f.name).slice(strings, .@"1")}), + ?Value.ByteRange => try writer.print("\"{?s}\"", .{if (@field(args, f.name)) |range| range.slice(strings, .@"1") else null}), else => switch (@typeInfo(f.type)) { .Enum => try writer.writeAll(@tagName(@field(args, f.name))), else => try writer.print("{any}", .{@field(args, f.name)}), diff --git a/src/Value.zig b/src/Value.zig index 58a058c3..1577db93 100644 --- a/src/Value.zig +++ b/src/Value.zig @@ -18,8 +18,40 @@ pub const ByteRange = struct { return .{ .start = self.start, .end = self.end - amount }; } - pub fn slice(self: ByteRange, all_bytes: []const u8) []const u8 { - return all_bytes[self.start..self.end]; + pub fn slice(self: ByteRange, all_bytes: []const u8, comptime size: Compilation.CharUnitSize) []const size.Type() { + switch (size) { + inline else => |sz| { + const aligned: []align(@alignOf(sz.Type())) const u8 = @alignCast(all_bytes[self.start..self.end]); + return std.mem.bytesAsSlice(sz.Type(), aligned); + }, + } + } + + pub fn dumpString(range: ByteRange, ty: Type, comp: *const Compilation, strings: []const u8, w: anytype) !void { + const size: Compilation.CharUnitSize = @enumFromInt(ty.elemType().sizeof(comp).?); + const without_null = range.trim(@intFromEnum(size)); + switch (size) { + inline .@"1", .@"2" => |sz| { + const data_slice = without_null.slice(strings, sz); + const formatter = if (sz == .@"1") std.zig.fmtEscapes(data_slice) else std.unicode.fmtUtf16le(data_slice); + try w.print("\"{}\"", .{formatter}); + }, + .@"4" => { + try w.writeByte('"'); + const data_slice = without_null.slice(strings, .@"4"); + var buf: [4]u8 = undefined; + for (data_slice) |item| { + if (item <= std.math.maxInt(u21) and std.unicode.utf8ValidCodepoint(@intCast(item))) { + const codepoint: u21 = @intCast(item); + const written = std.unicode.utf8Encode(codepoint, &buf) catch unreachable; + try w.print("{s}", .{buf[0..written]}); + } else { + try w.print("\\x{x}", .{item}); + } + } + try w.writeByte('"'); + }, + } } }; @@ -593,7 +625,7 @@ pub fn dump(v: Value, ty: Type, comp: *Compilation, strings: []const u8, w: anyt } else { try w.print("{d}", .{v.signExtend(ty, comp)}); }, - .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(strings)}), + .bytes => try v.data.bytes.dumpString(ty, comp, strings, w), // std.fmt does @as instead of @floatCast .float => try w.print("{d}", .{@as(f64, @floatCast(v.data.float))}), else => try w.print("({s})", .{@tagName(v.tag)}), diff --git a/src/codegen/x86_64.zig b/src/codegen/x86_64.zig index bc8b43df..aa96b4df 100644 --- a/src/codegen/x86_64.zig +++ b/src/codegen/x86_64.zig @@ -177,7 +177,7 @@ fn genNode(func: *Fn, node: NodeIndex) Codegen.Error!Value { .int_literal => return Value{ .immediate = @bitCast(data.int) }, .string_literal_expr => { const range = func.c.tree.value_map.get(node).?.data.bytes; - const str_bytes = range.slice(func.c.tree.strings); + const str_bytes = range.slice(func.c.tree.strings, .@"1"); const section = try func.c.obj.getSection(.strings); const start = section.items.len; try section.appendSlice(str_bytes); diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c index c483fe32..8495674f 100644 --- a/test/cases/wide strings.c +++ b/test/cases/wide strings.c @@ -49,7 +49,12 @@ uint16_t L[] = u"\xFFFFF"; uint8_t M[] = u8"\xFFF"; +_Static_assert(1 == 2, u"😬\U0001f62c"); +_Static_assert(1 == 2, U"😬\U0001f62c"); + #define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ "wide strings.c:48:26: error: escape sequence out of range" \ "wide strings.c:50:24: error: escape sequence out of range" \ + "wide strings.c:52:1: error: static assertion failed \"😬😬\"" \ + "wide strings.c:53:1: error: static assertion failed \"😬😬\"" \ From 4e73f225b611993086a61eae65f2f95116ad6ddd Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 09:02:01 -0700 Subject: [PATCH 09/26] Attribute: fix crash when attribute expects a string and doesn't get it --- src/Attribute.zig | 2 +- test/cases/attributes.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Attribute.zig b/src/Attribute.zig index f45067df..23e038d4 100644 --- a/src/Attribute.zig +++ b/src/Attribute.zig @@ -60,7 +60,7 @@ pub const ArgumentType = enum { fn fromType(comptime T: type) ArgumentType { return switch (T) { - []const u8 => .string, + Value.ByteRange => .string, Identifier => .identifier, u32 => .int, Alignment => .alignment, diff --git a/test/cases/attributes.c b/test/cases/attributes.c index 73b59567..2363b8eb 100644 --- a/test/cases/attributes.c +++ b/test/cases/attributes.c @@ -107,6 +107,8 @@ typedef struct { __attribute__((aligned(32))) char aligned_arr[] = {1, 2, 3}; _Static_assert(sizeof(aligned_arr) == 3, ""); +__attribute__((section(1))) int Z; + __attribute__(()) // test attribute at eof #define TESTS_SKIPPED 1 @@ -119,4 +121,5 @@ __attribute__(()) // test attribute at eof "attributes.c:36:5: error: fallthrough annotation does not directly precede switch label" \ "attributes.c:40:20: error: 'noreturn' attribute cannot be applied to a statement" \ "attributes.c:76:6: error: cannot call non function type 'int'" \ - "attributes.c:110:18: error: expected identifier or '('" \ + "attributes.c:110:24: error: Attribute argument is invalid, expected a string but got an integer constant" \ + "attributes.c:112:18: error: expected identifier or '('" \ From ed04af42596345736ad985d29a94f5df1a224302 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 09:10:03 -0700 Subject: [PATCH 10/26] Parser: rename retainedString to better describe its purpose --- src/Parser.zig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Parser.zig b/src/Parser.zig index 7c1e8678..423886ba 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -468,7 +468,7 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ defer p.strings.items.len = strings_top; const w = p.strings.writer(); - const msg_str = p.retainedString(@"error".msg); + const msg_str = p.attributeMessageString(@"error".msg); try w.print("call to '{s}' declared with attribute error: {s}", .{ p.tokSlice(@"error".__name_tok), msg_str }); const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); try p.errStr(.error_attribute, usage_tok, str); @@ -478,7 +478,7 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ defer p.strings.items.len = strings_top; const w = p.strings.writer(); - const msg_str = p.retainedString(warning.msg); + const msg_str = p.attributeMessageString(warning.msg); try w.print("call to '{s}' declared with attribute warning: {s}", .{ p.tokSlice(warning.__name_tok), msg_str }); const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); try p.errStr(.warning_attribute, usage_tok, str); @@ -493,8 +493,9 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ } } +/// Assumes that the specified range was created by an ordinary or `u8` string literal /// Returned slice is invalidated if additional strings are added to p.retained_strings -fn retainedString(p: *Parser, range: Value.ByteRange) []const u8 { +fn attributeMessageString(p: *Parser, range: Value.ByteRange) []const u8 { return range.slice(p.retained_strings.items, .@"1"); } @@ -511,7 +512,7 @@ fn errDeprecated(p: *Parser, tag: Diagnostics.Tag, tok_i: TokenIndex, msg: ?Valu }; try w.writeAll(reason); if (msg) |m| { - const str = p.retainedString(m); + const str = p.attributeMessageString(m); try w.print(": {s}", .{str}); } const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); From fc5bd3a5457dece7b186360e6608aef7f4519c1a Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 09:28:03 -0700 Subject: [PATCH 11/26] CharLiteral: ensure that errors aren't dropped if we have a lot of warnings --- src/CharLiteral.zig | 6 +++++- test/cases/wide strings.c | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index c70cc3fd..d937a03f 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -175,7 +175,11 @@ pub const Parser = struct { pub fn err(self: *Parser, tag: Diagnostics.Tag, extra: Diagnostics.Message.Extra) void { if (self.errored) return; self.errored = true; - self.errors.append(.{ .tag = tag, .extra = extra }) catch {}; + const diagnostic = .{ .tag = tag, .extra = extra }; + self.errors.append(diagnostic) catch { + _ = self.errors.pop(); + self.errors.append(diagnostic) catch unreachable; + }; } pub fn warn(self: *Parser, tag: Diagnostics.Tag, extra: Diagnostics.Message.Extra) void { diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c index 8495674f..782214e1 100644 --- a/test/cases/wide strings.c +++ b/test/cases/wide strings.c @@ -52,9 +52,12 @@ uint8_t M[] = u8"\xFFF"; _Static_assert(1 == 2, u"😬\U0001f62c"); _Static_assert(1 == 2, U"😬\U0001f62c"); +char foo[] = "\u0020\u0020\u0020\u0020\xFFFFFFFF"; + #define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ "wide strings.c:48:26: error: escape sequence out of range" \ "wide strings.c:50:24: error: escape sequence out of range" \ "wide strings.c:52:1: error: static assertion failed \"😬😬\"" \ "wide strings.c:53:1: error: static assertion failed \"😬😬\"" \ + "wide strings.c:55:50: error: escape sequence out of range" \ From 17b3f51936e556472bcbbd9d94b1199765bf1df9 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 09:33:32 -0700 Subject: [PATCH 12/26] Parser: use resize API for retained_strings --- src/Parser.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.zig b/src/Parser.zig index 423886ba..870e3ea4 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7584,7 +7584,7 @@ fn stringLiteral(p: *Parser) Error!Result { const dest_len = if (capacity_slice.len % 2 == 0) capacity_slice.len else capacity_slice.len - 1; var dest = std.mem.bytesAsSlice(u16, capacity_slice[0..dest_len]); const words_written = std.unicode.utf8ToUtf16Le(dest, view.bytes) catch unreachable; - p.retained_strings.items.len += words_written * 2; + p.retained_strings.resize(p.retained_strings.items.len + words_written * 2) catch unreachable; }, .@"4" => { var it = view.iterator(); From a6fb754967dbba648b8432e26ff46d0d7fa794c1 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 09:40:35 -0700 Subject: [PATCH 13/26] CharLiteral: add a function for string literal element type --- src/CharLiteral.zig | 8 ++++++++ src/Parser.zig | 8 +------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index d937a03f..faa19a20 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -92,6 +92,14 @@ pub const StringKind = enum { inline else => |size| @alignOf(size.Type()), }; } + + pub fn elementType(kind: StringKind, comp: *const Compilation) Type { + return switch (kind) { + .char => .{ .specifier = .char }, + .utf_8 => if (comp.langopts.hasChar8_T()) .{ .specifier = .uchar } else .{ .specifier = .char }, + else => kind.charKind().charLiteralType(comp), + }; + } }; pub const CharKind = enum { diff --git a/src/Parser.zig b/src/Parser.zig index 870e3ea4..29e8f899 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7604,13 +7604,7 @@ fn stringLiteral(p: *Parser) Error!Result { const slice = p.retained_strings.items[retain_start..]; const arr_ty = try p.arena.create(Type.Array); - const specifier: Type.Specifier = switch (string_kind) { - .char => .char, - .utf_8 => if (p.comp.langopts.hasChar8_T()) .uchar else .char, - else => string_kind.charKind().charLiteralType(p.comp).specifier, - }; - - arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = @divExact(slice.len, @intFromEnum(char_width)) }; + arr_ty.* = .{ .elem = string_kind.elementType(p.comp), .len = @divExact(slice.len, @intFromEnum(char_width)) }; var res: Result = .{ .ty = .{ .specifier = .array, From 38d51b5cf7bcaa44791d7fbe2bc335aecace9ae5 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 10:06:19 -0700 Subject: [PATCH 14/26] Parser: improve string error message locations --- src/CharLiteral.zig | 15 ++++++++++++--- src/Parser.zig | 18 +++++++++--------- test/cases/strings.c | 16 ++++++++-------- test/cases/wide character constants.c | 6 +++--- test/cases/wide strings.c | 6 +++--- 5 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index faa19a20..479143f6 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -180,6 +180,14 @@ pub const Parser = struct { }; } + fn prefixLen(self: *const Parser) usize { + return switch (self.kind) { + .char => 0, + .utf_8 => 2, + .wide, .utf_16, .utf_32 => 1, + }; + } + pub fn err(self: *Parser, tag: Diagnostics.Tag, extra: Diagnostics.Message.Extra) void { if (self.errored) return; self.errored = true; @@ -252,7 +260,7 @@ pub const Parser = struct { self.i += expected_len; if (overflowed) { - self.err(.escape_sequence_overflow, .{ .unsigned = start }); + self.err(.escape_sequence_overflow, .{ .unsigned = start + self.prefixLen() }); return null; } @@ -262,7 +270,7 @@ pub const Parser = struct { } if (val > std.math.maxInt(u21) or !std.unicode.utf8ValidCodepoint(@intCast(val))) { - self.err(.invalid_universal_character, .{ .unsigned = start }); + self.err(.invalid_universal_character, .{ .unsigned = start + self.prefixLen() }); return null; } @@ -332,6 +340,7 @@ pub const Parser = struct { var val: u32 = 0; var count: usize = 0; var overflowed = false; + const start = self.i; defer self.i += count; const slice = switch (base) { .octal => self.literal[self.i..@min(self.literal.len, self.i + 3)], // max 3 chars @@ -348,7 +357,7 @@ pub const Parser = struct { count += 1; } if (overflowed or val > self.kind.maxInt(self.comp)) { - self.err(.escape_sequence_overflow, .{ .unsigned = 0 }); + self.err(.escape_sequence_overflow, .{ .unsigned = start + self.prefixLen() }); return 0; } if (count == 0) { diff --git a/src/Parser.zig b/src/Parser.zig index 29e8f899..6965c7be 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7520,25 +7520,25 @@ fn makePredefinedIdentifier(p: *Parser, start: u32) !Result { } fn stringLiteral(p: *Parser) Error!Result { - var start = p.tok_i; - var string_kind = CharLiteral.StringKind.classify(p.tok_ids[start]).?; - p.tok_i += 1; - while (true) : (p.tok_i += 1) { - const next = CharLiteral.StringKind.classify(p.tok_ids[p.tok_i]) orelse break; + var string_end = p.tok_i; + var string_kind: CharLiteral.StringKind = .char; + while (CharLiteral.StringKind.classify(p.tok_ids[string_end])) |next| : (string_end += 1) { string_kind = string_kind.concat(next) catch { - try p.err(.unsupported_str_cat); + try p.errTok(.unsupported_str_cat, string_end); while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} return error.ParsingFailed; }; } + assert(string_end > p.tok_i); + const char_width = string_kind.charUnitSize(p.comp); const retain_start = mem.alignForward(usize, p.retained_strings.items.len, string_kind.internalStorageAlignment(p.comp)); try p.retained_strings.resize(retain_start); - while (start < p.tok_i) : (start += 1) { - const this_kind = CharLiteral.StringKind.classify(p.tok_ids[start]).?; - const slice = this_kind.contentSlice(p.tokSlice(start)); + while (p.tok_i < string_end) : (p.tok_i += 1) { + const this_kind = CharLiteral.StringKind.classify(p.tok_ids[p.tok_i]).?; + const slice = this_kind.contentSlice(p.tokSlice(p.tok_i)); var char_literal_parser = CharLiteral.Parser.init(slice, this_kind.charKind(), 0x10ffff, p.comp); try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * @intFromEnum(char_width)); // +1 for null terminator diff --git a/test/cases/strings.c b/test/cases/strings.c index 9bcda7e6..148c5e1b 100644 --- a/test/cases/strings.c +++ b/test/cases/strings.c @@ -1,5 +1,5 @@ _Static_assert(1, "foo" "\n" "bar"); -_Static_assert(1, "foo" "\x606262 "); +_Static_assert(1, "foo" "abc\x606262 "); _Static_assert(1, "\000062"); _Static_assert(1, "\U00110000"); _Static_assert(1, "\u0062"); @@ -15,12 +15,12 @@ _Static_assert(1, "\u0060"); _Static_assert(1, "aaァ\e[1;"); #pragma GCC diagnostic pop -#define EXPECTED_ERRORS "strings.c:2:36: error: escape sequence out of range" \ - "strings.c:4:31: error: invalid universal character" \ - "strings.c:5:27: error: character 'b' cannot be specified by a universal character name" \ +#define EXPECTED_ERRORS "strings.c:2:29: error: escape sequence out of range" \ + "strings.c:4:19: error: invalid universal character" \ + "strings.c:5:19: error: character 'b' cannot be specified by a universal character name" \ "strings.c:7:9: warning: multi-character character constant [-Wmultichar]" \ "strings.c:7:9: warning: character constant too long for its type" \ - "strings.c:9:27: error: invalid universal character" \ - "strings.c:10:31: error: invalid universal character" \ - "strings.c:11:31: error: invalid universal character" \ - "strings.c:15:4: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ + "strings.c:9:19: error: invalid universal character" \ + "strings.c:10:19: error: invalid universal character" \ + "strings.c:11:19: error: invalid universal character" \ + "strings.c:15:23: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ diff --git a/test/cases/wide character constants.c b/test/cases/wide character constants.c index 99d512d4..c174fbef 100644 --- a/test/cases/wide character constants.c +++ b/test/cases/wide character constants.c @@ -55,13 +55,13 @@ int Z = 'ABC\D'; "wide character constants.c:10:16: error: wide character literals may not contain multiple characters" \ "wide character constants.c:11:16: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:14:16: warning: multi-character character constant [-Wfour-char-constants]" \ - "wide character constants.c:20:19: error: escape sequence out of range" \ + "wide character constants.c:20:21: error: escape sequence out of range" \ "wide character constants.c:22:19: error: character too large for enclosing character literal type" \ - "wide character constants.c:25:19: error: invalid universal character" \ + "wide character constants.c:25:20: error: invalid universal character" \ "wide character constants.c:26:19: error: character too large for enclosing character literal type" \ "wide character constants.c:27:19: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:28:19: error: Unicode character literals may not contain multiple characters" \ - "wide character constants.c:29:19: error: escape sequence out of range" \ + "wide character constants.c:29:20: error: escape sequence out of range" \ "wide character constants.c:33:9: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:35:9: error: character too large for enclosing character literal type" \ "wide character constants.c:36:9: error: character 'A' cannot be specified by a universal character name" \ diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c index 782214e1..b8bb471b 100644 --- a/test/cases/wide strings.c +++ b/test/cases/wide strings.c @@ -55,9 +55,9 @@ _Static_assert(1 == 2, U"😬\U0001f62c"); char foo[] = "\u0020\u0020\u0020\u0020\xFFFFFFFF"; #define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ - "wide strings.c:48:26: error: escape sequence out of range" \ - "wide strings.c:50:24: error: escape sequence out of range" \ + "wide strings.c:48:18: error: escape sequence out of range" \ + "wide strings.c:50:18: error: escape sequence out of range" \ "wide strings.c:52:1: error: static assertion failed \"😬😬\"" \ "wide strings.c:53:1: error: static assertion failed \"😬😬\"" \ - "wide strings.c:55:50: error: escape sequence out of range" \ + "wide strings.c:55:39: error: escape sequence out of range" \ From bcb58ade5a6d75256926fc346fc11f449d903d9f Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 17:00:52 -0700 Subject: [PATCH 15/26] tests: Add a few more wide string tests --- test/cases/wide strings.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c index b8bb471b..74fc1fb7 100644 --- a/test/cases/wide strings.c +++ b/test/cases/wide strings.c @@ -54,6 +54,16 @@ _Static_assert(1 == 2, U"😬\U0001f62c"); char foo[] = "\u0020\u0020\u0020\u0020\xFFFFFFFF"; +wchar_t N[] = "word" L"" "a"; +_Static_assert(sizeof(N) == sizeof(wchar_t) * 6); +uint32_t O[] = "word" U"" "a"; +_Static_assert(sizeof(O) == sizeof(uint32_t) * 6); +uint16_t P[] = "word" u"" "a"; +_Static_assert(sizeof(P) == sizeof(uint16_t) * 6); + +uint32_t Q[] = U"abc\ndef\xFFghi"; +_Static_assert(sizeof(Q) == sizeof(uint32_t) * 12); + #define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ "wide strings.c:48:18: error: escape sequence out of range" \ "wide strings.c:50:18: error: escape sequence out of range" \ From 6aed87f834a18758d759bda24993cf41ff610e80 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 17:05:56 -0700 Subject: [PATCH 16/26] Parser: use alignBackward to find destination slice len --- src/Parser.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.zig b/src/Parser.zig index 6965c7be..a89b0ea0 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7581,7 +7581,7 @@ fn stringLiteral(p: *Parser) Error!Result { .@"1" => p.retained_strings.appendSliceAssumeCapacity(view.bytes), .@"2" => { var capacity_slice: []align(@alignOf(u16)) u8 = @alignCast(p.retained_strings.unusedCapacitySlice()); - const dest_len = if (capacity_slice.len % 2 == 0) capacity_slice.len else capacity_slice.len - 1; + const dest_len = std.mem.alignBackward(usize, capacity_slice.len, 2); var dest = std.mem.bytesAsSlice(u16, capacity_slice[0..dest_len]); const words_written = std.unicode.utf8ToUtf16Le(dest, view.bytes) catch unreachable; p.retained_strings.resize(p.retained_strings.items.len + words_written * 2) catch unreachable; From b8c3aaa4953460677df2dcdeeeabb53d31e86e0d Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 19 Oct 2023 22:32:55 -0700 Subject: [PATCH 17/26] Compilation: define __STDC_UTF_16__ and __STDC_UTF_32__ --- src/Compilation.zig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Compilation.zig b/src/Compilation.zig index 0f053c4b..49846b8a 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -238,6 +238,8 @@ pub fn generateBuiltinMacros(comp: *Compilation) !Source { \\#define __STDC_NO_COMPLEX__ 1 \\#define __STDC_NO_THREADS__ 1 \\#define __STDC_NO_VLA__ 1 + \\#define __STDC_UTF_16__ 1 + \\#define __STDC_UTF_32__ 1 \\ ); if (comp.langopts.standard.StdCVersionMacro()) |stdc_version| { From f3be68ed491364d4d49ea3b547023bba72c1c8ac Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 11:22:33 -0700 Subject: [PATCH 18/26] CharLiteral: unify StringKind and CharKind Add a test to ensure string literals can't be concatenated with char literals. --- src/CharLiteral.zig | 161 ++++++++++++++++---------------------- src/Parser.zig | 10 +-- test/cases/wide strings.c | 6 ++ 3 files changed, 80 insertions(+), 97 deletions(-) diff --git a/src/CharLiteral.zig b/src/CharLiteral.zig index 479143f6..4b301c4d 100644 --- a/src/CharLiteral.zig +++ b/src/CharLiteral.zig @@ -21,110 +21,48 @@ const CharDiagnostic = struct { extra: Diagnostics.Message.Extra, }; -/// This assumes StringKind and CharKind have the same enum tags and C-source prefix -fn contentSliceInternal(comptime T: type, kind: T, delimited: []const u8) []const u8 { - const end = delimited.len - 1; // remove trailing quote - return switch (kind) { - .char => delimited[1..end], - .wide => delimited[2..end], - .utf_8 => delimited[3..end], - .utf_16 => delimited[2..end], - .utf_32 => delimited[2..end], - }; -} - -pub const StringKind = enum { +pub const Kind = enum { char, wide, utf_8, utf_16, utf_32, - pub fn classify(id: Tokenizer.Token.Id) ?StringKind { - return switch (id) { - .string_literal => .char, - .string_literal_utf_8 => .utf_8, - .string_literal_wide => .wide, - .string_literal_utf_16 => .utf_16, - .string_literal_utf_32 => .utf_32, - else => null, + pub fn classify(id: Tokenizer.Token.Id, context: enum { string_literal, char_literal }) ?Kind { + return switch (context) { + .string_literal => switch (id) { + .string_literal => .char, + .string_literal_utf_8 => .utf_8, + .string_literal_wide => .wide, + .string_literal_utf_16 => .utf_16, + .string_literal_utf_32 => .utf_32, + else => null, + }, + .char_literal => switch (id) { + .char_literal => .char, + .char_literal_utf_8 => .utf_8, + .char_literal_wide => .wide, + .char_literal_utf_16 => .utf_16, + .char_literal_utf_32 => .utf_32, + else => null, + }, }; } - pub fn concat(self: StringKind, other: StringKind) !StringKind { + /// Should only be called for string literals. Determines the result kind of two adjacent string + /// literals + pub fn concat(self: Kind, other: Kind) !Kind { if (self == other) return self; // can always concat with own kind if (self == .char) return other; // char + X -> X if (other == .char) return self; // X + char -> X return error.CannotConcat; } - pub fn charKind(self: StringKind) CharKind { - return switch (self) { - .char => .char, - .wide => .wide, - .utf_8 => .utf_8, - .utf_16 => .utf_16, - .utf_32 => .utf_32, - }; - } - - pub fn contentSlice(kind: StringKind, delimited: []const u8) []const u8 { - return contentSliceInternal(StringKind, kind, delimited); - } - - pub fn charUnitSize(kind: StringKind, comp: *const Compilation) Compilation.CharUnitSize { - return switch (kind) { - .char => .@"1", - .wide => switch (comp.types.wchar.sizeof(comp).?) { - 2 => .@"2", - 4 => .@"4", - else => unreachable, - }, - .utf_8 => .@"1", - .utf_16 => .@"2", - .utf_32 => .@"4", - }; - } - - /// Required alignment within aro (on compiler host) for writing to retained_strings - pub fn internalStorageAlignment(kind: StringKind, comp: *const Compilation) usize { - return switch (kind.charUnitSize(comp)) { - inline else => |size| @alignOf(size.Type()), - }; - } - - pub fn elementType(kind: StringKind, comp: *const Compilation) Type { - return switch (kind) { - .char => .{ .specifier = .char }, - .utf_8 => if (comp.langopts.hasChar8_T()) .{ .specifier = .uchar } else .{ .specifier = .char }, - else => kind.charKind().charLiteralType(comp), - }; - } -}; - -pub const CharKind = enum { - char, - wide, - utf_8, - utf_16, - utf_32, - - pub fn classify(id: Tokenizer.Token.Id) CharKind { - return switch (id) { - .char_literal => .char, - .char_literal_utf_8 => .utf_8, - .char_literal_wide => .wide, - .char_literal_utf_16 => .utf_16, - .char_literal_utf_32 => .utf_32, - else => unreachable, - }; - } - /// Largest unicode codepoint that can be represented by this character kind /// May be smaller than the largest value that can be represented. /// For example u8 char literals may only specify 0-127 via literals or /// character escapes, but may specify up to \xFF via hex escapes. - pub fn maxCodepoint(kind: CharKind, comp: *const Compilation) u21 { + pub fn maxCodepoint(kind: Kind, comp: *const Compilation) u21 { return @intCast(switch (kind) { .char => std.math.maxInt(u7), .wide => @min(0x10FFFF, comp.types.wchar.maxInt(comp)), @@ -135,7 +73,7 @@ pub const CharKind = enum { } /// Largest integer that can be represented by this character kind - pub fn maxInt(kind: CharKind, comp: *const Compilation) u32 { + pub fn maxInt(kind: Kind, comp: *const Compilation) u32 { return @intCast(switch (kind) { .char, .utf_8 => std.math.maxInt(u8), .wide => comp.types.wchar.maxInt(comp), @@ -144,7 +82,8 @@ pub const CharKind = enum { }); } - pub fn charLiteralType(kind: CharKind, comp: *const Compilation) Type { + /// The C type of a character literal of this kind + pub fn charLiteralType(kind: Kind, comp: *const Compilation) Type { return switch (kind) { .char => Type.int, .wide => comp.types.wchar, @@ -154,24 +93,62 @@ pub const CharKind = enum { }; } - /// Return the actual contents of the string literal with leading / trailing quotes and + /// Return the actual contents of the literal with leading / trailing quotes and /// specifiers removed - pub fn contentSlice(kind: CharKind, delimited: []const u8) []const u8 { - return contentSliceInternal(CharKind, kind, delimited); + pub fn contentSlice(kind: Kind, delimited: []const u8) []const u8 { + const end = delimited.len - 1; // remove trailing quote + return switch (kind) { + .char => delimited[1..end], + .wide => delimited[2..end], + .utf_8 => delimited[3..end], + .utf_16 => delimited[2..end], + .utf_32 => delimited[2..end], + }; + } + + /// The size of a character unit for a string literal of this kind + pub fn charUnitSize(kind: Kind, comp: *const Compilation) Compilation.CharUnitSize { + return switch (kind) { + .char => .@"1", + .wide => switch (comp.types.wchar.sizeof(comp).?) { + 2 => .@"2", + 4 => .@"4", + else => unreachable, + }, + .utf_8 => .@"1", + .utf_16 => .@"2", + .utf_32 => .@"4", + }; + } + + /// Required alignment within aro (on compiler host) for writing to retained_strings + pub fn internalStorageAlignment(kind: Kind, comp: *const Compilation) usize { + return switch (kind.charUnitSize(comp)) { + inline else => |size| @alignOf(size.Type()), + }; + } + + /// The C type of an element of a string literal of this kind + pub fn elementType(kind: Kind, comp: *const Compilation) Type { + return switch (kind) { + .char => .{ .specifier = .char }, + .utf_8 => if (comp.langopts.hasChar8_T()) .{ .specifier = .uchar } else .{ .specifier = .char }, + else => kind.charLiteralType(comp), + }; } }; pub const Parser = struct { literal: []const u8, i: usize = 0, - kind: CharKind, + kind: Kind, max_codepoint: u21, /// We only want to issue a max of 1 error per char literal errored: bool = false, errors: std.BoundedArray(CharDiagnostic, 4) = .{}, comp: *const Compilation, - pub fn init(literal: []const u8, kind: CharKind, max_codepoint: u21, comp: *const Compilation) Parser { + pub fn init(literal: []const u8, kind: Kind, max_codepoint: u21, comp: *const Compilation) Parser { return .{ .literal = literal, .comp = comp, diff --git a/src/Parser.zig b/src/Parser.zig index a89b0ea0..2c83d23b 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7521,8 +7521,8 @@ fn makePredefinedIdentifier(p: *Parser, start: u32) !Result { fn stringLiteral(p: *Parser) Error!Result { var string_end = p.tok_i; - var string_kind: CharLiteral.StringKind = .char; - while (CharLiteral.StringKind.classify(p.tok_ids[string_end])) |next| : (string_end += 1) { + var string_kind: CharLiteral.Kind = .char; + while (CharLiteral.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { string_kind = string_kind.concat(next) catch { try p.errTok(.unsupported_str_cat, string_end); while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} @@ -7537,9 +7537,9 @@ fn stringLiteral(p: *Parser) Error!Result { try p.retained_strings.resize(retain_start); while (p.tok_i < string_end) : (p.tok_i += 1) { - const this_kind = CharLiteral.StringKind.classify(p.tok_ids[p.tok_i]).?; + const this_kind = CharLiteral.Kind.classify(p.tok_ids[p.tok_i], .string_literal).?; const slice = this_kind.contentSlice(p.tokSlice(p.tok_i)); - var char_literal_parser = CharLiteral.Parser.init(slice, this_kind.charKind(), 0x10ffff, p.comp); + var char_literal_parser = CharLiteral.Parser.init(slice, this_kind, 0x10ffff, p.comp); try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * @intFromEnum(char_width)); // +1 for null terminator while (char_literal_parser.next()) |item| switch (item) { @@ -7620,7 +7620,7 @@ fn stringLiteral(p: *Parser) Error!Result { fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; - const char_kind = CharLiteral.CharKind.classify(tok_id); + const char_kind = CharLiteral.Kind.classify(tok_id, .char_literal).?; var val: u32 = 0; const slice = char_kind.contentSlice(p.tokSlice(p.tok_i)); diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c index 74fc1fb7..2e37b07d 100644 --- a/test/cases/wide strings.c +++ b/test/cases/wide strings.c @@ -64,10 +64,16 @@ _Static_assert(sizeof(P) == sizeof(uint16_t) * 6); uint32_t Q[] = U"abc\ndef\xFFghi"; _Static_assert(sizeof(Q) == sizeof(uint32_t) * 12); +uint32_t R[] = U"a" U'b'; +uint32_t S[] = U'a'; +uint32_t T[] = { U'a', U'b'}; + #define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ "wide strings.c:48:18: error: escape sequence out of range" \ "wide strings.c:50:18: error: escape sequence out of range" \ "wide strings.c:52:1: error: static assertion failed \"😬😬\"" \ "wide strings.c:53:1: error: static assertion failed \"😬😬\"" \ "wide strings.c:55:39: error: escape sequence out of range" \ + "wide strings.c:67:21: error: expected ';', found 'a character literal'" \ + "wide strings.c:68:16: error: array initializer must be an initializer list or wide string literal" \ From 257f048926d819409dd94c73faea98e117b70875 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 11:28:45 -0700 Subject: [PATCH 19/26] CharLiteral: rename to TextLiteral This better reflects that it now handles character literals and string literals --- src/Parser.zig | 14 +++++++------- src/{CharLiteral.zig => TextLiteral.zig} | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) rename src/{CharLiteral.zig => TextLiteral.zig} (99%) diff --git a/src/Parser.zig b/src/Parser.zig index 2c83d23b..6aa23c2e 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -17,7 +17,7 @@ const NodeList = std.ArrayList(NodeIndex); const InitList = @import("InitList.zig"); const Attribute = @import("Attribute.zig"); const CharInfo = @import("CharInfo.zig"); -const CharLiteral = @import("CharLiteral.zig"); +const TextLiteral = @import("TextLiteral.zig"); const Value = @import("Value.zig"); const SymbolStack = @import("SymbolStack.zig"); const Symbol = SymbolStack.Symbol; @@ -7521,8 +7521,8 @@ fn makePredefinedIdentifier(p: *Parser, start: u32) !Result { fn stringLiteral(p: *Parser) Error!Result { var string_end = p.tok_i; - var string_kind: CharLiteral.Kind = .char; - while (CharLiteral.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { + var string_kind: TextLiteral.Kind = .char; + while (TextLiteral.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { string_kind = string_kind.concat(next) catch { try p.errTok(.unsupported_str_cat, string_end); while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} @@ -7537,9 +7537,9 @@ fn stringLiteral(p: *Parser) Error!Result { try p.retained_strings.resize(retain_start); while (p.tok_i < string_end) : (p.tok_i += 1) { - const this_kind = CharLiteral.Kind.classify(p.tok_ids[p.tok_i], .string_literal).?; + const this_kind = TextLiteral.Kind.classify(p.tok_ids[p.tok_i], .string_literal).?; const slice = this_kind.contentSlice(p.tokSlice(p.tok_i)); - var char_literal_parser = CharLiteral.Parser.init(slice, this_kind, 0x10ffff, p.comp); + var char_literal_parser = TextLiteral.Parser.init(slice, this_kind, 0x10ffff, p.comp); try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * @intFromEnum(char_width)); // +1 for null terminator while (char_literal_parser.next()) |item| switch (item) { @@ -7620,7 +7620,7 @@ fn stringLiteral(p: *Parser) Error!Result { fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; - const char_kind = CharLiteral.Kind.classify(tok_id, .char_literal).?; + const char_kind = TextLiteral.Kind.classify(tok_id, .char_literal).?; var val: u32 = 0; const slice = char_kind.contentSlice(p.tokSlice(p.tok_i)); @@ -7630,7 +7630,7 @@ fn charLiteral(p: *Parser) Error!Result { val = slice[0]; } else { const max_codepoint = char_kind.maxCodepoint(p.comp); - var char_literal_parser = CharLiteral.Parser.init(slice, char_kind, max_codepoint, p.comp); + var char_literal_parser = TextLiteral.Parser.init(slice, char_kind, max_codepoint, p.comp); const max_chars_expected = 4; var stack_fallback = std.heap.stackFallback(max_chars_expected * @sizeOf(u32), p.comp.gpa); diff --git a/src/CharLiteral.zig b/src/TextLiteral.zig similarity index 99% rename from src/CharLiteral.zig rename to src/TextLiteral.zig index 4b301c4d..679488c7 100644 --- a/src/CharLiteral.zig +++ b/src/TextLiteral.zig @@ -1,3 +1,5 @@ +//! Parsing and classification of string and character literals + const std = @import("std"); const Compilation = @import("Compilation.zig"); const Type = @import("Type.zig"); From 23f0096ba3086af7753772918e7c6665bea8e630 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 15:37:53 -0700 Subject: [PATCH 20/26] Tokenizer: add token id for unterminated string literals --- src/Diagnostics.zig | 10 ++++++++++ src/Parser.zig | 9 ++++++++- src/Preprocessor.zig | 9 +++++++++ src/TextLiteral.zig | 11 +++++++++++ src/Tokenizer.zig | 10 ++++++++-- test/cases/stringify invalid.c | 3 ++- test/cases/unterminated string literal.c | 11 +++++++++++ 7 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 test/cases/unterminated string literal.c diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 52e096f0..2e4e549f 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -179,6 +179,7 @@ pub const Options = packed struct { @"invalid-source-encoding": Kind = .default, @"four-char-constants": Kind = .default, @"unknown-escape-sequence": Kind = .default, + @"invalid-pp-token": Kind = .default, }; const messages = struct { @@ -2515,6 +2516,15 @@ const messages = struct { const kind = .@"error"; const extra = .str; }; + pub const unterminated_string_literal_warning = struct { + const msg = "missing terminating '\"' character"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const unterminated_string_literal_error = struct { + const msg = "missing terminating '\"' character"; + const kind = .@"error"; + }; }; list: std.ArrayListUnmanaged(Message) = .{}, diff --git a/src/Parser.zig b/src/Parser.zig index 6aa23c2e..619830a1 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -1181,6 +1181,7 @@ fn staticAssert(p: *Parser) Error!bool { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, => try p.stringLiteral(), else => { try p.err(.expected_str_literal); @@ -3950,7 +3951,7 @@ fn assembly(p: *Parser, kind: enum { global, decl_label, stmt }) Error!?NodeInde fn asmStr(p: *Parser) Error!Result { var i = p.tok_i; while (true) : (i += 1) switch (p.tok_ids[i]) { - .string_literal => {}, + .string_literal, .unterminated_string_literal => {}, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32 => { try p.errStr(.invalid_asm_str, p.tok_i, "unicode"); return error.ParsingFailed; @@ -7458,6 +7459,7 @@ fn primaryExpr(p: *Parser) Error!Result { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, => return p.stringLiteral(), .char_literal, .char_literal_utf_8, @@ -7528,6 +7530,11 @@ fn stringLiteral(p: *Parser) Error!Result { while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} return error.ParsingFailed; }; + if (string_kind == .unterminated) { + try p.errTok(.unterminated_string_literal_error, string_end); + p.tok_i = string_end + 1; + return error.ParsingFailed; + } } assert(string_end > p.tok_i); diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index ca1b41ce..abb3b9de 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -635,6 +635,9 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { try pp.err(tok, .poisoned_identifier); } + if (tok.id == .unterminated_string_literal) { + try pp.err(tok, .unterminated_string_literal_warning); + } // Add the token to the buffer doing any necessary expansions. start_of_line = false; try pp.expandMacro(&tokenizer, tok); @@ -2187,6 +2190,9 @@ fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { need_ws = false; try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); } + if (tok.id == .unterminated_string_literal) { + try pp.err(tok, .unterminated_string_literal_warning); + } try pp.token_buf.append(tok); }, } @@ -2324,6 +2330,9 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa try pp.token_buf.append(tok); }, else => { + if (tok.id == .unterminated_string_literal) { + try pp.err(tok, .unterminated_string_literal_warning); + } if (tok.id != .whitespace and need_ws) { need_ws = false; try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); diff --git a/src/TextLiteral.zig b/src/TextLiteral.zig index 679488c7..4364a1d8 100644 --- a/src/TextLiteral.zig +++ b/src/TextLiteral.zig @@ -29,6 +29,8 @@ pub const Kind = enum { utf_8, utf_16, utf_32, + /// Error kind that halts parsing + unterminated, pub fn classify(id: Tokenizer.Token.Id, context: enum { string_literal, char_literal }) ?Kind { return switch (context) { @@ -38,6 +40,7 @@ pub const Kind = enum { .string_literal_wide => .wide, .string_literal_utf_16 => .utf_16, .string_literal_utf_32 => .utf_32, + .unterminated_string_literal => .unterminated, else => null, }, .char_literal => switch (id) { @@ -54,6 +57,7 @@ pub const Kind = enum { /// Should only be called for string literals. Determines the result kind of two adjacent string /// literals pub fn concat(self: Kind, other: Kind) !Kind { + if (self == .unterminated or other == .unterminated) return .unterminated; if (self == other) return self; // can always concat with own kind if (self == .char) return other; // char + X -> X if (other == .char) return self; // X + char -> X @@ -71,6 +75,7 @@ pub const Kind = enum { .utf_8 => std.math.maxInt(u7), .utf_16 => std.math.maxInt(u16), .utf_32 => 0x10FFFF, + .unterminated => unreachable, }); } @@ -81,6 +86,7 @@ pub const Kind = enum { .wide => comp.types.wchar.maxInt(comp), .utf_16 => std.math.maxInt(u16), .utf_32 => std.math.maxInt(u32), + .unterminated => unreachable, }); } @@ -92,6 +98,7 @@ pub const Kind = enum { .utf_8 => .{ .specifier = .uchar }, .utf_16 => comp.types.uint_least16_t, .utf_32 => comp.types.uint_least32_t, + .unterminated => unreachable, }; } @@ -105,6 +112,7 @@ pub const Kind = enum { .utf_8 => delimited[3..end], .utf_16 => delimited[2..end], .utf_32 => delimited[2..end], + .unterminated => unreachable, }; } @@ -120,6 +128,7 @@ pub const Kind = enum { .utf_8 => .@"1", .utf_16 => .@"2", .utf_32 => .@"4", + .unterminated => unreachable, }; } @@ -133,6 +142,7 @@ pub const Kind = enum { /// The C type of an element of a string literal of this kind pub fn elementType(kind: Kind, comp: *const Compilation) Type { return switch (kind) { + .unterminated => unreachable, .char => .{ .specifier = .char }, .utf_8 => if (comp.langopts.hasChar8_T()) .{ .specifier = .uchar } else .{ .specifier = .char }, else => kind.charLiteralType(comp), @@ -161,6 +171,7 @@ pub const Parser = struct { fn prefixLen(self: *const Parser) usize { return switch (self.kind) { + .unterminated => unreachable, .char => 0, .utf_8 => 2, .wide, .utf_16, .utf_32 => 1, diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index efaa60f2..d5afc58c 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -30,6 +30,10 @@ pub const Token = struct { string_literal_utf_32, string_literal_wide, + /// Any string literal with an embedded newline or EOF + /// Always a parser error + unterminated_string_literal, + // only generated by preprocessor macro_string, @@ -480,6 +484,7 @@ pub const Token = struct { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, .char_literal, .char_literal_utf_8, .char_literal_utf_16, @@ -1221,7 +1226,7 @@ pub fn next(self: *Tokenizer) Token { break; }, '\n' => { - id = .invalid; + id = .unterminated_string_literal; break; }, '\r' => unreachable, @@ -1681,7 +1686,6 @@ pub fn next(self: *Tokenizer) Token { .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, .period2, - .string_literal, .path_escape, .char_literal_start, .char_literal, @@ -1691,6 +1695,8 @@ pub fn next(self: *Tokenizer) Token { .multi_line_comment_asterisk, => id = .invalid, + .string_literal => id = .unterminated_string_literal, + .whitespace => id = .whitespace, .multi_line_comment_done => id = .whitespace, diff --git a/test/cases/stringify invalid.c b/test/cases/stringify invalid.c index 84e3d172..66fb3d7f 100644 --- a/test/cases/stringify invalid.c +++ b/test/cases/stringify invalid.c @@ -1,7 +1,8 @@ // clang also reports: warning: missing terminating '"' character [-Winvalid-pp-token] #define TESTS_SKIPPED 1 -#define EXPECTED_ERRORS "stringify invalid.c:15:1: error: expected ';', found '}'" +#define EXPECTED_ERRORS "stringify invalid.c:12:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "stringify invalid.c:16:1: error: expected ';', found '}'" void foo(void) { diff --git a/test/cases/unterminated string literal.c b/test/cases/unterminated string literal.c new file mode 100644 index 00000000..d5bbcc9b --- /dev/null +++ b/test/cases/unterminated string literal.c @@ -0,0 +1,11 @@ +#define EXPECTED_ERRORS "unterminated string literal.c:9:12: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:10:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:11:12: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:9:12: error: missing terminating '\"' character" \ + "unterminated string literal.c:10:20: error: missing terminating '\"' character" \ + "unterminated string literal.c:11:12: error: missing terminating '\"' character" \ + + +char A[] = "hello +char B[] = "hello" "world +char C[] = " \ No newline at end of file From 4947bd052d725b03d5a607a5091773389c19f7ac Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 16:17:26 -0700 Subject: [PATCH 21/26] Tokenizer: add ids for empty and unterminated char literals --- src/Diagnostics.zig | 18 +++++++++++++ src/Parser.zig | 15 ++++++++++- src/Preprocessor.zig | 37 ++++++++++++++++++++------ src/Tokenizer.zig | 30 ++++++++++++++------- test/cases/unterminated char literal.c | 17 ++++++++++++ 5 files changed, 99 insertions(+), 18 deletions(-) create mode 100644 test/cases/unterminated char literal.c diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 2e4e549f..4623a341 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -2525,6 +2525,24 @@ const messages = struct { const msg = "missing terminating '\"' character"; const kind = .@"error"; }; + pub const empty_char_literal_warning = struct { + const msg = "empty character constant"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const empty_char_literal_error = struct { + const msg = "empty character constant"; + const kind = .@"error"; + }; + pub const unterminated_char_literal_warning = struct { + const msg = "missing terminating ' character"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const unterminated_char_literal_error = struct { + const msg = "missing terminating ' character"; + const kind = .@"error"; + }; }; list: std.ArrayListUnmanaged(Message) = .{}, diff --git a/src/Parser.zig b/src/Parser.zig index 619830a1..660fbf10 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -7466,6 +7466,8 @@ fn primaryExpr(p: *Parser) Error!Result { .char_literal_utf_16, .char_literal_utf_32, .char_literal_wide, + .empty_char_literal, + .unterminated_char_literal, => return p.charLiteral(), .zero => { p.tok_i += 1; @@ -7627,7 +7629,18 @@ fn stringLiteral(p: *Parser) Error!Result { fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; - const char_kind = TextLiteral.Kind.classify(tok_id, .char_literal).?; + const char_kind = TextLiteral.Kind.classify(tok_id, .char_literal) orelse { + if (tok_id == .empty_char_literal) { + try p.err(.empty_char_literal_error); + } else if (tok_id == .unterminated_char_literal) { + try p.err(.unterminated_char_literal_error); + } else unreachable; + return .{ + .ty = Type.int, + .val = Value.int(0), + .node = try p.addNode(.{ .tag = .char_literal, .ty = Type.int, .data = undefined }), + }; + }; var val: u32 = 0; const slice = char_kind.contentSlice(p.tokSlice(p.tok_i)); diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index abb3b9de..f8688fa8 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -635,8 +635,11 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { try pp.err(tok, .poisoned_identifier); } - if (tok.id == .unterminated_string_literal) { - try pp.err(tok, .unterminated_string_literal_warning); + switch (tok.id) { + .unterminated_string_literal => try pp.err(tok, .unterminated_string_literal_warning), + .empty_char_literal => try pp.err(tok, .empty_char_literal_warning), + .unterminated_char_literal => try pp.err(tok, .unterminated_char_literal_warning), + else => {}, } // Add the token to the buffer doing any necessary expansions. start_of_line = false; @@ -2185,14 +2188,23 @@ fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { try pp.token_buf.append(tok); }, .whitespace => need_ws = true, + .unterminated_string_literal => { + try pp.err(tok, .unterminated_string_literal_warning); + try pp.token_buf.append(tok); + }, + .unterminated_char_literal => { + try pp.err(tok, .unterminated_char_literal_warning); + try pp.token_buf.append(tok); + }, + .empty_char_literal => { + try pp.err(tok, .empty_char_literal_warning); + try pp.token_buf.append(tok); + }, else => { if (tok.id != .whitespace and need_ws) { need_ws = false; try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); } - if (tok.id == .unterminated_string_literal) { - try pp.err(tok, .unterminated_string_literal_warning); - } try pp.token_buf.append(tok); }, } @@ -2329,10 +2341,19 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa } try pp.token_buf.append(tok); }, + .unterminated_string_literal => { + try pp.err(tok, .unterminated_string_literal_warning); + try pp.token_buf.append(tok); + }, + .unterminated_char_literal => { + try pp.err(tok, .unterminated_char_literal_warning); + try pp.token_buf.append(tok); + }, + .empty_char_literal => { + try pp.err(tok, .empty_char_literal_warning); + try pp.token_buf.append(tok); + }, else => { - if (tok.id == .unterminated_string_literal) { - try pp.err(tok, .unterminated_string_literal_warning); - } if (tok.id != .whitespace and need_ws) { need_ws = false; try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index d5afc58c..6629a2b0 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -31,7 +31,7 @@ pub const Token = struct { string_literal_wide, /// Any string literal with an embedded newline or EOF - /// Always a parser error + /// Always a parser error; by default just a warning from preprocessor unterminated_string_literal, // only generated by preprocessor @@ -44,6 +44,14 @@ pub const Token = struct { char_literal_utf_32, char_literal_wide, + /// Any character literal with nothing inside the quotes + /// Always a parser error; by default just a warning from preprocessor + empty_char_literal, + + /// Any character literal with an embedded newline or EOF + /// Always a parser error; by default just a warning from preprocessor + unterminated_char_literal, + /// Integer literal tokens generated by preprocessor. one, zero, @@ -485,6 +493,8 @@ pub const Token = struct { .string_literal_utf_32, .string_literal_wide, .unterminated_string_literal, + .unterminated_char_literal, + .empty_char_literal, .char_literal, .char_literal_utf_8, .char_literal_utf_16, @@ -1239,8 +1249,13 @@ pub fn next(self: *Tokenizer) Token { '\\' => { state = .char_escape_sequence; }, - '\'', '\n' => { - id = .invalid; + '\'' => { + id = .empty_char_literal; + self.index += 1; + break; + }, + '\n' => { + id = .unterminated_char_literal; break; }, else => { @@ -1256,7 +1271,7 @@ pub fn next(self: *Tokenizer) Token { break; }, '\n' => { - id = .invalid; + id = .unterminated_char_literal; break; }, else => {}, @@ -1687,15 +1702,12 @@ pub fn next(self: *Tokenizer) Token { .extended_identifier => id = .extended_identifier, .period2, .path_escape, - .char_literal_start, - .char_literal, - .string_escape_sequence, - .char_escape_sequence, .multi_line_comment, .multi_line_comment_asterisk, => id = .invalid, - .string_literal => id = .unterminated_string_literal, + .char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal, + .string_escape_sequence, .string_literal => id = .unterminated_string_literal, .whitespace => id = .whitespace, .multi_line_comment_done => id = .whitespace, diff --git a/test/cases/unterminated char literal.c b/test/cases/unterminated char literal.c new file mode 100644 index 00000000..951ca6e4 --- /dev/null +++ b/test/cases/unterminated char literal.c @@ -0,0 +1,17 @@ +#define A 'b +#define B '' +#define C(X) '' +#define D(X) 'A + +#define EXPECTED_ERRORS "unterminated char literal.c:1:11: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:2:11: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:3:14: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:4:14: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:16:10: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:17:10: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:16:10: error: empty character constant" \ + "unterminated char literal.c:17:10: error: missing terminating ' character" \ + "unterminated char literal.c:17:11: error: expected ';' before end of file" \ + +char c = u8''; +char d = ' \ No newline at end of file From c80861429caed66667d9a9579ddb85d40e5206a0 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 18:20:21 -0700 Subject: [PATCH 22/26] Preprocessor: Add an error for unterminated comment --- src/Diagnostics.zig | 4 ++++ src/Preprocessor.zig | 10 ++++++++++ src/Tokenizer.zig | 8 +++++++- test/cases/unterminated comment.c | 4 ++++ 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 test/cases/unterminated comment.c diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 4623a341..64ab6ca5 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -2543,6 +2543,10 @@ const messages = struct { const msg = "missing terminating ' character"; const kind = .@"error"; }; + pub const unterminated_comment = struct { + const msg = "unterminated comment"; + const kind = .@"error"; + }; }; list: std.ArrayListUnmanaged(Message) = .{}, diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index f8688fa8..d1b47b05 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -639,6 +639,10 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { .unterminated_string_literal => try pp.err(tok, .unterminated_string_literal_warning), .empty_char_literal => try pp.err(tok, .empty_char_literal_warning), .unterminated_char_literal => try pp.err(tok, .unterminated_char_literal_warning), + .unterminated_comment => { + try pp.err(tok, .unterminated_comment); + continue; + }, else => {}, } // Add the token to the buffer doing any necessary expansions. @@ -2200,6 +2204,9 @@ fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { try pp.err(tok, .empty_char_literal_warning); try pp.token_buf.append(tok); }, + .unterminated_comment => { + try pp.err(tok, .unterminated_comment); + }, else => { if (tok.id != .whitespace and need_ws) { need_ws = false; @@ -2353,6 +2360,9 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa try pp.err(tok, .empty_char_literal_warning); try pp.token_buf.append(tok); }, + .unterminated_comment => { + try pp.err(tok, .unterminated_comment); + }, else => { if (tok.id != .whitespace and need_ws) { need_ws = false; diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 6629a2b0..dcf33a7e 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -52,6 +52,9 @@ pub const Token = struct { /// Always a parser error; by default just a warning from preprocessor unterminated_char_literal, + /// `/* */` style comment without a closing `*/` before EOF + unterminated_comment, + /// Integer literal tokens generated by preprocessor. one, zero, @@ -482,6 +485,7 @@ pub const Token = struct { return switch (id) { .include_start, .include_resume, + .unterminated_comment, // Fatal error; parsing should not be attempted => unreachable, .invalid, @@ -1702,9 +1706,11 @@ pub fn next(self: *Tokenizer) Token { .extended_identifier => id = .extended_identifier, .period2, .path_escape, + => id = .invalid, + .multi_line_comment, .multi_line_comment_asterisk, - => id = .invalid, + => id = .unterminated_comment, .char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal, .string_escape_sequence, .string_literal => id = .unterminated_string_literal, diff --git a/test/cases/unterminated comment.c b/test/cases/unterminated comment.c new file mode 100644 index 00000000..91b22923 --- /dev/null +++ b/test/cases/unterminated comment.c @@ -0,0 +1,4 @@ +#define EXPECTED_ERRORS "unterminated comment.c:4:7: error: unterminated comment" \ + "unterminated comment.c:4:6: error: expected ';' before end of file" \ + +int x /** \ No newline at end of file From e60d5928641afc6318103a418832175e1f4a3ece Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 22:30:17 -0700 Subject: [PATCH 23/26] Tokenizer: handle 2 periods at end of file --- src/Tokenizer.zig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index dcf33a7e..993f2c6b 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -1704,9 +1704,12 @@ pub fn next(self: *Tokenizer) Token { .start, .line_comment => {}, .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, - .period2, - .path_escape, - => id = .invalid, + .path_escape => id = .invalid, + + .period2 => { + self.index -= 1; + id = .period; + }, .multi_line_comment, .multi_line_comment_asterisk, From 37c8622630e47097b408a2989500fa9eb429a0f7 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 22:43:50 -0700 Subject: [PATCH 24/26] Tokenizer: remove path_escape --- src/Preprocessor.zig | 20 +++++--------------- src/Tokenizer.zig | 9 +-------- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index d1b47b05..8b035633 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -1249,7 +1249,7 @@ fn reconstructIncludeString(pp: *Preprocessor, param_toks: []const Token) !?[]co } for (params) |tok| { - const str = pp.expandedSliceExtra(tok, .preserve_macro_ws, false); + const str = pp.expandedSliceExtra(tok, .preserve_macro_ws); try pp.char_buf.appendSlice(str); } @@ -1995,12 +1995,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr } } -fn expandedSliceExtra( - pp: *const Preprocessor, - tok: Token, - macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }, - path_escapes: bool, -) []const u8 { +fn expandedSliceExtra(pp: *const Preprocessor, tok: Token, macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }) []const u8 { if (tok.id.lexeme()) |some| { if (!tok.id.allowsDigraphs(pp.comp) and !(tok.id == .macro_ws and macro_ws_handling == .preserve_macro_ws)) return some; } @@ -2009,7 +2004,6 @@ fn expandedSliceExtra( .comp = pp.comp, .index = tok.loc.byte_offset, .source = .generated, - .path_escapes = path_escapes, }; if (tok.id == .macro_string) { while (true) : (tmp_tokenizer.index += 1) { @@ -2023,7 +2017,7 @@ fn expandedSliceExtra( /// Get expanded token source string. pub fn expandedSlice(pp: *Preprocessor, tok: Token) []const u8 { - return pp.expandedSliceExtra(tok, .single_macro_ws, false); + return pp.expandedSliceExtra(tok, .single_macro_ws); } /// Concat two tokens and add the result to pp.generated @@ -2408,8 +2402,6 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa /// Handle an #embed directive fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { - tokenizer.path_escapes = true; - defer tokenizer.path_escapes = false; const first = tokenizer.nextNoWS(); const filename_tok = pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof) catch |er| switch (er) { error.InvalidInclude => return, @@ -2417,7 +2409,7 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { }; // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws, true); + const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); if (tok_slice.len < 3) { try pp.err(first, .empty_filename); return; @@ -2459,8 +2451,6 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { // Handle a #include directive. fn include(pp: *Preprocessor, tokenizer: *Tokenizer, which: Compilation.WhichInclude) MacroError!void { - tokenizer.path_escapes = true; - defer tokenizer.path_escapes = false; const first = tokenizer.nextNoWS(); const new_source = findIncludeSource(pp, tokenizer, first, which) catch |er| switch (er) { error.InvalidInclude => return, @@ -2626,7 +2616,7 @@ fn findIncludeSource(pp: *Preprocessor, tokenizer: *Tokenizer, first: RawToken, const filename_tok = try pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof); // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws, true); + const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); if (tok_slice.len < 3) { try pp.err(first, .empty_filename); return error.InvalidInclude; diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 993f2c6b..3c9a85a0 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -1003,8 +1003,6 @@ index: u32 = 0, source: Source.Id, comp: *const Compilation, line: u32 = 1, -/// Used to parse include strings with Windows style paths. -path_escapes: bool = false, pub fn next(self: *Tokenizer) Token { var state: enum { @@ -1015,7 +1013,6 @@ pub fn next(self: *Tokenizer) Token { U, L, string_literal, - path_escape, char_literal_start, char_literal, char_escape_sequence, @@ -1233,7 +1230,7 @@ pub fn next(self: *Tokenizer) Token { }, .string_literal => switch (c) { '\\' => { - state = if (self.path_escapes) .path_escape else .string_escape_sequence; + state = .string_escape_sequence; }, '"' => { self.index += 1; @@ -1246,9 +1243,6 @@ pub fn next(self: *Tokenizer) Token { '\r' => unreachable, else => {}, }, - .path_escape => { - state = .string_literal; - }, .char_literal_start => switch (c) { '\\' => { state = .char_escape_sequence; @@ -1704,7 +1698,6 @@ pub fn next(self: *Tokenizer) Token { .start, .line_comment => {}, .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, - .path_escape => id = .invalid, .period2 => { self.index -= 1; From 81be5c731709283939ab625a12a0ca466ab5dfce Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 20 Oct 2023 23:25:01 -0700 Subject: [PATCH 25/26] Preprocessor: remove some code duplication for invalid token handling --- src/Preprocessor.zig | 57 ++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index 8b035633..b0e28f9e 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -266,6 +266,15 @@ pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: } }); } +fn invalidTokenDiagnostic(tok_id: Token.Id) Diagnostics.Tag { + return switch (tok_id) { + .unterminated_string_literal => .unterminated_string_literal_warning, + .empty_char_literal => .empty_char_literal_warning, + .unterminated_char_literal => .unterminated_char_literal_warning, + else => unreachable, + }; +} + /// Return the name of the #ifndef guard macro that starts a source, if any. fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { var tokenizer = Tokenizer{ @@ -631,20 +640,16 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { } return tokFromRaw(tok); }, + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + start_of_line = false; + try pp.err(tok, invalidTokenDiagnostic(tag)); + try pp.expandMacro(&tokenizer, tok); + }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { try pp.err(tok, .poisoned_identifier); } - switch (tok.id) { - .unterminated_string_literal => try pp.err(tok, .unterminated_string_literal_warning), - .empty_char_literal => try pp.err(tok, .empty_char_literal_warning), - .unterminated_char_literal => try pp.err(tok, .unterminated_char_literal_warning), - .unterminated_comment => { - try pp.err(tok, .unterminated_comment); - continue; - }, - else => {}, - } // Add the token to the buffer doing any necessary expansions. start_of_line = false; try pp.expandMacro(&tokenizer, tok); @@ -2186,21 +2191,11 @@ fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { try pp.token_buf.append(tok); }, .whitespace => need_ws = true, - .unterminated_string_literal => { - try pp.err(tok, .unterminated_string_literal_warning); + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + try pp.err(tok, invalidTokenDiagnostic(tag)); try pp.token_buf.append(tok); }, - .unterminated_char_literal => { - try pp.err(tok, .unterminated_char_literal_warning); - try pp.token_buf.append(tok); - }, - .empty_char_literal => { - try pp.err(tok, .empty_char_literal_warning); - try pp.token_buf.append(tok); - }, - .unterminated_comment => { - try pp.err(tok, .unterminated_comment); - }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id != .whitespace and need_ws) { need_ws = false; @@ -2342,21 +2337,11 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa } try pp.token_buf.append(tok); }, - .unterminated_string_literal => { - try pp.err(tok, .unterminated_string_literal_warning); + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + try pp.err(tok, invalidTokenDiagnostic(tag)); try pp.token_buf.append(tok); }, - .unterminated_char_literal => { - try pp.err(tok, .unterminated_char_literal_warning); - try pp.token_buf.append(tok); - }, - .empty_char_literal => { - try pp.err(tok, .empty_char_literal_warning); - try pp.token_buf.append(tok); - }, - .unterminated_comment => { - try pp.err(tok, .unterminated_comment); - }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id != .whitespace and need_ws) { need_ws = false; From 05a73293c9350494ca74a3643589a8c4dea1a5f3 Mon Sep 17 00:00:00 2001 From: Veikka Tuominen Date: Sat, 21 Oct 2023 12:34:00 +0300 Subject: [PATCH 26/26] remove outdated TESTS_SKIPPED --- test/cases/stringify invalid.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/cases/stringify invalid.c b/test/cases/stringify invalid.c index 66fb3d7f..6d551d29 100644 --- a/test/cases/stringify invalid.c +++ b/test/cases/stringify invalid.c @@ -1,8 +1,5 @@ -// clang also reports: warning: missing terminating '"' character [-Winvalid-pp-token] -#define TESTS_SKIPPED 1 - -#define EXPECTED_ERRORS "stringify invalid.c:12:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ - "stringify invalid.c:16:1: error: expected ';', found '}'" +#define EXPECTED_ERRORS "stringify invalid.c:9:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "stringify invalid.c:13:1: error: expected ';', found '}'" void foo(void) {