diff --git a/src/Attribute.zig b/src/Attribute.zig index 966d240c..23e038d4 100644 --- a/src/Attribute.zig +++ b/src/Attribute.zig @@ -60,7 +60,7 @@ pub const ArgumentType = enum { fn fromType(comptime T: type) ArgumentType { return switch (T) { - []const u8 => .string, + Value.ByteRange => .string, Identifier => .identifier, u32 => .int, Alignment => .alignment, @@ -263,10 +263,17 @@ fn diagnoseField( .bytes => { const bytes = val.data.bytes.trim(1); // remove null terminator if (wanted == Value.ByteRange) { + std.debug.assert(node.tag == .string_literal_expr); + if (!node.ty.elemType().is(.char) and !node.ty.elemType().is(.uchar)) { + return Diagnostics.Message{ + .tag = .attribute_requires_string, + .extra = .{ .str = decl.name }, + }; + } @field(@field(arguments, decl.name), field.name) = bytes; return null; } else if (@typeInfo(wanted) == .Enum and @hasDecl(wanted, "opts") and wanted.opts.enum_kind == .string) { - const str = bytes.slice(strings); + const str = bytes.slice(strings, .@"1"); if (std.meta.stringToEnum(wanted, str)) |enum_val| { @field(@field(arguments, decl.name), field.name) = enum_val; return null; diff --git a/src/Compilation.zig b/src/Compilation.zig index c0c3126a..49846b8a 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -238,6 +238,8 @@ pub fn generateBuiltinMacros(comp: *Compilation) !Source { \\#define __STDC_NO_COMPLEX__ 1 \\#define __STDC_NO_THREADS__ 1 \\#define __STDC_NO_VLA__ 1 + \\#define __STDC_UTF_16__ 1 + \\#define __STDC_UTF_32__ 1 \\ ); if (comp.langopts.standard.StdCVersionMacro()) |stdc_version| { @@ -1428,6 +1430,20 @@ pub fn hasBuiltinFunction(comp: *const Compilation, builtin: Builtin) bool { } } +pub const CharUnitSize = enum(u32) { + @"1" = 1, + @"2" = 2, + @"4" = 4, + + pub fn Type(comptime self: CharUnitSize) type { + return switch (self) { + .@"1" => u8, + .@"2" => u16, + .@"4" => u32, + }; + } +}; + pub const renderErrors = Diagnostics.render; test "addSourceFromReader" { diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig index 7c8ae4fd..64ab6ca5 100644 --- a/src/Diagnostics.zig +++ b/src/Diagnostics.zig @@ -179,6 +179,7 @@ pub const Options = packed struct { @"invalid-source-encoding": Kind = .default, @"four-char-constants": Kind = .default, @"unknown-escape-sequence": Kind = .default, + @"invalid-pp-token": Kind = .default, }; const messages = struct { @@ -2510,6 +2511,42 @@ const messages = struct { const opt = "unknown-escape-sequence"; const extra = .invalid_escape; }; + pub const attribute_requires_string = struct { + const msg = "attribute '{s}' requires an ordinary string"; + const kind = .@"error"; + const extra = .str; + }; + pub const unterminated_string_literal_warning = struct { + const msg = "missing terminating '\"' character"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const unterminated_string_literal_error = struct { + const msg = "missing terminating '\"' character"; + const kind = .@"error"; + }; + pub const empty_char_literal_warning = struct { + const msg = "empty character constant"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const empty_char_literal_error = struct { + const msg = "empty character constant"; + const kind = .@"error"; + }; + pub const unterminated_char_literal_warning = struct { + const msg = "missing terminating ' character"; + const kind = .warning; + const opt = "invalid-pp-token"; + }; + pub const unterminated_char_literal_error = struct { + const msg = "missing terminating ' character"; + const kind = .@"error"; + }; + pub const unterminated_comment = struct { + const msg = "unterminated comment"; + const kind = .@"error"; + }; }; list: std.ArrayListUnmanaged(Message) = .{}, diff --git a/src/Ir.zig b/src/Ir.zig index 43739252..4c45f78b 100644 --- a/src/Ir.zig +++ b/src/Ir.zig @@ -552,7 +552,7 @@ fn writeValue(ir: Ir, val_ref: Interner.Ref, color: bool, w: anytype) !void { switch (v.tag) { .unavailable => try w.writeAll(" unavailable"), .int => try w.print("{d}", .{v.data.int}), - .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(ir.strings)}), + .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(ir.strings, .@"1")}), // std.fmt does @as instead of @floatCast .float => try w.print("{d}", .{@as(f64, @floatCast(v.data.float))}), else => try w.print("({s})", .{@tagName(v.tag)}), diff --git a/src/Parser.zig b/src/Parser.zig index 93da1d3e..660fbf10 100644 --- a/src/Parser.zig +++ b/src/Parser.zig @@ -17,7 +17,7 @@ const NodeList = std.ArrayList(NodeIndex); const InitList = @import("InitList.zig"); const Attribute = @import("Attribute.zig"); const CharInfo = @import("CharInfo.zig"); -const CharLiteral = @import("CharLiteral.zig"); +const TextLiteral = @import("TextLiteral.zig"); const Value = @import("Value.zig"); const SymbolStack = @import("SymbolStack.zig"); const Symbol = SymbolStack.Symbol; @@ -468,7 +468,7 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ defer p.strings.items.len = strings_top; const w = p.strings.writer(); - const msg_str = p.retainedString(@"error".msg); + const msg_str = p.attributeMessageString(@"error".msg); try w.print("call to '{s}' declared with attribute error: {s}", .{ p.tokSlice(@"error".__name_tok), msg_str }); const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); try p.errStr(.error_attribute, usage_tok, str); @@ -478,7 +478,7 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ defer p.strings.items.len = strings_top; const w = p.strings.writer(); - const msg_str = p.retainedString(warning.msg); + const msg_str = p.attributeMessageString(warning.msg); try w.print("call to '{s}' declared with attribute warning: {s}", .{ p.tokSlice(warning.__name_tok), msg_str }); const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); try p.errStr(.warning_attribute, usage_tok, str); @@ -493,9 +493,10 @@ fn checkDeprecatedUnavailable(p: *Parser, ty: Type, usage_tok: TokenIndex, decl_ } } +/// Assumes that the specified range was created by an ordinary or `u8` string literal /// Returned slice is invalidated if additional strings are added to p.retained_strings -fn retainedString(p: *Parser, range: Value.ByteRange) []const u8 { - return range.slice(p.retained_strings.items); +fn attributeMessageString(p: *Parser, range: Value.ByteRange) []const u8 { + return range.slice(p.retained_strings.items, .@"1"); } fn errDeprecated(p: *Parser, tag: Diagnostics.Tag, tok_i: TokenIndex, msg: ?Value.ByteRange) Compilation.Error!void { @@ -511,7 +512,7 @@ fn errDeprecated(p: *Parser, tag: Diagnostics.Tag, tok_i: TokenIndex, msg: ?Valu }; try w.writeAll(reason); if (msg) |m| { - const str = p.retainedString(m); + const str = p.attributeMessageString(m); try w.print(": {s}", .{str}); } const str = try p.comp.diag.arena.allocator().dupe(u8, p.strings.items[strings_top..]); @@ -1153,17 +1154,13 @@ fn staticAssertMessage(p: *Parser, cond_node: NodeIndex, message: Result) !?[]co try buf.appendSlice(")'"); } if (message.node != .none) { + assert(p.nodes.items(.tag)[@intFromEnum(message.node)] == .string_literal_expr); if (buf.items.len > 0) { try buf.append(' '); } - const data = message.val.data.bytes; - try buf.ensureUnusedCapacity(data.len()); - try Tree.dumpStr( - p.retained_strings.items, - data, - p.nodes.items(.tag)[@intFromEnum(message.node)], - buf.writer(), - ); + const byte_range = message.val.data.bytes; + try buf.ensureUnusedCapacity(byte_range.len()); + try byte_range.dumpString(message.ty, p.comp, p.retained_strings.items, buf.writer()); } return try p.comp.diag.arena.allocator().dupe(u8, buf.items); } @@ -1184,6 +1181,7 @@ fn staticAssert(p: *Parser) Error!bool { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, => try p.stringLiteral(), else => { try p.err(.expected_str_literal); @@ -3953,7 +3951,7 @@ fn assembly(p: *Parser, kind: enum { global, decl_label, stmt }) Error!?NodeInde fn asmStr(p: *Parser) Error!Result { var i = p.tok_i; while (true) : (i += 1) switch (p.tok_ids[i]) { - .string_literal => {}, + .string_literal, .unterminated_string_literal => {}, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32 => { try p.errStr(.invalid_asm_str, p.tok_i, "unicode"); return error.ParsingFailed; @@ -7461,12 +7459,15 @@ fn primaryExpr(p: *Parser) Error!Result { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, => return p.stringLiteral(), .char_literal, .char_literal_utf_8, .char_literal_utf_16, .char_literal_utf_32, .char_literal_wide, + .empty_char_literal, + .unterminated_char_literal, => return p.charLiteral(), .zero => { p.tok_i += 1; @@ -7523,131 +7524,123 @@ fn makePredefinedIdentifier(p: *Parser, start: u32) !Result { } fn stringLiteral(p: *Parser) Error!Result { - var start = p.tok_i; - // use 1 for wchar_t - var width: ?u8 = null; - var is_u8_literal = false; - while (true) { - switch (p.tok_ids[p.tok_i]) { - .string_literal => {}, - .string_literal_utf_16 => if (width) |some| { - if (some != 16) try p.err(.unsupported_str_cat); - } else { - width = 16; - }, - .string_literal_utf_8 => { - is_u8_literal = true; - if (width) |some| { - if (some != 8) try p.err(.unsupported_str_cat); - } else { - width = 8; + var string_end = p.tok_i; + var string_kind: TextLiteral.Kind = .char; + while (TextLiteral.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { + string_kind = string_kind.concat(next) catch { + try p.errTok(.unsupported_str_cat, string_end); + while (p.tok_ids[p.tok_i].isStringLiteral()) : (p.tok_i += 1) {} + return error.ParsingFailed; + }; + if (string_kind == .unterminated) { + try p.errTok(.unterminated_string_literal_error, string_end); + p.tok_i = string_end + 1; + return error.ParsingFailed; + } + } + assert(string_end > p.tok_i); + + const char_width = string_kind.charUnitSize(p.comp); + + const retain_start = mem.alignForward(usize, p.retained_strings.items.len, string_kind.internalStorageAlignment(p.comp)); + try p.retained_strings.resize(retain_start); + + while (p.tok_i < string_end) : (p.tok_i += 1) { + const this_kind = TextLiteral.Kind.classify(p.tok_ids[p.tok_i], .string_literal).?; + const slice = this_kind.contentSlice(p.tokSlice(p.tok_i)); + var char_literal_parser = TextLiteral.Parser.init(slice, this_kind, 0x10ffff, p.comp); + + try p.retained_strings.ensureUnusedCapacity((slice.len + 1) * @intFromEnum(char_width)); // +1 for null terminator + while (char_literal_parser.next()) |item| switch (item) { + .value => |v| { + switch (char_width) { + .@"1" => p.retained_strings.appendAssumeCapacity(@intCast(v)), + .@"2" => { + const word: u16 = @intCast(v); + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&word)); + }, + .@"4" => p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&v)), } }, - .string_literal_utf_32 => if (width) |some| { - if (some != 32) try p.err(.unsupported_str_cat); - } else { - width = 32; + .codepoint => |c| { + switch (char_width) { + .@"1" => { + var buf: [4]u8 = undefined; + const written = std.unicode.utf8Encode(c, &buf) catch unreachable; + const encoded = buf[0..written]; + p.retained_strings.appendSliceAssumeCapacity(encoded); + }, + .@"2" => { + var utf16_buf: [2]u16 = undefined; + var utf8_buf: [4]u8 = undefined; + const utf8_written = std.unicode.utf8Encode(c, &utf8_buf) catch unreachable; + const utf16_written = std.unicode.utf8ToUtf16Le(&utf16_buf, utf8_buf[0..utf8_written]) catch unreachable; + const bytes = std.mem.sliceAsBytes(utf16_buf[0..utf16_written]); + p.retained_strings.appendSliceAssumeCapacity(bytes); + }, + .@"4" => { + const val: u32 = c; + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); + }, + } }, - .string_literal_wide => if (width) |some| { - if (some != 1) try p.err(.unsupported_str_cat); - } else { - width = 1; + .improperly_encoded => |bytes| p.retained_strings.appendSliceAssumeCapacity(bytes), + .utf8_text => |view| { + switch (char_width) { + .@"1" => p.retained_strings.appendSliceAssumeCapacity(view.bytes), + .@"2" => { + var capacity_slice: []align(@alignOf(u16)) u8 = @alignCast(p.retained_strings.unusedCapacitySlice()); + const dest_len = std.mem.alignBackward(usize, capacity_slice.len, 2); + var dest = std.mem.bytesAsSlice(u16, capacity_slice[0..dest_len]); + const words_written = std.unicode.utf8ToUtf16Le(dest, view.bytes) catch unreachable; + p.retained_strings.resize(p.retained_strings.items.len + words_written * 2) catch unreachable; + }, + .@"4" => { + var it = view.iterator(); + while (it.nextCodepoint()) |codepoint| { + const val: u32 = codepoint; + p.retained_strings.appendSliceAssumeCapacity(mem.asBytes(&val)); + } + }, + } }, - else => break, - } - p.tok_i += 1; - } - if (width == null) width = 8; - if (width.? != 8) return p.todo("unicode string literals"); - - const string_start = p.retained_strings.items.len; - while (start < p.tok_i) : (start += 1) { - var slice = p.tokSlice(start); - slice = slice[0 .. slice.len - 1]; - var i = mem.indexOf(u8, slice, "\"").? + 1; - try p.retained_strings.ensureUnusedCapacity(slice.len); - while (i < slice.len) : (i += 1) { - switch (slice[i]) { - '\\' => { - i += 1; - switch (slice[i]) { - '\n' => i += 1, - '\r' => i += 2, - '\'', '\"', '\\', '?' => |c| p.retained_strings.appendAssumeCapacity(c), - 'n' => p.retained_strings.appendAssumeCapacity('\n'), - 'r' => p.retained_strings.appendAssumeCapacity('\r'), - 't' => p.retained_strings.appendAssumeCapacity('\t'), - 'a' => p.retained_strings.appendAssumeCapacity(0x07), - 'b' => p.retained_strings.appendAssumeCapacity(0x08), - 'e' => { - try p.errExtra(.non_standard_escape_char, start, .{ .invalid_escape = .{ .char = 'e', .offset = @intCast(i) } }); - p.retained_strings.appendAssumeCapacity(0x1B); - }, - 'f' => p.retained_strings.appendAssumeCapacity(0x0C), - 'v' => p.retained_strings.appendAssumeCapacity(0x0B), - 'x' => p.retained_strings.appendAssumeCapacity(try p.parseNumberEscape(start, 16, slice, &i)), - '0'...'7' => p.retained_strings.appendAssumeCapacity(try p.parseNumberEscape(start, 8, slice, &i)), - 'u' => try p.parseUnicodeEscape(start, 4, slice, &i), - 'U' => try p.parseUnicodeEscape(start, 8, slice, &i), - else => unreachable, - } - }, - else => |c| p.retained_strings.appendAssumeCapacity(c), - } + }; + for (char_literal_parser.errors.constSlice()) |item| { + try p.errExtra(item.tag, p.tok_i, item.extra); } } - try p.retained_strings.append(0); - const slice = p.retained_strings.items[string_start..]; + p.retained_strings.appendNTimesAssumeCapacity(0, @intFromEnum(char_width)); + const slice = p.retained_strings.items[retain_start..]; const arr_ty = try p.arena.create(Type.Array); - const specifier: Type.Specifier = if (is_u8_literal and p.comp.langopts.hasChar8_T()) .uchar else .char; - - arr_ty.* = .{ .elem = .{ .specifier = specifier }, .len = slice.len }; + arr_ty.* = .{ .elem = string_kind.elementType(p.comp), .len = @divExact(slice.len, @intFromEnum(char_width)) }; var res: Result = .{ .ty = .{ .specifier = .array, .data = .{ .array = arr_ty }, }, - .val = Value.bytes(@intCast(string_start), @intCast(p.retained_strings.items.len)), + .val = Value.bytes(@intCast(retain_start), @intCast(p.retained_strings.items.len)), }; res.node = try p.addNode(.{ .tag = .string_literal_expr, .ty = res.ty, .data = undefined }); if (!p.in_macro) try p.value_map.put(res.node, res.val); return res; } -fn parseNumberEscape(p: *Parser, tok: TokenIndex, base: u8, slice: []const u8, i: *usize) !u8 { - if (base == 16) i.* += 1; // skip x - var char: u8 = 0; - var reported = false; - while (i.* < slice.len) : (i.* += 1) { - const val = std.fmt.charToDigit(slice[i.*], base) catch break; // validated by Tokenizer - const product, const overflowed = @mulWithOverflow(char, base); - if (overflowed != 0 and !reported) { - try p.errExtra(.escape_sequence_overflow, tok, .{ .unsigned = i.* }); - reported = true; - } - char = product + val; - } - i.* -= 1; - return char; -} - -fn parseUnicodeEscape(p: *Parser, tok: TokenIndex, count: u8, slice: []const u8, i: *usize) !void { - const c = std.fmt.parseInt(u21, slice[i.* + 1 ..][0..count], 16) catch 0x110000; // count validated by tokenizer - i.* += count + 1; - if (!std.unicode.utf8ValidCodepoint(c) or (c < 0xa0 and c != '$' and c != '@' and c != '`')) { - try p.errExtra(.invalid_universal_character, tok, .{ .unsigned = i.* - count - 2 }); - return; - } - var buf: [4]u8 = undefined; - const to_write = std.unicode.utf8Encode(c, &buf) catch unreachable; // validated above - p.retained_strings.appendSliceAssumeCapacity(buf[0..to_write]); -} - fn charLiteral(p: *Parser) Error!Result { defer p.tok_i += 1; const tok_id = p.tok_ids[p.tok_i]; - const char_kind = CharLiteral.Kind.classify(tok_id); + const char_kind = TextLiteral.Kind.classify(tok_id, .char_literal) orelse { + if (tok_id == .empty_char_literal) { + try p.err(.empty_char_literal_error); + } else if (tok_id == .unterminated_char_literal) { + try p.err(.unterminated_char_literal_error); + } else unreachable; + return .{ + .ty = Type.int, + .val = Value.int(0), + .node = try p.addNode(.{ .tag = .char_literal, .ty = Type.int, .data = undefined }), + }; + }; var val: u32 = 0; const slice = char_kind.contentSlice(p.tokSlice(p.tok_i)); @@ -7656,7 +7649,8 @@ fn charLiteral(p: *Parser) Error!Result { // fast path: single unescaped ASCII char val = slice[0]; } else { - var char_literal_parser = CharLiteral.Parser.init(slice, char_kind, p.comp); + const max_codepoint = char_kind.maxCodepoint(p.comp); + var char_literal_parser = TextLiteral.Parser.init(slice, char_kind, max_codepoint, p.comp); const max_chars_expected = 4; var stack_fallback = std.heap.stackFallback(max_chars_expected * @sizeOf(u32), p.comp.gpa); @@ -7664,20 +7658,21 @@ fn charLiteral(p: *Parser) Error!Result { defer chars.deinit(); while (char_literal_parser.next()) |item| switch (item) { - .value => |c| try chars.append(c), + .value => |v| try chars.append(v), + .codepoint => |c| try chars.append(c), .improperly_encoded => |s| { try chars.ensureUnusedCapacity(s.len); for (s) |c| chars.appendAssumeCapacity(c); }, .utf8_text => |view| { var it = view.iterator(); - var max_codepoint: u21 = 0; + var max_codepoint_seen: u21 = 0; try chars.ensureUnusedCapacity(view.bytes.len); while (it.nextCodepoint()) |c| { - max_codepoint = @max(max_codepoint, c); + max_codepoint_seen = @max(max_codepoint_seen, c); chars.appendAssumeCapacity(c); } - if (max_codepoint > char_kind.maxCodepoint(p.comp)) { + if (max_codepoint_seen > max_codepoint) { char_literal_parser.err(.char_too_large, .{ .none = {} }); } }, diff --git a/src/Preprocessor.zig b/src/Preprocessor.zig index ca1b41ce..b0e28f9e 100644 --- a/src/Preprocessor.zig +++ b/src/Preprocessor.zig @@ -266,6 +266,15 @@ pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: } }); } +fn invalidTokenDiagnostic(tok_id: Token.Id) Diagnostics.Tag { + return switch (tok_id) { + .unterminated_string_literal => .unterminated_string_literal_warning, + .empty_char_literal => .empty_char_literal_warning, + .unterminated_char_literal => .unterminated_char_literal_warning, + else => unreachable, + }; +} + /// Return the name of the #ifndef guard macro that starts a source, if any. fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { var tokenizer = Tokenizer{ @@ -631,6 +640,12 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { } return tokFromRaw(tok); }, + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + start_of_line = false; + try pp.err(tok, invalidTokenDiagnostic(tag)); + try pp.expandMacro(&tokenizer, tok); + }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { try pp.err(tok, .poisoned_identifier); @@ -1239,7 +1254,7 @@ fn reconstructIncludeString(pp: *Preprocessor, param_toks: []const Token) !?[]co } for (params) |tok| { - const str = pp.expandedSliceExtra(tok, .preserve_macro_ws, false); + const str = pp.expandedSliceExtra(tok, .preserve_macro_ws); try pp.char_buf.appendSlice(str); } @@ -1985,12 +2000,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr } } -fn expandedSliceExtra( - pp: *const Preprocessor, - tok: Token, - macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }, - path_escapes: bool, -) []const u8 { +fn expandedSliceExtra(pp: *const Preprocessor, tok: Token, macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }) []const u8 { if (tok.id.lexeme()) |some| { if (!tok.id.allowsDigraphs(pp.comp) and !(tok.id == .macro_ws and macro_ws_handling == .preserve_macro_ws)) return some; } @@ -1999,7 +2009,6 @@ fn expandedSliceExtra( .comp = pp.comp, .index = tok.loc.byte_offset, .source = .generated, - .path_escapes = path_escapes, }; if (tok.id == .macro_string) { while (true) : (tmp_tokenizer.index += 1) { @@ -2013,7 +2022,7 @@ fn expandedSliceExtra( /// Get expanded token source string. pub fn expandedSlice(pp: *Preprocessor, tok: Token) []const u8 { - return pp.expandedSliceExtra(tok, .single_macro_ws, false); + return pp.expandedSliceExtra(tok, .single_macro_ws); } /// Concat two tokens and add the result to pp.generated @@ -2182,6 +2191,11 @@ fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { try pp.token_buf.append(tok); }, .whitespace => need_ws = true, + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + try pp.err(tok, invalidTokenDiagnostic(tag)); + try pp.token_buf.append(tok); + }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id != .whitespace and need_ws) { need_ws = false; @@ -2323,6 +2337,11 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa } try pp.token_buf.append(tok); }, + .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { + try pp.err(tok, invalidTokenDiagnostic(tag)); + try pp.token_buf.append(tok); + }, + .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { if (tok.id != .whitespace and need_ws) { need_ws = false; @@ -2368,8 +2387,6 @@ fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_pa /// Handle an #embed directive fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { - tokenizer.path_escapes = true; - defer tokenizer.path_escapes = false; const first = tokenizer.nextNoWS(); const filename_tok = pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof) catch |er| switch (er) { error.InvalidInclude => return, @@ -2377,7 +2394,7 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { }; // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws, true); + const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); if (tok_slice.len < 3) { try pp.err(first, .empty_filename); return; @@ -2419,8 +2436,6 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { // Handle a #include directive. fn include(pp: *Preprocessor, tokenizer: *Tokenizer, which: Compilation.WhichInclude) MacroError!void { - tokenizer.path_escapes = true; - defer tokenizer.path_escapes = false; const first = tokenizer.nextNoWS(); const new_source = findIncludeSource(pp, tokenizer, first, which) catch |er| switch (er) { error.InvalidInclude => return, @@ -2586,7 +2601,7 @@ fn findIncludeSource(pp: *Preprocessor, tokenizer: *Tokenizer, first: RawToken, const filename_tok = try pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof); // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws, true); + const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); if (tok_slice.len < 3) { try pp.err(first, .empty_filename); return error.InvalidInclude; diff --git a/src/Source.zig b/src/Source.zig index c7f401cf..6986f88b 100644 --- a/src/Source.zig +++ b/src/Source.zig @@ -74,7 +74,10 @@ pub fn lineCol(source: Source, loc: Location) LineCol { i += 1; continue; }; - const cp = std.unicode.utf8Decode(source.buf[i..][0..len]) catch unreachable; + const cp = std.unicode.utf8Decode(source.buf[i..][0..len]) catch { + i += 1; + continue; + }; width += codepointWidth(cp); i += len; } diff --git a/src/CharLiteral.zig b/src/TextLiteral.zig similarity index 68% rename from src/CharLiteral.zig rename to src/TextLiteral.zig index 7c47ac7f..4364a1d8 100644 --- a/src/CharLiteral.zig +++ b/src/TextLiteral.zig @@ -1,3 +1,5 @@ +//! Parsing and classification of string and character literals + const std = @import("std"); const Compilation = @import("Compilation.zig"); const Type = @import("Type.zig"); @@ -6,8 +8,10 @@ const Tokenizer = @import("Tokenizer.zig"); const mem = std.mem; pub const Item = union(enum) { - /// decoded escape + /// decoded hex or character escape value: u32, + /// validated unicode codepoint + codepoint: u21, /// Char literal in the source text is not utf8 encoded improperly_encoded: []const u8, /// 1 or more unescaped bytes @@ -25,28 +29,41 @@ pub const Kind = enum { utf_8, utf_16, utf_32, - - pub fn classify(id: Tokenizer.Token.Id) Kind { - return switch (id) { - .char_literal, - .string_literal, - => .char, - .char_literal_utf_8, - .string_literal_utf_8, - => .utf_8, - .char_literal_wide, - .string_literal_wide, - => .wide, - .char_literal_utf_16, - .string_literal_utf_16, - => .utf_16, - .char_literal_utf_32, - .string_literal_utf_32, - => .utf_32, - else => unreachable, + /// Error kind that halts parsing + unterminated, + + pub fn classify(id: Tokenizer.Token.Id, context: enum { string_literal, char_literal }) ?Kind { + return switch (context) { + .string_literal => switch (id) { + .string_literal => .char, + .string_literal_utf_8 => .utf_8, + .string_literal_wide => .wide, + .string_literal_utf_16 => .utf_16, + .string_literal_utf_32 => .utf_32, + .unterminated_string_literal => .unterminated, + else => null, + }, + .char_literal => switch (id) { + .char_literal => .char, + .char_literal_utf_8 => .utf_8, + .char_literal_wide => .wide, + .char_literal_utf_16 => .utf_16, + .char_literal_utf_32 => .utf_32, + else => null, + }, }; } + /// Should only be called for string literals. Determines the result kind of two adjacent string + /// literals + pub fn concat(self: Kind, other: Kind) !Kind { + if (self == .unterminated or other == .unterminated) return .unterminated; + if (self == other) return self; // can always concat with own kind + if (self == .char) return other; // char + X -> X + if (other == .char) return self; // X + char -> X + return error.CannotConcat; + } + /// Largest unicode codepoint that can be represented by this character kind /// May be smaller than the largest value that can be represented. /// For example u8 char literals may only specify 0-127 via literals or @@ -58,6 +75,7 @@ pub const Kind = enum { .utf_8 => std.math.maxInt(u7), .utf_16 => std.math.maxInt(u16), .utf_32 => 0x10FFFF, + .unterminated => unreachable, }); } @@ -68,9 +86,11 @@ pub const Kind = enum { .wide => comp.types.wchar.maxInt(comp), .utf_16 => std.math.maxInt(u16), .utf_32 => std.math.maxInt(u32), + .unterminated => unreachable, }); } + /// The C type of a character literal of this kind pub fn charLiteralType(kind: Kind, comp: *const Compilation) Type { return switch (kind) { .char => Type.int, @@ -78,10 +98,11 @@ pub const Kind = enum { .utf_8 => .{ .specifier = .uchar }, .utf_16 => comp.types.uint_least16_t, .utf_32 => comp.types.uint_least32_t, + .unterminated => unreachable, }; } - /// Return the actual contents of the string literal with leading / trailing quotes and + /// Return the actual contents of the literal with leading / trailing quotes and /// specifiers removed pub fn contentSlice(kind: Kind, delimited: []const u8) []const u8 { const end = delimited.len - 1; // remove trailing quote @@ -91,6 +112,40 @@ pub const Kind = enum { .utf_8 => delimited[3..end], .utf_16 => delimited[2..end], .utf_32 => delimited[2..end], + .unterminated => unreachable, + }; + } + + /// The size of a character unit for a string literal of this kind + pub fn charUnitSize(kind: Kind, comp: *const Compilation) Compilation.CharUnitSize { + return switch (kind) { + .char => .@"1", + .wide => switch (comp.types.wchar.sizeof(comp).?) { + 2 => .@"2", + 4 => .@"4", + else => unreachable, + }, + .utf_8 => .@"1", + .utf_16 => .@"2", + .utf_32 => .@"4", + .unterminated => unreachable, + }; + } + + /// Required alignment within aro (on compiler host) for writing to retained_strings + pub fn internalStorageAlignment(kind: Kind, comp: *const Compilation) usize { + return switch (kind.charUnitSize(comp)) { + inline else => |size| @alignOf(size.Type()), + }; + } + + /// The C type of an element of a string literal of this kind + pub fn elementType(kind: Kind, comp: *const Compilation) Type { + return switch (kind) { + .unterminated => unreachable, + .char => .{ .specifier = .char }, + .utf_8 => if (comp.langopts.hasChar8_T()) .{ .specifier = .uchar } else .{ .specifier = .char }, + else => kind.charLiteralType(comp), }; } }; @@ -99,23 +154,38 @@ pub const Parser = struct { literal: []const u8, i: usize = 0, kind: Kind, + max_codepoint: u21, /// We only want to issue a max of 1 error per char literal errored: bool = false, errors: std.BoundedArray(CharDiagnostic, 4) = .{}, comp: *const Compilation, - pub fn init(literal: []const u8, kind: Kind, comp: *const Compilation) Parser { + pub fn init(literal: []const u8, kind: Kind, max_codepoint: u21, comp: *const Compilation) Parser { return .{ .literal = literal, .comp = comp, .kind = kind, + .max_codepoint = max_codepoint, + }; + } + + fn prefixLen(self: *const Parser) usize { + return switch (self.kind) { + .unterminated => unreachable, + .char => 0, + .utf_8 => 2, + .wide, .utf_16, .utf_32 => 1, }; } pub fn err(self: *Parser, tag: Diagnostics.Tag, extra: Diagnostics.Message.Extra) void { if (self.errored) return; self.errored = true; - self.errors.append(.{ .tag = tag, .extra = extra }) catch {}; + const diagnostic = .{ .tag = tag, .extra = extra }; + self.errors.append(diagnostic) catch { + _ = self.errors.pop(); + self.errors.append(diagnostic) catch unreachable; + }; } pub fn warn(self: *Parser, tag: Diagnostics.Tag, extra: Diagnostics.Message.Extra) void { @@ -134,9 +204,9 @@ pub const Parser = struct { const view = std.unicode.Utf8View.init(unescaped_slice) catch { if (self.kind != .char) { self.err(.illegal_char_encoding_error, .{ .none = {} }); - } else { - self.warn(.illegal_char_encoding_warning, .{ .none = {} }); + return null; } + self.warn(.illegal_char_encoding_warning, .{ .none = {} }); return .{ .improperly_encoded = self.literal[start..self.i] }; }; return .{ .utf8_text = view }; @@ -180,7 +250,7 @@ pub const Parser = struct { self.i += expected_len; if (overflowed) { - self.err(.escape_sequence_overflow, .{ .unsigned = start }); + self.err(.escape_sequence_overflow, .{ .unsigned = start + self.prefixLen() }); return null; } @@ -190,12 +260,13 @@ pub const Parser = struct { } if (val > std.math.maxInt(u21) or !std.unicode.utf8ValidCodepoint(@intCast(val))) { - self.err(.invalid_universal_character, .{ .unsigned = start }); + self.err(.invalid_universal_character, .{ .unsigned = start + self.prefixLen() }); return null; } - if (val > self.kind.maxCodepoint(self.comp)) { + if (val > self.max_codepoint) { self.err(.char_too_large, .{ .none = {} }); + return null; } if (val < 0xA0 and (val != '$' and val != '@' and val != '`')) { @@ -216,7 +287,7 @@ pub const Parser = struct { } self.warn(.c89_ucn_in_literal, .{ .none = {} }); - return .{ .value = val }; + return .{ .codepoint = @intCast(val) }; } fn parseEscapedChar(self: *Parser) Item { @@ -259,6 +330,7 @@ pub const Parser = struct { var val: u32 = 0; var count: usize = 0; var overflowed = false; + const start = self.i; defer self.i += count; const slice = switch (base) { .octal => self.literal[self.i..@min(self.literal.len, self.i + 3)], // max 3 chars @@ -275,7 +347,8 @@ pub const Parser = struct { count += 1; } if (overflowed or val > self.kind.maxInt(self.comp)) { - self.err(.escape_sequence_overflow, .{ .unsigned = 0 }); + self.err(.escape_sequence_overflow, .{ .unsigned = start + self.prefixLen() }); + return 0; } if (count == 0) { std.debug.assert(base == .hex); diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig index 4e37d764..3c9a85a0 100644 --- a/src/Tokenizer.zig +++ b/src/Tokenizer.zig @@ -30,6 +30,10 @@ pub const Token = struct { string_literal_utf_32, string_literal_wide, + /// Any string literal with an embedded newline or EOF + /// Always a parser error; by default just a warning from preprocessor + unterminated_string_literal, + // only generated by preprocessor macro_string, @@ -40,6 +44,17 @@ pub const Token = struct { char_literal_utf_32, char_literal_wide, + /// Any character literal with nothing inside the quotes + /// Always a parser error; by default just a warning from preprocessor + empty_char_literal, + + /// Any character literal with an embedded newline or EOF + /// Always a parser error; by default just a warning from preprocessor + unterminated_char_literal, + + /// `/* */` style comment without a closing `*/` before EOF + unterminated_comment, + /// Integer literal tokens generated by preprocessor. one, zero, @@ -470,6 +485,7 @@ pub const Token = struct { return switch (id) { .include_start, .include_resume, + .unterminated_comment, // Fatal error; parsing should not be attempted => unreachable, .invalid, @@ -480,6 +496,9 @@ pub const Token = struct { .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, + .unterminated_string_literal, + .unterminated_char_literal, + .empty_char_literal, .char_literal, .char_literal_utf_8, .char_literal_utf_16, @@ -984,8 +1003,6 @@ index: u32 = 0, source: Source.Id, comp: *const Compilation, line: u32 = 1, -/// Used to parse include strings with Windows style paths. -path_escapes: bool = false, pub fn next(self: *Tokenizer) Token { var state: enum { @@ -996,14 +1013,10 @@ pub fn next(self: *Tokenizer) Token { U, L, string_literal, - path_escape, char_literal_start, char_literal, char_escape_sequence, - escape_sequence, - octal_escape, - hex_escape, - unicode_escape, + string_escape_sequence, identifier, extended_identifier, equal, @@ -1038,8 +1051,6 @@ pub fn next(self: *Tokenizer) Token { var start = self.index; var id: Token.Id = .eof; - var return_state = state; - var counter: u32 = 0; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; switch (state) { @@ -1219,29 +1230,30 @@ pub fn next(self: *Tokenizer) Token { }, .string_literal => switch (c) { '\\' => { - return_state = .string_literal; - state = if (self.path_escapes) .path_escape else .escape_sequence; + state = .string_escape_sequence; }, '"' => { self.index += 1; break; }, '\n' => { - id = .invalid; + id = .unterminated_string_literal; break; }, '\r' => unreachable, else => {}, }, - .path_escape => { - state = .string_literal; - }, .char_literal_start => switch (c) { '\\' => { state = .char_escape_sequence; }, - '\'', '\n' => { - id = .invalid; + '\'' => { + id = .empty_char_literal; + self.index += 1; + break; + }, + '\n' => { + id = .unterminated_char_literal; break; }, else => { @@ -1257,7 +1269,7 @@ pub fn next(self: *Tokenizer) Token { break; }, '\n' => { - id = .invalid; + id = .unterminated_char_literal; break; }, else => {}, @@ -1266,55 +1278,9 @@ pub fn next(self: *Tokenizer) Token { '\r', '\n' => unreachable, // removed by line splicing else => state = .char_literal, }, - .escape_sequence => switch (c) { - '\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => { - state = return_state; - }, + .string_escape_sequence => switch (c) { '\r', '\n' => unreachable, // removed by line splicing - '0'...'7' => { - counter = 1; - state = .octal_escape; - }, - 'x' => state = .hex_escape, - 'u' => { - counter = 4; - state = .unicode_escape; - }, - 'U' => { - counter = 8; - state = .unicode_escape; - }, - else => { - id = .invalid; - break; - }, - }, - .octal_escape => switch (c) { - '0'...'7' => { - counter += 1; - if (counter == 3) state = return_state; - }, - else => { - self.index -= 1; - state = return_state; - }, - }, - .hex_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => { - self.index -= 1; - state = return_state; - }, - }, - .unicode_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => { - counter -= 1; - if (counter == 0) state = return_state; - }, - else => { - id = .invalid; - break; - }, + else => state = .string_literal, }, .identifier, .extended_identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, @@ -1732,19 +1698,18 @@ pub fn next(self: *Tokenizer) Token { .start, .line_comment => {}, .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, - .period2, - .string_literal, - .path_escape, - .char_literal_start, - .char_literal, - .escape_sequence, - .char_escape_sequence, - .octal_escape, - .hex_escape, - .unicode_escape, + + .period2 => { + self.index -= 1; + id = .period; + }, + .multi_line_comment, .multi_line_comment_asterisk, - => id = .invalid, + => id = .unterminated_comment, + + .char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal, + .string_escape_sequence, .string_literal => id = .unterminated_string_literal, .whitespace => id = .whitespace, .multi_line_comment_done => id = .whitespace, @@ -2114,7 +2079,7 @@ test "extended identifiers" { try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal}); try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal}); - try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid }); + try expectTokens("\"\\u\u{E0000}\"", &.{.string_literal}); try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier }); } diff --git a/src/Tree.zig b/src/Tree.zig index 5c25ec66..f6335014 100644 --- a/src/Tree.zig +++ b/src/Tree.zig @@ -656,17 +656,6 @@ pub fn isLvalExtra(nodes: Node.List.Slice, extra: []const NodeIndex, value_map: } } -pub fn dumpStr(retained_strings: []const u8, range: Value.ByteRange, tag: Tag, writer: anytype) !void { - switch (tag) { - .string_literal_expr => { - const lit_range = range.trim(1); // remove null-terminator - const str = lit_range.slice(retained_strings); - try writer.print("\"{}\"", .{std.zig.fmtEscapes(str)}); - }, - else => unreachable, - } -} - pub fn tokSlice(tree: Tree, tok_i: TokenIndex) []const u8 { if (tree.tokens.items(.id)[tok_i].lexeme()) |some| return some; const loc = tree.tokens.items(.loc)[tok_i]; @@ -716,8 +705,8 @@ fn dumpAttribute(attr: Attribute, strings: []const u8, writer: anytype) !void { try writer.writeAll(f.name); try writer.writeAll(": "); switch (f.type) { - Value.ByteRange => try writer.print("\"{s}\"", .{@field(args, f.name).slice(strings)}), - ?Value.ByteRange => try writer.print("\"{?s}\"", .{if (@field(args, f.name)) |range| range.slice(strings) else null}), + Value.ByteRange => try writer.print("\"{s}\"", .{@field(args, f.name).slice(strings, .@"1")}), + ?Value.ByteRange => try writer.print("\"{?s}\"", .{if (@field(args, f.name)) |range| range.slice(strings, .@"1") else null}), else => switch (@typeInfo(f.type)) { .Enum => try writer.writeAll(@tagName(@field(args, f.name))), else => try writer.print("{any}", .{@field(args, f.name)}), diff --git a/src/Value.zig b/src/Value.zig index 58a058c3..1577db93 100644 --- a/src/Value.zig +++ b/src/Value.zig @@ -18,8 +18,40 @@ pub const ByteRange = struct { return .{ .start = self.start, .end = self.end - amount }; } - pub fn slice(self: ByteRange, all_bytes: []const u8) []const u8 { - return all_bytes[self.start..self.end]; + pub fn slice(self: ByteRange, all_bytes: []const u8, comptime size: Compilation.CharUnitSize) []const size.Type() { + switch (size) { + inline else => |sz| { + const aligned: []align(@alignOf(sz.Type())) const u8 = @alignCast(all_bytes[self.start..self.end]); + return std.mem.bytesAsSlice(sz.Type(), aligned); + }, + } + } + + pub fn dumpString(range: ByteRange, ty: Type, comp: *const Compilation, strings: []const u8, w: anytype) !void { + const size: Compilation.CharUnitSize = @enumFromInt(ty.elemType().sizeof(comp).?); + const without_null = range.trim(@intFromEnum(size)); + switch (size) { + inline .@"1", .@"2" => |sz| { + const data_slice = without_null.slice(strings, sz); + const formatter = if (sz == .@"1") std.zig.fmtEscapes(data_slice) else std.unicode.fmtUtf16le(data_slice); + try w.print("\"{}\"", .{formatter}); + }, + .@"4" => { + try w.writeByte('"'); + const data_slice = without_null.slice(strings, .@"4"); + var buf: [4]u8 = undefined; + for (data_slice) |item| { + if (item <= std.math.maxInt(u21) and std.unicode.utf8ValidCodepoint(@intCast(item))) { + const codepoint: u21 = @intCast(item); + const written = std.unicode.utf8Encode(codepoint, &buf) catch unreachable; + try w.print("{s}", .{buf[0..written]}); + } else { + try w.print("\\x{x}", .{item}); + } + } + try w.writeByte('"'); + }, + } } }; @@ -593,7 +625,7 @@ pub fn dump(v: Value, ty: Type, comp: *Compilation, strings: []const u8, w: anyt } else { try w.print("{d}", .{v.signExtend(ty, comp)}); }, - .bytes => try w.print("\"{s}\"", .{v.data.bytes.slice(strings)}), + .bytes => try v.data.bytes.dumpString(ty, comp, strings, w), // std.fmt does @as instead of @floatCast .float => try w.print("{d}", .{@as(f64, @floatCast(v.data.float))}), else => try w.print("({s})", .{@tagName(v.tag)}), diff --git a/src/codegen/x86_64.zig b/src/codegen/x86_64.zig index bc8b43df..aa96b4df 100644 --- a/src/codegen/x86_64.zig +++ b/src/codegen/x86_64.zig @@ -177,7 +177,7 @@ fn genNode(func: *Fn, node: NodeIndex) Codegen.Error!Value { .int_literal => return Value{ .immediate = @bitCast(data.int) }, .string_literal_expr => { const range = func.c.tree.value_map.get(node).?.data.bytes; - const str_bytes = range.slice(func.c.tree.strings); + const str_bytes = range.slice(func.c.tree.strings, .@"1"); const section = try func.c.obj.getSection(.strings); const start = section.items.len; try section.appendSlice(str_bytes); diff --git a/test/cases/attributes.c b/test/cases/attributes.c index 73b59567..2363b8eb 100644 --- a/test/cases/attributes.c +++ b/test/cases/attributes.c @@ -107,6 +107,8 @@ typedef struct { __attribute__((aligned(32))) char aligned_arr[] = {1, 2, 3}; _Static_assert(sizeof(aligned_arr) == 3, ""); +__attribute__((section(1))) int Z; + __attribute__(()) // test attribute at eof #define TESTS_SKIPPED 1 @@ -119,4 +121,5 @@ __attribute__(()) // test attribute at eof "attributes.c:36:5: error: fallthrough annotation does not directly precede switch label" \ "attributes.c:40:20: error: 'noreturn' attribute cannot be applied to a statement" \ "attributes.c:76:6: error: cannot call non function type 'int'" \ - "attributes.c:110:18: error: expected identifier or '('" \ + "attributes.c:110:24: error: Attribute argument is invalid, expected a string but got an integer constant" \ + "attributes.c:112:18: error: expected identifier or '('" \ diff --git a/test/cases/stringify invalid.c b/test/cases/stringify invalid.c index 84e3d172..6d551d29 100644 --- a/test/cases/stringify invalid.c +++ b/test/cases/stringify invalid.c @@ -1,7 +1,5 @@ -// clang also reports: warning: missing terminating '"' character [-Winvalid-pp-token] -#define TESTS_SKIPPED 1 - -#define EXPECTED_ERRORS "stringify invalid.c:15:1: error: expected ';', found '}'" +#define EXPECTED_ERRORS "stringify invalid.c:9:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "stringify invalid.c:13:1: error: expected ';', found '}'" void foo(void) { diff --git a/test/cases/strings.c b/test/cases/strings.c index 402f4633..148c5e1b 100644 --- a/test/cases/strings.c +++ b/test/cases/strings.c @@ -1,5 +1,5 @@ _Static_assert(1, "foo" "\n" "bar"); -_Static_assert(1, "foo" "\x606262 "); +_Static_assert(1, "foo" "abc\x606262 "); _Static_assert(1, "\000062"); _Static_assert(1, "\U00110000"); _Static_assert(1, "\u0062"); @@ -15,12 +15,12 @@ _Static_assert(1, "\u0060"); _Static_assert(1, "aaァ\e[1;"); #pragma GCC diagnostic pop -#define EXPECTED_ERRORS "strings.c:2:30: error: escape sequence out of range" \ - "strings.c:4:20: error: invalid universal character" \ - "strings.c:5:20: error: invalid universal character" \ +#define EXPECTED_ERRORS "strings.c:2:29: error: escape sequence out of range" \ + "strings.c:4:19: error: invalid universal character" \ + "strings.c:5:19: error: character 'b' cannot be specified by a universal character name" \ "strings.c:7:9: warning: multi-character character constant [-Wmultichar]" \ "strings.c:7:9: warning: character constant too long for its type" \ - "strings.c:9:20: error: invalid universal character" \ - "strings.c:10:20: error: invalid universal character" \ - "strings.c:11:20: error: invalid universal character" \ - "strings.c:15:24: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ + "strings.c:9:19: error: invalid universal character" \ + "strings.c:10:19: error: invalid universal character" \ + "strings.c:11:19: error: invalid universal character" \ + "strings.c:15:23: warning: use of non-standard escape character '\\e' [-Wpedantic]" \ diff --git a/test/cases/unterminated char literal.c b/test/cases/unterminated char literal.c new file mode 100644 index 00000000..951ca6e4 --- /dev/null +++ b/test/cases/unterminated char literal.c @@ -0,0 +1,17 @@ +#define A 'b +#define B '' +#define C(X) '' +#define D(X) 'A + +#define EXPECTED_ERRORS "unterminated char literal.c:1:11: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:2:11: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:3:14: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:4:14: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:16:10: warning: empty character constant [-Winvalid-pp-token]" \ + "unterminated char literal.c:17:10: warning: missing terminating ' character [-Winvalid-pp-token]" \ + "unterminated char literal.c:16:10: error: empty character constant" \ + "unterminated char literal.c:17:10: error: missing terminating ' character" \ + "unterminated char literal.c:17:11: error: expected ';' before end of file" \ + +char c = u8''; +char d = ' \ No newline at end of file diff --git a/test/cases/unterminated comment.c b/test/cases/unterminated comment.c new file mode 100644 index 00000000..91b22923 --- /dev/null +++ b/test/cases/unterminated comment.c @@ -0,0 +1,4 @@ +#define EXPECTED_ERRORS "unterminated comment.c:4:7: error: unterminated comment" \ + "unterminated comment.c:4:6: error: expected ';' before end of file" \ + +int x /** \ No newline at end of file diff --git a/test/cases/unterminated string literal.c b/test/cases/unterminated string literal.c new file mode 100644 index 00000000..d5bbcc9b --- /dev/null +++ b/test/cases/unterminated string literal.c @@ -0,0 +1,11 @@ +#define EXPECTED_ERRORS "unterminated string literal.c:9:12: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:10:20: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:11:12: warning: missing terminating '\"' character [-Winvalid-pp-token]" \ + "unterminated string literal.c:9:12: error: missing terminating '\"' character" \ + "unterminated string literal.c:10:20: error: missing terminating '\"' character" \ + "unterminated string literal.c:11:12: error: missing terminating '\"' character" \ + + +char A[] = "hello +char B[] = "hello" "world +char C[] = " \ No newline at end of file diff --git a/test/cases/wide character constants.c b/test/cases/wide character constants.c index 99d512d4..c174fbef 100644 --- a/test/cases/wide character constants.c +++ b/test/cases/wide character constants.c @@ -55,13 +55,13 @@ int Z = 'ABC\D'; "wide character constants.c:10:16: error: wide character literals may not contain multiple characters" \ "wide character constants.c:11:16: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:14:16: warning: multi-character character constant [-Wfour-char-constants]" \ - "wide character constants.c:20:19: error: escape sequence out of range" \ + "wide character constants.c:20:21: error: escape sequence out of range" \ "wide character constants.c:22:19: error: character too large for enclosing character literal type" \ - "wide character constants.c:25:19: error: invalid universal character" \ + "wide character constants.c:25:20: error: invalid universal character" \ "wide character constants.c:26:19: error: character too large for enclosing character literal type" \ "wide character constants.c:27:19: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:28:19: error: Unicode character literals may not contain multiple characters" \ - "wide character constants.c:29:19: error: escape sequence out of range" \ + "wide character constants.c:29:20: error: escape sequence out of range" \ "wide character constants.c:33:9: error: Unicode character literals may not contain multiple characters" \ "wide character constants.c:35:9: error: character too large for enclosing character literal type" \ "wide character constants.c:36:9: error: character 'A' cannot be specified by a universal character name" \ diff --git a/test/cases/wide strings.c b/test/cases/wide strings.c new file mode 100644 index 00000000..2e37b07d --- /dev/null +++ b/test/cases/wide strings.c @@ -0,0 +1,79 @@ +//aro-args -std=c2x +#include +typedef __WCHAR_TYPE__ wchar_t; + +uint8_t b[] = u8""; +_Static_assert(sizeof(b) == sizeof(uint8_t[1])); +char c[] = ""; +_Static_assert(sizeof(c) == 1); +wchar_t d[] = L""; +_Static_assert(sizeof(d) == sizeof(wchar_t[1])); +uint16_t e[] = u""; +_Static_assert(sizeof(e) == sizeof(uint16_t[1])); +uint32_t f[] = U""; +_Static_assert(sizeof(f) == sizeof(uint32_t[1])); + +uint16_t A[] = u"abc"; +_Static_assert(sizeof(A) == 8); + +uint32_t B[] = U"ABC"; +_Static_assert(sizeof(B) == 16); + +wchar_t C[] = L"ABC"; +_Static_assert(sizeof(C) == sizeof(wchar_t) * 4); + +uint16_t D[] = u"a" U"b"; + +uint16_t E[] = u"a" u"bc"; +_Static_assert(sizeof(E) == 8); + +uint32_t F[] = U"A" "BC"; +_Static_assert(sizeof(F) == 16); + +uint16_t G[] = u"🤗"; +_Static_assert(sizeof(G) == 6); + +uint16_t H[] = u"\U0001F917"; +_Static_assert(sizeof(H) == 6); + +uint32_t I[] = U"🤗"; +_Static_assert(sizeof(I) == 8); + +uint8_t J[] = u8"🤗"; +_Static_assert(sizeof(J) == 5); + +uint8_t K[] = u8"\U0001F917"; +_Static_assert(sizeof(K) == 5); + +uint16_t L[] = u"\xFFFFF"; + +uint8_t M[] = u8"\xFFF"; + +_Static_assert(1 == 2, u"😬\U0001f62c"); +_Static_assert(1 == 2, U"😬\U0001f62c"); + +char foo[] = "\u0020\u0020\u0020\u0020\xFFFFFFFF"; + +wchar_t N[] = "word" L"" "a"; +_Static_assert(sizeof(N) == sizeof(wchar_t) * 6); +uint32_t O[] = "word" U"" "a"; +_Static_assert(sizeof(O) == sizeof(uint32_t) * 6); +uint16_t P[] = "word" u"" "a"; +_Static_assert(sizeof(P) == sizeof(uint16_t) * 6); + +uint32_t Q[] = U"abc\ndef\xFFghi"; +_Static_assert(sizeof(Q) == sizeof(uint32_t) * 12); + +uint32_t R[] = U"a" U'b'; +uint32_t S[] = U'a'; +uint32_t T[] = { U'a', U'b'}; + +#define EXPECTED_ERRORS "wide strings.c:25:21: error: unsupported string literal concatenation" \ + "wide strings.c:48:18: error: escape sequence out of range" \ + "wide strings.c:50:18: error: escape sequence out of range" \ + "wide strings.c:52:1: error: static assertion failed \"😬😬\"" \ + "wide strings.c:53:1: error: static assertion failed \"😬😬\"" \ + "wide strings.c:55:39: error: escape sequence out of range" \ + "wide strings.c:67:21: error: expected ';', found 'a character literal'" \ + "wide strings.c:68:16: error: array initializer must be an initializer list or wide string literal" \ +