diff --git a/build/GenerateDef.zig b/build/GenerateDef.zig index 9c6a4d6c..508a629f 100644 --- a/build/GenerateDef.zig +++ b/build/GenerateDef.zig @@ -50,8 +50,8 @@ pub fn create(owner: *std.Build, options: Options) std.Build.Module.Import { }; } -fn make(step: *Step, prog_node: std.Progress.Node) !void { - _ = prog_node; +fn make(step: *Step, options: std.Build.Step.MakeOptions) !void { + _ = options; const b = step.owner; const self: *GenerateDef = @fieldParentPtr("step", step); const arena = b.allocator; diff --git a/src/aro.zig b/src/aro.zig index a9b76e0e..73f105f4 100644 --- a/src/aro.zig +++ b/src/aro.zig @@ -35,5 +35,6 @@ test { _ = @import("aro/target.zig"); _ = @import("aro/Tokenizer.zig"); _ = @import("aro/toolchains/Linux.zig"); + _ = @import("aro/Treap.zig"); _ = @import("aro/Value.zig"); } diff --git a/src/aro/Diagnostics/messages.def b/src/aro/Diagnostics/messages.def index 119e72a2..7700e1b8 100644 --- a/src/aro/Diagnostics/messages.def +++ b/src/aro/Diagnostics/messages.def @@ -2509,3 +2509,13 @@ auto_type_self_initialized .msg = "variable '{s}' declared with deduced type '__auto_type' cannot appear in its own initializer" .extra = .str .kind = .@"error" + +expected_left_angle_bracket + .msg = "expected '<' but got '{s}'" + .extra = .str + .kind = .@"error" + +closing_paren_after + .msg = "expected '(' after '{s}'" + .extra = .str + .kind = .@"error" diff --git a/src/aro/Parser.zig b/src/aro/Parser.zig index 270d0a33..17c43e25 100644 --- a/src/aro/Parser.zig +++ b/src/aro/Parser.zig @@ -7097,6 +7097,10 @@ fn unExpr(p: *Parser) Error!Result { return operand; }, .plus_plus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; var operand = try p.castExpr(); @@ -7123,6 +7127,10 @@ fn unExpr(p: *Parser) Error!Result { return operand; }, .minus_minus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; var operand = try p.castExpr(); @@ -7423,6 +7431,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { switch (p.tok_ids[p.tok_i]) { .l_paren => return p.callExpr(lhs), .plus_plus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } defer p.tok_i += 1; var operand = lhs; @@ -7441,6 +7453,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return operand; }, .minus_minus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } defer p.tok_i += 1; var operand = lhs; @@ -7459,6 +7475,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return operand; }, .l_bracket => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } const l_bracket = p.tok_i; p.tok_i += 1; var index = try p.expr(); @@ -7495,11 +7515,19 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return ptr; }, .period => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; const name = try p.expectIdentifier(); return p.fieldAccess(lhs, name, false); }, .arrow => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; const name = try p.expectIdentifier(); if (lhs.ty.isArray()) { @@ -8039,6 +8067,11 @@ fn makePredefinedIdentifier(p: *Parser, strings_top: usize) !Result { } fn stringLiteral(p: *Parser) Error!Result { + if (p.in_macro) { + try p.err(.invalid_preproc_expr_start); + return error.ParsingFailed; + } + var string_end = p.tok_i; var string_kind: text_literal.Kind = .char; while (text_literal.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { diff --git a/src/aro/Pragma.zig b/src/aro/Pragma.zig index 279ac5f0..3f698c31 100644 --- a/src/aro/Pragma.zig +++ b/src/aro/Pragma.zig @@ -57,8 +57,8 @@ pub fn pasteTokens(pp: *Preprocessor, start_idx: TokenIndex) ![]const u8 { .r_paren => rparen_count += 1, .string_literal => { if (rparen_count != 0) return error.ExpectedStringLiteral; - const str = pp.expandedSlice(tok); - try pp.char_buf.appendSlice(str[1 .. str.len - 1]); + const str = pp.tokSlice(tok); + try pp.char_buf.appendSlice(pp.gpa, str[1 .. str.len - 1]); }, else => return error.ExpectedStringLiteral, } diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 3fd98882..13008292 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -14,10 +14,13 @@ const Token = Tree.Token; const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs; const Attribute = @import("Attribute.zig"); const features = @import("features.zig"); -const Hideset = @import("Hideset.zig"); +const OldPreprocessor = @import("Preprocessor.zig"); +const Treap = @import("Treap.zig"); +const ParamMap = std.StringHashMapUnmanaged(PreprocessorToken); const DefineMap = std.StringHashMapUnmanaged(Macro); -const RawTokenList = std.ArrayList(RawToken); + +const TokenList = std.ArrayListUnmanaged(PreprocessorToken); const max_include_depth = 200; /// Errors that can be returned when expanding a macro. @@ -25,40 +28,72 @@ const max_include_depth = 200; /// it is handled there and doesn't escape that function const MacroError = Error || error{StopPreprocessing}; +const PreprocessingError = Error || error{PreprocessingFailed}; + +const SpecialMacroFn = fn (*Preprocessor, PreprocessorToken) Error!void; + +fn Range(comptime T: type) type { + return struct { + const Self = @This(); + const Item = T; + + start: u32, + end: u32, + const empty: Self = .{ .start = 0, .end = 0 }; + + fn len(self: Self) u32 { + return self.end - self.start; + } + + fn slice(self: Self, items: []const Item) []const Item { + return items[self.start..self.end]; + } + }; +} + +/// Each macro argument is a list of tokens (represented as a range of Preprocessor.macro_arg_tokens) +const MacroArg = Range(PreprocessorToken); + +/// List of MacroArg's for a macro invocation (represented as a range of Preprocessor.macro_args) +const MacroArgList = Range(MacroArg); + +const PreprocessorToken = TokenWithExpansionLocs; + const Macro = struct { - /// Parameters of the function type macro - params: []const []const u8, + /// Tokens constituting the macro body + tokens: []const PreprocessorToken, - /// Token constituting the macro body - tokens: []const RawToken, + /// Number of arguments for function-like macros + nargs: usize, /// If the function type macro has variable number of arguments var_args: bool, - /// Is a function type macro - is_func: bool, - - /// Is a predefined macro - is_builtin: bool = false, - /// Location of macro in the source loc: Source.Location, + kind: Kind, + + const Kind = union(enum) { + object, + func, + special: *const SpecialMacroFn, + }; + fn eql(a: Macro, b: Macro, pp: *Preprocessor) bool { + if ((a.kind == .object and b.kind != .object) or (a.kind == .func and b.kind != .func)) return false; + if (!std.meta.eql(a.kind, b.kind)) return false; if (a.tokens.len != b.tokens.len) return false; - if (a.is_builtin != b.is_builtin) return false; for (a.tokens, b.tokens) |a_tok, b_tok| if (!tokEql(pp, a_tok, b_tok)) return false; - if (a.is_func and b.is_func) { + if (a.kind == .func) { if (a.var_args != b.var_args) return false; - if (a.params.len != b.params.len) return false; - for (a.params, b.params) |a_param, b_param| if (!mem.eql(u8, a_param, b_param)) return false; } return true; } - fn tokEql(pp: *Preprocessor, a: RawToken, b: RawToken) bool { + fn tokEql(pp: *Preprocessor, a: PreprocessorToken, b: PreprocessorToken) bool { return mem.eql(u8, pp.tokSlice(a), pp.tokSlice(b)); } }; @@ -78,27 +113,15 @@ const TokenState = struct { comp: *Compilation, gpa: mem.Allocator, arena: std.heap.ArenaAllocator, -defines: DefineMap = .{}, -/// Do not directly mutate this; use addToken / addTokenAssumeCapacity / ensureTotalTokenCapacity / ensureUnusedTokenCapacity -tokens: Token.List = .{}, + +tokens: std.MultiArrayList(Token) = .{}, /// Do not directly mutate this; must be kept in sync with `tokens` expansion_entries: std.MultiArrayList(ExpansionEntry) = .{}, -token_buf: RawTokenList, -char_buf: std.ArrayList(u8), -/// Counter that is incremented each time preprocess() is called -/// Can be used to distinguish multiple preprocessings of the same file -preprocess_count: u32 = 0, -generated_line: u32 = 1, -add_expansion_nl: u32 = 0, -include_depth: u8 = 0, -counter: u32 = 0, -expansion_source_loc: Source.Location = undefined, -poisoned_identifiers: std.StringHashMap(void), + /// Map from Source.Id to macro name in the `#ifndef` condition which guards the source, if any include_guards: std.AutoHashMapUnmanaged(Source.Id, []const u8) = .{}, -/// Memory is retained to avoid allocation on every single token. -top_expansion_buf: ExpandBuf, +char_buf: std.ArrayListUnmanaged(u8) = .{}, /// Dump current state to stderr. verbose: bool = false, @@ -107,7 +130,35 @@ preserve_whitespace: bool = false, /// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers) linemarkers: Linemarkers = .none, -hideset: Hideset, +tokenizers: std.ArrayListUnmanaged(Tokenizer) = .{}, + +expansion_bufs: std.ArrayListUnmanaged(TokenList) = .{}, + +defines: DefineMap = .{}, + +generated_line: u32 = 1, + +counter: u32 = 0, + +if_level: u8 = 0, + +preprocess_count: u32 = 0, + +poisoned_identifiers: std.StringHashMap(void), + +if_kind: std.PackedIntArray(u2, 256) = blk: { + @setEvalBranchQuota(2000); + break :blk std.PackedIntArray(u2, 256).initAllTo(0); +}, + +guard_stack: std.ArrayListUnmanaged(?[]const u8) = .{}, + +macro_arg_tokens: std.ArrayListUnmanaged(MacroArg.Item) = .{}, +macro_args: std.ArrayListUnmanaged(MacroArgList.Item) = .{}, + +safe_strings: std.StringHashMapUnmanaged(void) = .{}, + +treap: Treap, pub const parse = Parser.parse; @@ -125,1186 +176,438 @@ pub fn init(comp: *Compilation) Preprocessor { .comp = comp, .gpa = comp.gpa, .arena = std.heap.ArenaAllocator.init(comp.gpa), - .token_buf = RawTokenList.init(comp.gpa), - .char_buf = std.ArrayList(u8).init(comp.gpa), .poisoned_identifiers = std.StringHashMap(void).init(comp.gpa), - .top_expansion_buf = ExpandBuf.init(comp.gpa), - .hideset = .{ .comp = comp }, + .treap = Treap.init(comp.gpa), }; comp.pragmaEvent(.before_preprocess); return pp; } -/// Initialize Preprocessor with builtin macros. -pub fn initDefault(comp: *Compilation) !Preprocessor { - var pp = init(comp); - errdefer pp.deinit(); - try pp.addBuiltinMacros(); - return pp; -} - -const builtin_macros = struct { - const args = [1][]const u8{"X"}; - - const has_attribute = [1]RawToken{.{ - .id = .macro_param_has_attribute, - .source = .generated, - }}; - const has_c_attribute = [1]RawToken{.{ - .id = .macro_param_has_c_attribute, - .source = .generated, - }}; - const has_declspec_attribute = [1]RawToken{.{ - .id = .macro_param_has_declspec_attribute, - .source = .generated, - }}; - const has_warning = [1]RawToken{.{ - .id = .macro_param_has_warning, - .source = .generated, - }}; - const has_feature = [1]RawToken{.{ - .id = .macro_param_has_feature, - .source = .generated, - }}; - const has_extension = [1]RawToken{.{ - .id = .macro_param_has_extension, - .source = .generated, - }}; - const has_builtin = [1]RawToken{.{ - .id = .macro_param_has_builtin, - .source = .generated, - }}; - const has_include = [1]RawToken{.{ - .id = .macro_param_has_include, - .source = .generated, - }}; - const has_include_next = [1]RawToken{.{ - .id = .macro_param_has_include_next, - .source = .generated, - }}; - const has_embed = [1]RawToken{.{ - .id = .macro_param_has_embed, - .source = .generated, - }}; - - const is_identifier = [1]RawToken{.{ - .id = .macro_param_is_identifier, - .source = .generated, - }}; - - const pragma_operator = [1]RawToken{.{ - .id = .macro_param_pragma_operator, - .source = .generated, - }}; - - const file = [1]RawToken{.{ - .id = .macro_file, - .source = .generated, - }}; - const line = [1]RawToken{.{ - .id = .macro_line, - .source = .generated, - }}; - const counter = [1]RawToken{.{ - .id = .macro_counter, - .source = .generated, - }}; -}; - -fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, is_func: bool, tokens: []const RawToken) !void { +fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, func: *const SpecialMacroFn) !void { try pp.defines.putNoClobber(pp.gpa, name, .{ - .params = &builtin_macros.args, - .tokens = tokens, + .tokens = &.{}, .var_args = false, - .is_func = is_func, .loc = .{ .id = .generated }, - .is_builtin = true, + .kind = .{ .special = func }, + .nargs = 0, }); } -pub fn addBuiltinMacros(pp: *Preprocessor) !void { - try pp.addBuiltinMacro("__has_attribute", true, &builtin_macros.has_attribute); - try pp.addBuiltinMacro("__has_c_attribute", true, &builtin_macros.has_c_attribute); - try pp.addBuiltinMacro("__has_declspec_attribute", true, &builtin_macros.has_declspec_attribute); - try pp.addBuiltinMacro("__has_warning", true, &builtin_macros.has_warning); - try pp.addBuiltinMacro("__has_feature", true, &builtin_macros.has_feature); - try pp.addBuiltinMacro("__has_extension", true, &builtin_macros.has_extension); - try pp.addBuiltinMacro("__has_builtin", true, &builtin_macros.has_builtin); - try pp.addBuiltinMacro("__has_include", true, &builtin_macros.has_include); - try pp.addBuiltinMacro("__has_include_next", true, &builtin_macros.has_include_next); - try pp.addBuiltinMacro("__has_embed", true, &builtin_macros.has_embed); - try pp.addBuiltinMacro("__is_identifier", true, &builtin_macros.is_identifier); - try pp.addBuiltinMacro("_Pragma", true, &builtin_macros.pragma_operator); - - try pp.addBuiltinMacro("__FILE__", false, &builtin_macros.file); - try pp.addBuiltinMacro("__LINE__", false, &builtin_macros.line); - try pp.addBuiltinMacro("__COUNTER__", false, &builtin_macros.counter); +fn handleLineMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{source.physicalLine(tok.loc)}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); } -pub fn deinit(pp: *Preprocessor) void { - pp.defines.deinit(pp.gpa); - pp.tokens.deinit(pp.gpa); - pp.arena.deinit(); - pp.token_buf.deinit(); - pp.char_buf.deinit(); - pp.poisoned_identifiers.deinit(); - pp.include_guards.deinit(pp.gpa); - pp.top_expansion_buf.deinit(); - pp.hideset.deinit(); - for (pp.expansion_entries.items(.locs)) |locs| TokenWithExpansionLocs.free(locs, pp.gpa); - pp.expansion_entries.deinit(pp.gpa); +fn handleFileMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("\"{s}\"\n", .{source.path}); + const pasted_tok = try pp.makeGeneratedToken(start, .string_literal, tok); + return pp.ungetToken(pasted_tok); } -/// Free buffers that are not needed after preprocessing -fn clearBuffers(pp: *Preprocessor) void { - pp.token_buf.clearAndFree(); - pp.char_buf.clearAndFree(); - pp.top_expansion_buf.clearAndFree(); - pp.hideset.clearAndFree(); +fn handleCounterMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + defer pp.counter += 1; + const start = pp.comp.generated_buf.items.len; + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{pp.counter}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); } -pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { - const S = struct { - fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { - _ = context; - return std.math.order(lhs, rhs); - } - }; +fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: PreprocessorToken) !PreprocessorToken { + var pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + } }; + pp.generated_line += 1; + try pasted_token.addExpansionLocation(pp.gpa, source.loc); + // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); TODO + return pasted_token; +} - const indices = pp.expansion_entries.items(.idx); - const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; - const locs = pp.expansion_entries.items(.locs)[idx]; - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - return locs[0..i]; +fn errStr(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag, str: []const u8) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .str = str }, + }, &.{}); // todo expansion slice } -/// Preprocess a compilation unit of sources into a parsable list of tokens. -pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { - assert(sources.len > 1); - const first = sources[0]; - try pp.addIncludeStart(first); - for (sources[1..]) |header| { - try pp.addIncludeStart(header); - _ = try pp.preprocess(header); +fn errTok(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .none = {} }, + }, &.{}); // todo expansion slice +} + +fn expectClosing(pp: *Preprocessor, opening: PreprocessorToken, id: Token.Id) !void { + // todo: fix expect + const item = try pp.expect(id, .closing_paren); + if (item.id != id) { + try pp.errTok(opening, .to_match_paren); } - try pp.addIncludeResume(first.id, 0, 1); - const eof = try pp.preprocess(first); - try pp.addToken(eof); - pp.clearBuffers(); } -/// Preprocess a source file, returns eof token. -pub fn preprocess(pp: *Preprocessor, source: Source) Error!TokenWithExpansionLocs { - const eof = pp.preprocessExtra(source) catch |er| switch (er) { - // This cannot occur in the main file and is handled in `include`. - error.StopPreprocessing => unreachable, - else => |e| return e, - }; - try eof.checkMsEof(source, pp.comp); - return eof; +fn tokFromBool(b: bool) PreprocessorToken { + return if (b) PreprocessorToken.one else PreprocessorToken.zero; } -/// Tokenize a file without any preprocessing, returns eof token. -pub fn tokenize(pp: *Preprocessor, source: Source) Error!Token { - assert(pp.linemarkers == .none); - assert(pp.preserve_whitespace == false); - var tokenizer = Tokenizer{ - .buf = source.buf, - .comp = pp.comp, - .source = source.id, - }; +fn handleHasAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); - // Estimate how many new tokens this source will contain. - const estimated_token_count = source.buf.len / 8; - try pp.ensureTotalTokenCapacity(pp.tokens.len + estimated_token_count); + const has_attr = Attribute.fromString(.gnu, null, pp.tokSlice(attr_name)) != null; + return pp.ungetToken(tokFromBool(has_attr)); +} + +fn handleHasCAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + var tok: PreprocessorToken = undefined; while (true) { - const tok = tokenizer.next(); - if (tok.id == .eof) return tokFromRaw(tok); - try pp.addToken(tokFromRaw(tok)); + tok = try pp.readToken(); + if (tok.id == .comment) continue; + if (tok.id.isDirectiveEnd() or tok.id == .r_paren) break; + try r.append(pp.gpa, tok); } + try pp.expectClosing(l_paren, .r_paren); } -pub fn addIncludeStart(pp: *Preprocessor, source: Source) !void { - if (pp.linemarkers == .none) return; - try pp.addToken(.{ .id = .include_start, .loc = .{ - .id = source.id, - .byte_offset = std.math.maxInt(u32), - .line = 1, - } }); -} +fn handleHasDeclSpecAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); -pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: u32) !void { - if (pp.linemarkers == .none) return; - try pp.addToken(.{ .id = .include_resume, .loc = .{ - .id = source, - .byte_offset = offset, - .line = line, - } }); + const ident_str = pp.tokSlice(attr_name); + const has_attr = if (pp.comp.langopts.declspec_attrs) Attribute.fromString(.declspec, null, ident_str) != null else false; + return pp.ungetToken(tokFromBool(has_attr)); } -fn invalidTokenDiagnostic(tok_id: Token.Id) Diagnostics.Tag { - return switch (tok_id) { - .unterminated_string_literal => .unterminated_string_literal_warning, - .empty_char_literal => .empty_char_literal_warning, - .unterminated_char_literal => .unterminated_char_literal_warning, - else => unreachable, - }; -} +fn handleHasFeature(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); -/// Return the name of the #ifndef guard macro that starts a source, if any. -fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { - var tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .source = source.id, - }; - var hash = tokenizer.nextNoWS(); - while (hash.id == .nl) hash = tokenizer.nextNoWS(); - if (hash.id != .hash) return null; - const ifndef = tokenizer.nextNoWS(); - if (ifndef.id != .keyword_ifndef) return null; - const guard = tokenizer.nextNoWS(); - if (guard.id != .identifier) return null; - return pp.tokSlice(guard); + const ident_str = pp.tokSlice(attr_name); + const has_feature = features.hasFeature(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_feature)); } -fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!TokenWithExpansionLocs { - var guard_name = pp.findIncludeGuard(source); +fn handleHasExtension(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); - pp.preprocess_count += 1; - var tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .source = source.id, - }; + const ident_str = pp.tokSlice(attr_name); + const has_extension = features.hasExtension(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_extension)); +} - // Estimate how many new tokens this source will contain. - const estimated_token_count = source.buf.len / 8; - try pp.ensureTotalTokenCapacity(pp.tokens.len + estimated_token_count); +fn handleHasBuiltin(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); - var if_level: u8 = 0; - var if_kind = std.PackedIntArray(u2, 256).init([1]u2{0} ** 256); - const until_else = 0; - const until_endif = 1; - const until_endif_seen_else = 2; + const ident_str = pp.tokSlice(attr_name); + const has_builtin = pp.comp.hasBuiltin(ident_str); + return pp.ungetToken(tokFromBool(has_builtin)); +} + +fn handleHasWarning(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(macro_tok); + const start = pp.char_buf.items.len; + defer pp.char_buf.items.len = start; - var start_of_line = true; while (true) { - var tok = tokenizer.next(); + const tok = try pp.readExpandNewline(); switch (tok.id) { - .hash => if (!start_of_line) try pp.addToken(tokFromRaw(tok)) else { - const directive = tokenizer.nextNoWS(); - switch (directive.id) { - .keyword_error, .keyword_warning => { - // #error tokens.. - pp.top_expansion_buf.items.len = 0; - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - while (true) { - tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) break; - if (tok.id == .whitespace) tok.id = .macro_ws; - try pp.top_expansion_buf.append(tokFromRaw(tok)); - } - try pp.stringify(pp.top_expansion_buf.items); - const slice = pp.char_buf.items[char_top + 1 .. pp.char_buf.items.len - 2]; - const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); - - try pp.comp.addDiagnostic(.{ - .tag = if (directive.id == .keyword_error) .error_directive else .warning_directive, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - .extra = .{ .str = duped }, - }, &.{}); - }, - .keyword_if => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - if (try pp.expr(&tokenizer)) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #if", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #if", .{}); - } - } - }, - .keyword_ifdef => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name) != null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #ifdef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #ifdef", .{}); - } - } - }, - .keyword_ifndef => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name) == null) { - if_kind.set(if_level, until_endif); - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - } - }, - .keyword_elif => { - if (if_level == 0) { - try pp.err(directive, .elif_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => if (try pp.expr(&tokenizer)) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elif", .{}); - } - } else { - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elif", .{}); - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elif_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_elifdef => { - if (if_level == 0) { - try pp.err(directive, .elifdef_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(&tokenizer); - if (macro_name == null) { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } else { - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name.?) != null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifdef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elifdef_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_elifndef => { - if (if_level == 0) { - try pp.err(directive, .elifdef_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(&tokenizer); - if (macro_name == null) { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } else { - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name.?) == null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifndef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elifdef_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_else => { - try pp.expectNl(&tokenizer); - if (if_level == 0) { - try pp.err(directive, .else_without_if); - continue; - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - if_kind.set(if_level, until_endif_seen_else); - if (pp.verbose) { - pp.verboseLog(directive, "#else branch here", .{}); - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif_seen_else), - until_endif_seen_else => { - try pp.err(directive, .else_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_endif => { - try pp.expectNl(&tokenizer); - if (if_level == 0) { - guard_name = null; - try pp.err(directive, .endif_without_if); - continue; - } else if (if_level == 1) { - const saved_tokenizer = tokenizer; - defer tokenizer = saved_tokenizer; - - var next = tokenizer.nextNoWS(); - while (next.id == .nl) : (next = tokenizer.nextNoWS()) {} - if (next.id != .eof) guard_name = null; - } - if_level -= 1; - }, - .keyword_define => try pp.define(&tokenizer), - .keyword_undef => { - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - - _ = pp.defines.remove(macro_name); - try pp.expectNl(&tokenizer); - }, - .keyword_include => { - try pp.include(&tokenizer, .first); - continue; - }, - .keyword_include_next => { - try pp.comp.addDiagnostic(.{ - .tag = .include_next, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - }, &.{}); - if (pp.include_depth == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .include_next_outside_header, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - }, &.{}); - try pp.include(&tokenizer, .first); - } else { - try pp.include(&tokenizer, .next); - } - }, - .keyword_embed => try pp.embed(&tokenizer), - .keyword_pragma => { - try pp.pragma(&tokenizer, directive, null, &.{}); - continue; - }, - .keyword_line => { - // #line number "file" - const digits = tokenizer.nextNoWS(); - if (digits.id != .pp_num) try pp.err(digits, .line_simple_digit); - // TODO: validate that the pp_num token is solely digits - - if (digits.id == .eof or digits.id == .nl) continue; - const name = tokenizer.nextNoWS(); - if (name.id == .eof or name.id == .nl) continue; - if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); - try pp.expectNl(&tokenizer); - }, - .pp_num => { - // # number "file" flags - // TODO: validate that the pp_num token is solely digits - // if not, emit `GNU line marker directive requires a simple digit sequence` - const name = tokenizer.nextNoWS(); - if (name.id == .eof or name.id == .nl) continue; - if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); - - const flag_1 = tokenizer.nextNoWS(); - if (flag_1.id == .eof or flag_1.id == .nl) continue; - const flag_2 = tokenizer.nextNoWS(); - if (flag_2.id == .eof or flag_2.id == .nl) continue; - const flag_3 = tokenizer.nextNoWS(); - if (flag_3.id == .eof or flag_3.id == .nl) continue; - const flag_4 = tokenizer.nextNoWS(); - if (flag_4.id == .eof or flag_4.id == .nl) continue; - try pp.expectNl(&tokenizer); - }, - .nl => {}, - .eof => { - if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); - return tokFromRaw(directive); - }, - else => { - try pp.err(tok, .invalid_preprocessing_directive); - skipToNl(&tokenizer); - }, - } - if (pp.preserve_whitespace) { - tok.id = .nl; - try pp.addToken(tokFromRaw(tok)); - } + .nl, .eof => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return pp.ungetToken(PreprocessorToken.zero); }, - .whitespace => if (pp.preserve_whitespace) try pp.addToken(tokFromRaw(tok)), - .nl => { - start_of_line = true; - if (pp.preserve_whitespace) try pp.addToken(tokFromRaw(tok)); - }, - .eof => { - if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); - // The following check needs to occur here and not at the top of the function - // because a pragma may change the level during preprocessing - if (source.buf.len > 0 and source.buf[source.buf.len - 1] != '\n') { - try pp.err(tok, .newline_eof); - } - if (guard_name) |name| { - if (try pp.include_guards.fetchPut(pp.gpa, source.id, name)) |prev| { - assert(mem.eql(u8, name, prev.value)); - } - } - return tokFromRaw(tok); - }, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - start_of_line = false; - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.expandMacro(&tokenizer, tok); + .r_paren => break, + .string_literal => { + const string = pp.tokSlice(tok); + try pp.char_buf.appendSlice(pp.gpa, string[1 .. string.len - 1]); }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { - if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { - try pp.err(tok, .poisoned_identifier); - } - // Add the token to the buffer doing any necessary expansions. - start_of_line = false; - try pp.expandMacro(&tokenizer, tok); + pp.skipToNl(); + try pp.errTok(tok, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return pp.ungetToken(PreprocessorToken.zero); }, } } + const actual_param = pp.char_buf.items[start..]; + if (actual_param.len == 0) { + try pp.comp.addDiagnostic(.{ + .tag = .expected_arguments, + .loc = macro_tok.loc, + .extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }, + }, &.{}); // todo expansion slice + return pp.ungetToken(PreprocessorToken.zero); + } + if (!mem.startsWith(u8, actual_param, "-W")) { + try pp.errStr(l_paren, .malformed_warning_check, "__has_warning"); + return pp.ungetToken(PreprocessorToken.zero); + } + const warning_name = actual_param[2..]; + const exists = Diagnostics.warningExists(warning_name); + return pp.ungetToken(tokFromBool(exists)); } -/// Get raw token source string. -/// Returned slice is invalidated when comp.generated_buf is updated. -pub fn tokSlice(pp: *Preprocessor, token: anytype) []const u8 { - if (token.id.lexeme()) |some| return some; - const source = pp.comp.getSource(token.source); - return source.buf[token.start..token.end]; +fn handleHasInclude(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .first); } -/// Convert a token from the Tokenizer into a token used by the parser. -fn tokFromRaw(raw: RawToken) TokenWithExpansionLocs { - return .{ - .id = raw.id, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }; +fn handleHasIncludeNext(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .next); } -fn err(pp: *Preprocessor, raw: RawToken, tag: Diagnostics.Tag) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }, &.{}); +fn handleHasIncludeExtra(pp: *Preprocessor, macro_tok: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; + } + + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return pp.ungetToken(PreprocessorToken.zero), + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + + if (which == .first or pp.includeDepth() == 0) { + if (which == .next) { + try pp.comp.addDiagnostic(.{ + .tag = .include_next_outside_header, + .loc = macro_tok.loc, + }, &.{}); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .first); + return pp.ungetToken(tokFromBool(has)); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .next); + return pp.ungetToken(tokFromBool(has)); } -fn errStr(pp: *Preprocessor, tok: TokenWithExpansionLocs, tag: Diagnostics.Tag, str: []const u8) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = tok.loc, - .extra = .{ .str = str }, - }, tok.expansionSlice()); +fn includeDepth(pp: *Preprocessor) usize { + return pp.tokenizers.items.len - 1; } -fn fatal(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) Compilation.Error { - try pp.comp.diagnostics.list.append(pp.gpa, .{ - .tag = .cli_error, - .kind = .@"fatal error", - .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }); - return error.FatalError; +fn hasEmbedValue(contents_arg: ?[]const u8) []const u8 { + const contents = contents_arg orelse return "0\n"; + if (contents.len == 0) return "2\n"; + return "1\n"; } -fn fatalNotFound(pp: *Preprocessor, tok: TokenWithExpansionLocs, filename: []const u8) Compilation.Error { - const old = pp.comp.diagnostics.fatal_errors; - pp.comp.diagnostics.fatal_errors = true; - defer pp.comp.diagnostics.fatal_errors = old; - - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ - .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), - } }, tok.expansionSlice(), false); - unreachable; // addExtra should've returned FatalError -} - -fn verboseLog(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) void { - const source = pp.comp.getSource(raw.source); - const line_col = source.lineCol(.{ .id = raw.source, .line = raw.line, .byte_offset = raw.start }); - - const stderr = std.io.getStdErr().writer(); - var buf_writer = std.io.bufferedWriter(stderr); - const writer = buf_writer.writer(); - defer buf_writer.flush() catch {}; - writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; - writer.print(fmt, args) catch return; - writer.writeByte('\n') catch return; - writer.writeAll(line_col.line) catch return; - writer.writeByte('\n') catch return; -} - -/// Consume next token, error if it is not an identifier. -fn expectMacroName(pp: *Preprocessor, tokenizer: *Tokenizer) Error!?[]const u8 { - const macro_name = tokenizer.nextNoWS(); - if (!macro_name.id.isMacroIdentifier()) { - try pp.err(macro_name, .macro_name_missing); - skipToNl(tokenizer); - return null; +/// TODO: handle limit/prefix/suffix/etc +fn handleHasEmbed(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; } - return pp.tokSlice(macro_name); -} -/// Skip until after a newline, error if extra tokens before it. -fn expectNl(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { - var sent_err = false; - while (true) { - const tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) return; - if (tok.id == .whitespace or tok.id == .comment) continue; - if (!sent_err) { - sent_err = true; - try pp.err(tok, .extra_tokens_directive_end); - } - } -} + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); -fn getTokenState(pp: *const Preprocessor) TokenState { - return .{ - .tokens_len = pp.tokens.len, - .expansion_entries_len = pp.expansion_entries.len, + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, }; -} -fn restoreTokenState(pp: *Preprocessor, state: TokenState) void { - pp.tokens.len = state.tokens_len; - pp.expansion_entries.len = state.expansion_entries_len; + const contents = try pp.comp.findEmbed(filename, macro_tok.loc.id, include_type, 1); + const result = hasEmbedValue(contents); + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, macro_tok); + return pp.ungetToken(pasted_tok); } -/// Consume all tokens until a newline and parse the result into a boolean. -fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool { - const token_state = pp.getTokenState(); - defer { - for (pp.top_expansion_buf.items) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - pp.restoreTokenState(token_state); +// Skip until newline, ignore other tokens. +fn skipToNl(pp: *Preprocessor) void { + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; } +} - pp.top_expansion_buf.items.len = 0; - const eof = while (true) { - const tok = tokenizer.next(); - switch (tok.id) { - .nl, .eof => break tok, - .whitespace => if (pp.top_expansion_buf.items.len == 0) continue, - else => {}, - } - try pp.top_expansion_buf.append(tokFromRaw(tok)); - } else unreachable; - if (pp.top_expansion_buf.items.len != 0) { - pp.expansion_source_loc = pp.top_expansion_buf.items[0].loc; - pp.hideset.clearRetainingCapacity(); - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, pp.top_expansion_buf.items.len, false, .expr); - } - for (pp.top_expansion_buf.items) |tok| { - if (tok.id == .macro_ws) continue; - if (!tok.id.validPreprocessorExprStart()) { - try pp.comp.addDiagnostic(.{ - .tag = .invalid_preproc_expr_start, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - } - break; - } else { - try pp.err(eof, .expected_value_in_expr); - return false; - } +fn readOneIdentifierArgument(pp: *Preprocessor, macro_tok: PreprocessorToken) !?PreprocessorToken { + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + _ = l_paren; + var invalid: ?PreprocessorToken = null; + var identifier: ?PreprocessorToken = null; + while (true) { + var tok = pp.getToken(); + tok.id.simplifyMacroKeywordExtra(true); - // validate the tokens in the expression - try pp.ensureUnusedTokenCapacity(pp.top_expansion_buf.items.len); - var i: usize = 0; - const items = pp.top_expansion_buf.items; - while (i < items.len) : (i += 1) { - var tok = items[i]; switch (tok.id) { - .string_literal, - .string_literal_utf_16, - .string_literal_utf_8, - .string_literal_utf_32, - .string_literal_wide, - => { - try pp.comp.addDiagnostic(.{ - .tag = .string_literal_in_pp_expr, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - }, - .plus_plus, - .minus_minus, - .plus_equal, - .minus_equal, - .asterisk_equal, - .slash_equal, - .percent_equal, - .angle_bracket_angle_bracket_left_equal, - .angle_bracket_angle_bracket_right_equal, - .ampersand_equal, - .caret_equal, - .pipe_equal, - .l_bracket, - .r_bracket, - .l_brace, - .r_brace, - .ellipsis, - .semicolon, - .hash, - .hash_hash, - .equal, - .arrow, - .period, - => { - try pp.comp.addDiagnostic(.{ - .tag = .invalid_preproc_operator, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - }, - .macro_ws, .whitespace => continue, - .keyword_false => tok.id = .zero, - .keyword_true => tok.id = .one, - else => if (tok.id.isMacroIdentifier()) { - if (tok.id == .keyword_defined) { - const tokens_consumed = try pp.handleKeywordDefined(&tok, items[i + 1 ..], eof); - i += tokens_consumed; - } else { - try pp.errStr(tok, .undefined_macro, pp.expandedSlice(tok)); - - if (i + 1 < pp.top_expansion_buf.items.len and - pp.top_expansion_buf.items[i + 1].id == .l_paren) - { - try pp.errStr(tok, .fn_macro_undefined, pp.expandedSlice(tok)); - return false; - } - - tok.id = .zero; // undefined macro - } + .r_paren, .eof => break, + else => { + if (identifier) |_| invalid = tok else identifier = tok; }, } - pp.addTokenAssumeCapacity(tok); } - try pp.addToken(.{ - .id = .eof, - .loc = tokFromRaw(eof).loc, - }); - - // Actually parse it. - var parser = Parser{ - .pp = pp, - .comp = pp.comp, - .gpa = pp.gpa, - .tok_ids = pp.tokens.items(.id), - .tok_i = @intCast(token_state.tokens_len), - .arena = pp.arena.allocator(), - .in_macro = true, - .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), - - .data = undefined, - .value_map = undefined, - .labels = undefined, - .decl_buf = undefined, - .list_buf = undefined, - .param_buf = undefined, - .enum_buf = undefined, - .record_buf = undefined, - .attr_buf = undefined, - .field_attr_buf = undefined, - .string_ids = undefined, - }; - defer parser.strings.deinit(); - return parser.macroExpr(); -} - -/// Turns macro_tok from .keyword_defined into .zero or .one depending on whether the argument is defined -/// Returns the number of tokens consumed -fn handleKeywordDefined(pp: *Preprocessor, macro_tok: *TokenWithExpansionLocs, tokens: []const TokenWithExpansionLocs, eof: RawToken) !usize { - std.debug.assert(macro_tok.id == .keyword_defined); - var it = TokenIterator.init(tokens); - const first = it.nextNoWS() orelse { - try pp.err(eof, .macro_name_missing); - return it.i; - }; - switch (first.id) { - .l_paren => {}, - else => { - if (!first.id.isMacroIdentifier()) { - try pp.errStr(first, .macro_name_must_be_identifier, pp.expandedSlice(first)); - } - macro_tok.id = if (pp.defines.contains(pp.expandedSlice(first))) .one else .zero; - return it.i; - }, - } - const second = it.nextNoWS() orelse { - try pp.err(eof, .macro_name_missing); - return it.i; - }; - if (!second.id.isMacroIdentifier()) { + if (invalid) |some| { try pp.comp.addDiagnostic(.{ - .tag = .macro_name_must_be_identifier, - .loc = second.loc, - }, second.expansionSlice()); - return it.i; + .tag = .missing_tok_builtin, + .loc = some.loc, + .extra = .{ .tok_id_expected = .r_paren }, + }, &.{}); // TODO: expansion slice + return null; } - macro_tok.id = if (pp.defines.contains(pp.expandedSlice(second))) .one else .zero; - - const last = it.nextNoWS(); - if (last == null or last.?.id != .r_paren) { - const tok = last orelse tokFromRaw(eof); - try pp.comp.addDiagnostic(.{ - .tag = .closing_paren, - .loc = tok.loc, - }, tok.expansionSlice()); - try pp.comp.addDiagnostic(.{ - .tag = .to_match_paren, - .loc = first.loc, - }, first.expansionSlice()); + if (identifier) |ident| { + if (ident.id == .identifier or ident.id == .extended_identifier) return ident; + } else { + const extra: Diagnostics.Message.Extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }; + try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); } - - return it.i; + return null; } -/// Skip until #else #elif #endif, return last directive token id. -/// Also skips nested #if ... #endifs. -fn skip( - pp: *Preprocessor, - tokenizer: *Tokenizer, - cont: enum { until_else, until_endif, until_endif_seen_else }, -) Error!void { - var ifs_seen: u32 = 0; - var line_start = true; - while (tokenizer.index < tokenizer.buf.len) { - if (line_start) { - const saved_tokenizer = tokenizer.*; - const hash = tokenizer.nextNoWS(); - if (hash.id == .nl) continue; - line_start = false; - if (hash.id != .hash) continue; - const directive = tokenizer.nextNoWS(); - switch (directive.id) { - .keyword_else => { - if (ifs_seen != 0) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .else_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elif => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elif_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifdef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elifdef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifndef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elifndef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_endif => { - if (ifs_seen == 0) { - tokenizer.* = saved_tokenizer; - return; - } - ifs_seen -= 1; - }, - .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, - else => {}, - } - } else if (tokenizer.buf[tokenizer.index] == '\n') { - line_start = true; - tokenizer.index += 1; - tokenizer.line += 1; - if (pp.preserve_whitespace) { - try pp.addToken(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); - } - } else { - line_start = false; - tokenizer.index += 1; - } +fn handleIsIdentifier(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + if (try pp.readOneIdentifierArgument(macro_tok)) |_| { + return pp.ungetToken(PreprocessorToken.one); } else { - const eof = tokenizer.next(); - return pp.err(eof, .unterminated_conditional_directive); + return pp.ungetToken(PreprocessorToken.zero); } } -// Skip until newline, ignore other tokens. -fn skipToNl(tokenizer: *Tokenizer) void { - while (true) { - const tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) return; - } +fn handlePragmaOperator(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + _ = pp; + _ = macro_tok; + // TODO } -const ExpandBuf = std.ArrayList(TokenWithExpansionLocs); -fn removePlacemarkers(buf: *ExpandBuf) void { - var i: usize = buf.items.len -% 1; - while (i < buf.items.len) : (i -%= 1) { - if (buf.items[i].id == .placemarker) { - const placemarker = buf.orderedRemove(i); - TokenWithExpansionLocs.free(placemarker.expansion_locs, buf.allocator); - } - } +pub fn addBuiltinMacros(pp: *Preprocessor) !void { + try pp.addBuiltinMacro("__has_attribute", handleHasAttribute); + try pp.addBuiltinMacro("__has_c_attribute", handleHasCAttribute); + try pp.addBuiltinMacro("__has_declspec_attribute", handleHasDeclSpecAttribute); + try pp.addBuiltinMacro("__has_feature", handleHasFeature); + try pp.addBuiltinMacro("__has_extension", handleHasExtension); + try pp.addBuiltinMacro("__has_builtin", handleHasBuiltin); + try pp.addBuiltinMacro("__has_warning", handleHasWarning); + try pp.addBuiltinMacro("__has_include", handleHasInclude); + try pp.addBuiltinMacro("__has_include_next", handleHasIncludeNext); + try pp.addBuiltinMacro("__has_embed", handleHasEmbed); + + try pp.addBuiltinMacro("__is_identifier", handleIsIdentifier); + + try pp.addBuiltinMacro("__FILE__", handleFileMacro); + try pp.addBuiltinMacro("__LINE__", handleLineMacro); + try pp.addBuiltinMacro("__COUNTER__", handleCounterMacro); + try pp.addBuiltinMacro("_Pragma", handlePragmaOperator); } -const MacroArguments = std.ArrayList([]const TokenWithExpansionLocs); -fn deinitMacroArguments(allocator: Allocator, args: *const MacroArguments) void { - for (args.items) |item| { - for (item) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, allocator); - allocator.free(item); - } - args.deinit(); +/// Initialize Preprocessor with builtin macros. +pub fn initDefault(comp: *Compilation) !Preprocessor { + var pp = init(comp); + errdefer pp.deinit(); + try pp.addBuiltinMacros(); + return pp; } -fn expandObjMacro(pp: *Preprocessor, simple_macro: *const Macro) Error!ExpandBuf { - var buf = ExpandBuf.init(pp.gpa); - errdefer buf.deinit(); - if (simple_macro.tokens.len == 0) { - try buf.append(.{ .id = .placemarker, .loc = .{ .id = .generated } }); - return buf; +pub fn deinit(pp: *Preprocessor) void { + pp.arena.deinit(); + pp.include_guards.deinit(pp.gpa); + pp.tokens.deinit(pp.gpa); + pp.tokenizers.deinit(pp.gpa); + for (pp.expansion_bufs.items) |*toklist| { + toklist.deinit(pp.gpa); } - try buf.ensureTotalCapacity(simple_macro.tokens.len); - - // Add all of the simple_macros tokens to the new buffer handling any concats. - var i: usize = 0; - while (i < simple_macro.tokens.len) : (i += 1) { - const raw = simple_macro.tokens[i]; - const tok = tokFromRaw(raw); - switch (raw.id) { - .hash_hash => { - var rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - while (true) { - if (rhs.id == .whitespace) { - rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - } else if (rhs.id == .comment and !pp.comp.langopts.preserve_comments_in_macros) { - rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - } else break; - } - try pp.pasteTokens(&buf, &.{rhs}); - }, - .whitespace => if (pp.preserve_whitespace) buf.appendAssumeCapacity(tok), - .macro_file => { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(pp.expansion_source_loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("\"{s}\"\n", .{source.path}); - - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .string_literal, tok)); - }, - .macro_line => { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(pp.expansion_source_loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{source.physicalLine(pp.expansion_source_loc)}); + pp.expansion_bufs.deinit(pp.gpa); + pp.defines.deinit(pp.gpa); + pp.char_buf.deinit(pp.gpa); + for (pp.expansion_entries.items(.locs)) |locs| PreprocessorToken.free(locs, pp.gpa); + pp.expansion_entries.deinit(pp.gpa); + pp.guard_stack.deinit(pp.gpa); + pp.macro_arg_tokens.deinit(pp.gpa); + pp.macro_args.deinit(pp.gpa); + pp.safe_strings.deinit(pp.gpa); + pp.treap.deinit(); + pp.poisoned_identifiers.deinit(); +} - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); - }, - .macro_counter => { - defer pp.counter += 1; - const start = pp.comp.generated_buf.items.len; - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{pp.counter}); +/// Preprocess a compilation unit of sources into a parsable list of tokens. +pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { + assert(sources.len > 1); + const first = sources[0]; - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); - }, - else => buf.appendAssumeCapacity(tok), - } + try pp.addIncludeStart(first); + for (sources[1..]) |header| { + try pp.addIncludeStart(header); + _ = try pp.preprocess(header); } - - return buf; + try pp.addIncludeResume(first.id, 0, 1); + const eof = try pp.preprocess(first); + try pp.addToken(eof); } -/// Join a possibly-parenthesized series of string literal tokens into a single string without -/// leading or trailing quotes. The returned slice is invalidated if pp.char_buf changes. -/// Returns error.ExpectedStringLiteral if parentheses are not balanced, a non-string-literal -/// is encountered, or if no string literals are encountered -/// TODO: destringize (replace all '\\' with a single `\` and all '\"' with a '"') -fn pasteStringsUnsafe(pp: *Preprocessor, toks: []const TokenWithExpansionLocs) ![]const u8 { - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - var unwrapped = toks; - if (toks.len >= 2 and toks[0].id == .l_paren and toks[toks.len - 1].id == .r_paren) { - unwrapped = toks[1 .. toks.len - 1]; - } - if (unwrapped.len == 0) return error.ExpectedStringLiteral; - - for (unwrapped) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id != .string_literal) return error.ExpectedStringLiteral; - const str = pp.expandedSlice(tok); - try pp.char_buf.appendSlice(str[1 .. str.len - 1]); +fn propagateSpace(pp: *Preprocessor, tokens: []PreprocessorToken, template: PreprocessorToken) void { + if (tokens.len > 0) { + tokens[0].flags = template.flags; + } else { + pp.injectSpace(); } - return pp.char_buf.items[char_top..]; } -/// Handle the _Pragma operator (implemented as a builtin macro) -fn pragmaOperator(pp: *Preprocessor, arg_tok: TokenWithExpansionLocs, operator_loc: Source.Location) !void { - const arg_slice = pp.expandedSlice(arg_tok); - const content = arg_slice[1 .. arg_slice.len - 1]; - const directive = "#pragma "; - - pp.char_buf.clearRetainingCapacity(); - const total_len = directive.len + content.len + 1; // destringify can never grow the string, + 1 for newline - try pp.char_buf.ensureUnusedCapacity(total_len); - pp.char_buf.appendSliceAssumeCapacity(directive); - pp.destringify(content); - pp.char_buf.appendAssumeCapacity('\n'); +fn ungetAll(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + if (tokens.len == 0) return; + const start = pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len; + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items[start..]); +} - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items); - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.generated_buf.items, - .langopts = pp.comp.langopts, - .index = @intCast(start), - .source = .generated, - .line = pp.generated_line, - }; - pp.generated_line += 1; - const hash_tok = tmp_tokenizer.next(); - assert(hash_tok.id == .hash); - const pragma_tok = tmp_tokenizer.next(); - assert(pragma_tok.id == .keyword_pragma); - try pp.pragma(&tmp_tokenizer, pragma_tok, operator_loc, arg_tok.expansionSlice()); -} - -/// Inverts the output of the preprocessor stringify (#) operation -/// (except all whitespace is condensed to a single space) -/// writes output to pp.char_buf; assumes capacity is sufficient -/// backslash backslash -> backslash -/// backslash doublequote -> doublequote -/// All other characters remain the same -fn destringify(pp: *Preprocessor, str: []const u8) void { - var state: enum { start, backslash_seen } = .start; - for (str) |c| { - switch (c) { - '\\' => { - if (state == .backslash_seen) pp.char_buf.appendAssumeCapacity(c); - state = if (state == .start) .backslash_seen else .start; - }, - else => { - if (state == .backslash_seen and c != '"') pp.char_buf.appendAssumeCapacity('\\'); - pp.char_buf.appendAssumeCapacity(c); - state = .start; +fn addHideSet(pp: *Preprocessor, toks: []PreprocessorToken, hideset: Treap.Node) !void { + for (toks) |*tok| { + switch (tok.id) { + // non-identifiers are not expanded, so we don't need to track their hidesets. + // Track r_paren hideset since it is used for computing the hideset of function-like macro expansions + .identifier, .extended_identifier, .r_paren => { + tok.hideset = try pp.treap.@"union"(tok.hideset, hideset); }, + else => {}, } } } -/// Stringify `tokens` into pp.char_buf. -/// See https://gcc.gnu.org/onlinedocs/gcc-11.2.0/cpp/Stringizing.html#Stringizing -fn stringify(pp: *Preprocessor, tokens: []const TokenWithExpansionLocs) !void { - try pp.char_buf.append('"'); - var ws_state: enum { start, need, not_needed } = .start; - for (tokens) |tok| { - if (tok.id == .macro_ws) { - if (ws_state == .start) continue; - ws_state = .need; - continue; - } - if (ws_state == .need) try pp.char_buf.append(' '); - ws_state = .not_needed; +fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) !PreprocessorToken { + const char_buf_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_buf_top; + const start = pp.comp.generated_buf.items.len; + try pp.char_buf.append(pp.gpa, '"'); + const args = args_range.slice(pp.macro_arg_tokens.items); + for (args) |tok| { + if (tok.flags.space and pp.char_buf.items.len - 1 > char_buf_top) { + try pp.char_buf.append(pp.gpa, ' '); + } // backslashes not inside strings are not escaped const is_str = switch (tok.id) { .string_literal, @@ -1320,1661 +623,1252 @@ fn stringify(pp: *Preprocessor, tokens: []const TokenWithExpansionLocs) !void { else => false, }; - for (pp.expandedSlice(tok)) |c| { + for (pp.tokSlice(tok)) |c| { if (c == '"') - try pp.char_buf.appendSlice("\\\"") + try pp.char_buf.appendSlice(pp.gpa, "\\\"") else if (c == '\\' and is_str) - try pp.char_buf.appendSlice("\\\\") + try pp.char_buf.appendSlice(pp.gpa, "\\\\") else - try pp.char_buf.append(c); + try pp.char_buf.append(pp.gpa, c); } } - try pp.char_buf.ensureUnusedCapacity(2); + try pp.char_buf.ensureUnusedCapacity(pp.gpa, 2); if (pp.char_buf.items[pp.char_buf.items.len - 1] != '\\') { pp.char_buf.appendSliceAssumeCapacity("\"\n"); - return; - } - pp.char_buf.appendAssumeCapacity('"'); - var tokenizer: Tokenizer = .{ - .buf = pp.char_buf.items, - .index = 0, - .source = .generated, - .langopts = pp.comp.langopts, - .line = 0, - }; - const item = tokenizer.next(); - if (item.id == .unterminated_string_literal) { - const tok = tokens[tokens.len - 1]; - try pp.comp.addDiagnostic(.{ - .tag = .invalid_pp_stringify_escape, - .loc = tok.loc, - }, tok.expansionSlice()); - pp.char_buf.items.len -= 2; // erase unpaired backslash and appended end quote + } else { pp.char_buf.appendAssumeCapacity('"'); + var tokenizer: Tokenizer = .{ + .buf = pp.char_buf.items, + .index = 0, + .source = .generated, + .langopts = pp.comp.langopts, + .line = 0, + }; + const item = tokenizer.next(); + if (item.id == .unterminated_string_literal) { + const tok = args[args.len - 1]; + try pp.comp.addDiagnostic(.{ + .tag = .invalid_pp_stringify_escape, + .loc = tok.loc, + }, tok.expansionSlice()); + pp.char_buf.items.len -= 2; // erase unpaired backslash and appended end quote + pp.char_buf.appendAssumeCapacity('"'); + } + pp.char_buf.appendAssumeCapacity('\n'); } - pp.char_buf.appendAssumeCapacity('\n'); + + try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items[char_buf_top..]); + + var tok = tmpl; + tok.id = .string_literal; + tok.loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + }; + pp.generated_line += 1; + return tok; } -fn reconstructIncludeString(pp: *Preprocessor, param_toks: []const TokenWithExpansionLocs, embed_args: ?*[]const TokenWithExpansionLocs, first: TokenWithExpansionLocs) !?[]const u8 { - if (param_toks.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; +fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, args: MacroArgList, hideset_arg: Treap.Node) ![]PreprocessorToken { + var hideset = hideset_arg; + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + var i: usize = 0; + while (i < macro.tokens.len) : (i += 1) { + const t0 = macro.tokens[i]; + const t1: ?PreprocessorToken = if (i == macro.tokens.len - 1) null else macro.tokens[i + 1]; + + const t0_param = t0.id == .macro_param; + const t1_param = if (t1) |tok| tok.id == .macro_param else false; + + if (t0.id == .hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + const stringized = try pp.stringize(t0, arg); + try r.append(pp.gpa, stringized); + i += 1; + continue; + } + if (t0.id == .hash_hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + if (t1.?.isVarArg() and r.items.len > 0 and r.items[r.items.len - 1].id == .comma) { + if (arg.len() == 0) { + _ = r.pop(); + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); + } + } else if (arg.len() > 0) { + try pp.pasteAndPush(&r, arg.slice(pp.macro_arg_tokens.items)[0]); + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)[1..]); + } + i += 1; + continue; + } + if (t0.id == .hash_hash and t1 != null) { + hideset = t1.?.hideset; + try pp.pasteAndPush(&r, t1.?); + i += 1; + continue; + } + if (t0_param and t1 != null and t1.?.id == .hash_hash) { + hideset = t1.?.hideset; + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + if (arg.len() == 0) { + i += 1; + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); + } + continue; + } + if (t0_param) { + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + const expanded = try pp.expandAll(arg.slice(pp.macro_arg_tokens.items), t0); + defer pp.gpa.free(expanded); + try r.appendSlice(pp.gpa, expanded); + continue; + } + try r.append(pp.gpa, t0); + } + try pp.addHideSet(r.items, hideset); + for (r.items) |*tok| { + try tok.addExpansionLocation(pp.gpa, macro_tok.loc); + try tok.addExpansionLocationList(pp.gpa, macro_tok.loc_list); } + return r.toOwnedSlice(pp.gpa); +} - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; +fn pasteTokens(pp: *Preprocessor, lhs: PreprocessorToken, rhs: PreprocessorToken) !PreprocessorToken { + const start = pp.comp.generated_buf.items.len; + const end = start + pp.tokSlice(lhs).len + pp.tokSlice(rhs).len; + try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline - // Trim leading/trailing whitespace - var begin: usize = 0; - var end: usize = param_toks.len; - while (begin < end and param_toks[begin].id == .macro_ws) : (begin += 1) {} - while (end > begin and param_toks[end - 1].id == .macro_ws) : (end -= 1) {} - const params = param_toks[begin..end]; + // We cannot use the same slices here since they might be invalidated by `ensureCapacity` + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(lhs)); + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(rhs)); + pp.comp.generated_buf.appendAssumeCapacity('\n'); - if (params.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; - } - // no string pasting - if (embed_args == null and params[0].id == .string_literal and params.len > 1) { - try pp.comp.addDiagnostic(.{ - .tag = .closing_paren, - .loc = params[1].loc, - }, params[1].expansionSlice()); - return null; + // Try to tokenize the result. + var tmp_tokenizer = Tokenizer{ + .buf = pp.comp.generated_buf.items, + .langopts = pp.comp.langopts, + .index = @intCast(start), + .source = .generated, + }; + const pasted_token = tmp_tokenizer.nextNoWSComments(); + const next_tok = tmp_tokenizer.next(); + if (next_tok.id != .nl) { + try pp.errStr( + lhs, + .pasting_formed_invalid, + try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), + ); } + return pp.makeGeneratedToken(start, pasted_token.id, lhs); +} - for (params, 0..) |tok, i| { - const str = pp.expandedSliceExtra(tok, .preserve_macro_ws); - try pp.char_buf.appendSlice(str); - if (embed_args) |some| { - if ((i == 0 and tok.id == .string_literal) or tok.id == .angle_bracket_right) { - some.* = params[i + 1 ..]; - break; - } - } +/// Paste `tok` onto the last token in `tokens` +fn pasteAndPush(pp: *Preprocessor, tokens: *TokenList, tok: PreprocessorToken) !void { + const last = tokens.pop(); + const pasted = try pp.pasteTokens(last, tok); + return tokens.append(pp.gpa, pasted); +} + +fn tokenBufferStashReverse(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + try pp.expansion_bufs.append(pp.gpa, .{}); + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items); +} + +fn tokenBufferUnstash(pp: *Preprocessor) void { + var buf = pp.expansion_bufs.pop(); + buf.deinit(pp.gpa); +} + +fn expandAll(pp: *Preprocessor, tokens: []const PreprocessorToken, tmpl: PreprocessorToken) ![]const PreprocessorToken { + try pp.tokenBufferStashReverse(tokens); + defer pp.tokenBufferUnstash(); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + while (true) { + const tok = try pp.readExpand(); + if (tok.id == .eof) break; + try r.append(pp.gpa, tok); } + pp.propagateSpace(r.items, tmpl); + return r.toOwnedSlice(pp.gpa); +} - const include_str = pp.char_buf.items[char_top..]; - if (include_str.len < 3) { - if (include_str.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; - } - try pp.comp.addDiagnostic(.{ - .tag = .empty_filename, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; +fn peekToken(pp: *Preprocessor) !PreprocessorToken { + const tok = try pp.readToken(); + try pp.ungetToken(tok); + return tok; +} + +/// Return a string with the same contents as `name` and whose lifetime is the same as the preprocessor's lifetime +/// If `tok` is not from the generated source, this is just `name`. +/// If `tok` is from the generated source, pointers are invalidated when the underlying ArrayList is resized. Therefore, +/// duplicate the string and store it (so we aren't repeatedly copying the same string) +fn getSafeString(pp: *Preprocessor, tok: PreprocessorToken, name: []const u8) ![]const u8 { + if (tok.loc.id != .generated) return name; + const gop = try pp.safe_strings.getOrPut(pp.gpa, name); + if (!gop.found_existing) { + const copy = try pp.arena.allocator().dupe(u8, name); + gop.key_ptr.* = copy; } + return gop.key_ptr.*; +} - switch (include_str[0]) { - '<' => { - if (include_str[include_str.len - 1] != '>') { - // Ugly hack to find out where the '>' should go, since we don't have the closing ')' location - const start = params[0].loc; - try pp.comp.addDiagnostic(.{ - .tag = .header_str_closing, - .loc = .{ .id = start.id, .byte_offset = start.byte_offset + @as(u32, @intCast(include_str.len)) + 1, .line = start.line }, - }, params[0].expansionSlice()); - try pp.comp.addDiagnostic(.{ - .tag = .header_str_match, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; - } - return include_str; - }, - '"' => return include_str, - else => { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; - }, +fn injectSpace(pp: *Preprocessor) void { + var i = pp.expansion_bufs.items.len; + while (i > 0) : (i -= 1) { + var j = pp.expansion_bufs.items[i - 1].items.len; + while (j > 0) : (j -= 1) { + pp.expansion_bufs.items[i - 1].items[j - 1].flags.space = true; + return; + } } } -fn handleBuiltinMacro(pp: *Preprocessor, builtin: RawToken.Id, param_toks: []const TokenWithExpansionLocs, src_loc: Source.Location) Error!bool { - switch (builtin) { - .macro_param_has_attribute, - .macro_param_has_declspec_attribute, - .macro_param_has_feature, - .macro_param_has_extension, - .macro_param_has_builtin, - => { - var invalid: ?TokenWithExpansionLocs = null; - var identifier: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id == .comment) continue; - if (!tok.id.isMacroIdentifier()) { - invalid = tok; - break; - } - if (identifier) |_| invalid = tok else identifier = tok; - } - if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; - if (invalid) |some| { - try pp.comp.addDiagnostic( - .{ .tag = .feature_check_requires_identifier, .loc = some.loc }, - some.expansionSlice(), - ); - return false; - } +fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) return tok; + const name = pp.tokSlice(tok); + const macro = pp.defines.getPtr(name) orelse return tok; - const ident_str = pp.expandedSlice(identifier.?); - return switch (builtin) { - .macro_param_has_attribute => Attribute.fromString(.gnu, null, ident_str) != null, - .macro_param_has_declspec_attribute => { - return if (pp.comp.langopts.declspec_attrs) - Attribute.fromString(.declspec, null, ident_str) != null - else - false; - }, - .macro_param_has_feature => features.hasFeature(pp.comp, ident_str), - .macro_param_has_extension => features.hasExtension(pp.comp, ident_str), - .macro_param_has_builtin => pp.comp.hasBuiltin(ident_str), - else => unreachable, - }; - }, - .macro_param_has_warning => { - const actual_param = pp.pasteStringsUnsafe(param_toks) catch |er| switch (er) { - error.ExpectedStringLiteral => { - try pp.errStr(param_toks[0], .expected_str_literal_in, "__has_warning"); - return false; - }, - else => |e| return e, - }; - if (!mem.startsWith(u8, actual_param, "-W")) { - try pp.errStr(param_toks[0], .malformed_warning_check, "__has_warning"); - return false; + const macro_hideset = tok.hideset; + if (pp.treap.contains(macro_hideset, name)) return tok; + + switch (macro.kind) { + .object => { + const safe_name = try pp.getSafeString(tok, name); + const new_hideset = try pp.treap.addNodeTo(tok.hideset, safe_name); + + const tokens = try pp.subst(macro, tok, MacroArgList.empty, new_hideset); + for (tokens) |*t| { + try t.addExpansionLocation(pp.gpa, tok.loc); + try t.addExpansionLocationList(pp.gpa, tok.loc_list); } - const warning_name = actual_param[2..]; - return Diagnostics.warningExists(warning_name); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); }, - .macro_param_is_identifier => { - var invalid: ?TokenWithExpansionLocs = null; - var identifier: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| switch (tok.id) { - .macro_ws => continue, - .comment => continue, - else => { - if (identifier) |_| invalid = tok else identifier = tok; + .func => { + if (!try pp.getMacroLParen()) return tok; + const arg_tokens_start = pp.macro_arg_tokens.items.len; + defer pp.macro_arg_tokens.items.len = arg_tokens_start; + const macro_args_start = pp.macro_args.items.len; + defer pp.macro_args.items.len = macro_args_start; + + const args = pp.readArgs(tok, macro) catch |err| switch (err) { + error.IncorrectArgumentCount => return PreprocessorToken.zero, + error.UnterminatedMacroArgumentList => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return PreprocessorToken.zero; }, + else => |e| return e, }; - if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; - if (invalid) |some| { - try pp.comp.addDiagnostic(.{ - .tag = .missing_tok_builtin, - .loc = some.loc, - .extra = .{ .tok_id_expected = .r_paren }, - }, some.expansionSlice()); - return false; - } - - const id = identifier.?.id; - return id == .identifier or id == .extended_identifier; + const r_paren = pp.getToken(); + std.debug.assert(r_paren.id == .r_paren); + const safe_name = try pp.getSafeString(tok, name); + + const intersection = try pp.treap.intersection(macro_hideset, r_paren.hideset); + const hideset = try pp.treap.addNodeTo(intersection, safe_name); + const tokens = try pp.subst(macro, tok, args, hideset); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); }, - .macro_param_has_include, .macro_param_has_include_next => { - const include_str = (try pp.reconstructIncludeString(param_toks, null, param_toks[0])) orelse return false; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - const filename = include_str[1 .. include_str.len - 1]; - if (builtin == .macro_param_has_include or pp.include_depth == 0) { - if (builtin == .macro_param_has_include_next) { - try pp.comp.addDiagnostic(.{ - .tag = .include_next_outside_header, - .loc = src_loc, - }, &.{}); - } - return pp.comp.hasInclude(filename, src_loc.id, include_type, .first); - } - return pp.comp.hasInclude(filename, src_loc.id, include_type, .next); + .special => |func| { + try func(pp, tok); + return pp.readExpand(); }, - else => unreachable, } } -/// Treat whitespace-only paste arguments as empty -fn getPasteArgs(args: []const TokenWithExpansionLocs) []const TokenWithExpansionLocs { - for (args) |tok| { - if (tok.id != .macro_ws) return args; +fn readMacroArg(pp: *Preprocessor, end: *bool, readall: bool) !MacroArg { + var level: i32 = 0; + const start: u32 = @intCast(pp.macro_arg_tokens.items.len); + while (true) { + var tok = pp.getToken(); + if (tok.id == .eof) { + return error.UnterminatedMacroArgumentList; + } + if (tok.id == .nl) continue; + if (tok.flags.is_bol and tok.id == .hash) { + try pp.readDirective(); + continue; + } + if (level == 0 and tok.id == .r_paren) { + try pp.ungetToken(tok); + end.* = true; + break; + } + if (level == 0 and tok.id == .comma and !readall) { + break; + } + if (tok.id == .l_paren) { + level += 1; + } + if (tok.id == .r_paren) { + level -= 1; + } + if (tok.flags.is_bol) { + tok.flags = .{ .is_bol = false, .space = true }; + } + try pp.macro_arg_tokens.append(pp.gpa, tok); } - return &[1]TokenWithExpansionLocs{.{ - .id = .placemarker, - .loc = .{ .id = .generated, .byte_offset = 0, .line = 0 }, - }}; + return .{ .start = start, .end = @intCast(pp.macro_arg_tokens.items.len) }; } -fn expandFuncMacro( - pp: *Preprocessor, - macro_tok: TokenWithExpansionLocs, - func_macro: *const Macro, - args: *const MacroArguments, - expanded_args: *const MacroArguments, - hideset_arg: Hideset.Index, -) MacroError!ExpandBuf { - var hideset = hideset_arg; - var buf = ExpandBuf.init(pp.gpa); - try buf.ensureTotalCapacity(func_macro.tokens.len); - errdefer buf.deinit(); - - var expanded_variable_arguments = ExpandBuf.init(pp.gpa); - defer expanded_variable_arguments.deinit(); - var variable_arguments = ExpandBuf.init(pp.gpa); - defer variable_arguments.deinit(); - - if (func_macro.var_args) { - var i: usize = func_macro.params.len; - while (i < expanded_args.items.len) : (i += 1) { - try variable_arguments.appendSlice(args.items[i]); - try expanded_variable_arguments.appendSlice(expanded_args.items[i]); - if (i != expanded_args.items.len - 1) { - const comma = TokenWithExpansionLocs{ .id = .comma, .loc = .{ .id = .generated } }; - try variable_arguments.append(comma); - try expanded_variable_arguments.append(comma); - } - } +fn doReadArgs(pp: *Preprocessor, macro: *const Macro) !MacroArgList { + const start: u32 = @intCast(pp.macro_args.items.len); + var end = false; + while (!end) { + const in_ellipsis = macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs; + const arg_range = try pp.readMacroArg(&end, in_ellipsis); + try pp.macro_args.append(pp.gpa, arg_range); + } + if (macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs) { + try pp.macro_args.append(pp.gpa, MacroArg.empty); } + return .{ .start = start, .end = @intCast(pp.macro_args.items.len) }; +} - // token concatenation and expansion phase - var tok_i: usize = 0; - while (tok_i < func_macro.tokens.len) : (tok_i += 1) { - const raw = func_macro.tokens[tok_i]; - switch (raw.id) { - .hash_hash => while (tok_i + 1 < func_macro.tokens.len) { - const raw_next = func_macro.tokens[tok_i + 1]; - tok_i += 1; - - var va_opt_buf = ExpandBuf.init(pp.gpa); - defer va_opt_buf.deinit(); - - const next = switch (raw_next.id) { - .macro_ws => continue, - .hash_hash => continue, - .comment => if (!pp.comp.langopts.preserve_comments_in_macros) - continue - else - &[1]TokenWithExpansionLocs{tokFromRaw(raw_next)}, - .macro_param, .macro_param_no_expand => getPasteArgs(args.items[raw_next.end]), - .keyword_va_args => variable_arguments.items, - .keyword_va_opt => blk: { - try pp.expandVaOpt(&va_opt_buf, raw_next, variable_arguments.items.len != 0); - if (va_opt_buf.items.len == 0) break; - break :blk va_opt_buf.items; - }, - else => &[1]TokenWithExpansionLocs{tokFromRaw(raw_next)}, - }; - try pp.pasteTokens(&buf, next); - if (next.len != 0) break; - }, - .macro_param_no_expand => { - if (tok_i + 1 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - hideset = pp.hideset.get(tokFromRaw(func_macro.tokens[tok_i + 1]).loc); - } - const slice = getPasteArgs(args.items[raw.end]); - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, slice, &.{raw_loc}); - }, - .macro_param => { - if (tok_i + 1 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - hideset = pp.hideset.get(tokFromRaw(func_macro.tokens[tok_i + 1]).loc); - } - const arg = expanded_args.items[raw.end]; - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, arg, &.{raw_loc}); - }, - .keyword_va_args => { - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); - }, - .keyword_va_opt => { - try pp.expandVaOpt(&buf, raw, variable_arguments.items.len != 0); - }, - .stringify_param, .stringify_va_args => { - const arg = if (raw.id == .stringify_va_args) - variable_arguments.items - else - args.items[raw.end]; +fn readArgs(pp: *Preprocessor, ident: PreprocessorToken, macro: *const Macro) !MacroArgList { + if (macro.nargs == 0 and (try pp.peekToken()).id == .r_paren) { + return MacroArgList.empty; + } + const args = try pp.doReadArgs(macro); + if (args.len() != macro.nargs) { + const extra = Diagnostics.Message.Extra{ + .arguments = .{ .expected = @intCast(macro.nargs), .actual = @intCast(args.len()) }, + }; + try pp.comp.addDiagnostic( + .{ .tag = .expected_arguments, .loc = ident.loc, .extra = extra }, + &.{}, // TODO: expansion slice + ); + return error.IncorrectArgumentCount; + } + return args; +} - pp.char_buf.clearRetainingCapacity(); - try pp.stringify(arg); +fn readExpand(pp: *Preprocessor) Error!PreprocessorToken { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id != .nl) return tok; + } +} + +/// # number "file" flags +/// TODO: validate that the pp_num token is solely digits +/// if not, emit `GNU line marker directive requires a simple digit sequence` +fn readLinemarker(pp: *Preprocessor) !void { + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + + const flag_1 = pp.getToken(); + if (flag_1.id.isDirectiveEnd()) return; + const flag_2 = pp.getToken(); + if (flag_2.id.isDirectiveEnd()) return; + const flag_3 = pp.getToken(); + if (flag_3.id.isDirectiveEnd()) return; + const flag_4 = pp.getToken(); + if (flag_4.id.isDirectiveEnd()) return; + try pp.expectNewline(); +} + +fn readIdent(pp: *Preprocessor) !?PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + return null; + } + return tok; +} - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items); +fn ungetToken(pp: *Preprocessor, tok: PreprocessorToken) !void { + if (tok.id == .eof) return; + if (pp.isBufferEmpty()) { + try pp.expansion_bufs.append(pp.gpa, .{}); + } + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].append(pp.gpa, tok); +} - try buf.append(try pp.makeGeneratedToken(start, .string_literal, tokFromRaw(raw))); - }, - .macro_param_has_attribute, - .macro_param_has_declspec_attribute, - .macro_param_has_warning, - .macro_param_has_feature, - .macro_param_has_extension, - .macro_param_has_builtin, - .macro_param_has_include, - .macro_param_has_include_next, - .macro_param_is_identifier, - => { - const arg = expanded_args.items[0]; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk false; - } else try pp.handleBuiltinMacro(raw.id, arg, macro_tok.loc); - const start = pp.comp.generated_buf.items.len; - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{}\n", .{@intFromBool(result)}); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_has_c_attribute => { - const arg = expanded_args.items[0]; - const not_found = "0\n"; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk not_found; - } else res: { - var invalid: ?TokenWithExpansionLocs = null; - var vendor_ident: ?TokenWithExpansionLocs = null; - var colon_colon: ?TokenWithExpansionLocs = null; - var attr_ident: ?TokenWithExpansionLocs = null; - for (arg) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id == .comment) continue; - if (tok.id == .colon_colon) { - if (colon_colon != null or attr_ident == null) { - invalid = tok; - break; - } - vendor_ident = attr_ident; - attr_ident = null; - colon_colon = tok; - continue; - } - if (!tok.id.isMacroIdentifier()) { - invalid = tok; - break; - } - if (attr_ident) |_| { - invalid = tok; - break; - } else attr_ident = tok; - } - if (vendor_ident != null and attr_ident == null) { - invalid = vendor_ident; - } else if (attr_ident == null and invalid == null) { - invalid = .{ .id = .eof, .loc = macro_tok.loc }; - } - if (invalid) |some| { - try pp.comp.addDiagnostic( - .{ .tag = .feature_check_requires_identifier, .loc = some.loc }, - some.expansionSlice(), - ); - break :res not_found; - } - if (vendor_ident) |some| { - const vendor_str = pp.expandedSlice(some); - const attr_str = pp.expandedSlice(attr_ident.?); - const exists = Attribute.fromString(.gnu, vendor_str, attr_str) != null; - - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, if (exists) "1\n" else "0\n"); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - continue; - } - if (!pp.comp.langopts.standard.atLeast(.c23)) break :res not_found; - - const attrs = std.StaticStringMap([]const u8).initComptime(.{ - .{ "deprecated", "201904L\n" }, - .{ "fallthrough", "201904L\n" }, - .{ "maybe_unused", "201904L\n" }, - .{ "nodiscard", "202003L\n" }, - .{ "noreturn", "202202L\n" }, - .{ "_Noreturn", "202202L\n" }, - .{ "unsequenced", "202207L\n" }, - .{ "reproducible", "202207L\n" }, - }); - - const attr_str = Attribute.normalize(pp.expandedSlice(attr_ident.?)); - break :res attrs.get(attr_str) orelse not_found; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, result); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_has_embed => { - const arg = expanded_args.items[0]; - const not_found = "0\n"; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk not_found; - } else res: { - var embed_args: []const TokenWithExpansionLocs = &.{}; - const include_str = (try pp.reconstructIncludeString(arg, &embed_args, arg[0])) orelse - break :res not_found; - - var prev = tokFromRaw(raw); - prev.id = .eof; - var it: struct { - i: u32 = 0, - slice: []const TokenWithExpansionLocs, - prev: TokenWithExpansionLocs, - fn next(it: *@This()) TokenWithExpansionLocs { - while (it.i < it.slice.len) switch (it.slice[it.i].id) { - .macro_ws, .whitespace => it.i += 1, - else => break, - } else return it.prev; - defer it.i += 1; - it.prev = it.slice[it.i]; - it.prev.id = .eof; - return it.slice[it.i]; - } - } = .{ .slice = embed_args, .prev = prev }; - - while (true) { - const param_first = it.next(); - if (param_first.id == .eof) break; - if (param_first.id != .identifier) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } +fn hashHashCheck(pp: *Preprocessor, toks: []const PreprocessorToken) !void { + if (toks.len == 0) return; + if (toks[0].id == .hash_hash) { + return pp.errTok(toks[0], .hash_hash_at_start); + } + if (toks[toks.len - 1].id == .hash_hash) { + return pp.errTok(toks[toks.len - 1], .hash_hash_at_end); + } +} - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - const maybe_colon = it.next(); - const param = switch (maybe_colon.id) { - .colon_colon => blk: { - // vendor::param - const param = it.next(); - if (param.id != .identifier) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = param.loc }, - param.expansionSlice(), - ); - continue; - } - const l_paren = it.next(); - if (l_paren.id != .l_paren) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = l_paren.loc }, - l_paren.expansionSlice(), - ); - continue; - } - break :blk "doesn't exist"; - }, - .l_paren => Attribute.normalize(pp.expandedSlice(param_first)), - else => { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = maybe_colon.loc }, - maybe_colon.expansionSlice(), - ); - continue; - }, - }; - - var arg_count: u32 = 0; - var first_arg: TokenWithExpansionLocs = undefined; - while (true) { - const next = it.next(); - if (next.id == .eof) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - break; - } - if (next.id == .r_paren) break; - arg_count += 1; - if (arg_count == 1) first_arg = next; - } +fn readObjMacro(pp: *Preprocessor, name: PreprocessorToken) !void { + var body: TokenList = .{}; + errdefer body.deinit(pp.gpa); - if (std.mem.eql(u8, param, "limit")) { - if (arg_count != 1) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } - if (first_arg.id != .pp_num) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } - _ = std.fmt.parseInt(u32, pp.expandedSlice(first_arg), 10) catch { - break :res not_found; - }; - } else if (!std.mem.eql(u8, param, "prefix") and !std.mem.eql(u8, param, "suffix") and - !std.mem.eql(u8, param, "if_empty")) - { - break :res not_found; - } - } + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; + switch (tok.id) { + .unterminated_comment => try pp.errTok(tok, .unterminated_comment), + else => try body.append(pp.gpa, tok), + } + } + try pp.hashHashCheck(body.items); + const macro: Macro = .{ + .tokens = body.items, + .var_args = false, + .loc = name.loc, + .kind = .object, + .nargs = undefined, + }; + try pp.defineMacro(name, macro); +} - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - const filename = include_str[1 .. include_str.len - 1]; - const contents = (try pp.comp.findEmbed(filename, arg[0].loc.id, include_type, 1)) orelse - break :res not_found; - - defer pp.comp.gpa.free(contents); - break :res if (contents.len != 0) "1\n" else "2\n"; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_pragma_operator => { - const param_toks = expanded_args.items[0]; - // Clang and GCC require exactly one token (so, no parentheses or string pasting) - // even though their error messages indicate otherwise. Ours is slightly more - // descriptive. - var invalid: ?TokenWithExpansionLocs = null; - var string: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| switch (tok.id) { - .string_literal => { - if (string) |_| invalid = tok else string = tok; - }, - .macro_ws => continue, - .comment => continue, - else => { - invalid = tok; - break; - }, - }; - if (string == null and invalid == null) invalid = .{ .loc = macro_tok.loc, .id = .eof }; - if (invalid) |some| try pp.comp.addDiagnostic( - .{ .tag = .pragma_operator_string_literal, .loc = some.loc }, - some.expansionSlice(), - ) else try pp.pragmaOperator(string.?, macro_tok.loc); - }, - .comma => { - if (tok_i + 2 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - const hash_hash = func_macro.tokens[tok_i + 1]; - var maybe_va_args = func_macro.tokens[tok_i + 2]; - var consumed: usize = 2; - if (maybe_va_args.id == .macro_ws and tok_i + 3 < func_macro.tokens.len) { - consumed = 3; - maybe_va_args = func_macro.tokens[tok_i + 3]; - } - if (maybe_va_args.id == .keyword_va_args) { - // GNU extension: `, ##__VA_ARGS__` deletes the comma if __VA_ARGS__ is empty - tok_i += consumed; - if (func_macro.params.len == expanded_args.items.len) { - // Empty __VA_ARGS__, drop the comma - try pp.err(hash_hash, .comma_deletion_va_args); - } else if (func_macro.params.len == 0 and expanded_args.items.len == 1 and expanded_args.items[0].len == 0) { - // Ambiguous whether this is "empty __VA_ARGS__" or "__VA_ARGS__ omitted" - if (pp.comp.langopts.standard.isGNU()) { - // GNU standard, drop the comma - try pp.err(hash_hash, .comma_deletion_va_args); - } else { - // C standard, retain the comma - try buf.append(tokFromRaw(raw)); - } - } else { - try buf.append(tokFromRaw(raw)); - if (expanded_variable_arguments.items.len > 0 or variable_arguments.items.len == func_macro.params.len) { - try pp.err(hash_hash, .comma_deletion_va_args); - } - const raw_loc = Source.Location{ - .id = maybe_va_args.source, - .byte_offset = maybe_va_args.start, - .line = maybe_va_args.line, - }; - try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); - } - continue; - } - } - // Regular comma, no token pasting with __VA_ARGS__ - try buf.append(tokFromRaw(raw)); - }, - else => try buf.append(tokFromRaw(raw)), +/// Defines a new macro and warns if it is a duplicate +fn defineMacro(pp: *Preprocessor, name_tok: PreprocessorToken, macro: Macro) Error!void { + const name_str = pp.tokSlice(name_tok); + const gop = try pp.defines.getOrPut(pp.gpa, name_str); + if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { + const tag: Diagnostics.Tag = if (gop.value_ptr.kind == .special) .builtin_macro_redefined else .macro_redefined; + const start = pp.comp.diagnostics.list.items.len; + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = name_tok.loc, + .extra = .{ .str = name_str }, + }, &.{}); + if (gop.value_ptr.kind != .special and pp.comp.diagnostics.list.items.len != start) { + try pp.comp.addDiagnostic(.{ + .tag = .previous_definition, + .loc = gop.value_ptr.loc, + }, &.{}); } } - removePlacemarkers(&buf); + gop.value_ptr.* = macro; +} - const macro_expansion_locs = macro_tok.expansionSlice(); - for (buf.items) |*tok| { - try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); - try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); - const tok_hidelist = pp.hideset.get(tok.loc); - const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hideset); - try pp.hideset.put(tok.loc, new_hidelist); +/// Get raw token source string. +/// Returned slice is invalidated when comp.generated_buf is updated. +pub fn tokSlice(pp: *Preprocessor, token: anytype) []const u8 { + if (token.id.lexeme()) |some| return some; + const source = pp.comp.getSource(token.loc.id); + var tmp_tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = token.loc.byte_offset, + .source = .generated, + }; + const tok = tmp_tokenizer.next(); + return tmp_tokenizer.buf[tok.start..tok.end]; +} + +fn expect(pp: *Preprocessor, expected: Tokenizer.Token.Id, tag: Diagnostics.Tag) !PreprocessorToken { + const tok = pp.getToken(); + if (tok.id != expected) { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .none = {} }, + }, &.{}); // todo expansion slice + try pp.errTok(tok, tag); } + return tok; +} - return buf; +fn expectLParen(pp: *Preprocessor, tok: PreprocessorToken) !PreprocessorToken { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + try pp.comp.addDiagnostic(.{ + .tag = .missing_lparen_after_builtin, + .loc = tok.loc, + .extra = .{ .str = pp.tokSlice(tok) }, + }, &.{}); // todo expansion slice + } + return l_paren; } -fn expandVaOpt( - pp: *Preprocessor, - buf: *ExpandBuf, - raw: RawToken, - should_expand: bool, -) !void { - if (!should_expand) return; +fn makeMacroToken(position: usize, is_vararg: bool) PreprocessorToken { + return .{ + .id = .macro_param, + .hideset = null, + .loc = .{ + .id = .unused, + .byte_offset = @intCast(position), + .line = @intFromBool(is_vararg), + }, + }; +} + +pub fn addIncludeStart(pp: *Preprocessor, source: Source) !void { + if (pp.linemarkers == .none) return; + try pp.addToken(.{ .id = .include_start, .loc = .{ + .id = source.id, + .byte_offset = std.math.maxInt(u32), + .line = 1, + } }); +} - const source = pp.comp.getSource(raw.source); - var tokenizer: Tokenizer = .{ - .buf = source.buf, - .index = raw.start, - .source = raw.source, - .langopts = pp.comp.langopts, - .line = raw.line, - }; - while (tokenizer.index < raw.end) { - const tok = tokenizer.next(); - try buf.append(tokFromRaw(tok)); - } +pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: u32) !void { + if (pp.linemarkers == .none) return; + try pp.addToken(.{ .id = .include_resume, .loc = .{ + .id = source, + .byte_offset = offset, + .line = line, + } }); } -fn bufCopyTokens(buf: *ExpandBuf, tokens: []const TokenWithExpansionLocs, src: []const Source.Location) !void { - try buf.ensureUnusedCapacity(tokens.len); - for (tokens) |tok| { - var copy = try tok.dupe(buf.allocator); - errdefer TokenWithExpansionLocs.free(copy.expansion_locs, buf.allocator); - try copy.addExpansionLocation(buf.allocator, src); - buf.appendAssumeCapacity(copy); - } +fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { + const tok = pp.getToken(); + if (tok.id == id) return true; + try pp.ungetToken(tok); + return false; } -fn nextBufToken( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: *usize, - end_idx: *usize, - extend_buf: bool, -) Error!TokenWithExpansionLocs { - start_idx.* += 1; - if (start_idx.* == buf.items.len and start_idx.* >= end_idx.*) { - if (extend_buf) { - const raw_tok = tokenizer.next(); - if (raw_tok.id.isMacroIdentifier() and - pp.poisoned_identifiers.get(pp.tokSlice(raw_tok)) != null) - try pp.err(raw_tok, .poisoned_identifier); - - if (raw_tok.id == .nl) pp.add_expansion_nl += 1; - - const new_tok = tokFromRaw(raw_tok); - end_idx.* += 1; - try buf.append(new_tok); - return new_tok; - } else { - return TokenWithExpansionLocs{ .id = .eof, .loc = .{ .id = .generated } }; - } - } else { - return buf.items[start_idx.*]; +fn getMacroLParen(pp: *Preprocessor) !bool { + while (true) { + const tok = pp.getToken(); + if (tok.id == .nl) continue; + + if (tok.id == .l_paren) return true; + try pp.ungetToken(tok); + return false; } } -fn collectMacroFuncArguments( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: *usize, - end_idx: *usize, - extend_buf: bool, - is_builtin: bool, - r_paren: *TokenWithExpansionLocs, -) !MacroArguments { - const name_tok = buf.items[start_idx.*]; - const saved_tokenizer = tokenizer.*; - const old_end = end_idx.*; - +/// Returns true for vararg function-like macro, false otherwise +fn readFunclikeMacroParams(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken, params: *ParamMap) !bool { + _ = name; + var pos: usize = 0; while (true) { - const tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); + var tok = pp.getToken(); switch (tok.id) { - .nl, .whitespace, .macro_ws => {}, - .l_paren => break, - else => { - if (is_builtin) { - try pp.errStr(name_tok, .missing_lparen_after_builtin, pp.expandedSlice(name_tok)); - } - // Not a macro function call, go over normal identifier, rewind - tokenizer.* = saved_tokenizer; - end_idx.* = old_end; - return error.MissingLParen; + .r_paren => return false, + .unterminated_comment => { + try pp.errTok(tok, .unterminated_comment); + return false; }, + else => {}, + } + if (pos != 0) { + if (tok.id != .comma) { + switch (tok.id) { + .nl, .eof => {}, + else => pp.skipToNl(), + } + try pp.errTok(tok, .expected_comma_param_list); + return error.InvalidMacroDef; + } + tok = pp.getToken(); + } + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .missing_paren_param_list); + return false; + } + if (tok.id == .ellipsis) { + try params.put(pp.gpa, "__VA_ARGS__", makeMacroToken(pos, true)); + pos += 1; + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return error.InvalidMacroDef; + } + return true; + } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .invalid_token_param_list); + return error.InvalidMacroDef; } + const arg = pp.tokSlice(tok); + if (try pp.next(.ellipsis)) { + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + pp.skipToNl(); + } + try params.put(pp.gpa, arg, makeMacroToken(pos, true)); + pos += 1; + return true; + } + try params.put(pp.gpa, arg, makeMacroToken(pos, false)); + pos += 1; } +} - // collect the arguments. - var parens: u32 = 0; - var args = MacroArguments.init(pp.gpa); - errdefer deinitMacroArguments(pp.gpa, &args); - var curArgument = std.ArrayList(TokenWithExpansionLocs).init(pp.gpa); - defer curArgument.deinit(); +fn readFunclikeMacroBody(pp: *Preprocessor, params: *const ParamMap) ![]const PreprocessorToken { + var tokens: TokenList = .{}; + errdefer tokens.deinit(pp.gpa); while (true) { - var tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); - tok.flags.is_macro_arg = true; - switch (tok.id) { - .comma => { - if (parens == 0) { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - } else { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - } - }, - .l_paren => { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - parens += 1; - }, - .r_paren => { - if (parens == 0) { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - r_paren.* = tok; - break; - } else { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - parens -= 1; - } - }, - .eof => { - { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - } - tokenizer.* = saved_tokenizer; - try pp.comp.addDiagnostic( - .{ .tag = .unterminated_macro_arg_list, .loc = name_tok.loc }, - name_tok.expansionSlice(), - ); - return error.Unterminated; - }, - .nl, .whitespace => { - try curArgument.append(.{ .id = .macro_ws, .loc = tok.loc }); - }, - else => { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - }, + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) { + return tokens.toOwnedSlice(pp.gpa); + } + if (tok.id.isMacroIdentifier()) { + // const subst = params. + if (params.get(pp.tokSlice(tok))) |sub| { + var copy = sub; + copy.flags.space = tok.flags.space; + try tokens.append(pp.gpa, copy); + continue; + } } + try tokens.append(pp.gpa, tok); } - - return args; } -fn removeExpandedTokens(pp: *Preprocessor, buf: *ExpandBuf, start: usize, len: usize, moving_end_idx: *usize) !void { - for (buf.items[start .. start + len]) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try buf.replaceRange(start, len, &.{}); - moving_end_idx.* -|= len; +fn readFuncLikeMacro(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken) Error!void { + var params: ParamMap = .{}; + defer params.deinit(pp.gpa); + const is_vararg = pp.readFunclikeMacroParams(name, l_paren, ¶ms) catch |err| switch (err) { + error.InvalidMacroDef => blk: { + pp.skipToNl(); + break :blk false; + }, + else => |e| return e, + }; + const body = try pp.readFunclikeMacroBody(¶ms); + errdefer pp.gpa.free(body); + try pp.hashHashCheck(body); + const macro: Macro = .{ + .tokens = body, + .var_args = is_vararg, + .loc = name.loc, + .kind = .func, + .nargs = params.count(), + }; + try pp.defineMacro(name, macro); } -/// The behavior of `defined` depends on whether we are in a preprocessor -/// expression context (#if or #elif) or not. -/// In a non-expression context it's just an identifier. Within a preprocessor -/// expression it is a unary operator or one-argument function. -const EvalContext = enum { - expr, - non_expr, -}; - -/// Helper for safely iterating over a slice of tokens while skipping whitespace -const TokenIterator = struct { - toks: []const TokenWithExpansionLocs, - i: usize, - - fn init(toks: []const TokenWithExpansionLocs) TokenIterator { - return .{ .toks = toks, .i = 0 }; +fn readDefine(pp: *Preprocessor) !void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; + }; + const next_tok = pp.getToken(); + if (next_tok.id == .l_paren and !next_tok.flags.space) { + try pp.readFuncLikeMacro(name, next_tok); + return; } + try pp.ungetToken(next_tok); + try pp.readObjMacro(name); +} - fn nextNoWS(self: *TokenIterator) ?TokenWithExpansionLocs { - while (self.i < self.toks.len) : (self.i += 1) { - const tok = self.toks[self.i]; - if (tok.id == .whitespace or tok.id == .macro_ws) continue; - - self.i += 1; - return tok; - } - return null; +fn doSkipSpace(pp: *Preprocessor) bool { + const saved_tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + switch (tok.id) { + .eof => return false, + .whitespace, .comment => return true, + else => { + pp.tokenizers.items[pp.tokenizers.items.len - 1] = saved_tokenizer; + return false; + }, } -}; - -fn expandMacroExhaustive( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: usize, - end_idx: usize, - extend_buf: bool, - eval_ctx: EvalContext, -) MacroError!void { - var moving_end_idx = end_idx; - var advance_index: usize = 0; - // rescan loop - var do_rescan = true; - while (do_rescan) { - do_rescan = false; - // expansion loop - var idx: usize = start_idx + advance_index; - while (idx < moving_end_idx) { - const macro_tok = buf.items[idx]; - if (macro_tok.id == .keyword_defined and eval_ctx == .expr) { - idx += 1; - var it = TokenIterator.init(buf.items[idx..moving_end_idx]); - if (it.nextNoWS()) |tok| { - switch (tok.id) { - .l_paren => { - _ = it.nextNoWS(); // eat (what should be) identifier - _ = it.nextNoWS(); // eat (what should be) r paren - }, - .identifier, .extended_identifier => {}, - else => {}, - } - } - idx += it.i; - continue; - } - if (!macro_tok.id.isMacroIdentifier() or macro_tok.flags.expansion_disabled) { - idx += 1; - continue; - } - const expanded = pp.expandedSlice(macro_tok); - const macro = pp.defines.getPtr(expanded) orelse { - idx += 1; - continue; - }; - const macro_hidelist = pp.hideset.get(macro_tok.loc); - if (pp.hideset.contains(macro_hidelist, expanded)) { - idx += 1; - continue; - } - - macro_handler: { - if (macro.is_func) { - var r_paren: TokenWithExpansionLocs = undefined; - var macro_scan_idx = idx; - // to be saved in case this doesn't turn out to be a call - const args = pp.collectMacroFuncArguments( - tokenizer, - buf, - ¯o_scan_idx, - &moving_end_idx, - extend_buf, - macro.is_builtin, - &r_paren, - ) catch |er| switch (er) { - error.MissingLParen => { - if (!buf.items[idx].flags.is_macro_arg) buf.items[idx].flags.expansion_disabled = true; - idx += 1; - break :macro_handler; - }, - error.Unterminated => { - if (pp.comp.langopts.emulate == .gcc) idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx, &moving_end_idx); - break :macro_handler; - }, - else => |e| return e, - }; - assert(r_paren.id == .r_paren); - var free_arg_expansion_locs = false; - defer { - for (args.items) |item| { - if (free_arg_expansion_locs) for (item) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - pp.gpa.free(item); - } - args.deinit(); - } - const r_paren_hidelist = pp.hideset.get(r_paren.loc); - var hs = try pp.hideset.intersection(macro_hidelist, r_paren_hidelist); - hs = try pp.hideset.prepend(macro_tok.loc, hs); - - var args_count: u32 = @intCast(args.items.len); - // if the macro has zero arguments g() args_count is still 1 - // an empty token list g() and a whitespace-only token list g( ) - // counts as zero arguments for the purposes of argument-count validation - if (args_count == 1 and macro.params.len == 0) { - for (args.items[0]) |tok| { - if (tok.id != .macro_ws) break; - } else { - args_count = 0; - } - } - - // Validate argument count. - const extra = Diagnostics.Message.Extra{ - .arguments = .{ .expected = @intCast(macro.params.len), .actual = args_count }, - }; - if (macro.var_args and args_count < macro.params.len) { - free_arg_expansion_locs = true; - try pp.comp.addDiagnostic( - .{ .tag = .expected_at_least_arguments, .loc = buf.items[idx].loc, .extra = extra }, - buf.items[idx].expansionSlice(), - ); - idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); - continue; - } - if (!macro.var_args and args_count != macro.params.len) { - free_arg_expansion_locs = true; - try pp.comp.addDiagnostic( - .{ .tag = .expected_arguments, .loc = buf.items[idx].loc, .extra = extra }, - buf.items[idx].expansionSlice(), - ); - idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); - continue; - } - var expanded_args = MacroArguments.init(pp.gpa); - defer deinitMacroArguments(pp.gpa, &expanded_args); - try expanded_args.ensureTotalCapacity(args.items.len); - for (args.items) |arg| { - var expand_buf = ExpandBuf.init(pp.gpa); - errdefer expand_buf.deinit(); - try expand_buf.appendSlice(arg); - - try pp.expandMacroExhaustive(tokenizer, &expand_buf, 0, expand_buf.items.len, false, eval_ctx); - - expanded_args.appendAssumeCapacity(try expand_buf.toOwnedSlice()); - } - - var res = try pp.expandFuncMacro(macro_tok, macro, &args, &expanded_args, hs); - defer res.deinit(); - const tokens_added = res.items.len; - const tokens_removed = macro_scan_idx - idx + 1; - for (buf.items[idx .. idx + tokens_removed]) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try buf.replaceRange(idx, tokens_removed, res.items); - - moving_end_idx += tokens_added; - // Overflow here means that we encountered an unterminated argument list - // while expanding the body of this macro. - moving_end_idx -|= tokens_removed; - idx += tokens_added; - do_rescan = true; - } else { - const res = try pp.expandObjMacro(macro); - defer res.deinit(); - - const hs = try pp.hideset.prepend(macro_tok.loc, macro_hidelist); - - const macro_expansion_locs = macro_tok.expansionSlice(); - var increment_idx_by = res.items.len; - for (res.items, 0..) |*tok, i| { - tok.flags.is_macro_arg = macro_tok.flags.is_macro_arg; - try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); - try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); - - const tok_hidelist = pp.hideset.get(tok.loc); - const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs); - try pp.hideset.put(tok.loc, new_hidelist); - - if (tok.id == .keyword_defined and eval_ctx == .expr) { - try pp.comp.addDiagnostic(.{ - .tag = .expansion_to_defined, - .loc = tok.loc, - }, tok.expansionSlice()); - } - - if (i < increment_idx_by and (tok.id == .keyword_defined or pp.defines.contains(pp.expandedSlice(tok.*)))) { - increment_idx_by = i; - } - } +} - TokenWithExpansionLocs.free(buf.items[idx].expansion_locs, pp.gpa); - try buf.replaceRange(idx, 1, res.items); - idx += increment_idx_by; - moving_end_idx = moving_end_idx + res.items.len - 1; - do_rescan = true; - } - } - if (idx - start_idx == advance_index + 1 and !do_rescan) { - advance_index += 1; - } - } // end of replacement phase +/// Skips spaces including comments. +/// Returns true if at least one space is skipped. +fn skipSpace(pp: *Preprocessor) bool { + if (!pp.doSkipSpace()) { + return false; } - // end of scanning phase + while (pp.doSkipSpace()) {} + return true; +} - // trim excess buffer - for (buf.items[moving_end_idx..]) |item| { - TokenWithExpansionLocs.free(item.expansion_locs, pp.gpa); +/// Read the next raw token from the tokenizer stack +fn lexToken(pp: *Preprocessor) PreprocessorToken { + if (pp.skipSpace()) { + return .{ .id = .whitespace, .loc = undefined }; } - buf.items.len = moving_end_idx; + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + return .{ + .id = tok.id, + .flags = .{ + .is_bol = tok.bol, + }, + .loc = .{ + .id = tok.source, + .byte_offset = tok.start, + .line = tok.line, + }, + }; } -/// Try to expand a macro after a possible candidate has been read from the `tokenizer` -/// into the `raw` token passed as argument -fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void { - var source_tok = tokFromRaw(raw); - if (!raw.id.isMacroIdentifier()) { - source_tok.id.simplifyMacroKeyword(); - return pp.addToken(source_tok); +/// Read the next token without expanding it +fn getToken(pp: *Preprocessor) PreprocessorToken { + if (!pp.isBufferEmpty() and pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len > 0) { + return pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].pop(); + } + if (pp.expansion_bufs.items.len > 1) { + return .{ .id = .eof, .loc = undefined }; } - pp.top_expansion_buf.items.len = 0; - try pp.top_expansion_buf.append(source_tok); - pp.expansion_source_loc = source_tok.loc; + const bol = pp.tokenizers.items[pp.tokenizers.items.len - 1].bol; + var tok = pp.lexToken(); + while (tok.id == .whitespace) { + tok = pp.lexToken(); + tok.flags.space = true; + } + tok.flags.is_bol = bol; + return tok; +} - pp.hideset.clearRetainingCapacity(); - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); - try pp.ensureUnusedTokenCapacity(pp.top_expansion_buf.items.len); - for (pp.top_expansion_buf.items) |*tok| { - if (tok.id == .macro_ws and !pp.preserve_whitespace) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - if (tok.id == .comment and !pp.comp.langopts.preserve_comments_in_macros) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - if (tok.id == .placemarker) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - tok.id.simplifyMacroKeywordExtra(true); - pp.addTokenAssumeCapacity(tok.*); - } - if (pp.preserve_whitespace) { - try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl); - while (pp.add_expansion_nl > 0) : (pp.add_expansion_nl -= 1) { - pp.addTokenAssumeCapacity(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); +fn readDefinedOp(pp: *Preprocessor) !PreprocessorToken { + var tok = pp.getToken(); + if (tok.id == .l_paren) { + tok = pp.getToken(); + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errStr(r_paren, .closing_paren_after, "defined"); } } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + } + const slice = pp.tokSlice(tok); + if (pp.defines.contains(slice)) { + return PreprocessorToken.one; + } + return PreprocessorToken.zero; } -fn expandedSliceExtra(pp: *const Preprocessor, tok: anytype, macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }) []const u8 { - if (tok.id.lexeme()) |some| { - if (!tok.id.allowsDigraphs(pp.comp.langopts) and !(tok.id == .macro_ws and macro_ws_handling == .preserve_macro_ws)) return some; - } - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.getSource(tok.loc.id).buf, - .langopts = pp.comp.langopts, - .index = tok.loc.byte_offset, - .source = .generated, - }; - if (tok.id == .macro_string) { - while (true) : (tmp_tokenizer.index += 1) { - if (tmp_tokenizer.buf[tmp_tokenizer.index] == '>') break; +fn readIntExprLine(pp: *Preprocessor) !void { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) break; + if (tok.id == .keyword_defined) { + const result = try pp.readDefinedOp(); + try pp.addToken(result); + } else if (tok.id.isMacroIdentifier()) { + try pp.addToken(PreprocessorToken.zero); + } else { + try pp.addToken(tok); } - return tmp_tokenizer.buf[tok.loc.byte_offset .. tmp_tokenizer.index + 1]; } - const res = tmp_tokenizer.next(); - return tmp_tokenizer.buf[res.start..res.end]; + try pp.addToken(.{ .id = .eof, .loc = .{} }); } -/// Get expanded token source string. -pub fn expandedSlice(pp: *const Preprocessor, tok: anytype) []const u8 { - return pp.expandedSliceExtra(tok, .single_macro_ws); -} +fn readConstexpr(pp: *Preprocessor) !bool { + const start = pp.tokens.len; + defer pp.tokens.len = start; + try pp.readIntExprLine(); -/// Concat two tokens and add the result to pp.generated -fn pasteTokens(pp: *Preprocessor, lhs_toks: *ExpandBuf, rhs_toks: []const TokenWithExpansionLocs) Error!void { - const lhs = while (lhs_toks.popOrNull()) |lhs| { - if ((pp.comp.langopts.preserve_comments_in_macros and lhs.id == .comment) or - (lhs.id != .macro_ws and lhs.id != .comment)) - break lhs; + var parser = Parser{ + .pp = pp, + .comp = pp.comp, + .gpa = pp.gpa, + .tok_ids = pp.tokens.items(.id), + .tok_i = @intCast(start), + .arena = undefined, + .in_macro = true, + .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), - TokenWithExpansionLocs.free(lhs.expansion_locs, pp.gpa); - } else { - return bufCopyTokens(lhs_toks, rhs_toks, &.{}); + .data = undefined, + .value_map = undefined, + .labels = undefined, + .decl_buf = undefined, + .list_buf = undefined, + .param_buf = undefined, + .enum_buf = undefined, + .record_buf = undefined, + .attr_buf = undefined, + .field_attr_buf = undefined, + .string_ids = undefined, }; + defer parser.strings.deinit(); + return parser.macroExpr(); +} - var rhs_rest: u32 = 1; - const rhs = for (rhs_toks) |rhs| { - if ((pp.comp.langopts.preserve_comments_in_macros and rhs.id == .comment) or - (rhs.id != .macro_ws and rhs.id != .comment)) - break rhs; +/// #line number "file" +/// TODO: validate that the pp_num token is solely digits +fn readLine(pp: *Preprocessor) Error!void { + const digits = pp.getToken(); + if (digits.id != .pp_num) try pp.errTok(digits, .line_simple_digit); - rhs_rest += 1; - } else { - return lhs_toks.appendAssumeCapacity(lhs); - }; - defer TokenWithExpansionLocs.free(lhs.expansion_locs, pp.gpa); + if (digits.id.isDirectiveEnd()) return; + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + try pp.expectNewline(); +} - const start = pp.comp.generated_buf.items.len; - const end = start + pp.expandedSlice(lhs).len + pp.expandedSlice(rhs).len; - try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline - // We cannot use the same slices here since they might be invalidated by `ensureCapacity` - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(lhs)); - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(rhs)); - pp.comp.generated_buf.appendAssumeCapacity('\n'); +fn readPragma(pp: *Preprocessor, pragma_tok: PreprocessorToken) Error!void { + const name_tok = pp.getToken(); + if (name_tok.id == .nl or name_tok.id == .eof) return; - // Try to tokenize the result. - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.generated_buf.items, - .langopts = pp.comp.langopts, - .index = @intCast(start), - .source = .generated, - }; - const pasted_token = tmp_tokenizer.nextNoWSComments(); - const next = tmp_tokenizer.nextNoWSComments(); - const pasted_id = if (lhs.id == .placemarker and rhs.id == .placemarker) - .placemarker - else - pasted_token.id; - try lhs_toks.append(try pp.makeGeneratedToken(start, pasted_id, lhs)); - - if (next.id != .nl and next.id != .eof) { - try pp.errStr( - lhs, - .pasting_formed_invalid, - try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), - ); - try lhs_toks.append(tokFromRaw(next)); + try pp.addToken(pragma_tok); + + const pragma_start: u32 = @intCast(pp.tokens.len); + try pp.addToken(name_tok); + + while (true) { + const next_tok = pp.getToken(); + if (next_tok.id == .eof) { + try pp.addToken(.{ + .id = .nl, + .loc = .{ .id = .generated }, + }); + break; + } + try pp.addToken(next_tok); + if (next_tok.id == .nl) break; + } + const name = pp.tokSlice(name_tok); + if (pp.comp.getPragma(name)) |prag| unknown: { + return prag.preprocessorCB(pp, pragma_start) catch |er| switch (er) { + error.UnknownPragma => break :unknown, + error.StopPreprocessing => { + _ = pp.tokenizers.pop(); + return; + }, + else => |e| return e, + }; } + return pp.errTok(name_tok, .unknown_pragma); +} - try bufCopyTokens(lhs_toks, rhs_toks[rhs_rest..], &.{}); +fn readUndef(pp: *Preprocessor) Error!void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; + }; + try pp.expectNewline(); + _ = pp.defines.remove(pp.tokSlice(name)); } -fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: TokenWithExpansionLocs) !TokenWithExpansionLocs { - var pasted_token = TokenWithExpansionLocs{ .id = id, .loc = .{ - .id = .generated, - .byte_offset = @intCast(start), - .line = pp.generated_line, - } }; - pp.generated_line += 1; - try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); - try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); - return pasted_token; +/// Skip until after a newline, error if extra tokens before it. +fn expectNewline(pp: *Preprocessor) !void { + var sent_err = false; + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; + if (tok.id == .whitespace or tok.id == .comment) continue; + if (!sent_err) { + sent_err = true; + try pp.errTok(tok, .extra_tokens_directive_end); + } + } } -/// Defines a new macro and warns if it is a duplicate -fn defineMacro(pp: *Preprocessor, name_tok: RawToken, macro: Macro) Error!void { - const name_str = pp.tokSlice(name_tok); - const gop = try pp.defines.getOrPut(pp.gpa, name_str); - if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { - const tag: Diagnostics.Tag = if (gop.value_ptr.is_builtin) .builtin_macro_redefined else .macro_redefined; - const start = pp.comp.diagnostics.list.items.len; +/// TODO: pragma once +fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + try pp.expectNewline(); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + const tok: RawToken = .{ .id = include_token.id, .source = include_token.loc.id, .start = include_token.loc.byte_offset, .line = include_token.loc.line }; + const source = (try pp.comp.findInclude(filename, tok, include_type, which)) orelse return pp.fatalNotFound(include_token, filename); + if (pp.include_guards.get(source.id)) |guard| { + if (pp.defines.contains(guard)) return; + } + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); + + if (pp.tokenizers.items.len > max_include_depth) { try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = .{ .id = name_tok.source, .byte_offset = name_tok.start, .line = name_tok.line }, - .extra = .{ .str = name_str }, + .tag = .too_many_includes, + .loc = include_token.loc, }, &.{}); - if (!gop.value_ptr.is_builtin and pp.comp.diagnostics.list.items.len != start) { - try pp.comp.addDiagnostic(.{ - .tag = .previous_definition, - .loc = gop.value_ptr.loc, - }, &.{}); - } + return error.FatalError; } - if (pp.verbose) { - pp.verboseLog(name_tok, "macro {s} defined", .{name_str}); - } - gop.value_ptr.* = macro; + pp.preprocess_count += 1; + try pp.addIncludeStart(source); + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); } -/// Handle a #define directive. -fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { - // Get macro name and validate it. - const macro_name = tokenizer.nextNoWS(); - if (macro_name.id == .keyword_defined) { - try pp.err(macro_name, .defined_as_macro_name); - return skipToNl(tokenizer); +/// Read a header name delimited by quotes or angle brackets +fn readHeaderFileName(pp: *Preprocessor, is_std: *bool) !?[]const u8 { + if (!pp.isBufferEmpty()) return null; + _ = pp.skipSpace(); + + var close: u8 = undefined; + var tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + defer pp.tokenizers.items[pp.tokenizers.items.len - 1] = tokenizer; + + if (tokenizer.buf[tokenizer.index..].len < 2) { + return null; } - if (!macro_name.id.isMacroIdentifier()) { - try pp.err(macro_name, .macro_name_must_be_identifier); - return skipToNl(tokenizer); - } - var macro_name_token_id = macro_name.id; - macro_name_token_id.simplifyMacroKeyword(); - switch (macro_name_token_id) { - .identifier, .extended_identifier => {}, - else => if (macro_name_token_id.isMacroIdentifier()) { - try pp.err(macro_name, .keyword_macro); + const start = tokenizer.index; + switch (tokenizer.buf[tokenizer.index..][0]) { + '"' => { + is_std.* = false; + close = '"'; }, + '<' => { + is_std.* = true; + close = '>'; + }, + else => return null, } + tokenizer.index += 1; + while (tokenizer.index < tokenizer.buf.len and tokenizer.buf[tokenizer.index] != close and tokenizer.buf[tokenizer.index] != '\n') : (tokenizer.index += 1) {} - // Check for function macros and empty defines. - var first = tokenizer.next(); - switch (first.id) { - .nl, .eof => return pp.defineMacro(macro_name, .{ - .params = &.{}, - .tokens = &.{}, - .var_args = false, - .loc = tokFromRaw(macro_name).loc, - .is_func = false, - }), - .whitespace => first = tokenizer.next(), - .l_paren => return pp.defineFn(tokenizer, macro_name, first), - else => try pp.err(first, .whitespace_after_macro_name), - } - if (first.id == .hash_hash) { - try pp.err(first, .hash_hash_at_start); - return skipToNl(tokenizer); + if (tokenizer.index == tokenizer.buf.len or tokenizer.buf[tokenizer.index] != close) { + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .header_str_closing); + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .header_str_match); + return error.InvalidInclude; } - first.id.simplifyMacroKeyword(); - pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. + tokenizer.index += 1; - var need_ws = false; - // Collect the token body and validate any ## found. - var tok = first; - while (true) { - tok.id.simplifyMacroKeyword(); - switch (tok.id) { - .hash_hash => { - const next = tokenizer.nextNoWSComments(); - switch (next.id) { - .nl, .eof => { - try pp.err(tok, .hash_hash_at_end); - return; - }, - .hash_hash => { - try pp.err(next, .hash_hash_at_end); - return; - }, - else => {}, - } - try pp.token_buf.append(tok); - try pp.token_buf.append(next); - }, - .nl, .eof => break, - .comment => if (pp.comp.langopts.preserve_comments_in_macros) { - if (need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - .whitespace => need_ws = true, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.token_buf.append(tok); - }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), - else => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - } - tok = tokenizer.next(); + const buf = tokenizer.buf[start..tokenizer.index]; + if (buf.len == 2) { + try pp.errTok(.{ .id = .nl, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .empty_filename); + return error.InvalidInclude; } + return buf; +} - const list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); - try pp.defineMacro(macro_name, .{ - .loc = tokFromRaw(macro_name).loc, - .tokens = list, - .params = undefined, - .is_func = false, - .var_args = false, - }); +fn isBufferEmpty(pp: *const Preprocessor) bool { + return pp.expansion_bufs.items.len == 0; } -/// Handle a function like #define directive. -fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_paren: RawToken) Error!void { - assert(macro_name.id.isMacroIdentifier()); - var params = std.ArrayList([]const u8).init(pp.gpa); - defer params.deinit(); +/// Read a delimited header name, or a macro expanded one +fn readHeaderName(pp: *Preprocessor, is_std: *bool) ![]const u8 { + if (try pp.readHeaderFileName(is_std)) |path| return path; - // Parse the parameter list. - var gnu_var_args: []const u8 = ""; - var var_args = false; + // If a token following #include does not start with < nor ", + // try to read the token as a regular token. Macro-expanded + // form may be a valid header file path. + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .expected_filename); + return error.InvalidInclude; + } + if (tok.id == .string_literal) { + is_std.* = false; + return pp.tokSlice(tok); + } + if (tok.id != .angle_bracket_left) { + try pp.errStr(tok, .expected_left_angle_bracket, pp.tokSlice(tok)); + return error.InvalidInclude; + } + const start = pp.char_buf.items.len; + try pp.char_buf.append(pp.gpa, '<'); + defer pp.char_buf.items.len = start; + const writer = pp.char_buf.writer(pp.gpa); while (true) { - var tok = tokenizer.nextNoWS(); - if (tok.id == .r_paren) break; - if (tok.id == .eof) return pp.err(tok, .unterminated_macro_param_list); - if (tok.id == .ellipsis) { - var_args = true; - const r_paren = tokenizer.nextNoWS(); - if (r_paren.id != .r_paren) { - try pp.err(r_paren, .missing_paren_param_list); - try pp.err(l_paren, .to_match_paren); - return skipToNl(tokenizer); - } - break; + const path_tok = try pp.readExpandNewline(); + if (path_tok.id == .nl) { + try pp.errTok(path_tok, .header_str_closing); + try pp.errTok(tok, .header_str_match); + return error.InvalidInclude; } - if (!tok.id.isMacroIdentifier()) { - try pp.err(tok, .invalid_token_param_list); - return skipToNl(tokenizer); + if (path_tok.id == .angle_bracket_right) { + break; } + try pp.prettyPrintToken(writer, path_tok.toTreeToken()); + } + is_std.* = true; + try pp.char_buf.append(pp.gpa, '>'); + return pp.gpa.dupe(u8, pp.char_buf.items[start..]); +} - try params.append(pp.tokSlice(tok)); +fn readInclude(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .first); +} - tok = tokenizer.nextNoWS(); - if (tok.id == .ellipsis) { - try pp.err(tok, .gnu_va_macro); - gnu_var_args = params.pop(); - const r_paren = tokenizer.nextNoWS(); - if (r_paren.id != .r_paren) { - try pp.err(r_paren, .missing_paren_param_list); - try pp.err(l_paren, .to_match_paren); - return skipToNl(tokenizer); - } - break; - } else if (tok.id == .r_paren) { - break; - } else if (tok.id != .comma) { - try pp.err(tok, .expected_comma_param_list); - return skipToNl(tokenizer); +fn readIncludeNext(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .next); +} + +fn readErrorMessage(pp: *Preprocessor, directive_tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + var i: usize = 0; + while (true) : (i += 1) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; + const slice = pp.tokSlice(tok); + if (slice.len > 0 and tok.flags.space and i != 0) { + try pp.char_buf.append(pp.gpa, ' '); } + try pp.char_buf.appendSlice(pp.gpa, slice); + } + const slice = pp.char_buf.items[char_top..]; + const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = directive_tok.loc, + .extra = .{ .str = duped }, + }, &.{}); +} + +fn clearGuard(pp: *Preprocessor) void { + pp.guard_stack.items[pp.guard_stack.items.len - 1] = null; +} + +fn readDirective(pp: *Preprocessor) Error!void { + const directive = pp.getToken(); + if (directive.id.isDirectiveEnd()) return; + if (directive.id == .pp_num) { + return pp.readLinemarker(); } - var need_ws = false; - // Collect the body tokens and validate # and ##'s found. - pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. - tok_loop: while (true) { - var tok = tokenizer.next(); - switch (tok.id) { - .nl, .eof => break, - .whitespace => need_ws = pp.token_buf.items.len != 0, - .comment => if (!pp.comp.langopts.preserve_comments_in_macros) continue else { - if (need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - .hash => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - const param = tokenizer.nextNoWS(); - blk: { - if (var_args and param.id == .keyword_va_args) { - tok.id = .stringify_va_args; - try pp.token_buf.append(tok); - continue :tok_loop; + const until_else = 0; + const until_endif = 1; + const until_endif_seen_else = 2; + + switch (directive.id) { + .keyword_define => try pp.readDefine(), + .keyword_elif => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elif_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elif", .{}); } - if (!param.id.isMacroIdentifier()) break :blk; - const s = pp.tokSlice(param); - if (mem.eql(u8, s, gnu_var_args)) { - tok.id = .stringify_va_args; - try pp.token_buf.append(tok); - continue :tok_loop; + } else { + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elif", .{}); } - for (params.items, 0..) |p, i| { - if (mem.eql(u8, p, s)) { - tok.id = .stringify_param; - tok.end = @intCast(i); - try pp.token_buf.append(tok); - continue :tok_loop; - } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elif_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_else => { + try pp.expectNewline(); + if (pp.if_level == 0) { + try pp.errTok(directive, .else_without_if); + return; + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + pp.if_kind.set(pp.if_level, until_endif_seen_else); + if (pp.verbose) { + pp.verboseLog(directive, "#else branch here", .{}); } - } - try pp.err(param, .hash_not_followed_param); - return skipToNl(tokenizer); - }, - .hash_hash => { - need_ws = false; - // if ## appears at the beginning, the token buf is still empty - // in this case, error out - if (pp.token_buf.items.len == 0) { - try pp.err(tok, .hash_hash_at_start); - return skipToNl(tokenizer); - } + }, + until_endif => try pp.skip(.until_endif_seen_else), + until_endif_seen_else => { + try pp.errTok(directive, .else_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_endif => { + try pp.expectNewline(); + if (pp.if_level == 0) { + pp.clearGuard(); + try pp.errTok(directive, .endif_without_if); + return; + } else if (pp.if_level == 1) { + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; const saved_tokenizer = tokenizer.*; - const next = tokenizer.nextNoWSComments(); - if (next.id == .nl or next.id == .eof) { - try pp.err(tok, .hash_hash_at_end); - return; + defer tokenizer.* = saved_tokenizer; + + var next_tok = tokenizer.nextNoWS(); + while (next_tok.id == .nl) : (next_tok = tokenizer.nextNoWS()) {} + if (next_tok.id != .eof) pp.clearGuard(); + } + pp.if_level -= 1; + }, + .keyword_error => try pp.readErrorMessage(directive, .error_directive), + .keyword_if => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #if", .{}); } - tokenizer.* = saved_tokenizer; - // convert the previous token to .macro_param_no_expand if it was .macro_param - if (pp.token_buf.items[pp.token_buf.items.len - 1].id == .macro_param) { - pp.token_buf.items[pp.token_buf.items.len - 1].id = .macro_param_no_expand; + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #if", .{}); } - try pp.token_buf.append(tok); - }, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.token_buf.append(tok); - }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), - else => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); + } + }, + .keyword_ifdef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #ifdef", .{}); } - if (var_args and tok.id == .keyword_va_args) { - // do nothing - } else if (var_args and tok.id == .keyword_va_opt) { - const opt_l_paren = tokenizer.next(); - if (opt_l_paren.id != .l_paren) { - try pp.err(opt_l_paren, .va_opt_lparen); - return skipToNl(tokenizer); - } - tok.start = opt_l_paren.end; - - var parens: u32 = 0; - while (true) { - const opt_tok = tokenizer.next(); - switch (opt_tok.id) { - .l_paren => parens += 1, - .r_paren => if (parens == 0) { - break; - } else { - parens -= 1; - }, - .nl, .eof => { - try pp.err(opt_tok, .va_opt_rparen); - try pp.err(opt_l_paren, .to_match_paren); - return skipToNl(tokenizer); - }, - .whitespace => {}, - else => tok.end = opt_tok.end, + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #ifdef", .{}); + } + } + }, + .keyword_ifndef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) == null) { + pp.if_kind.set(pp.if_level, until_endif); + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + } + }, + .keyword_elifdef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifdef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } } } - } else if (tok.id.isMacroIdentifier()) { - tok.id.simplifyMacroKeyword(); - const s = pp.tokSlice(tok); - if (mem.eql(u8, gnu_var_args, s)) { - tok.id = .keyword_va_args; - } else for (params.items, 0..) |param, i| { - if (mem.eql(u8, param, s)) { - // NOTE: it doesn't matter to assign .macro_param_no_expand - // here in case a ## was the previous token, because - // ## processing will eat this token with the same semantics - tok.id = .macro_param; - tok.end = @intCast(i); - break; + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_elifndef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) == null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifndef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } } } - } - try pp.token_buf.append(tok); - }, - } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_include => try pp.readInclude(directive), + .keyword_include_next => try pp.readIncludeNext(directive), + .keyword_line => try pp.readLine(), + .keyword_pragma => try pp.readPragma(directive), + .keyword_undef => try pp.readUndef(), + .keyword_warning => try pp.readErrorMessage(directive, .warning_directive), + .keyword_embed => try pp.readEmbed(directive), + else => try pp.errTok(directive, .invalid_preprocessing_directive), } - - const param_list = try pp.arena.allocator().dupe([]const u8, params.items); - const token_list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); - try pp.defineMacro(macro_name, .{ - .is_func = true, - .params = param_list, - .var_args = var_args or gnu_var_args.len != 0, - .tokens = token_list, - .loc = tokFromRaw(macro_name).loc, - }); } -/// Handle an #embed directive -/// embedDirective : ("FILENAME" | ) embedParam* -/// embedParam : IDENTIFIER (:: IDENTIFIER)? '(' ')' -fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { - const first = tokenizer.nextNoWS(); - const filename_tok = pp.findIncludeFilenameToken(first, tokenizer, .ignore_trailing_tokens) catch |er| switch (er) { +/// TODO: handle limit/prefix/suffix/etc +fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { error.InvalidInclude => return, else => |e| return e, }; - defer TokenWithExpansionLocs.free(filename_tok.expansion_locs, pp.gpa); - // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); - if (tok_slice.len < 3) { - try pp.err(first, .empty_filename); - return; - } - const filename = tok_slice[1 .. tok_slice.len - 1]; - const include_type: Compilation.IncludeType = switch (filename_tok.id) { - .string_literal => .quotes, - .macro_string => .angle_brackets, + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, else => unreachable, }; - // Index into `token_buf` - const Range = struct { - start: u32, - end: u32, - - fn expand(opt_range: ?@This(), pp_: *Preprocessor, tokenizer_: *Tokenizer) !void { - const range = opt_range orelse return; - const slice = pp_.token_buf.items[range.start..range.end]; - for (slice) |tok| { - try pp_.expandMacro(tokenizer_, tok); - } - } - }; - pp.token_buf.items.len = 0; - - var limit: ?u32 = null; - var prefix: ?Range = null; - var suffix: ?Range = null; - var if_empty: ?Range = null; - while (true) { - const param_first = tokenizer.nextNoWS(); - switch (param_first.id) { - .nl, .eof => break, - .identifier => {}, - else => { - try pp.err(param_first, .malformed_embed_param); - continue; - }, - } - - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - const maybe_colon = tokenizer.colonColon(); - const param = switch (maybe_colon.id) { - .colon_colon => blk: { - // vendor::param - const param = tokenizer.nextNoWS(); - if (param.id != .identifier) { - try pp.err(param, .malformed_embed_param); - continue; - } - const l_paren = tokenizer.nextNoWS(); - if (l_paren.id != .l_paren) { - try pp.err(l_paren, .malformed_embed_param); - continue; - } - try pp.char_buf.appendSlice(Attribute.normalize(pp.tokSlice(param_first))); - try pp.char_buf.appendSlice("::"); - try pp.char_buf.appendSlice(Attribute.normalize(pp.tokSlice(param))); - break :blk pp.char_buf.items; - }, - .l_paren => Attribute.normalize(pp.tokSlice(param_first)), - else => { - try pp.err(maybe_colon, .malformed_embed_param); - continue; - }, - }; - - const start: u32 = @intCast(pp.token_buf.items.len); - while (true) { - const next = tokenizer.nextNoWS(); - if (next.id == .r_paren) break; - if (next.id == .eof) { - try pp.err(maybe_colon, .malformed_embed_param); - break; - } - try pp.token_buf.append(next); - } - const end: u32 = @intCast(pp.token_buf.items.len); - - if (std.mem.eql(u8, param, "limit")) { - if (limit != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "limit"); - continue; - } - if (start + 1 != end) { - try pp.err(param_first, .malformed_embed_limit); - continue; - } - const limit_tok = pp.token_buf.items[start]; - if (limit_tok.id != .pp_num) { - try pp.err(param_first, .malformed_embed_limit); - continue; - } - limit = std.fmt.parseInt(u32, pp.tokSlice(limit_tok), 10) catch { - try pp.err(limit_tok, .malformed_embed_limit); - continue; - }; - pp.token_buf.items.len = start; - } else if (std.mem.eql(u8, param, "prefix")) { - if (prefix != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "prefix"); - continue; - } - prefix = .{ .start = start, .end = end }; - } else if (std.mem.eql(u8, param, "suffix")) { - if (suffix != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "suffix"); - continue; - } - suffix = .{ .start = start, .end = end }; - } else if (std.mem.eql(u8, param, "if_empty")) { - if (if_empty != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "if_empty"); - continue; - } - if_empty = .{ .start = start, .end = end }; - } else { - try pp.errStr( - tokFromRaw(param_first), - .unsupported_embed_param, - try pp.comp.diagnostics.arena.allocator().dupe(u8, param), - ); - pp.token_buf.items.len = start; - } - } - - const embed_bytes = (try pp.comp.findEmbed(filename, first.source, include_type, limit)) orelse - return pp.fatalNotFound(filename_tok, filename); + const limit = std.math.maxInt(u32); + const embed_bytes = (try pp.comp.findEmbed(filename, directive_tok.loc.id, include_type, limit)) orelse + return pp.fatalNotFound(directive_tok, filename); defer pp.comp.gpa.free(embed_bytes); - try Range.expand(prefix, pp, tokenizer); - - if (embed_bytes.len == 0) { - try Range.expand(if_empty, pp, tokenizer); - try Range.expand(suffix, pp, tokenizer); - return; - } - - try pp.ensureUnusedTokenCapacity(2 * embed_bytes.len - 1); // N bytes and N-1 commas + if (embed_bytes.len == 0) return; // TODO: We currently only support systems with CHAR_BIT == 8 // If the target's CHAR_BIT is not 8, we need to write out correctly-sized embed_bytes @@ -2985,290 +1879,71 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { const byte = embed_bytes[0]; const start = pp.comp.generated_buf.items.len; try writer.print("{d}", .{byte}); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start, .embed_byte, filename_tok)); + var generated = try pp.makeGeneratedToken(start, .embed_byte, directive_tok); + generated.flags.is_bol = true; + try pp.addToken(generated); } for (embed_bytes[1..]) |byte| { const start = pp.comp.generated_buf.items.len; try writer.print(",{d}", .{byte}); - pp.addTokenAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, filename_tok)); + try pp.addToken(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); + try pp.addToken(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); } try pp.comp.generated_buf.append(pp.gpa, '\n'); - - try Range.expand(suffix, pp, tokenizer); -} - -// Handle a #include directive. -fn include(pp: *Preprocessor, tokenizer: *Tokenizer, which: Compilation.WhichInclude) MacroError!void { - const first = tokenizer.nextNoWS(); - const new_source = findIncludeSource(pp, tokenizer, first, which) catch |er| switch (er) { - error.InvalidInclude => return, - else => |e| return e, - }; - - // Prevent stack overflow - pp.include_depth += 1; - defer pp.include_depth -= 1; - if (pp.include_depth > max_include_depth) { - try pp.comp.addDiagnostic(.{ - .tag = .too_many_includes, - .loc = .{ .id = first.source, .byte_offset = first.start, .line = first.line }, - }, &.{}); - return error.StopPreprocessing; - } - - if (pp.include_guards.get(new_source.id)) |guard| { - if (pp.defines.contains(guard)) return; - } - - if (pp.verbose) { - pp.verboseLog(first, "include file {s}", .{new_source.path}); - } - - const token_state = pp.getTokenState(); - try pp.addIncludeStart(new_source); - const eof = pp.preprocessExtra(new_source) catch |er| switch (er) { - error.StopPreprocessing => { - for (pp.expansion_entries.items(.locs)[token_state.expansion_entries_len..]) |loc| TokenWithExpansionLocs.free(loc, pp.gpa); - pp.restoreTokenState(token_state); - return; - }, - else => |e| return e, - }; - try eof.checkMsEof(new_source, pp.comp); - if (pp.preserve_whitespace and pp.tokens.items(.id)[pp.tokens.len - 1] != .nl) { - try pp.addToken(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); - } - if (pp.linemarkers == .none) return; - var next = first; - while (true) { - var tmp = tokenizer.*; - next = tmp.nextNoWS(); - if (next.id != .nl) break; - tokenizer.* = tmp; - } - try pp.addIncludeResume(next.source, next.end, next.line); -} - -/// tokens that are part of a pragma directive can happen in 3 ways: -/// 1. directly in the text via `#pragma ...` -/// 2. Via a string literal argument to `_Pragma` -/// 3. Via a stringified macro argument which is used as an argument to `_Pragma` -/// operator_loc: Location of `_Pragma`; null if this is from #pragma -/// arg_locs: expansion locations of the argument to _Pragma. empty if #pragma or a raw string literal was used -fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !TokenWithExpansionLocs { - var tok = tokFromRaw(raw); - if (operator_loc) |loc| { - try tok.addExpansionLocation(pp.gpa, &.{loc}); - } - try tok.addExpansionLocation(pp.gpa, arg_locs); - return tok; -} - -pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void { - if (tok.expansion_locs) |expansion_locs| { - try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); - } - try pp.tokens.append(pp.gpa, .{ .id = tok.id, .loc = tok.loc }); -} - -pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: TokenWithExpansionLocs) void { - if (tok.expansion_locs) |expansion_locs| { - pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); - } - pp.tokens.appendAssumeCapacity(.{ .id = tok.id, .loc = tok.loc }); -} - -pub fn ensureTotalTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureTotalCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureTotalCapacity(pp.gpa, capacity); -} - -pub fn ensureUnusedTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureUnusedCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, capacity); } -/// Handle a pragma directive -fn pragma(pp: *Preprocessor, tokenizer: *Tokenizer, pragma_tok: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !void { - const name_tok = tokenizer.nextNoWS(); - if (name_tok.id == .nl or name_tok.id == .eof) return; - - const name = pp.tokSlice(name_tok); - try pp.addToken(try pp.makePragmaToken(pragma_tok, operator_loc, arg_locs)); - const pragma_start: u32 = @intCast(pp.tokens.len); - - const pragma_name_tok = try pp.makePragmaToken(name_tok, operator_loc, arg_locs); - try pp.addToken(pragma_name_tok); +fn readToken(pp: *Preprocessor) Error!PreprocessorToken { while (true) { - const next_tok = tokenizer.next(); - if (next_tok.id == .whitespace) continue; - if (next_tok.id == .eof) { - try pp.addToken(.{ - .id = .nl, - .loc = .{ .id = .generated }, - }); - break; - } - try pp.addToken(try pp.makePragmaToken(next_tok, operator_loc, arg_locs)); - if (next_tok.id == .nl) break; - } - if (pp.comp.getPragma(name)) |prag| unknown: { - return prag.preprocessorCB(pp, pragma_start) catch |er| switch (er) { - error.UnknownPragma => break :unknown, - else => |e| return e, - }; - } - return pp.comp.addDiagnostic(.{ - .tag = .unknown_pragma, - .loc = pragma_name_tok.loc, - }, pragma_name_tok.expansionSlice()); -} - -fn findIncludeFilenameToken( - pp: *Preprocessor, - first_token: RawToken, - tokenizer: *Tokenizer, - trailing_token_behavior: enum { ignore_trailing_tokens, expect_nl_eof }, -) !TokenWithExpansionLocs { - var first = first_token; - - if (first.id == .angle_bracket_left) to_end: { - // The tokenizer does not handle include strings so do it here. - while (tokenizer.index < tokenizer.buf.len) : (tokenizer.index += 1) { - switch (tokenizer.buf[tokenizer.index]) { - '>' => { - tokenizer.index += 1; - first.end = tokenizer.index; - first.id = .macro_string; - break :to_end; - }, - '\n' => break, - else => {}, - } + const tok = try pp.readExpand(); + if (tok.flags.is_bol and tok.id == .hash and tok.hideset == null) { + try pp.readDirective(); + continue; } - try pp.comp.addDiagnostic(.{ - .tag = .header_str_closing, - .loc = .{ .id = first.source, .byte_offset = tokenizer.index, .line = first.line }, - }, &.{}); - try pp.err(first, .header_str_match); - } - - const source_tok = tokFromRaw(first); - const filename_tok, const expanded_trailing = switch (source_tok.id) { - .string_literal, .macro_string => .{ source_tok, false }, - else => expanded: { - // Try to expand if the argument is a macro. - pp.top_expansion_buf.items.len = 0; - defer for (pp.top_expansion_buf.items) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try pp.top_expansion_buf.append(source_tok); - pp.expansion_source_loc = source_tok.loc; - - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); - var trailing_toks: []const TokenWithExpansionLocs = &.{}; - const include_str = (try pp.reconstructIncludeString(pp.top_expansion_buf.items, &trailing_toks, tokFromRaw(first))) orelse { - try pp.expectNl(tokenizer); - return error.InvalidInclude; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, include_str); - - break :expanded .{ try pp.makeGeneratedToken(start, switch (include_str[0]) { - '"' => .string_literal, - '<' => .macro_string, - else => unreachable, - }, pp.top_expansion_buf.items[0]), trailing_toks.len != 0 }; - }, - }; - - switch (trailing_token_behavior) { - .expect_nl_eof => { - // Error on extra tokens. - const nl = tokenizer.nextNoWS(); - if ((nl.id != .nl and nl.id != .eof) or expanded_trailing) { - skipToNl(tokenizer); - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ - .tag = .extra_tokens_directive_end, - .loc = filename_tok.loc, - }, filename_tok.expansionSlice(), false); - } - }, - .ignore_trailing_tokens => if (expanded_trailing) { - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ - .tag = .extra_tokens_directive_end, - .loc = filename_tok.loc, - }, filename_tok.expansionSlice(), false); - }, + return tok; } - return filename_tok; } -fn findIncludeSource(pp: *Preprocessor, tokenizer: *Tokenizer, first: RawToken, which: Compilation.WhichInclude) !Source { - const filename_tok = try pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof); - defer TokenWithExpansionLocs.free(filename_tok.expansion_locs, pp.gpa); - - // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); - if (tok_slice.len < 3) { - try pp.err(first, .empty_filename); - return error.InvalidInclude; - } +pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); - // Find the file. - const filename = tok_slice[1 .. tok_slice.len - 1]; - const include_type: Compilation.IncludeType = switch (filename_tok.id) { - .string_literal => .quotes, - .macro_string => .angle_brackets, - else => unreachable, - }; + pp.preprocess_count += 1; + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); + while (true) { + const tok = try pp.readToken(); + if (tok.id == .eof) { + const tokenizer = pp.tokenizers.pop(); - return (try pp.comp.findInclude(filename, first, include_type, which)) orelse - return pp.fatalNotFound(filename_tok, filename); -} + if (pp.tokenizers.items.len > 0) { + var next_tok: RawToken = undefined; + var tmp = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + while (true) { + next_tok = tmp.nextNoWS(); + if (next_tok.id != .nl) break; + } + try pp.addIncludeResume(next_tok.source, next_tok.end, next_tok.line); + } -fn printLinemarker( - pp: *Preprocessor, - w: anytype, - line_no: u32, - source: Source, - start_resume: enum(u8) { start, @"resume", none }, -) !void { - try w.writeByte('#'); - if (pp.linemarkers == .line_directives) try w.writeAll("line"); - try w.print(" {d} \"", .{line_no}); - for (source.path) |byte| switch (byte) { - '\n' => try w.writeAll("\\n"), - '\r' => try w.writeAll("\\r"), - '\t' => try w.writeAll("\\t"), - '\\' => try w.writeAll("\\\\"), - '"' => try w.writeAll("\\\""), - ' ', '!', '#'...'&', '('...'[', ']'...'~' => try w.writeByte(byte), - // Use hex escapes for any non-ASCII/unprintable characters. - // This ensures that the parsed version of this string will end up - // containing the same bytes as the input regardless of encoding. - else => { - try w.writeAll("\\x"); - try std.fmt.formatInt(byte, 16, .lower, .{ .width = 2, .fill = '0' }, w); - }, - }; - try w.writeByte('"'); - if (pp.linemarkers == .numeric_directives) { - switch (start_resume) { - .none => {}, - .start => try w.writeAll(" 1"), - .@"resume" => try w.writeAll(" 2"), - } - switch (source.kind) { - .user => {}, - .system => try w.writeAll(" 3"), - .extern_c_system => try w.writeAll(" 3 4"), + const guard_name = pp.guard_stack.pop(); + if (guard_name) |name| { + try pp.include_guards.put(pp.gpa, tokenizer.source, name); + } + if (pp.tokenizers.items.len == 0) { + return tok; + } + } else { + switch (tok.id) { + .unterminated_comment => try pp.errTok(tok, .unterminated_comment), + else => try pp.addToken(tok), + } } } - try w.writeByte('\n'); } // After how many empty lines are needed to replace them with linemarkers. @@ -3277,16 +1952,12 @@ const collapse_newlines = 8; /// Pretty print tokens and try to preserve whitespace. pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { const tok_ids = pp.tokens.items(.id); - var i: u32 = 0; - var last_nl = true; - outer: while (true) : (i += 1) { + var last_nl = false; + outer: while (i < pp.tokens.len) : (i += 1) { var cur: Token = pp.tokens.get(i); switch (cur.id) { - .eof => { - if (!last_nl) try w.writeByte('\n'); - return; - }, + .eof => break, .nl => { var newlines: u32 = 0; for (tok_ids[i..], i..) |id, j| { @@ -3305,9 +1976,9 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { i = @intCast((j - 1) - @intFromBool(tok_ids[j - 1] == .whitespace)); if (!last_nl) try w.writeAll("\n"); if (pp.linemarkers != .none) { - const next = pp.tokens.get(i); - const source = pp.comp.getSource(next.loc.id); - const line_col = source.lineCol(next.loc); + const next_tok = pp.tokens.get(i); + const source = pp.comp.getSource(next_tok.loc.id); + const line_col = source.lineCol(next_tok.loc); try pp.printLinemarker(w, line_col.line_no, source, .none); last_nl = true; } @@ -3318,7 +1989,7 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { try w.writeAll("\n"); }, .keyword_pragma => { - const pragma_name = pp.expandedSlice(pp.tokens.get(i + 1)); + const pragma_name = pp.tokSlice(pp.tokens.get(i + 1)); const end_idx = mem.indexOfScalarPos(Token.Id, tok_ids, i, .nl) orelse i + 1; const pragma_len = @as(u32, @intCast(end_idx)) - i; @@ -3340,19 +2011,10 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { break; } try w.writeByte(' '); - const slice = pp.expandedSlice(cur); + const slice = pp.tokSlice(cur); try w.writeAll(slice); } }, - .whitespace => { - var slice = pp.expandedSlice(cur); - while (mem.indexOfScalar(u8, slice, '\n')) |some| { - if (pp.linemarkers != .none) try w.writeByte('\n'); - slice = slice[some + 1 ..]; - } - for (slice) |_| try w.writeByte(' '); - last_nl = false; - }, .include_start => { const source = pp.comp.getSource(cur.loc.id); @@ -3367,192 +2029,249 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { try pp.printLinemarker(w, line_col.line_no, source, .@"resume"); last_nl = true; }, - else => { - const slice = pp.expandedSlice(cur); - try w.writeAll(slice); - last_nl = false; - }, + else => try pp.prettyPrintToken(w, cur), } } + try w.writeByte('\n'); } -test "Preserve pragma tokens sometimes" { - const allocator = std.testing.allocator; - const Test = struct { - fn runPreprocessor(source_text: []const u8) ![]const u8 { - var buf = std.ArrayList(u8).init(allocator); - defer buf.deinit(); +fn printLinemarker( + pp: *Preprocessor, + w: anytype, + line_no: u32, + source: Source, + start_resume: enum(u8) { start, @"resume", none }, +) !void { + try w.writeByte('#'); + if (pp.linemarkers == .line_directives) try w.writeAll("line"); + try w.print(" {d} \"", .{line_no}); + for (source.path) |byte| switch (byte) { + '\n' => try w.writeAll("\\n"), + '\r' => try w.writeAll("\\r"), + '\t' => try w.writeAll("\\t"), + '\\' => try w.writeAll("\\\\"), + '"' => try w.writeAll("\\\""), + ' ', '!', '#'...'&', '('...'[', ']'...'~' => try w.writeByte(byte), + // Use hex escapes for any non-ASCII/unprintable characters. + // This ensures that the parsed version of this string will end up + // containing the same bytes as the input regardless of encoding. + else => { + try w.writeAll("\\x"); + try std.fmt.formatInt(byte, 16, .lower, .{ .width = 2, .fill = '0' }, w); + }, + }; + try w.writeByte('"'); + if (pp.linemarkers == .numeric_directives) { + switch (start_resume) { + .none => {}, + .start => try w.writeAll(" 1"), + .@"resume" => try w.writeAll(" 2"), + } + switch (source.kind) { + .user => {}, + .system => try w.writeAll(" 3"), + .extern_c_system => try w.writeAll(" 3 4"), + } + } + try w.writeByte('\n'); +} - var comp = Compilation.init(allocator); - defer comp.deinit(); +fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: Token) !void { + if (tok.flags.is_bol) { + try w.writeByte('\n'); + } + if (tok.flags.space) { + try w.writeByte(' '); + } + if (tok.id.lexeme()) |some| { + try w.writeAll(some); + } else { + try w.writeAll(pp.tokSlice(tok)); + } +} - try comp.addDefaultPragmaHandlers(); +pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { + const S = struct { + fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { + _ = context; + return std.math.order(lhs, rhs); + } + }; - var pp = Preprocessor.init(&comp); - defer pp.deinit(); + const indices = pp.expansion_entries.items(.idx); + const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; + const locs = pp.expansion_entries.items(.locs)[idx]; + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + return locs[0..i]; +} - pp.preserve_whitespace = true; - assert(pp.linemarkers == .none); +pub fn addToken(pp: *Preprocessor, tok: PreprocessorToken) !void { + var r: std.ArrayListUnmanaged(Source.Location) = .{}; + defer r.deinit(pp.gpa); - const test_runner_macros = try comp.addSourceFromBuffer("", source_text); - const eof = try pp.preprocess(test_runner_macros); - try pp.addToken(eof); - try pp.prettyPrintTokens(buf.writer()); - return allocator.dupe(u8, buf.items); - } + var it = tok.loc_list.first; + while (it) |node| : (it = node.next) { + try r.append(pp.gpa, node.data); + } + if (r.items.len > 0) { + // std.debug.print("gottem\n", .{}); + try r.append(pp.gpa, .{ .id = .unused, .byte_offset = 1 }); + try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, 1); - fn check(source_text: []const u8, expected: []const u8) !void { - const output = try runPreprocessor(source_text); - defer allocator.free(output); + const items = try r.toOwnedSlice(pp.gpa); // TODO: reverse? + pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = items.ptr }); + } - try std.testing.expectEqualStrings(expected, output); - } - }; - const preserve_gcc_diagnostic = - \\#pragma GCC diagnostic error "-Wnewline-eof" - \\#pragma GCC warning error "-Wnewline-eof" - \\int x; - \\#pragma GCC ignored error "-Wnewline-eof" - \\ - ; - try Test.check(preserve_gcc_diagnostic, preserve_gcc_diagnostic); - - const omit_once = - \\#pragma once - \\int x; - \\#pragma once - \\ - ; - // TODO should only be one newline afterwards when emulating clang - try Test.check(omit_once, "\nint x;\n\n"); - - const omit_poison = - \\#pragma GCC poison foobar - \\ - ; - try Test.check(omit_poison, "\n"); -} - -test "destringify" { - const allocator = std.testing.allocator; - const Test = struct { - fn testDestringify(pp: *Preprocessor, stringified: []const u8, destringified: []const u8) !void { - pp.char_buf.clearRetainingCapacity(); - try pp.char_buf.ensureUnusedCapacity(stringified.len); - pp.destringify(stringified); - try std.testing.expectEqualStrings(destringified, pp.char_buf.items); - } - }; - var comp = Compilation.init(allocator); - defer comp.deinit(); - var pp = Preprocessor.init(&comp); - defer pp.deinit(); - - try Test.testDestringify(&pp, "hello\tworld\n", "hello\tworld\n"); - try Test.testDestringify(&pp, - \\ \"FOO BAR BAZ\" - , - \\ "FOO BAR BAZ" - ); - try Test.testDestringify(&pp, - \\ \\t\\n - \\ - , - \\ \t\n - \\ - ); -} - -test "Include guards" { - const Test = struct { - /// This is here so that when #elifdef / #elifndef are added we don't forget - /// to test that they don't accidentally break include guard detection - fn pairsWithIfndef(tok_id: RawToken.Id) bool { - return switch (tok_id) { - .keyword_elif, - .keyword_elifdef, - .keyword_elifndef, - .keyword_else, - => true, - - .keyword_include, - .keyword_include_next, - .keyword_embed, - .keyword_define, - .keyword_defined, - .keyword_undef, - .keyword_ifdef, - .keyword_ifndef, - .keyword_error, - .keyword_warning, - .keyword_pragma, - .keyword_line, - .keyword_endif, - => false, - else => unreachable, - }; - } + try pp.tokens.append(pp.gpa, tok.toTreeToken()); +} - fn skippable(tok_id: RawToken.Id) bool { - return switch (tok_id) { - .keyword_defined, .keyword_va_args, .keyword_va_opt, .keyword_endif => true, - else => false, - }; - } +fn skip( + pp: *Preprocessor, + cont: enum { until_else, until_endif, until_endif_seen_else }, +) Error!void { + var ifs_seen: u32 = 0; + var line_start = true; + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; - fn testIncludeGuard(allocator: std.mem.Allocator, comptime template: []const u8, tok_id: RawToken.Id, expected_guards: u32) !void { - var comp = Compilation.init(allocator); - defer comp.deinit(); - var pp = Preprocessor.init(&comp); - defer pp.deinit(); - - const path = try std.fs.path.join(allocator, &.{ ".", "bar.h" }); - defer allocator.free(path); - - _ = try comp.addSourceFromBuffer(path, "int bar = 5;\n"); - - var buf = std.ArrayList(u8).init(allocator); - defer buf.deinit(); - - var writer = buf.writer(); - switch (tok_id) { - .keyword_include, .keyword_include_next => try writer.print(template, .{ tok_id.lexeme().?, " \"bar.h\"" }), - .keyword_define, .keyword_undef => try writer.print(template, .{ tok_id.lexeme().?, " BAR" }), - .keyword_ifndef, - .keyword_ifdef, - .keyword_elifdef, - .keyword_elifndef, - => try writer.print(template, .{ tok_id.lexeme().?, " BAR\n#endif" }), - else => try writer.print(template, .{ tok_id.lexeme().?, "" }), + while (tokenizer.index < tokenizer.buf.len) { + if (line_start) { + const saved_tokenizer = tokenizer.*; + const hash = tokenizer.nextNoWS(); + if (hash.id == .nl) continue; + line_start = false; + if (hash.id != .hash) continue; + const directive = tokenizer.nextNoWS(); + switch (directive.id) { + .keyword_else => { + if (ifs_seen != 0) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .else_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elif => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elif_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifdef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifdef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifndef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifndef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_endif => { + if (ifs_seen == 0) { + tokenizer.* = saved_tokenizer; + return; + } + ifs_seen -= 1; + }, + .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, + else => {}, } - const source = try comp.addSourceFromBuffer("test.h", buf.items); - _ = try pp.preprocess(source); - - try std.testing.expectEqual(expected_guards, pp.include_guards.count()); - } - }; - const tags = std.meta.tags(RawToken.Id); - for (tags) |tag| { - if (Test.skippable(tag)) continue; - var copy = tag; - copy.simplifyMacroKeyword(); - if (copy != tag or tag == .keyword_else) { - const inside_ifndef_template = - \\//Leading comment (should be ignored) - \\ - \\#ifndef FOO - \\#{s}{s} - \\#endif - ; - const expected_guards: u32 = if (Test.pairsWithIfndef(tag)) 0 else 1; - try Test.testIncludeGuard(std.testing.allocator, inside_ifndef_template, tag, expected_guards); - - const outside_ifndef_template = - \\#ifndef FOO - \\#endif - \\#{s}{s} - ; - try Test.testIncludeGuard(std.testing.allocator, outside_ifndef_template, tag, 0); + } else if (tokenizer.buf[tokenizer.index] == '\n') { + line_start = true; + tokenizer.index += 1; + tokenizer.line += 1; + tokenizer.bol = true; + if (pp.preserve_whitespace) { + try pp.addToken(.{ .id = .nl, .loc = .{ + .id = tokenizer.source, + .line = tokenizer.line, + } }); + } + } else { + line_start = false; + tokenizer.index += 1; } + } else { + return pp.errTok(.{ .id = .eof, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .unterminated_conditional_directive); + } +} + +fn verboseLog(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) void { + const source = pp.comp.getSource(tok.loc.id); + const line_col = source.lineCol(tok.loc); + + const stderr = std.io.getStdErr().writer(); + var buf_writer = std.io.bufferedWriter(stderr); + const writer = buf_writer.writer(); + defer buf_writer.flush() catch {}; + writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; + writer.print(fmt, args) catch return; + writer.writeByte('\n') catch return; + writer.writeAll(line_col.line) catch return; + writer.writeByte('\n') catch return; +} + +fn fatal(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) Compilation.Error { + try pp.comp.diagnostics.list.append(pp.gpa, .{ + .tag = .cli_error, + .kind = .@"fatal error", + .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, + .loc = tok.loc, + }); + return error.FatalError; +} + +fn fatalNotFound(pp: *Preprocessor, tok: PreprocessorToken, filename: []const u8) Compilation.Error { + const old = pp.comp.diagnostics.fatal_errors; + pp.comp.diagnostics.fatal_errors = true; + defer pp.comp.diagnostics.fatal_errors = old; + + try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ + .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), + } }, tok.expansionSlice(), false); + unreachable; // addExtra should've returned FatalError +} + +/// Consume next token, error if it is not an identifier. +fn expectMacroName(pp: *Preprocessor) Error!?[]const u8 { + const macro_name = pp.getToken(); + if (!macro_name.id.isMacroIdentifier()) { + try pp.errTok(macro_name, .macro_name_missing); + pp.skipToNl(); + return null; } + return pp.tokSlice(macro_name); +} + +/// Return the name of the #ifndef guard macro that starts a source, if any. +/// If a source starts with `#ifndef IDENTIFIER`, return `IDENTIFIER` +/// This function does not validate that the entire source is guarded by the +/// initial ifndef, if any +fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { + var tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .source = source.id, + }; + var hash = tokenizer.nextNoWS(); + while (hash.id == .nl) hash = tokenizer.nextNoWS(); + if (hash.id != .hash) return null; + const ifndef = tokenizer.nextNoWS(); + if (ifndef.id != .keyword_ifndef) return null; + const guard = tokenizer.nextNoWS(); + if (guard.id != .identifier) return null; + return pp.tokSlice(.{ .id = guard.id, .loc = .{ .id = guard.source, .byte_offset = guard.start, .line = guard.line } }); } diff --git a/src/aro/Tokenizer.zig b/src/aro/Tokenizer.zig index 8ee38126..33f76429 100644 --- a/src/aro/Tokenizer.zig +++ b/src/aro/Tokenizer.zig @@ -10,6 +10,7 @@ pub const Token = struct { start: u32 = 0, end: u32 = 0, line: u32 = 0, + bol: bool = false, pub const Id = enum(u8) { invalid, @@ -323,6 +324,10 @@ pub const Token = struct { /// A comment token if asked to preserve comments. comment, + pub fn isDirectiveEnd(id: Id) bool { + return id == .nl or id == .eof; + } + /// Return true if token is identifier or keyword. pub fn isMacroIdentifier(id: Id) bool { switch (id) { @@ -1030,6 +1035,7 @@ index: u32 = 0, source: Source.Id, langopts: LangOpts, line: u32 = 1, +bol: bool = true, pub fn next(self: *Tokenizer) Token { var state: enum { @@ -1077,6 +1083,8 @@ pub fn next(self: *Tokenizer) Token { var start = self.index; var id: Token.Id = .eof; + const bol = self.bol; + self.bol = false; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; @@ -1086,6 +1094,7 @@ pub fn next(self: *Tokenizer) Token { id = .nl; self.index += 1; self.line += 1; + self.bol = true; break; }, '"' => { @@ -1265,6 +1274,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_string_literal; + self.bol = true; break; }, '\r' => unreachable, @@ -1281,6 +1291,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => { @@ -1297,6 +1308,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => {}, @@ -1304,6 +1316,7 @@ pub fn next(self: *Tokenizer) Token { .char_escape_sequence => switch (c) { '\r', '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => state = .char_literal, @@ -1311,6 +1324,7 @@ pub fn next(self: *Tokenizer) Token { .string_escape_sequence => switch (c) { '\r', '\n' => { id = .unterminated_string_literal; + self.bol = true; break; }, else => state = .string_literal, @@ -1624,6 +1638,7 @@ pub fn next(self: *Tokenizer) Token { }, .line_comment => switch (c) { '\n' => { + self.bol = true; if (self.langopts.preserve_comments) { id = .comment; break; @@ -1656,6 +1671,7 @@ pub fn next(self: *Tokenizer) Token { }, .multi_line_comment_done => switch (c) { '\n' => { + self.bol = true; start = self.index; id = .nl; self.index += 1; @@ -1782,6 +1798,7 @@ pub fn next(self: *Tokenizer) Token { .end = self.index, .line = self.line, .source = self.source, + .bol = bol, }; } diff --git a/src/aro/Treap.zig b/src/aro/Treap.zig new file mode 100644 index 00000000..74d3005d --- /dev/null +++ b/src/aro/Treap.zig @@ -0,0 +1,165 @@ +/// Persistent treap data structure. Nodes are immutable and set operations do not invalidate +/// existing nodes. +/// Adapted from https://arxiv.org/pdf/1301.3388 +const std = @import("std"); + +const Treap = @This(); +const Key = []const u8; + +pub const Node = ?*const Item; + +const Item = struct { + key: Key, + left: Node, + right: Node, + + fn priority(node: *const Item) u64 { + return std.hash.Wyhash.hash(0, node.key); + } + + const HashContext = struct { + pub fn hash(self: @This(), s: *const Item) u64 { + _ = self; + return std.hash.Wyhash.hash(0, std.mem.asBytes(s)); + } + pub fn eql(self: @This(), a: *const Item, b: *const Item) bool { + _ = self; + return a.left == b.left and a.right == b.right and std.mem.eql(u8, a.key, b.key); + } + }; +}; + +allocator: std.mem.Allocator, +node_arena: std.heap.ArenaAllocator, +/// nodes are hash-consed so structural equality can be determined by comparing pointers +nodes: std.HashMapUnmanaged(*const Item, void, Item.HashContext, std.hash_map.default_max_load_percentage) = .{}, + +pub fn init(allocator: std.mem.Allocator) Treap { + return .{ .allocator = allocator, .node_arena = std.heap.ArenaAllocator.init(allocator) }; +} + +pub fn deinit(self: *Treap) void { + self.nodes.deinit(self.allocator); + self.node_arena.deinit(); +} + +fn makeNode(self: *Treap, key: Key, left: Node, right: Node) !Node { + const node: Item = .{ .key = key, .left = left, .right = right }; + const gop = try self.nodes.getOrPut(self.allocator, &node); + if (gop.found_existing) return gop.key_ptr.*; + + const new_node = try self.node_arena.allocator().create(Item); + new_node.* = .{ + .key = key, + .left = left, + .right = right, + }; + gop.key_ptr.* = new_node; + return new_node; +} + +fn join(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + if (t1.priority() < t2.priority()) { + return self.makeNode(t2.key, try self.join(t1, t2.left), t2.right); + } else { + return self.makeNode(t1.key, t1.left, try self.join(t1.right, t2)); + } +} + +fn split(self: *Treap, t_arg: Node, key: Key) !struct { Node, Node } { + const t = t_arg orelse return .{ null, null }; + switch (std.mem.order(u8, key, t.key)) { + .lt => { + const l1, const l2 = try self.split(t.left, key); + return .{ l1, try self.makeNode(t.key, l2, t.right) }; + }, + .eq, .gt => { + const r1, const r2 = try self.split(t.right, key); + return .{ try self.makeNode(t.key, t.left, r1), r2 }; + }, + } +} + +fn add(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + std.debug.assert(!std.mem.eql(u8, t1.key, t2.key)); + if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.makeNode(t2.key, try self.add(l1, t2.left), try self.add(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.makeNode(t1.key, try self.add(t1.left, l2), try self.add(t1.right, r2)); + } +} + +pub fn addNodeTo(self: *Treap, t1: Node, key: Key) !Node { + std.debug.assert(!self.contains(t1, key)); + const node = try self.makeNode(key, null, null); + return self.add(t1, node); +} + +pub fn @"union"(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + if (t1_arg == t2_arg) return t1_arg; + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + + if (std.mem.eql(u8, t1.key, t2.key)) { + return self.makeNode(t1.key, try self.@"union"(t1.left, t2.left), try self.@"union"(t1.right, t2.right)); + } else if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.makeNode(t2.key, try self.@"union"(l1, t2.left), try self.@"union"(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.makeNode(t1.key, try self.@"union"(t1.left, l2), try self.@"union"(t1.right, r2)); + } +} + +pub fn intersection(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + if (t1_arg == t2_arg) return t1_arg; + const t1 = t1_arg orelse return null; + const t2 = t2_arg orelse return null; + + if (std.mem.eql(u8, t1.key, t2.key)) { + return self.makeNode(t1.key, try self.intersection(t1.left, t2.left), try self.intersection(t1.right, t2.right)); + } else if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.join(try self.intersection(l1, t2.left), try self.intersection(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.join(try self.intersection(t1.left, l2), try self.intersection(t1.right, r2)); + } +} + +pub fn contains(self: *Treap, t_arg: Node, key: Key) bool { + const t = t_arg orelse return false; + return switch (std.mem.order(u8, key, t.key)) { + .eq => true, + .lt => self.contains(t.left, key), + .gt => self.contains(t.right, key), + }; +} + +test add { + var treap = Treap.init(std.testing.allocator); + defer treap.deinit(); + + const tree1 = try treap.addNodeTo(null, "1"); + const tree2 = try treap.addNodeTo(tree1, "2"); + const tree3 = try treap.addNodeTo(tree2, "3"); + const tree4 = try treap.addNodeTo(tree3, "4"); + + try std.testing.expect(treap.contains(tree1, "1")); + try std.testing.expect(!treap.contains(tree1, "2")); + try std.testing.expect(treap.contains(tree2, "1")); + try std.testing.expect(treap.contains(tree2, "2")); + try std.testing.expect(!treap.contains(tree2, "3")); + try std.testing.expect(treap.contains(tree3, "1")); + try std.testing.expect(treap.contains(tree3, "2")); + try std.testing.expect(treap.contains(tree3, "3")); + try std.testing.expect(!treap.contains(tree3, "4")); + + try std.testing.expect(treap.contains(tree4, "4")); +} diff --git a/src/aro/Tree.zig b/src/aro/Tree.zig index f176930a..5756d37d 100644 --- a/src/aro/Tree.zig +++ b/src/aro/Tree.zig @@ -6,11 +6,19 @@ const Compilation = @import("Compilation.zig"); const number_affixes = @import("Tree/number_affixes.zig"); const Source = @import("Source.zig"); const Tokenizer = @import("Tokenizer.zig"); +const Treap = @import("Treap.zig"); const Type = @import("Type.zig"); const Value = @import("Value.zig"); const StringInterner = @import("StringInterner.zig"); +const Flags = packed struct(u8) { + is_bol: bool = false, + space: bool = false, + _: u6 = undefined, +}; + pub const Token = struct { + flags: Flags, id: Id, loc: Source.Location, @@ -20,56 +28,68 @@ pub const Token = struct { pub const NumberSuffix = number_affixes.Suffix; }; +const LocList = std.SinglyLinkedList(Source.Location); + pub const TokenWithExpansionLocs = struct { - id: Token.Id, - flags: packed struct { - expansion_disabled: bool = false, - is_macro_arg: bool = false, - } = .{}, - /// This location contains the actual token slice which might be generated. - /// If it is generated then there is guaranteed to be at least one - /// expansion location. + const Self = @This(); + + flags: Flags = .{}, + id: Tokenizer.Token.Id, + hideset: Treap.Node = null, loc: Source.Location, expansion_locs: ?[*]Source.Location = null, + loc_list: LocList = .{}, + + pub fn toTreeToken(self: Self) Token { + return .{ .flags = self.flags, .id = self.id, .loc = self.loc }; + } + + pub fn argPosition(self: Self) u32 { + std.debug.assert(self.id == .macro_param); + return self.loc.byte_offset; + } + + pub fn isVarArg(self: Self) bool { + std.debug.assert(self.id == .macro_param); + return self.loc.line != 0; + } - pub fn expansionSlice(tok: TokenWithExpansionLocs) []const Source.Location { + pub fn expansionSlice(tok: Self) []const Source.Location { const locs = tok.expansion_locs orelse return &[0]Source.Location{}; var i: usize = 0; while (locs[i].id != .unused) : (i += 1) {} return locs[0..i]; } - pub fn addExpansionLocation(tok: *TokenWithExpansionLocs, gpa: std.mem.Allocator, new: []const Source.Location) !void { - if (new.len == 0 or tok.id == .whitespace or tok.id == .macro_ws or tok.id == .placemarker) return; - var list = std.ArrayList(Source.Location).init(gpa); - defer { - @memset(list.items.ptr[list.items.len..list.capacity], .{}); - // Add a sentinel to indicate the end of the list since - // the ArrayList's capacity isn't guaranteed to be exactly - // what we ask for. - if (list.capacity > 0) { - list.items.ptr[list.capacity - 1].byte_offset = 1; - } - tok.expansion_locs = list.items.ptr; - } - - if (tok.expansion_locs) |locs| { - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - list.items = locs[0..i]; - while (locs[i].byte_offset != 1) : (i += 1) {} - list.capacity = i + 1; - } - - const min_len = @max(list.items.len + new.len + 1, 4); - const wanted_len = std.math.ceilPowerOfTwo(usize, min_len) catch - return error.OutOfMemory; - try list.ensureTotalCapacity(wanted_len); + pub fn addExpansionLocationList(tok: *Self, gpa: std.mem.Allocator, list: LocList) !void { + const first = tok.loc_list.first orelse return; + const new_list = list.first orelse return; + const end = first.findLast(); + // end.insertAfter(new_list); + _ = end; + _ = new_list; + // _ = end; + // const last = tok.loc_list.first.?.findLast(); + // _ = list; + // const last = first.findLast(); + // last.insertAfter(first); + _ = gpa; + // _ = last; + // // var it = list.first; + // // while (it) |node| : (it = node.next) { + // // // try r.append(pp.gpa, node.data); + // // } + // _ = tok; + // _ = gpa; + } - for (new) |new_loc| { - if (new_loc.id == .generated) continue; - list.appendAssumeCapacity(new_loc); - } + pub fn addExpansionLocation(tok: *Self, gpa: std.mem.Allocator, loc: Source.Location) !void { + _ = tok; + _ = gpa; + _ = loc; + // const node = try gpa.create(LocList.Node); + // node.* = .{ .data = loc }; + // tok.loc_list.prepend(node); } pub fn free(expansion_locs: ?[*]Source.Location, gpa: std.mem.Allocator) void { @@ -80,14 +100,14 @@ pub const TokenWithExpansionLocs = struct { gpa.free(locs[0 .. i + 1]); } - pub fn dupe(tok: TokenWithExpansionLocs, gpa: std.mem.Allocator) !TokenWithExpansionLocs { + pub fn dupe(tok: Self, gpa: std.mem.Allocator) !Self { var copy = tok; copy.expansion_locs = null; try copy.addExpansionLocation(gpa, tok.expansionSlice()); return copy; } - pub fn checkMsEof(tok: TokenWithExpansionLocs, source: Source, comp: *Compilation) !void { + pub fn checkMsEof(tok: Self, source: Source, comp: *Compilation) !void { std.debug.assert(tok.id == .eof); if (source.buf.len > tok.loc.byte_offset and source.buf[tok.loc.byte_offset] == 0x1A) { try comp.addDiagnostic(.{ @@ -100,6 +120,9 @@ pub const TokenWithExpansionLocs = struct { }, &.{}); } } + + pub const one: Self = .{ .id = .one, .loc = .{} }; + pub const zero: Self = .{ .id = .zero, .loc = .{} }; }; pub const TokenIndex = u32; diff --git a/src/aro/pragmas/gcc.zig b/src/aro/pragmas/gcc.zig index 91ab750b..8887b632 100644 --- a/src/aro/pragmas/gcc.zig +++ b/src/aro/pragmas/gcc.zig @@ -69,7 +69,7 @@ fn diagnosticHandler(self: *GCC, pp: *Preprocessor, start_idx: TokenIndex) Pragm const diagnostic_tok = pp.tokens.get(start_idx); if (diagnostic_tok.id == .nl) return; - const diagnostic = std.meta.stringToEnum(Directive.Diagnostics, pp.expandedSlice(diagnostic_tok)) orelse + const diagnostic = std.meta.stringToEnum(Directive.Diagnostics, pp.tokSlice(diagnostic_tok)) orelse return error.UnknownPragma; switch (diagnostic) { @@ -112,7 +112,7 @@ fn preprocessorHandler(pragma: *Pragma, pp: *Preprocessor, start_idx: TokenIndex const directive_tok = pp.tokens.get(start_idx + 1); if (directive_tok.id == .nl) return; - const gcc_pragma = std.meta.stringToEnum(Directive, pp.expandedSlice(directive_tok)) orelse + const gcc_pragma = std.meta.stringToEnum(Directive, pp.tokSlice(directive_tok)) orelse return pp.comp.addDiagnostic(.{ .tag = .unknown_gcc_pragma, .loc = directive_tok.loc, @@ -159,7 +159,7 @@ fn preprocessorHandler(pragma: *Pragma, pp: *Preprocessor, start_idx: TokenIndex .loc = tok.loc, }, pp.expansionSlice(start_idx + i)); } - const str = pp.expandedSlice(tok); + const str = pp.tokSlice(tok); if (pp.defines.get(str) != null) { try pp.comp.addDiagnostic(.{ .tag = .pragma_poison_macro, @@ -177,7 +177,7 @@ fn parserHandler(pragma: *Pragma, p: *Parser, start_idx: TokenIndex) Compilation var self: *GCC = @fieldParentPtr("pragma", pragma); const directive_tok = p.pp.tokens.get(start_idx + 1); if (directive_tok.id == .nl) return; - const name = p.pp.expandedSlice(directive_tok); + const name = p.pp.tokSlice(directive_tok); if (mem.eql(u8, name, "diagnostic")) { return self.diagnosticHandler(p.pp, start_idx + 2) catch |err| switch (err) { error.UnknownPragma => {}, // handled during preprocessing @@ -190,7 +190,7 @@ fn parserHandler(pragma: *Pragma, p: *Parser, start_idx: TokenIndex) Compilation fn preserveTokens(_: *Pragma, pp: *Preprocessor, start_idx: TokenIndex) bool { const next = pp.tokens.get(start_idx + 1); if (next.id != .nl) { - const name = pp.expandedSlice(next); + const name = pp.tokSlice(next); if (mem.eql(u8, name, "poison")) { return false; } diff --git a/test/cases/expanded/standard-redefinition-reexamination-example.c b/test/cases/expanded/standard-redefinition-reexamination-example.c index 5e5688ce..a9673c54 100644 --- a/test/cases/expanded/standard-redefinition-reexamination-example.c +++ b/test/cases/expanded/standard-redefinition-reexamination-example.c @@ -1,5 +1,4 @@ f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1); -f(2 * (2+(3,4)-0,1)) | f(2 * (\~{ } 5)) & f(2 * (0,1)) -^m(0,1); -int i[] = { 1, 23, 4, 5, }; +f(2 * (2+(3,4)-0,1)) | f(2 * (\~{ } 5)) & f(2 * (0,1))^m(0,1); +int i[] = { 1, 23, 4, 5, }; char c[2][6] = { "hello", "" }; diff --git a/test/runner.zig b/test/runner.zig index a1d6cbf7..8e8a0fff 100644 --- a/test/runner.zig +++ b/test/runner.zig @@ -239,7 +239,7 @@ pub fn main() !void { try pp.addToken(eof); if (pp.defines.get("TESTS_SKIPPED")) |macro| { - if (macro.is_func or macro.tokens.len != 1 or macro.tokens[0].id != .pp_num) { + if (macro.kind == .func or macro.tokens.len != 1 or macro.tokens[0].id != .pp_num) { fail_count += 1; std.debug.print("invalid TESTS_SKIPPED, definition should contain exactly one integer literal {}\n", .{macro}); continue; @@ -380,7 +380,7 @@ pub fn main() !void { if (pp.defines.get("EXPECTED_OUTPUT")) |macro| blk: { if (comp.diagnostics.errors != 0) break :blk; - if (macro.is_func) { + if (macro.kind == .func) { fail_count += 1; std.debug.print("invalid EXPECTED_OUTPUT {}\n", .{macro}); continue; @@ -470,7 +470,7 @@ fn checkExpectedErrors(pp: *aro.Preprocessor, buf: *std.ArrayList(u8)) !?bool { defer m.deinit(); aro.Diagnostics.renderMessages(pp.comp, &m); - if (macro.is_func) { + if (macro.kind == .func) { std.debug.print("invalid EXPECTED_ERRORS {}\n", .{macro}); return false; }