From 7703d8738636ba7cac62f207edb132f606e10f48 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 18 Jul 2024 14:31:43 -0700 Subject: [PATCH 01/10] zig update: Update signature for BuildStep makeFn --- build/GenerateDef.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/GenerateDef.zig b/build/GenerateDef.zig index 9c6a4d6c..508a629f 100644 --- a/build/GenerateDef.zig +++ b/build/GenerateDef.zig @@ -50,8 +50,8 @@ pub fn create(owner: *std.Build, options: Options) std.Build.Module.Import { }; } -fn make(step: *Step, prog_node: std.Progress.Node) !void { - _ = prog_node; +fn make(step: *Step, options: std.Build.Step.MakeOptions) !void { + _ = options; const b = step.owner; const self: *GenerateDef = @fieldParentPtr("step", step); const arena = b.allocator; From c22f743b0a778c9ebbfe1c179c7c2fec5ce1021a Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Tue, 14 May 2024 11:41:39 -0700 Subject: [PATCH 02/10] wip new preprocessor --- src/aro.zig | 1 + src/aro/Diagnostics/messages.def | 10 + src/aro/Driver.zig | 45 + src/aro/NewPreprocessor.zig | 2097 ++++++++++++++++++++++++++++++ src/aro/Tokenizer.zig | 17 + src/aro/Treap.zig | 165 +++ 6 files changed, 2335 insertions(+) create mode 100644 src/aro/NewPreprocessor.zig create mode 100644 src/aro/Treap.zig diff --git a/src/aro.zig b/src/aro.zig index a9b76e0e..45e90154 100644 --- a/src/aro.zig +++ b/src/aro.zig @@ -35,5 +35,6 @@ test { _ = @import("aro/target.zig"); _ = @import("aro/Tokenizer.zig"); _ = @import("aro/toolchains/Linux.zig"); + _ = @import("aro/treap.zig"); _ = @import("aro/Value.zig"); } diff --git a/src/aro/Diagnostics/messages.def b/src/aro/Diagnostics/messages.def index 119e72a2..7700e1b8 100644 --- a/src/aro/Diagnostics/messages.def +++ b/src/aro/Diagnostics/messages.def @@ -2509,3 +2509,13 @@ auto_type_self_initialized .msg = "variable '{s}' declared with deduced type '__auto_type' cannot appear in its own initializer" .extra = .str .kind = .@"error" + +expected_left_angle_bracket + .msg = "expected '<' but got '{s}'" + .extra = .str + .kind = .@"error" + +closing_paren_after + .msg = "expected '(' after '{s}'" + .extra = .str + .kind = .@"error" diff --git a/src/aro/Driver.zig b/src/aro/Driver.zig index db0b1155..2d383bf0 100644 --- a/src/aro/Driver.zig +++ b/src/aro/Driver.zig @@ -9,6 +9,7 @@ const Compilation = @import("Compilation.zig"); const Diagnostics = @import("Diagnostics.zig"); const LangOpts = @import("LangOpts.zig"); const Preprocessor = @import("Preprocessor.zig"); +const NewPreprocessor = @import("NewPreprocessor.zig"); const Source = @import("Source.zig"); const Toolchain = @import("Toolchain.zig"); const target_util = @import("target.zig"); @@ -36,6 +37,7 @@ line_commands: bool = true, /// If true, use `#line ` instead of `# ` for line directives use_line_directives: bool = false, only_preprocess: bool = false, +new_preprocessor: bool = false, only_syntax: bool = false, only_compile: bool = false, only_preprocess_and_compile: bool = false, @@ -236,6 +238,8 @@ pub fn parseArgs( d.only_compile = true; } else if (mem.eql(u8, arg, "-E")) { d.only_preprocess = true; + } else if (mem.eql(u8, arg, "-fnew-preprocessor")) { + d.new_preprocessor = true; } else if (mem.eql(u8, arg, "-P") or mem.eql(u8, arg, "--no-line-commands")) { d.line_commands = false; } else if (mem.eql(u8, arg, "-fuse-line-directives")) { @@ -630,6 +634,47 @@ fn processSource( comptime fast_exit: bool, ) !void { d.comp.generated_buf.items.len = 0; + if (d.new_preprocessor) { + var pp = try NewPreprocessor.initDefault(d.comp); + defer pp.deinit(); + if (d.comp.langopts.ms_extensions) { + d.comp.ms_cwd_source_id = source.id; + } + + if (d.verbose_pp) pp.verbose = true; + if (d.only_preprocess) { + pp.preserve_whitespace = true; + if (d.line_commands) { + pp.linemarkers = if (d.use_line_directives) .line_directives else .numeric_directives; + } + } + + try pp.preprocessSources(&.{ source, builtin, user_macros }); + + d.renderErrors(); + + if (d.comp.diagnostics.errors != 0) { + if (fast_exit) std.process.exit(1); // Not linking, no need for cleanup. + return; + } + + const file = if (d.output_name) |some| + std.fs.cwd().createFile(some, .{}) catch |er| + return d.fatal("unable to create output file '{s}': {s}", .{ some, errorDescription(er) }) + else + std.io.getStdOut(); + defer if (d.output_name != null) file.close(); + + var buf_w = std.io.bufferedWriter(file.writer()); + pp.prettyPrintTokens(buf_w.writer()) catch |er| + return d.fatal("unable to write result: {s}", .{errorDescription(er)}); + + buf_w.flush() catch |er| + return d.fatal("unable to write result: {s}", .{errorDescription(er)}); + + std.process.exit(0); // Not linking, no need for cleanup. + return; + } var pp = try Preprocessor.initDefault(d.comp); defer pp.deinit(); diff --git a/src/aro/NewPreprocessor.zig b/src/aro/NewPreprocessor.zig new file mode 100644 index 00000000..77442d26 --- /dev/null +++ b/src/aro/NewPreprocessor.zig @@ -0,0 +1,2097 @@ +const std = @import("std"); +const mem = std.mem; +const Allocator = mem.Allocator; +const assert = std.debug.assert; +const Compilation = @import("Compilation.zig"); +const Error = Compilation.Error; +const Source = @import("Source.zig"); +const Tokenizer = @import("Tokenizer.zig"); +const RawToken = Tokenizer.Token; +const Parser = @import("Parser.zig"); +const Diagnostics = @import("Diagnostics.zig"); +const Tree = @import("Tree.zig"); +const Token = Tree.Token; +const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs; +const Attribute = @import("Attribute.zig"); +const features = @import("features.zig"); +const OldPreprocessor = @import("Preprocessor.zig"); +const Treap = @import("treap.zig"); + +const ParamMap = std.StringHashMapUnmanaged(PreprocessorToken); +const DefineMap = std.StringHashMapUnmanaged(Macro); + +const TokenList = std.ArrayListUnmanaged(PreprocessorToken); +const max_include_depth = 200; + +/// Errors that can be returned when expanding a macro. +/// error.UnknownPragma can occur within Preprocessor.pragma() but +/// it is handled there and doesn't escape that function +const MacroError = Error || error{StopPreprocessing}; + +const PreprocessingError = Error || error{PreprocessingFailed}; + +const SpecialMacroFn = fn (*Preprocessor, PreprocessorToken) Error!void; + +fn Range(comptime T: type) type { + return struct { + const Self = @This(); + const Item = T; + + start: u32, + end: u32, + const empty: Self = .{ .start = 0, .end = 0 }; + + fn len(self: Self) u32 { + return self.end - self.start; + } + + fn slice(self: Self, items: []const Item) []const Item { + return items[self.start..self.end]; + } + }; +} + +/// Each macro argument is a list of tokens (represented as a range of Preprocessor.macro_arg_tokens) +const MacroArg = Range(PreprocessorToken); + +/// List of MacroArg's for a macro invocation (represented as a range of Preprocessor.macro_args) +const MacroArgList = Range(MacroArg); + +const PreprocessorToken = struct { + flags: packed struct(u8) { + is_bol: bool = false, + space: bool = false, + _: u6 = undefined, + } = .{}, + id: Tokenizer.Token.Id, + hideset: Treap.Node = null, + loc: Source.Location, + expansion_locs: ?[*]Source.Location = null, + + fn argPosition(self: PreprocessorToken) u32 { + std.debug.assert(self.id == .macro_param); + return self.loc.byte_offset; + } + + fn isVarArg(self: PreprocessorToken) bool { + std.debug.assert(self.id == .macro_param); + return self.loc.line != 0; + } + + pub fn expansionSlice(tok: PreprocessorToken) []const Source.Location { + const locs = tok.expansion_locs orelse return &[0]Source.Location{}; + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + return locs[0..i]; + } + + pub fn addExpansionLocation(tok: *PreprocessorToken, gpa: std.mem.Allocator, new: []const Source.Location) !void { + if (new.len == 0 or tok.id == .whitespace or tok.id == .macro_ws or tok.id == .placemarker) return; + var list = std.ArrayList(Source.Location).init(gpa); + defer { + @memset(list.items.ptr[list.items.len..list.capacity], .{}); + // Add a sentinel to indicate the end of the list since + // the ArrayList's capacity isn't guaranteed to be exactly + // what we ask for. + if (list.capacity > 0) { + list.items.ptr[list.capacity - 1].byte_offset = 1; + } + tok.expansion_locs = list.items.ptr; + } + + if (tok.expansion_locs) |locs| { + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + list.items = locs[0..i]; + while (locs[i].byte_offset != 1) : (i += 1) {} + list.capacity = i + 1; + } + + const min_len = @max(list.items.len + new.len + 1, 4); + const wanted_len = std.math.ceilPowerOfTwo(usize, min_len) catch + return error.OutOfMemory; + try list.ensureTotalCapacity(wanted_len); + + for (new) |new_loc| { + if (new_loc.id == .generated) continue; + list.appendAssumeCapacity(new_loc); + } + } + + pub fn free(expansion_locs: ?[*]Source.Location, gpa: std.mem.Allocator) void { + const locs = expansion_locs orelse return; + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + while (locs[i].byte_offset != 1) : (i += 1) {} + gpa.free(locs[0 .. i + 1]); + } + + pub fn dupe(tok: PreprocessorToken, gpa: std.mem.Allocator) !PreprocessorToken { + var copy = tok; + copy.expansion_locs = null; + try copy.addExpansionLocation(gpa, tok.expansionSlice()); + return copy; + } + + pub fn checkMsEof(tok: PreprocessorToken, source: Source, comp: *Compilation) !void { + std.debug.assert(tok.id == .eof); + if (source.buf.len > tok.loc.byte_offset and source.buf[tok.loc.byte_offset] == 0x1A) { + try comp.addDiagnostic(.{ + .tag = .ctrl_z_eof, + .loc = .{ + .id = source.id, + .byte_offset = tok.loc.byte_offset, + .line = tok.loc.line, + }, + }, &.{}); + } + } + + const one: PreprocessorToken = .{ .id = .one, .loc = .{} }; + const zero: PreprocessorToken = .{ .id = .zero, .loc = .{} }; +}; + +const Macro = struct { + /// Tokens constituting the macro body + tokens: []const PreprocessorToken, + + /// Number of arguments for function-like macros + nargs: usize, + + /// If the function type macro has variable number of arguments + var_args: bool, + + /// Location of macro in the source + loc: Source.Location, + + kind: Kind, + + const Kind = union(enum) { + object, + func, + special: *const SpecialMacroFn, + }; + + fn eql(a: Macro, b: Macro, pp: *Preprocessor) bool { + if ((a.kind == .object and b.kind != .object) or (a.kind == .func and b.kind != .func)) return false; + if (!std.meta.eql(a.kind, b.kind)) return false; + if (a.tokens.len != b.tokens.len) return false; + for (a.tokens, b.tokens) |a_tok, b_tok| if (!tokEql(pp, a_tok, b_tok)) return false; + + if (a.kind == .func) { + if (a.var_args != b.var_args) return false; + } + + return true; + } + + fn tokEql(pp: *Preprocessor, a: PreprocessorToken, b: PreprocessorToken) bool { + return mem.eql(u8, pp.tokSlice(a), pp.tokSlice(b)); + } +}; + +const Preprocessor = @This(); + +const ExpansionEntry = struct { + idx: Tree.TokenIndex, + locs: [*]Source.Location, +}; + +const TokenState = struct { + tokens_len: usize, + expansion_entries_len: usize, +}; + +comp: *Compilation, +gpa: mem.Allocator, +arena: std.heap.ArenaAllocator, + +tokens: std.MultiArrayList(PreprocessorToken) = .{}, +/// Do not directly mutate this; must be kept in sync with `tokens` +expansion_entries: std.MultiArrayList(ExpansionEntry) = .{}, + +/// Map from Source.Id to macro name in the `#ifndef` condition which guards the source, if any +include_guards: std.AutoHashMapUnmanaged(Source.Id, []const u8) = .{}, + +char_buf: std.ArrayListUnmanaged(u8) = .{}, + +/// Dump current state to stderr. +verbose: bool = false, +preserve_whitespace: bool = false, + +/// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers) +linemarkers: Linemarkers = .none, + +tokenizers: std.ArrayListUnmanaged(Tokenizer) = .{}, + +expansion_bufs: std.ArrayListUnmanaged(TokenList) = .{}, + +defines: DefineMap = .{}, + +generated_line: u32 = 1, + +counter: u32 = 0, + +if_level: u8 = 0, + +if_kind: std.PackedIntArray(u2, 256) = blk: { + @setEvalBranchQuota(2000); + break :blk std.PackedIntArray(u2, 256).initAllTo(0); +}, + +guard_stack: std.ArrayListUnmanaged(?[]const u8) = .{}, + +macro_arg_tokens: std.ArrayListUnmanaged(MacroArg.Item) = .{}, +macro_args: std.ArrayListUnmanaged(MacroArgList.Item) = .{}, + +safe_strings: std.StringHashMapUnmanaged(void) = .{}, + +treap: Treap, + +pub const parse = Parser.parse; + +pub const Linemarkers = enum { + /// No linemarker tokens. Required setting if parser will run + none, + /// #line "filename" + line_directives, + /// # "filename" flags + numeric_directives, +}; + +pub fn init(comp: *Compilation) Preprocessor { + const pp = Preprocessor{ + .comp = comp, + .gpa = comp.gpa, + .arena = std.heap.ArenaAllocator.init(comp.gpa), + .treap = Treap.init(comp.gpa), + }; + comp.pragmaEvent(.before_preprocess); + return pp; +} + +fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, func: *const SpecialMacroFn) !void { + try pp.defines.putNoClobber(pp.gpa, name, .{ + .tokens = &.{}, + .var_args = false, + .loc = .{ .id = .generated }, + .kind = .{ .special = func }, + .nargs = 0, + }); +} + +fn handleLineMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{source.physicalLine(tok.loc)}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); +} + +fn handleFileMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("\"{s}\"\n", .{source.path}); + const pasted_tok = try pp.makeGeneratedToken(start, .string_literal, tok); + return pp.ungetToken(pasted_tok); +} + +fn handleCounterMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + defer pp.counter += 1; + const start = pp.comp.generated_buf.items.len; + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{pp.counter}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); +} + +fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: PreprocessorToken) !PreprocessorToken { + const pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + } }; + pp.generated_line += 1; + // try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); + // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); + return pasted_token; +} + +fn errStr(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag, str: []const u8) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .str = str }, + }, &.{}); // todo expansion slice +} + +fn errTok(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .none = {} }, + }, &.{}); // todo expansion slice +} + +fn expectClosing(pp: *Preprocessor, opening: PreprocessorToken, id: Token.Id) !void { + // todo: fix expect + const item = try pp.expect(id, .closing_paren); + if (item.id != id) { + try pp.errTok(opening, .to_match_paren); + } +} + +fn tokFromBool(b: bool) PreprocessorToken { + return if (b) PreprocessorToken.one else PreprocessorToken.zero; +} + +fn handleHasAttribute(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + _ = tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const has_attr = Attribute.fromString(.gnu, null, pp.tokSlice(attr_name)) != null; + return pp.ungetToken(tokFromBool(has_attr)); +} + +fn handleHasCAttribute(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + _ = macro_tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + + var tok: PreprocessorToken = undefined; + while (true) { + tok = try pp.readToken(); + if (tok.id == .comment) continue; + if (tok.id.isDirectiveEnd() or tok.id == .r_paren) break; + try r.append(pp.gpa, tok); + } + try pp.expectClosing(l_paren, .r_paren); +} + +fn handleHasDeclSpecAttribute(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + _ = tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const ident_str = pp.tokSlice(attr_name); + const has_attr = if (pp.comp.langopts.declspec_attrs) Attribute.fromString(.declspec, null, ident_str) != null else false; + return pp.ungetToken(tokFromBool(has_attr)); +} + +fn handleHasFeature(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + _ = tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const ident_str = pp.tokSlice(attr_name); + const has_feature = features.hasFeature(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_feature)); +} + +fn handleHasExtension(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + _ = tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const ident_str = pp.tokSlice(attr_name); + const has_extension = features.hasExtension(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_extension)); +} + +fn handleHasBuiltin(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + _ = tok; + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const ident_str = pp.tokSlice(attr_name); + const has_builtin = pp.comp.hasBuiltin(ident_str); + return pp.ungetToken(tokFromBool(has_builtin)); +} + +fn handleHasWarning(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + const start = pp.char_buf.items.len; + defer pp.char_buf.items.len = start; + + while (true) { + const tok = try pp.readExpandNewline(); + switch (tok.id) { + .nl, .eof => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return pp.ungetToken(PreprocessorToken.zero); + }, + .r_paren => break, + .string_literal => { + const string = pp.tokSlice(tok); + try pp.char_buf.appendSlice(pp.gpa, string[1 .. string.len - 1]); + }, + else => { + pp.skipToNl(); + try pp.errTok(tok, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return pp.ungetToken(PreprocessorToken.zero); + }, + } + } + const actual_param = pp.char_buf.items[start..]; + if (actual_param.len == 0) { + try pp.comp.addDiagnostic(.{ + .tag = .expected_arguments, + .loc = macro_tok.loc, + .extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }, + }, &.{}); // todo expansion slice + return pp.ungetToken(PreprocessorToken.zero); + } + if (!mem.startsWith(u8, actual_param, "-W")) { + try pp.errStr(l_paren, .malformed_warning_check, "__has_warning"); + return pp.ungetToken(PreprocessorToken.zero); + } + const warning_name = actual_param[2..]; + const exists = Diagnostics.warningExists(warning_name); + return pp.ungetToken(tokFromBool(exists)); +} + +fn handleHasInclude(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .first); +} + +fn handleHasIncludeNext(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .next); +} + +fn handleHasIncludeExtra(pp: *Preprocessor, macro_tok: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; + } + + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return pp.ungetToken(PreprocessorToken.zero), + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + + if (which == .first or pp.includeDepth() == 0) { + if (which == .next) { + try pp.comp.addDiagnostic(.{ + .tag = .include_next_outside_header, + .loc = macro_tok.loc, + }, &.{}); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .first); + return pp.ungetToken(tokFromBool(has)); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .next); + return pp.ungetToken(tokFromBool(has)); +} + +fn includeDepth(pp: *Preprocessor) usize { + return pp.tokenizers.items.len - 1; +} + +fn hasEmbedValue(contents_arg: ?[]const u8) []const u8 { + const contents = contents_arg orelse return "0\n"; + if (contents.len == 0) return "2\n"; + return "1\n"; +} + +/// TODO: handle limit/prefix/suffix/etc +fn handleHasEmbed(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; + } + + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + + const contents = try pp.comp.findEmbed(filename, macro_tok.loc.id, include_type, 1); + const result = hasEmbedValue(contents); + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, macro_tok); + return pp.ungetToken(pasted_tok); +} + +// Skip until newline, ignore other tokens. +fn skipToNl(pp: *Preprocessor) void { + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; + } +} + +fn readOneIdentifierArgument(pp: *Preprocessor, macro_tok: PreprocessorToken) !?PreprocessorToken { + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + _ = l_paren; + var invalid: ?PreprocessorToken = null; + var identifier: ?PreprocessorToken = null; + while (true) { + var tok = pp.getToken(); + tok.id.simplifyMacroKeywordExtra(true); + + switch (tok.id) { + .r_paren, .eof => break, + else => { + if (identifier) |_| invalid = tok else identifier = tok; + }, + } + } + if (invalid) |some| { + try pp.comp.addDiagnostic(.{ + .tag = .missing_tok_builtin, + .loc = some.loc, + .extra = .{ .tok_id_expected = .r_paren }, + }, &.{}); // TODO: expansion slice + return null; + } + if (identifier) |ident| { + if (ident.id == .identifier or ident.id == .extended_identifier) return ident; + } else { + const extra: Diagnostics.Message.Extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }; + try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); + } + return null; +} + +fn handleIsIdentifier(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + if (try pp.readOneIdentifierArgument(macro_tok)) |_| { + return pp.ungetToken(PreprocessorToken.one); + } else { + return pp.ungetToken(PreprocessorToken.zero); + } +} + +fn handlePragmaOperator(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + _ = pp; + _ = macro_tok; + // TODO +} + +fn addBuiltinMacros(pp: *Preprocessor) !void { + try pp.addBuiltinMacro("__has_attribute", handleHasAttribute); + try pp.addBuiltinMacro("__has_c_attribute", handleHasCAttribute); + try pp.addBuiltinMacro("__has_declspec_attribute", handleHasDeclSpecAttribute); + try pp.addBuiltinMacro("__has_feature", handleHasFeature); + try pp.addBuiltinMacro("__has_extension", handleHasExtension); + try pp.addBuiltinMacro("__has_builtin", handleHasBuiltin); + try pp.addBuiltinMacro("__has_warning", handleHasWarning); + try pp.addBuiltinMacro("__has_include", handleHasInclude); + try pp.addBuiltinMacro("__has_include_next", handleHasIncludeNext); + try pp.addBuiltinMacro("__has_embed", handleHasEmbed); + + try pp.addBuiltinMacro("__is_identifier", handleIsIdentifier); + + try pp.addBuiltinMacro("__FILE__", handleFileMacro); + try pp.addBuiltinMacro("__LINE__", handleLineMacro); + try pp.addBuiltinMacro("__COUNTER__", handleCounterMacro); + try pp.addBuiltinMacro("_Pragma", handlePragmaOperator); +} + +/// Initialize Preprocessor with builtin macros. +pub fn initDefault(comp: *Compilation) !Preprocessor { + var pp = init(comp); + errdefer pp.deinit(); + try pp.addBuiltinMacros(); + return pp; +} + +pub fn deinit(pp: *Preprocessor) void { + pp.arena.deinit(); + pp.include_guards.deinit(pp.gpa); + pp.tokens.deinit(pp.gpa); + pp.tokenizers.deinit(pp.gpa); + for (pp.expansion_bufs.items) |*toklist| { + toklist.deinit(pp.gpa); + } + pp.expansion_bufs.deinit(pp.gpa); + pp.defines.deinit(pp.gpa); + pp.char_buf.deinit(pp.gpa); + for (pp.expansion_entries.items(.locs)) |locs| PreprocessorToken.free(locs, pp.gpa); + pp.expansion_entries.deinit(pp.gpa); + pp.guard_stack.deinit(pp.gpa); + pp.macro_arg_tokens.deinit(pp.gpa); + pp.macro_args.deinit(pp.gpa); + pp.safe_strings.deinit(pp.gpa); + pp.treap.deinit(); +} + +/// Preprocess a compilation unit of sources into a parsable list of tokens. +pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { + assert(sources.len > 1); + const first = sources[0]; + + for (sources[1..]) |header| { + _ = try pp.preprocess(header); + } + const eof = try pp.preprocess(first); + try pp.addToken(eof); +} + +fn propagateSpace(pp: *Preprocessor, tokens: []PreprocessorToken, template: PreprocessorToken) void { + if (tokens.len > 0) { + tokens[0].flags = template.flags; + } else { + pp.injectSpace(); + } +} + +fn ungetAll(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + if (tokens.len == 0) return; + const start = pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len; + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items[start..]); +} + +fn addHideSet(pp: *Preprocessor, toks: []PreprocessorToken, hideset: Treap.Node) !void { + for (toks) |*tok| { + switch (tok.id) { + // non-identifiers are not expanded, so we don't need to track their hidesets. + // Track r_paren hideset since it is used for computing the hideset of function-like macro expansions + .identifier, .extended_identifier, .r_paren => { + tok.hideset = try pp.treap.@"union"(tok.hideset, hideset); + }, + else => {}, + } + } +} + +fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) !PreprocessorToken { + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.append(pp.gpa, '"'); + const args = args_range.slice(pp.macro_arg_tokens.items); + for (args, 0..) |tok, i| { + const slice = pp.tokSlice(tok); + if (slice.len > 0 and tok.flags.space and i != 0) { + try pp.comp.generated_buf.append(pp.gpa, ' '); + } + try pp.comp.generated_buf.appendSlice(pp.gpa, slice); + } + try pp.comp.generated_buf.append(pp.gpa, '"'); + var tok = tmpl; + tok.id = .string_literal; + tok.loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + }; + pp.generated_line += 1; + return tok; +} + +fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, args: MacroArgList, hideset_arg: Treap.Node) ![]PreprocessorToken { + _ = macro_tok; + var hideset = hideset_arg; + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + var i: usize = 0; + while (i < macro.tokens.len) : (i += 1) { + const t0 = macro.tokens[i]; + const t1: ?PreprocessorToken = if (i == macro.tokens.len - 1) null else macro.tokens[i + 1]; + + const t0_param = t0.id == .macro_param; + const t1_param = if (t1) |tok| tok.id == .macro_param else false; + + if (t0.id == .hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + const stringized = try pp.stringize(t0, arg); + try r.append(pp.gpa, stringized); + i += 1; + continue; + } + if (t0.id == .hash_hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + if (t1.?.isVarArg() and r.items.len > 0 and r.items[r.items.len - 1].id == .comma) { + if (arg.len() == 0) { + _ = r.pop(); + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); + } + } else if (arg.len() > 0) { + try pp.pasteAndPush(&r, arg.slice(pp.macro_arg_tokens.items)[0]); + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)[1..]); + } + i += 1; + continue; + } + if (t0.id == .hash_hash and t1 != null) { + hideset = t1.?.hideset; + try pp.pasteAndPush(&r, t1.?); + i += 1; + continue; + } + if (t0_param and t1 != null and t1.?.id == .hash_hash) { + hideset = t1.?.hideset; + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + if (arg.len() == 0) { + i += 1; + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); + } + continue; + } + if (t0_param) { + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + const expanded = try pp.expandAll(arg.slice(pp.macro_arg_tokens.items), t0); + defer pp.gpa.free(expanded); + try r.appendSlice(pp.gpa, expanded); + continue; + } + try r.append(pp.gpa, t0); + } + try pp.addHideSet(r.items, hideset); + return r.toOwnedSlice(pp.gpa); +} + +fn pasteTokens(pp: *Preprocessor, lhs: PreprocessorToken, rhs: PreprocessorToken) !PreprocessorToken { + const start = pp.comp.generated_buf.items.len; + const end = start + pp.tokSlice(lhs).len + pp.tokSlice(rhs).len; + try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline + + // We cannot use the same slices here since they might be invalidated by `ensureCapacity` + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(lhs)); + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(rhs)); + pp.comp.generated_buf.appendAssumeCapacity('\n'); + + // Try to tokenize the result. + var tmp_tokenizer = Tokenizer{ + .buf = pp.comp.generated_buf.items, + .langopts = pp.comp.langopts, + .index = @intCast(start), + .source = .generated, + }; + const pasted_token = tmp_tokenizer.nextNoWSComments(); + const next_tok = tmp_tokenizer.next(); + if (next_tok.id != .nl) { + try pp.errStr( + lhs, + .pasting_formed_invalid, + try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), + ); + } + return pp.makeGeneratedToken(start, pasted_token.id, lhs); +} + +/// Paste `tok` onto the last token in `tokens` +fn pasteAndPush(pp: *Preprocessor, tokens: *TokenList, tok: PreprocessorToken) !void { + const last = tokens.pop(); + const pasted = try pp.pasteTokens(last, tok); + return tokens.append(pp.gpa, pasted); +} + +fn tokenBufferStashReverse(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + try pp.expansion_bufs.append(pp.gpa, .{}); + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items); +} + +fn tokenBufferUnstash(pp: *Preprocessor) void { + var buf = pp.expansion_bufs.pop(); + buf.deinit(pp.gpa); +} + +fn expandAll(pp: *Preprocessor, tokens: []const PreprocessorToken, tmpl: PreprocessorToken) ![]const PreprocessorToken { + try pp.tokenBufferStashReverse(tokens); + defer pp.tokenBufferUnstash(); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + while (true) { + const tok = try pp.readExpand(); + if (tok.id == .eof) break; + try r.append(pp.gpa, tok); + } + pp.propagateSpace(r.items, tmpl); + return r.toOwnedSlice(pp.gpa); +} + +fn peekToken(pp: *Preprocessor) !PreprocessorToken { + const tok = try pp.readToken(); + try pp.ungetToken(tok); + return tok; +} + +/// Return a string with the same contents as `name` and whose lifetime is the same as the preprocessor's lifetime +/// If `tok` is not from the generated source, this is just `name`. +/// If `tok` is from the generated source, pointers are invalidated when the underlying ArrayList is resized. Therefore, +/// duplicate the string and store it (so we aren't repeatedly copying the same string) +fn getSafeString(pp: *Preprocessor, tok: PreprocessorToken, name: []const u8) ![]const u8 { + if (tok.loc.id != .generated) return name; + const gop = try pp.safe_strings.getOrPut(pp.gpa, name); + if (!gop.found_existing) { + const copy = try pp.arena.allocator().dupe(u8, name); + gop.key_ptr.* = copy; + } + return gop.key_ptr.*; +} + +fn injectSpace(pp: *Preprocessor) void { + var i = pp.expansion_bufs.items.len; + while (i > 0) : (i -= 1) { + var j = pp.expansion_bufs.items[i - 1].items.len; + while (j > 0) : (j -= 1) { + pp.expansion_bufs.items[i - 1].items[j - 1].flags.space = true; + return; + } + } +} + +fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) return tok; + const name = pp.tokSlice(tok); + const macro = pp.defines.getPtr(name) orelse return tok; + + const macro_hideset = tok.hideset; + if (pp.treap.contains(macro_hideset, name)) return tok; + + switch (macro.kind) { + .object => { + const safe_name = try pp.getSafeString(tok, name); + const new_hideset = try pp.treap.addNodeTo(tok.hideset, safe_name); + + const tokens = try pp.subst(macro, tok, MacroArgList.empty, new_hideset); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); + }, + .func => { + if (!try pp.next(.l_paren)) return tok; + const arg_tokens_start = pp.macro_arg_tokens.items.len; + defer pp.macro_arg_tokens.items.len = arg_tokens_start; + const macro_args_start = pp.macro_args.items.len; + defer pp.macro_args.items.len = macro_args_start; + + const args = pp.readArgs(tok, macro) catch |err| switch (err) { + error.IncorrectArgumentCount => return PreprocessorToken.zero, + error.UnterminatedMacroArgumentList => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return PreprocessorToken.zero; + }, + else => |e| return e, + }; + const r_paren = pp.getToken(); + std.debug.assert(r_paren.id == .r_paren); + const safe_name = try pp.getSafeString(tok, name); + + const intersection = try pp.treap.intersection(macro_hideset, r_paren.hideset); + const hideset = try pp.treap.addNodeTo(intersection, safe_name); + const tokens = try pp.subst(macro, tok, args, hideset); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); + }, + .special => |func| { + try func(pp, tok); + return pp.readExpand(); + }, + } +} + +fn readMacroArg(pp: *Preprocessor, end: *bool, readall: bool) !MacroArg { + var level: i32 = 0; + const start: u32 = @intCast(pp.macro_arg_tokens.items.len); + while (true) { + var tok = pp.getToken(); + if (tok.id == .eof) { + return error.UnterminatedMacroArgumentList; + } + if (tok.id == .nl) continue; + if (tok.flags.is_bol and tok.id == .hash) { + try pp.readDirective(); + continue; + } + if (level == 0 and tok.id == .r_paren) { + try pp.ungetToken(tok); + end.* = true; + break; + } + if (level == 0 and tok.id == .comma and !readall) { + break; + } + if (tok.id == .l_paren) { + level += 1; + } + if (tok.id == .r_paren) { + level -= 1; + } + if (tok.flags.is_bol) { + tok.flags = .{ .is_bol = false, .space = true }; + } + try pp.macro_arg_tokens.append(pp.gpa, tok); + } + return .{ .start = start, .end = @intCast(pp.macro_arg_tokens.items.len) }; +} + +fn doReadArgs(pp: *Preprocessor, macro: *const Macro) !MacroArgList { + const start: u32 = @intCast(pp.macro_args.items.len); + var end = false; + while (!end) { + const in_ellipsis = macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs; + const arg_range = try pp.readMacroArg(&end, in_ellipsis); + try pp.macro_args.append(pp.gpa, arg_range); + } + if (macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs) { + try pp.macro_args.append(pp.gpa, MacroArg.empty); + } + return .{ .start = start, .end = @intCast(pp.macro_args.items.len) }; +} + +fn readArgs(pp: *Preprocessor, ident: PreprocessorToken, macro: *const Macro) !MacroArgList { + if (macro.nargs == 0 and (try pp.peekToken()).id == .r_paren) { + return MacroArgList.empty; + } + const args = try pp.doReadArgs(macro); + if (args.len() != macro.nargs) { + const extra = Diagnostics.Message.Extra{ + .arguments = .{ .expected = @intCast(macro.nargs), .actual = @intCast(args.len()) }, + }; + try pp.comp.addDiagnostic( + .{ .tag = .expected_arguments, .loc = ident.loc, .extra = extra }, + &.{}, // TODO: expansion slice + ); + return error.IncorrectArgumentCount; + } + return args; +} + +fn readExpand(pp: *Preprocessor) Error!PreprocessorToken { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id != .nl) return tok; + } +} + +/// # number "file" flags +/// TODO: validate that the pp_num token is solely digits +/// if not, emit `GNU line marker directive requires a simple digit sequence` +fn readLinemarker(pp: *Preprocessor) !void { + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + + const flag_1 = pp.getToken(); + if (flag_1.id.isDirectiveEnd()) return; + const flag_2 = pp.getToken(); + if (flag_2.id.isDirectiveEnd()) return; + const flag_3 = pp.getToken(); + if (flag_3.id.isDirectiveEnd()) return; + const flag_4 = pp.getToken(); + if (flag_4.id.isDirectiveEnd()) return; + try pp.expectNewline(); +} + +fn readIdent(pp: *Preprocessor) !?PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + return null; + } + return tok; +} + +fn ungetToken(pp: *Preprocessor, tok: PreprocessorToken) !void { + if (tok.id == .eof) return; + if (pp.isBufferEmpty()) { + try pp.expansion_bufs.append(pp.gpa, .{}); + } + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].append(pp.gpa, tok); +} + +fn hashHashCheck(pp: *Preprocessor, toks: []const PreprocessorToken) !void { + if (toks.len == 0) return; + if (toks[0].id == .hash_hash) { + return pp.errTok(toks[0], .hash_hash_at_start); + } + if (toks[toks.len - 1].id == .hash_hash) { + return pp.errTok(toks[toks.len - 1], .hash_hash_at_end); + } +} + +fn readObjMacro(pp: *Preprocessor, name: PreprocessorToken) !void { + var body: TokenList = .{}; + errdefer body.deinit(pp.gpa); + + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; + + try body.append(pp.gpa, tok); + } + try pp.hashHashCheck(body.items); + const macro: Macro = .{ + .tokens = body.items, + .var_args = false, + .loc = undefined, + .kind = .object, + .nargs = undefined, + }; + try pp.defineMacro(name, macro); +} + +/// Defines a new macro and warns if it is a duplicate +fn defineMacro(pp: *Preprocessor, name_tok: PreprocessorToken, macro: Macro) Error!void { + const name_str = pp.tokSlice(name_tok); + const gop = try pp.defines.getOrPut(pp.gpa, name_str); + if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { + const tag: Diagnostics.Tag = if (gop.value_ptr.kind == .special) .builtin_macro_redefined else .macro_redefined; + const start = pp.comp.diagnostics.list.items.len; + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = name_tok.loc, + .extra = .{ .str = name_str }, + }, &.{}); + if (gop.value_ptr.kind != .special and pp.comp.diagnostics.list.items.len != start) { + try pp.comp.addDiagnostic(.{ + .tag = .previous_definition, + .loc = gop.value_ptr.loc, + }, &.{}); + } + } + gop.value_ptr.* = macro; +} + +/// Get raw token source string. +/// Returned slice is invalidated when comp.generated_buf is updated. +pub fn tokSlice(pp: *Preprocessor, token: PreprocessorToken) []const u8 { + if (token.id.lexeme()) |some| return some; + const source = pp.comp.getSource(token.loc.id); + var tmp_tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = token.loc.byte_offset, + .source = .generated, + }; + const tok = tmp_tokenizer.next(); + return tmp_tokenizer.buf[tok.start..tok.end]; +} + +fn expect(pp: *Preprocessor, expected: Tokenizer.Token.Id, tag: Diagnostics.Tag) !PreprocessorToken { + const tok = pp.getToken(); + if (tok.id != expected) { + try pp.errTok(tok, tag); + } + return tok; +} + +fn makeMacroToken(position: usize, is_vararg: bool) PreprocessorToken { + return .{ + .id = .macro_param, + .hideset = null, + .loc = .{ + .id = .unused, + .byte_offset = @intCast(position), + .line = @intFromBool(is_vararg), + }, + }; +} + +fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { + const tok = pp.getToken(); + if (tok.id == id) return true; + try pp.ungetToken(tok); + return false; +} + +/// Returns true for vararg function-like macro, false otherwise +fn readFunclikeMacroParams(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken, params: *ParamMap) !bool { + _ = name; + var pos: usize = 0; + while (true) { + var tok = pp.getToken(); + if (tok.id == .r_paren) return false; + if (pos != 0) { + if (tok.id != .comma) { + switch (tok.id) { + .nl, .eof => {}, + else => pp.skipToNl(), + } + try pp.errTok(tok, .expected_comma_param_list); + return error.InvalidMacroDef; + } + tok = pp.getToken(); + } + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .missing_paren_param_list); + return false; + } + if (tok.id == .ellipsis) { + try params.put(pp.gpa, "__VA_ARGS__", makeMacroToken(pos, true)); + pos += 1; + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return error.InvalidMacroDef; + } + return true; + } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .invalid_token_param_list); + return error.InvalidMacroDef; + } + const arg = pp.tokSlice(tok); + if (try pp.next(.ellipsis)) { + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + pp.skipToNl(); + } + try params.put(pp.gpa, arg, makeMacroToken(pos, true)); + pos += 1; + return true; + } + try params.put(pp.gpa, arg, makeMacroToken(pos, false)); + pos += 1; + } +} + +fn readFunclikeMacroBody(pp: *Preprocessor, params: *const ParamMap) ![]const PreprocessorToken { + var tokens: TokenList = .{}; + errdefer tokens.deinit(pp.gpa); + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) { + return tokens.toOwnedSlice(pp.gpa); + } + if (tok.id.isMacroIdentifier()) { + // const subst = params. + if (params.get(pp.tokSlice(tok))) |sub| { + var copy = sub; + copy.flags.space = tok.flags.space; + try tokens.append(pp.gpa, copy); + continue; + } + } + try tokens.append(pp.gpa, tok); + } +} + +fn readFuncLikeMacro(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken) Error!void { + var params: ParamMap = .{}; + defer params.deinit(pp.gpa); + const is_vararg = pp.readFunclikeMacroParams(name, l_paren, ¶ms) catch |err| switch (err) { + error.InvalidMacroDef => blk: { + pp.skipToNl(); + break :blk false; + }, + else => |e| return e, + }; + const body = try pp.readFunclikeMacroBody(¶ms); + errdefer pp.gpa.free(body); + try pp.hashHashCheck(body); + const macro: Macro = .{ + .tokens = body, + .var_args = is_vararg, + .loc = undefined, + .kind = .func, + .nargs = params.count(), + }; + try pp.defineMacro(name, macro); +} + +fn readDefine(pp: *Preprocessor) !void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; + }; + const next_tok = pp.getToken(); + if (next_tok.id == .l_paren and !next_tok.flags.space) { + try pp.readFuncLikeMacro(name, next_tok); + return; + } + try pp.ungetToken(next_tok); + try pp.readObjMacro(name); +} + +fn doSkipSpace(pp: *Preprocessor) bool { + const saved_tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + switch (tok.id) { + .eof => return false, + .whitespace, .comment => return true, + else => { + pp.tokenizers.items[pp.tokenizers.items.len - 1] = saved_tokenizer; + return false; + }, + } +} + +/// Skips spaces including comments. +/// Returns true if at least one space is skipped. +fn skipSpace(pp: *Preprocessor) bool { + if (!pp.doSkipSpace()) { + return false; + } + while (pp.doSkipSpace()) {} + return true; +} + +/// Read the next raw token from the tokenizer stack +fn lexToken(pp: *Preprocessor) PreprocessorToken { + if (pp.skipSpace()) { + return .{ .id = .whitespace, .loc = undefined }; + } + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + return .{ + .id = tok.id, + .flags = .{ + .is_bol = tok.bol, + }, + .loc = .{ + .id = tok.source, + .byte_offset = tok.start, + .line = tok.line, + }, + }; +} + +/// Read the next token without expanding it +fn getToken(pp: *Preprocessor) PreprocessorToken { + if (!pp.isBufferEmpty() and pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len > 0) { + return pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].pop(); + } + if (pp.expansion_bufs.items.len > 1) { + return .{ .id = .eof, .loc = undefined }; + } + const bol = pp.tokenizers.items[pp.tokenizers.items.len - 1].bol; + var tok = pp.lexToken(); + while (tok.id == .whitespace) { + tok = pp.lexToken(); + tok.flags.space = true; + } + tok.flags.is_bol = bol; + return tok; +} + +fn readDefinedOp(pp: *Preprocessor) !PreprocessorToken { + var tok = pp.getToken(); + if (tok.id == .l_paren) { + tok = pp.getToken(); + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errStr(r_paren, .closing_paren_after, "defined"); + } + } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + } + const slice = pp.tokSlice(tok); + if (pp.defines.contains(slice)) { + return PreprocessorToken.one; + } + return PreprocessorToken.zero; +} + +fn readIntExprLine(pp: *Preprocessor) !void { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) break; + if (tok.id == .keyword_defined) { + const result = try pp.readDefinedOp(); + try pp.addToken(result); + } else if (tok.id.isMacroIdentifier()) { + try pp.addToken(PreprocessorToken.zero); + } else { + try pp.addToken(tok); + } + } + try pp.addToken(.{ .id = .eof, .loc = .{} }); +} + +fn readConstexpr(pp: *Preprocessor) !bool { + const start = pp.tokens.len; + defer pp.tokens.len = start; + try pp.readIntExprLine(); + + var oldpp = try OldPreprocessor.initDefault(pp.comp); + defer oldpp.deinit(); + + var i: usize = start; + while (i < pp.tokens.len) : (i += 1) { + const tok = pp.tokens.get(i); + try oldpp.tokens.append(pp.gpa, .{ .id = tok.id, .loc = tok.loc }); + } + + var parser = Parser{ + .pp = &oldpp, + .comp = pp.comp, + .gpa = pp.gpa, + .tok_ids = pp.tokens.items(.id)[start..], + .tok_i = 0, + .arena = undefined, + .in_macro = true, + .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), + + .data = undefined, + .value_map = undefined, + .labels = undefined, + .decl_buf = undefined, + .list_buf = undefined, + .param_buf = undefined, + .enum_buf = undefined, + .record_buf = undefined, + .attr_buf = undefined, + .field_attr_buf = undefined, + .string_ids = undefined, + }; + defer parser.strings.deinit(); + return parser.macroExpr(); +} + +/// #line number "file" +/// TODO: validate that the pp_num token is solely digits +fn readLine(pp: *Preprocessor) Error!void { + const digits = pp.getToken(); + if (digits.id != .pp_num) try pp.errTok(digits, .line_simple_digit); + + if (digits.id.isDirectiveEnd()) return; + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + try pp.expectNewline(); +} + +fn readPragma(pp: *Preprocessor) Error!void { + _ = pp; + // TODO +} + +fn readUndef(pp: *Preprocessor) Error!void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; + }; + try pp.expectNewline(); + _ = pp.defines.remove(pp.tokSlice(name)); +} + +/// Skip until after a newline, error if extra tokens before it. +fn expectNewline(pp: *Preprocessor) !void { + var sent_err = false; + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; + if (tok.id == .whitespace or tok.id == .comment) continue; + if (!sent_err) { + sent_err = true; + try pp.errTok(tok, .extra_tokens_directive_end); + } + } +} + +/// TODO: pragma once +fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + try pp.expectNewline(); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + const tok: RawToken = .{ .id = include_token.id, .source = include_token.loc.id, .start = include_token.loc.byte_offset, .line = include_token.loc.line }; + const source = (try pp.comp.findInclude(filename, tok, include_type, which)) orelse return pp.fatalNotFound(include_token, filename); + if (pp.include_guards.get(source.id)) |guard| { + if (pp.defines.contains(guard)) return; + } + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); + + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); +} + +/// Read a header name delimited by quotes or angle brackets +fn readHeaderFileName(pp: *Preprocessor, is_std: *bool) !?[]const u8 { + if (!pp.isBufferEmpty()) return null; + _ = pp.skipSpace(); + + var close: u8 = undefined; + var tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + defer pp.tokenizers.items[pp.tokenizers.items.len - 1] = tokenizer; + + if (tokenizer.buf[tokenizer.index..].len < 2) { + return null; + } + const start = tokenizer.index; + switch (tokenizer.buf[tokenizer.index..][0]) { + '"' => { + is_std.* = false; + close = '"'; + }, + '<' => { + is_std.* = true; + close = '>'; + }, + else => return null, + } + tokenizer.index += 1; + while (tokenizer.index < tokenizer.buf.len and tokenizer.buf[tokenizer.index] != close and tokenizer.buf[tokenizer.index] != '\n') : (tokenizer.index += 1) {} + + if (tokenizer.index == tokenizer.buf.len or tokenizer.buf[tokenizer.index] != close) { + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .header_str_closing); + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .header_str_match); + return error.InvalidInclude; + } + + tokenizer.index += 1; + + const buf = tokenizer.buf[start..tokenizer.index]; + if (buf.len == 2) { + try pp.errTok(.{ .id = .nl, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .empty_filename); + return error.InvalidInclude; + } + return buf; +} + +fn isBufferEmpty(pp: *const Preprocessor) bool { + return pp.expansion_bufs.items.len == 0; +} + +/// Read a delimited header name, or a macro expanded one +fn readHeaderName(pp: *Preprocessor, is_std: *bool) ![]const u8 { + if (try pp.readHeaderFileName(is_std)) |path| return path; + + // If a token following #include does not start with < nor ", + // try to read the token as a regular token. Macro-expanded + // form may be a valid header file path. + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .expected_filename); + return error.InvalidInclude; + } + if (tok.id == .string_literal) { + is_std.* = false; + return pp.tokSlice(tok); + } + if (tok.id != .angle_bracket_left) { + try pp.errStr(tok, .expected_left_angle_bracket, pp.tokSlice(tok)); + return error.InvalidInclude; + } + const start = pp.char_buf.items.len; + try pp.char_buf.append(pp.gpa, '<'); + defer pp.char_buf.items.len = start; + const writer = pp.char_buf.writer(pp.gpa); + while (true) { + const path_tok = try pp.readExpandNewline(); + if (path_tok.id == .nl) { + try pp.errTok(path_tok, .header_str_closing); + try pp.errTok(tok, .header_str_match); + return error.InvalidInclude; + } + if (path_tok.id == .angle_bracket_right) { + break; + } + try pp.prettyPrintToken(writer, path_tok); + } + is_std.* = true; + try pp.char_buf.append(pp.gpa, '>'); + return pp.gpa.dupe(u8, pp.char_buf.items[start..]); +} + +fn readInclude(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .first); +} + +fn readIncludeNext(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .next); +} + +fn readErrorMessage(pp: *Preprocessor, directive_tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + var i: usize = 0; + while (true) : (i += 1) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; + const slice = pp.tokSlice(tok); + if (slice.len > 0 and tok.flags.space and i != 0) { + try pp.char_buf.append(pp.gpa, ' '); + } + try pp.char_buf.appendSlice(pp.gpa, slice); + } + const slice = pp.char_buf.items[char_top..]; + const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = directive_tok.loc, + .extra = .{ .str = duped }, + }, &.{}); +} + +fn clearGuard(pp: *Preprocessor) void { + pp.guard_stack.items[pp.guard_stack.items.len - 1] = null; +} + +fn readDirective(pp: *Preprocessor) Error!void { + const directive = pp.getToken(); + if (directive.id.isDirectiveEnd()) return; + if (directive.id == .pp_num) { + return pp.readLinemarker(); + } + + const until_else = 0; + const until_endif = 1; + const until_endif_seen_else = 2; + + switch (directive.id) { + .keyword_define => try pp.readDefine(), + .keyword_elif => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elif_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elif", .{}); + } + } else { + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elif", .{}); + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elif_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_else => { + try pp.expectNewline(); + if (pp.if_level == 0) { + try pp.errTok(directive, .else_without_if); + return; + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + pp.if_kind.set(pp.if_level, until_endif_seen_else); + if (pp.verbose) { + pp.verboseLog(directive, "#else branch here", .{}); + } + }, + until_endif => try pp.skip(.until_endif_seen_else), + until_endif_seen_else => { + try pp.errTok(directive, .else_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_endif => { + try pp.expectNewline(); + if (pp.if_level == 0) { + pp.clearGuard(); + try pp.errTok(directive, .endif_without_if); + return; + } else if (pp.if_level == 1) { + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; + const saved_tokenizer = tokenizer.*; + defer tokenizer.* = saved_tokenizer; + + var next_tok = tokenizer.nextNoWS(); + while (next_tok.id == .nl) : (next_tok = tokenizer.nextNoWS()) {} + if (next_tok.id != .eof) pp.clearGuard(); + } + pp.if_level -= 1; + }, + .keyword_error => try pp.readErrorMessage(directive, .error_directive), + .keyword_if => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #if", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #if", .{}); + } + } + }, + .keyword_ifdef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #ifdef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #ifdef", .{}); + } + } + }, + .keyword_ifndef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) == null) { + pp.if_kind.set(pp.if_level, until_endif); + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + } + }, + .keyword_elifdef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifdef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_elifndef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) == null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifndef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_include => try pp.readInclude(directive), + .keyword_include_next => try pp.readIncludeNext(directive), + .keyword_line => try pp.readLine(), + .keyword_pragma => try pp.readPragma(), + .keyword_undef => try pp.readUndef(), + .keyword_warning => try pp.readErrorMessage(directive, .warning_directive), + .keyword_embed => try pp.readEmbed(directive), + else => try pp.errTok(directive, .invalid_preprocessing_directive), + } +} + +/// TODO: handle limit/prefix/suffix/etc +fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + + const limit = std.math.maxInt(u32); + const embed_bytes = (try pp.comp.findEmbed(filename, directive_tok.loc.id, include_type, limit)) orelse + return pp.fatalNotFound(directive_tok, filename); + defer pp.comp.gpa.free(embed_bytes); + + try pp.ensureUnusedTokenCapacity(2 * embed_bytes.len - 1); // N bytes and N-1 commas + + // TODO: We currently only support systems with CHAR_BIT == 8 + // If the target's CHAR_BIT is not 8, we need to write out correctly-sized embed_bytes + // and correctly account for the target's endianness + const writer = pp.comp.generated_buf.writer(pp.gpa); + + { + const byte = embed_bytes[0]; + const start = pp.comp.generated_buf.items.len; + try writer.print("{d}", .{byte}); + var generated = try pp.makeGeneratedToken(start, .embed_byte, directive_tok); + generated.flags.is_bol = true; + pp.addTokenAssumeCapacity(generated); + } + + for (embed_bytes[1..]) |byte| { + const start = pp.comp.generated_buf.items.len; + try writer.print(",{d}", .{byte}); + pp.addTokenAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); + pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); + } + try pp.comp.generated_buf.append(pp.gpa, '\n'); +} + +fn readToken(pp: *Preprocessor) Error!PreprocessorToken { + while (true) { + const tok = try pp.readExpand(); + if (tok.flags.is_bol and tok.id == .hash and tok.hideset == null) { + try pp.readDirective(); + continue; + } + return tok; + } +} + +pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); + + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); + while (true) { + const tok = try pp.readToken(); + if (tok.id == .eof) { + const tokenizer = pp.tokenizers.pop(); + const guard_name = pp.guard_stack.pop(); + if (guard_name) |name| { + try pp.include_guards.put(pp.gpa, tokenizer.source, name); + } + if (pp.tokenizers.items.len == 0) { + return tok; + } + } else { + try pp.addToken(tok); + } + } +} + +// After how many empty lines are needed to replace them with linemarkers. +const collapse_newlines = 8; + +/// Pretty print tokens and try to preserve whitespace. +pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { + var i: usize = 0; + while (i < pp.tokens.len) : (i += 1) { + const tok = pp.tokens.get(i); + if (tok.id == .eof) break; + try pp.prettyPrintToken(w, tok); + } + try w.writeByte('\n'); +} + +fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: PreprocessorToken) !void { + if (tok.flags.is_bol) { + try w.writeByte('\n'); + } + if (tok.flags.space) { + try w.writeByte(' '); + } + if (tok.id.lexeme()) |some| { + try w.writeAll(some); + } else { + try w.writeAll(pp.tokSlice(tok)); + } +} + +pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { + const S = struct { + fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { + _ = context; + return std.math.order(lhs, rhs); + } + }; + + const indices = pp.expansion_entries.items(.idx); + const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; + const locs = pp.expansion_entries.items(.locs)[idx]; + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + return locs[0..i]; +} + +pub fn addToken(pp: *Preprocessor, tok: PreprocessorToken) !void { + if (tok.expansion_locs) |expansion_locs| { + try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); + } + try pp.tokens.append(pp.gpa, tok); +} + +pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: PreprocessorToken) void { + if (tok.expansion_locs) |expansion_locs| { + pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); + } + pp.tokens.appendAssumeCapacity(tok); +} + +pub fn ensureTotalTokenCapacity(pp: *Preprocessor, capacity: usize) !void { + try pp.tokens.ensureTotalCapacity(pp.gpa, capacity); + try pp.expansion_entries.ensureTotalCapacity(pp.gpa, capacity); +} + +pub fn ensureUnusedTokenCapacity(pp: *Preprocessor, capacity: usize) !void { + try pp.tokens.ensureUnusedCapacity(pp.gpa, capacity); + try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, capacity); +} + +fn skip( + pp: *Preprocessor, + cont: enum { until_else, until_endif, until_endif_seen_else }, +) Error!void { + var ifs_seen: u32 = 0; + var line_start = true; + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; + + while (tokenizer.index < tokenizer.buf.len) { + if (line_start) { + const saved_tokenizer = tokenizer.*; + const hash = tokenizer.nextNoWS(); + if (hash.id == .nl) continue; + line_start = false; + if (hash.id != .hash) continue; + const directive = tokenizer.nextNoWS(); + switch (directive.id) { + .keyword_else => { + if (ifs_seen != 0) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .else_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elif => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elif_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifdef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifdef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifndef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifndef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_endif => { + if (ifs_seen == 0) { + tokenizer.* = saved_tokenizer; + return; + } + ifs_seen -= 1; + }, + .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, + else => {}, + } + } else if (tokenizer.buf[tokenizer.index] == '\n') { + line_start = true; + tokenizer.index += 1; + tokenizer.line += 1; + tokenizer.bol = true; + if (pp.preserve_whitespace) { + try pp.addToken(.{ .id = .nl, .loc = .{ + .id = tokenizer.source, + .line = tokenizer.line, + } }); + } + } else { + line_start = false; + tokenizer.index += 1; + } + } else { + return pp.errTok(.{ .id = .eof, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .unterminated_conditional_directive); + } +} + +fn verboseLog(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) void { + const source = pp.comp.getSource(tok.loc.id); + const line_col = source.lineCol(tok.loc); + + const stderr = std.io.getStdErr().writer(); + var buf_writer = std.io.bufferedWriter(stderr); + const writer = buf_writer.writer(); + defer buf_writer.flush() catch {}; + writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; + writer.print(fmt, args) catch return; + writer.writeByte('\n') catch return; + writer.writeAll(line_col.line) catch return; + writer.writeByte('\n') catch return; +} + +fn fatal(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) Compilation.Error { + try pp.comp.diagnostics.list.append(pp.gpa, .{ + .tag = .cli_error, + .kind = .@"fatal error", + .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, + .loc = tok.loc, + }); + return error.FatalError; +} + +fn fatalNotFound(pp: *Preprocessor, tok: PreprocessorToken, filename: []const u8) Compilation.Error { + const old = pp.comp.diagnostics.fatal_errors; + pp.comp.diagnostics.fatal_errors = true; + defer pp.comp.diagnostics.fatal_errors = old; + + try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ + .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), + } }, tok.expansionSlice(), false); + unreachable; // addExtra should've returned FatalError +} + +/// Consume next token, error if it is not an identifier. +fn expectMacroName(pp: *Preprocessor) Error!?[]const u8 { + const macro_name = pp.getToken(); + if (!macro_name.id.isMacroIdentifier()) { + try pp.errTok(macro_name, .macro_name_missing); + pp.skipToNl(); + return null; + } + return pp.tokSlice(macro_name); +} + +/// Return the name of the #ifndef guard macro that starts a source, if any. +/// If a source starts with `#ifndef IDENTIFIER`, return `IDENTIFIER` +/// This function does not validate that the entire source is guarded by the +/// initial ifndef, if any +fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { + var tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .source = source.id, + }; + var hash = tokenizer.nextNoWS(); + while (hash.id == .nl) hash = tokenizer.nextNoWS(); + if (hash.id != .hash) return null; + const ifndef = tokenizer.nextNoWS(); + if (ifndef.id != .keyword_ifndef) return null; + const guard = tokenizer.nextNoWS(); + if (guard.id != .identifier) return null; + return pp.tokSlice(.{ .id = guard.id, .loc = .{ .id = guard.source, .byte_offset = guard.start, .line = guard.line } }); +} diff --git a/src/aro/Tokenizer.zig b/src/aro/Tokenizer.zig index 8ee38126..33f76429 100644 --- a/src/aro/Tokenizer.zig +++ b/src/aro/Tokenizer.zig @@ -10,6 +10,7 @@ pub const Token = struct { start: u32 = 0, end: u32 = 0, line: u32 = 0, + bol: bool = false, pub const Id = enum(u8) { invalid, @@ -323,6 +324,10 @@ pub const Token = struct { /// A comment token if asked to preserve comments. comment, + pub fn isDirectiveEnd(id: Id) bool { + return id == .nl or id == .eof; + } + /// Return true if token is identifier or keyword. pub fn isMacroIdentifier(id: Id) bool { switch (id) { @@ -1030,6 +1035,7 @@ index: u32 = 0, source: Source.Id, langopts: LangOpts, line: u32 = 1, +bol: bool = true, pub fn next(self: *Tokenizer) Token { var state: enum { @@ -1077,6 +1083,8 @@ pub fn next(self: *Tokenizer) Token { var start = self.index; var id: Token.Id = .eof; + const bol = self.bol; + self.bol = false; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; @@ -1086,6 +1094,7 @@ pub fn next(self: *Tokenizer) Token { id = .nl; self.index += 1; self.line += 1; + self.bol = true; break; }, '"' => { @@ -1265,6 +1274,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_string_literal; + self.bol = true; break; }, '\r' => unreachable, @@ -1281,6 +1291,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => { @@ -1297,6 +1308,7 @@ pub fn next(self: *Tokenizer) Token { }, '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => {}, @@ -1304,6 +1316,7 @@ pub fn next(self: *Tokenizer) Token { .char_escape_sequence => switch (c) { '\r', '\n' => { id = .unterminated_char_literal; + self.bol = true; break; }, else => state = .char_literal, @@ -1311,6 +1324,7 @@ pub fn next(self: *Tokenizer) Token { .string_escape_sequence => switch (c) { '\r', '\n' => { id = .unterminated_string_literal; + self.bol = true; break; }, else => state = .string_literal, @@ -1624,6 +1638,7 @@ pub fn next(self: *Tokenizer) Token { }, .line_comment => switch (c) { '\n' => { + self.bol = true; if (self.langopts.preserve_comments) { id = .comment; break; @@ -1656,6 +1671,7 @@ pub fn next(self: *Tokenizer) Token { }, .multi_line_comment_done => switch (c) { '\n' => { + self.bol = true; start = self.index; id = .nl; self.index += 1; @@ -1782,6 +1798,7 @@ pub fn next(self: *Tokenizer) Token { .end = self.index, .line = self.line, .source = self.source, + .bol = bol, }; } diff --git a/src/aro/Treap.zig b/src/aro/Treap.zig new file mode 100644 index 00000000..74d3005d --- /dev/null +++ b/src/aro/Treap.zig @@ -0,0 +1,165 @@ +/// Persistent treap data structure. Nodes are immutable and set operations do not invalidate +/// existing nodes. +/// Adapted from https://arxiv.org/pdf/1301.3388 +const std = @import("std"); + +const Treap = @This(); +const Key = []const u8; + +pub const Node = ?*const Item; + +const Item = struct { + key: Key, + left: Node, + right: Node, + + fn priority(node: *const Item) u64 { + return std.hash.Wyhash.hash(0, node.key); + } + + const HashContext = struct { + pub fn hash(self: @This(), s: *const Item) u64 { + _ = self; + return std.hash.Wyhash.hash(0, std.mem.asBytes(s)); + } + pub fn eql(self: @This(), a: *const Item, b: *const Item) bool { + _ = self; + return a.left == b.left and a.right == b.right and std.mem.eql(u8, a.key, b.key); + } + }; +}; + +allocator: std.mem.Allocator, +node_arena: std.heap.ArenaAllocator, +/// nodes are hash-consed so structural equality can be determined by comparing pointers +nodes: std.HashMapUnmanaged(*const Item, void, Item.HashContext, std.hash_map.default_max_load_percentage) = .{}, + +pub fn init(allocator: std.mem.Allocator) Treap { + return .{ .allocator = allocator, .node_arena = std.heap.ArenaAllocator.init(allocator) }; +} + +pub fn deinit(self: *Treap) void { + self.nodes.deinit(self.allocator); + self.node_arena.deinit(); +} + +fn makeNode(self: *Treap, key: Key, left: Node, right: Node) !Node { + const node: Item = .{ .key = key, .left = left, .right = right }; + const gop = try self.nodes.getOrPut(self.allocator, &node); + if (gop.found_existing) return gop.key_ptr.*; + + const new_node = try self.node_arena.allocator().create(Item); + new_node.* = .{ + .key = key, + .left = left, + .right = right, + }; + gop.key_ptr.* = new_node; + return new_node; +} + +fn join(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + if (t1.priority() < t2.priority()) { + return self.makeNode(t2.key, try self.join(t1, t2.left), t2.right); + } else { + return self.makeNode(t1.key, t1.left, try self.join(t1.right, t2)); + } +} + +fn split(self: *Treap, t_arg: Node, key: Key) !struct { Node, Node } { + const t = t_arg orelse return .{ null, null }; + switch (std.mem.order(u8, key, t.key)) { + .lt => { + const l1, const l2 = try self.split(t.left, key); + return .{ l1, try self.makeNode(t.key, l2, t.right) }; + }, + .eq, .gt => { + const r1, const r2 = try self.split(t.right, key); + return .{ try self.makeNode(t.key, t.left, r1), r2 }; + }, + } +} + +fn add(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + std.debug.assert(!std.mem.eql(u8, t1.key, t2.key)); + if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.makeNode(t2.key, try self.add(l1, t2.left), try self.add(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.makeNode(t1.key, try self.add(t1.left, l2), try self.add(t1.right, r2)); + } +} + +pub fn addNodeTo(self: *Treap, t1: Node, key: Key) !Node { + std.debug.assert(!self.contains(t1, key)); + const node = try self.makeNode(key, null, null); + return self.add(t1, node); +} + +pub fn @"union"(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + if (t1_arg == t2_arg) return t1_arg; + const t1 = t1_arg orelse return t2_arg; + const t2 = t2_arg orelse return t1_arg; + + if (std.mem.eql(u8, t1.key, t2.key)) { + return self.makeNode(t1.key, try self.@"union"(t1.left, t2.left), try self.@"union"(t1.right, t2.right)); + } else if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.makeNode(t2.key, try self.@"union"(l1, t2.left), try self.@"union"(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.makeNode(t1.key, try self.@"union"(t1.left, l2), try self.@"union"(t1.right, r2)); + } +} + +pub fn intersection(self: *Treap, t1_arg: Node, t2_arg: Node) !Node { + if (t1_arg == t2_arg) return t1_arg; + const t1 = t1_arg orelse return null; + const t2 = t2_arg orelse return null; + + if (std.mem.eql(u8, t1.key, t2.key)) { + return self.makeNode(t1.key, try self.intersection(t1.left, t2.left), try self.intersection(t1.right, t2.right)); + } else if (t1.priority() < t2.priority()) { + const l1, const r1 = try self.split(t1, t2.key); + return self.join(try self.intersection(l1, t2.left), try self.intersection(r1, t2.right)); + } else { + const l2, const r2 = try self.split(t2, t1.key); + return self.join(try self.intersection(t1.left, l2), try self.intersection(t1.right, r2)); + } +} + +pub fn contains(self: *Treap, t_arg: Node, key: Key) bool { + const t = t_arg orelse return false; + return switch (std.mem.order(u8, key, t.key)) { + .eq => true, + .lt => self.contains(t.left, key), + .gt => self.contains(t.right, key), + }; +} + +test add { + var treap = Treap.init(std.testing.allocator); + defer treap.deinit(); + + const tree1 = try treap.addNodeTo(null, "1"); + const tree2 = try treap.addNodeTo(tree1, "2"); + const tree3 = try treap.addNodeTo(tree2, "3"); + const tree4 = try treap.addNodeTo(tree3, "4"); + + try std.testing.expect(treap.contains(tree1, "1")); + try std.testing.expect(!treap.contains(tree1, "2")); + try std.testing.expect(treap.contains(tree2, "1")); + try std.testing.expect(treap.contains(tree2, "2")); + try std.testing.expect(!treap.contains(tree2, "3")); + try std.testing.expect(treap.contains(tree3, "1")); + try std.testing.expect(treap.contains(tree3, "2")); + try std.testing.expect(treap.contains(tree3, "3")); + try std.testing.expect(!treap.contains(tree3, "4")); + + try std.testing.expect(treap.contains(tree4, "4")); +} From d96db70e6d32747d78097d1ef8aaf8a74215bc88 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Wed, 17 Jul 2024 13:32:15 -0700 Subject: [PATCH 03/10] Preprocessor: connect new preprocessor to parser --- src/aro/Driver.zig | 45 - src/aro/NewPreprocessor.zig | 2097 --------------- src/aro/Parser.zig | 33 + src/aro/Pragma.zig | 4 +- src/aro/Preprocessor.zig | 4756 ++++++++++++----------------------- src/aro/Tree.zig | 46 +- src/aro/pragmas/gcc.zig | 10 +- test/runner.zig | 6 +- 8 files changed, 1693 insertions(+), 5304 deletions(-) delete mode 100644 src/aro/NewPreprocessor.zig diff --git a/src/aro/Driver.zig b/src/aro/Driver.zig index 2d383bf0..db0b1155 100644 --- a/src/aro/Driver.zig +++ b/src/aro/Driver.zig @@ -9,7 +9,6 @@ const Compilation = @import("Compilation.zig"); const Diagnostics = @import("Diagnostics.zig"); const LangOpts = @import("LangOpts.zig"); const Preprocessor = @import("Preprocessor.zig"); -const NewPreprocessor = @import("NewPreprocessor.zig"); const Source = @import("Source.zig"); const Toolchain = @import("Toolchain.zig"); const target_util = @import("target.zig"); @@ -37,7 +36,6 @@ line_commands: bool = true, /// If true, use `#line ` instead of `# ` for line directives use_line_directives: bool = false, only_preprocess: bool = false, -new_preprocessor: bool = false, only_syntax: bool = false, only_compile: bool = false, only_preprocess_and_compile: bool = false, @@ -238,8 +236,6 @@ pub fn parseArgs( d.only_compile = true; } else if (mem.eql(u8, arg, "-E")) { d.only_preprocess = true; - } else if (mem.eql(u8, arg, "-fnew-preprocessor")) { - d.new_preprocessor = true; } else if (mem.eql(u8, arg, "-P") or mem.eql(u8, arg, "--no-line-commands")) { d.line_commands = false; } else if (mem.eql(u8, arg, "-fuse-line-directives")) { @@ -634,47 +630,6 @@ fn processSource( comptime fast_exit: bool, ) !void { d.comp.generated_buf.items.len = 0; - if (d.new_preprocessor) { - var pp = try NewPreprocessor.initDefault(d.comp); - defer pp.deinit(); - if (d.comp.langopts.ms_extensions) { - d.comp.ms_cwd_source_id = source.id; - } - - if (d.verbose_pp) pp.verbose = true; - if (d.only_preprocess) { - pp.preserve_whitespace = true; - if (d.line_commands) { - pp.linemarkers = if (d.use_line_directives) .line_directives else .numeric_directives; - } - } - - try pp.preprocessSources(&.{ source, builtin, user_macros }); - - d.renderErrors(); - - if (d.comp.diagnostics.errors != 0) { - if (fast_exit) std.process.exit(1); // Not linking, no need for cleanup. - return; - } - - const file = if (d.output_name) |some| - std.fs.cwd().createFile(some, .{}) catch |er| - return d.fatal("unable to create output file '{s}': {s}", .{ some, errorDescription(er) }) - else - std.io.getStdOut(); - defer if (d.output_name != null) file.close(); - - var buf_w = std.io.bufferedWriter(file.writer()); - pp.prettyPrintTokens(buf_w.writer()) catch |er| - return d.fatal("unable to write result: {s}", .{errorDescription(er)}); - - buf_w.flush() catch |er| - return d.fatal("unable to write result: {s}", .{errorDescription(er)}); - - std.process.exit(0); // Not linking, no need for cleanup. - return; - } var pp = try Preprocessor.initDefault(d.comp); defer pp.deinit(); diff --git a/src/aro/NewPreprocessor.zig b/src/aro/NewPreprocessor.zig deleted file mode 100644 index 77442d26..00000000 --- a/src/aro/NewPreprocessor.zig +++ /dev/null @@ -1,2097 +0,0 @@ -const std = @import("std"); -const mem = std.mem; -const Allocator = mem.Allocator; -const assert = std.debug.assert; -const Compilation = @import("Compilation.zig"); -const Error = Compilation.Error; -const Source = @import("Source.zig"); -const Tokenizer = @import("Tokenizer.zig"); -const RawToken = Tokenizer.Token; -const Parser = @import("Parser.zig"); -const Diagnostics = @import("Diagnostics.zig"); -const Tree = @import("Tree.zig"); -const Token = Tree.Token; -const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs; -const Attribute = @import("Attribute.zig"); -const features = @import("features.zig"); -const OldPreprocessor = @import("Preprocessor.zig"); -const Treap = @import("treap.zig"); - -const ParamMap = std.StringHashMapUnmanaged(PreprocessorToken); -const DefineMap = std.StringHashMapUnmanaged(Macro); - -const TokenList = std.ArrayListUnmanaged(PreprocessorToken); -const max_include_depth = 200; - -/// Errors that can be returned when expanding a macro. -/// error.UnknownPragma can occur within Preprocessor.pragma() but -/// it is handled there and doesn't escape that function -const MacroError = Error || error{StopPreprocessing}; - -const PreprocessingError = Error || error{PreprocessingFailed}; - -const SpecialMacroFn = fn (*Preprocessor, PreprocessorToken) Error!void; - -fn Range(comptime T: type) type { - return struct { - const Self = @This(); - const Item = T; - - start: u32, - end: u32, - const empty: Self = .{ .start = 0, .end = 0 }; - - fn len(self: Self) u32 { - return self.end - self.start; - } - - fn slice(self: Self, items: []const Item) []const Item { - return items[self.start..self.end]; - } - }; -} - -/// Each macro argument is a list of tokens (represented as a range of Preprocessor.macro_arg_tokens) -const MacroArg = Range(PreprocessorToken); - -/// List of MacroArg's for a macro invocation (represented as a range of Preprocessor.macro_args) -const MacroArgList = Range(MacroArg); - -const PreprocessorToken = struct { - flags: packed struct(u8) { - is_bol: bool = false, - space: bool = false, - _: u6 = undefined, - } = .{}, - id: Tokenizer.Token.Id, - hideset: Treap.Node = null, - loc: Source.Location, - expansion_locs: ?[*]Source.Location = null, - - fn argPosition(self: PreprocessorToken) u32 { - std.debug.assert(self.id == .macro_param); - return self.loc.byte_offset; - } - - fn isVarArg(self: PreprocessorToken) bool { - std.debug.assert(self.id == .macro_param); - return self.loc.line != 0; - } - - pub fn expansionSlice(tok: PreprocessorToken) []const Source.Location { - const locs = tok.expansion_locs orelse return &[0]Source.Location{}; - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - return locs[0..i]; - } - - pub fn addExpansionLocation(tok: *PreprocessorToken, gpa: std.mem.Allocator, new: []const Source.Location) !void { - if (new.len == 0 or tok.id == .whitespace or tok.id == .macro_ws or tok.id == .placemarker) return; - var list = std.ArrayList(Source.Location).init(gpa); - defer { - @memset(list.items.ptr[list.items.len..list.capacity], .{}); - // Add a sentinel to indicate the end of the list since - // the ArrayList's capacity isn't guaranteed to be exactly - // what we ask for. - if (list.capacity > 0) { - list.items.ptr[list.capacity - 1].byte_offset = 1; - } - tok.expansion_locs = list.items.ptr; - } - - if (tok.expansion_locs) |locs| { - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - list.items = locs[0..i]; - while (locs[i].byte_offset != 1) : (i += 1) {} - list.capacity = i + 1; - } - - const min_len = @max(list.items.len + new.len + 1, 4); - const wanted_len = std.math.ceilPowerOfTwo(usize, min_len) catch - return error.OutOfMemory; - try list.ensureTotalCapacity(wanted_len); - - for (new) |new_loc| { - if (new_loc.id == .generated) continue; - list.appendAssumeCapacity(new_loc); - } - } - - pub fn free(expansion_locs: ?[*]Source.Location, gpa: std.mem.Allocator) void { - const locs = expansion_locs orelse return; - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - while (locs[i].byte_offset != 1) : (i += 1) {} - gpa.free(locs[0 .. i + 1]); - } - - pub fn dupe(tok: PreprocessorToken, gpa: std.mem.Allocator) !PreprocessorToken { - var copy = tok; - copy.expansion_locs = null; - try copy.addExpansionLocation(gpa, tok.expansionSlice()); - return copy; - } - - pub fn checkMsEof(tok: PreprocessorToken, source: Source, comp: *Compilation) !void { - std.debug.assert(tok.id == .eof); - if (source.buf.len > tok.loc.byte_offset and source.buf[tok.loc.byte_offset] == 0x1A) { - try comp.addDiagnostic(.{ - .tag = .ctrl_z_eof, - .loc = .{ - .id = source.id, - .byte_offset = tok.loc.byte_offset, - .line = tok.loc.line, - }, - }, &.{}); - } - } - - const one: PreprocessorToken = .{ .id = .one, .loc = .{} }; - const zero: PreprocessorToken = .{ .id = .zero, .loc = .{} }; -}; - -const Macro = struct { - /// Tokens constituting the macro body - tokens: []const PreprocessorToken, - - /// Number of arguments for function-like macros - nargs: usize, - - /// If the function type macro has variable number of arguments - var_args: bool, - - /// Location of macro in the source - loc: Source.Location, - - kind: Kind, - - const Kind = union(enum) { - object, - func, - special: *const SpecialMacroFn, - }; - - fn eql(a: Macro, b: Macro, pp: *Preprocessor) bool { - if ((a.kind == .object and b.kind != .object) or (a.kind == .func and b.kind != .func)) return false; - if (!std.meta.eql(a.kind, b.kind)) return false; - if (a.tokens.len != b.tokens.len) return false; - for (a.tokens, b.tokens) |a_tok, b_tok| if (!tokEql(pp, a_tok, b_tok)) return false; - - if (a.kind == .func) { - if (a.var_args != b.var_args) return false; - } - - return true; - } - - fn tokEql(pp: *Preprocessor, a: PreprocessorToken, b: PreprocessorToken) bool { - return mem.eql(u8, pp.tokSlice(a), pp.tokSlice(b)); - } -}; - -const Preprocessor = @This(); - -const ExpansionEntry = struct { - idx: Tree.TokenIndex, - locs: [*]Source.Location, -}; - -const TokenState = struct { - tokens_len: usize, - expansion_entries_len: usize, -}; - -comp: *Compilation, -gpa: mem.Allocator, -arena: std.heap.ArenaAllocator, - -tokens: std.MultiArrayList(PreprocessorToken) = .{}, -/// Do not directly mutate this; must be kept in sync with `tokens` -expansion_entries: std.MultiArrayList(ExpansionEntry) = .{}, - -/// Map from Source.Id to macro name in the `#ifndef` condition which guards the source, if any -include_guards: std.AutoHashMapUnmanaged(Source.Id, []const u8) = .{}, - -char_buf: std.ArrayListUnmanaged(u8) = .{}, - -/// Dump current state to stderr. -verbose: bool = false, -preserve_whitespace: bool = false, - -/// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers) -linemarkers: Linemarkers = .none, - -tokenizers: std.ArrayListUnmanaged(Tokenizer) = .{}, - -expansion_bufs: std.ArrayListUnmanaged(TokenList) = .{}, - -defines: DefineMap = .{}, - -generated_line: u32 = 1, - -counter: u32 = 0, - -if_level: u8 = 0, - -if_kind: std.PackedIntArray(u2, 256) = blk: { - @setEvalBranchQuota(2000); - break :blk std.PackedIntArray(u2, 256).initAllTo(0); -}, - -guard_stack: std.ArrayListUnmanaged(?[]const u8) = .{}, - -macro_arg_tokens: std.ArrayListUnmanaged(MacroArg.Item) = .{}, -macro_args: std.ArrayListUnmanaged(MacroArgList.Item) = .{}, - -safe_strings: std.StringHashMapUnmanaged(void) = .{}, - -treap: Treap, - -pub const parse = Parser.parse; - -pub const Linemarkers = enum { - /// No linemarker tokens. Required setting if parser will run - none, - /// #line "filename" - line_directives, - /// # "filename" flags - numeric_directives, -}; - -pub fn init(comp: *Compilation) Preprocessor { - const pp = Preprocessor{ - .comp = comp, - .gpa = comp.gpa, - .arena = std.heap.ArenaAllocator.init(comp.gpa), - .treap = Treap.init(comp.gpa), - }; - comp.pragmaEvent(.before_preprocess); - return pp; -} - -fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, func: *const SpecialMacroFn) !void { - try pp.defines.putNoClobber(pp.gpa, name, .{ - .tokens = &.{}, - .var_args = false, - .loc = .{ .id = .generated }, - .kind = .{ .special = func }, - .nargs = 0, - }); -} - -fn handleLineMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(tok.loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{source.physicalLine(tok.loc)}); - const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); - return pp.ungetToken(pasted_tok); -} - -fn handleFileMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(tok.loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("\"{s}\"\n", .{source.path}); - const pasted_tok = try pp.makeGeneratedToken(start, .string_literal, tok); - return pp.ungetToken(pasted_tok); -} - -fn handleCounterMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - defer pp.counter += 1; - const start = pp.comp.generated_buf.items.len; - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{pp.counter}); - const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); - return pp.ungetToken(pasted_tok); -} - -fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: PreprocessorToken) !PreprocessorToken { - const pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ - .id = .generated, - .byte_offset = @intCast(start), - .line = pp.generated_line, - } }; - pp.generated_line += 1; - // try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); - // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); - return pasted_token; -} - -fn errStr(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag, str: []const u8) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = tok.loc, - .extra = .{ .str = str }, - }, &.{}); // todo expansion slice -} - -fn errTok(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = tok.loc, - .extra = .{ .none = {} }, - }, &.{}); // todo expansion slice -} - -fn expectClosing(pp: *Preprocessor, opening: PreprocessorToken, id: Token.Id) !void { - // todo: fix expect - const item = try pp.expect(id, .closing_paren); - if (item.id != id) { - try pp.errTok(opening, .to_match_paren); - } -} - -fn tokFromBool(b: bool) PreprocessorToken { - return if (b) PreprocessorToken.one else PreprocessorToken.zero; -} - -fn handleHasAttribute(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - _ = tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const attr_name = try pp.readToken(); - try pp.expectClosing(l_paren, .r_paren); - - const has_attr = Attribute.fromString(.gnu, null, pp.tokSlice(attr_name)) != null; - return pp.ungetToken(tokFromBool(has_attr)); -} - -fn handleHasCAttribute(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - _ = macro_tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - var r: TokenList = .{}; - defer r.deinit(pp.gpa); - - var tok: PreprocessorToken = undefined; - while (true) { - tok = try pp.readToken(); - if (tok.id == .comment) continue; - if (tok.id.isDirectiveEnd() or tok.id == .r_paren) break; - try r.append(pp.gpa, tok); - } - try pp.expectClosing(l_paren, .r_paren); -} - -fn handleHasDeclSpecAttribute(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - _ = tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const attr_name = try pp.readToken(); - try pp.expectClosing(l_paren, .r_paren); - - const ident_str = pp.tokSlice(attr_name); - const has_attr = if (pp.comp.langopts.declspec_attrs) Attribute.fromString(.declspec, null, ident_str) != null else false; - return pp.ungetToken(tokFromBool(has_attr)); -} - -fn handleHasFeature(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - _ = tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const attr_name = try pp.readToken(); - try pp.expectClosing(l_paren, .r_paren); - - const ident_str = pp.tokSlice(attr_name); - const has_feature = features.hasFeature(pp.comp, ident_str); - return pp.ungetToken(tokFromBool(has_feature)); -} - -fn handleHasExtension(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - _ = tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const attr_name = try pp.readToken(); - try pp.expectClosing(l_paren, .r_paren); - - const ident_str = pp.tokSlice(attr_name); - const has_extension = features.hasExtension(pp.comp, ident_str); - return pp.ungetToken(tokFromBool(has_extension)); -} - -fn handleHasBuiltin(pp: *Preprocessor, tok: PreprocessorToken) Error!void { - _ = tok; - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const attr_name = try pp.readToken(); - try pp.expectClosing(l_paren, .r_paren); - - const ident_str = pp.tokSlice(attr_name); - const has_builtin = pp.comp.hasBuiltin(ident_str); - return pp.ungetToken(tokFromBool(has_builtin)); -} - -fn handleHasWarning(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - const start = pp.char_buf.items.len; - defer pp.char_buf.items.len = start; - - while (true) { - const tok = try pp.readExpandNewline(); - switch (tok.id) { - .nl, .eof => { - try pp.errTok(tok, .unterminated_macro_arg_list); - return pp.ungetToken(PreprocessorToken.zero); - }, - .r_paren => break, - .string_literal => { - const string = pp.tokSlice(tok); - try pp.char_buf.appendSlice(pp.gpa, string[1 .. string.len - 1]); - }, - else => { - pp.skipToNl(); - try pp.errTok(tok, .missing_paren_param_list); - try pp.errTok(l_paren, .to_match_paren); - return pp.ungetToken(PreprocessorToken.zero); - }, - } - } - const actual_param = pp.char_buf.items[start..]; - if (actual_param.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_arguments, - .loc = macro_tok.loc, - .extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }, - }, &.{}); // todo expansion slice - return pp.ungetToken(PreprocessorToken.zero); - } - if (!mem.startsWith(u8, actual_param, "-W")) { - try pp.errStr(l_paren, .malformed_warning_check, "__has_warning"); - return pp.ungetToken(PreprocessorToken.zero); - } - const warning_name = actual_param[2..]; - const exists = Diagnostics.warningExists(warning_name); - return pp.ungetToken(tokFromBool(exists)); -} - -fn handleHasInclude(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - return pp.handleHasIncludeExtra(macro_tok, .first); -} - -fn handleHasIncludeNext(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - return pp.handleHasIncludeExtra(macro_tok, .next); -} - -fn handleHasIncludeExtra(pp: *Preprocessor, macro_tok: PreprocessorToken, which: Compilation.WhichInclude) Error!void { - const l_paren = pp.getToken(); - if (l_paren.id != .l_paren) { - pp.skipToNl(); - return; - } - - var is_std: bool = undefined; - const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { - error.InvalidInclude => return pp.ungetToken(PreprocessorToken.zero), - else => |e| return e, - }; - try pp.expectClosing(l_paren, .r_paren); - - const filename = include_str[1 .. include_str.len - 1]; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - - if (which == .first or pp.includeDepth() == 0) { - if (which == .next) { - try pp.comp.addDiagnostic(.{ - .tag = .include_next_outside_header, - .loc = macro_tok.loc, - }, &.{}); - } - const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .first); - return pp.ungetToken(tokFromBool(has)); - } - const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .next); - return pp.ungetToken(tokFromBool(has)); -} - -fn includeDepth(pp: *Preprocessor) usize { - return pp.tokenizers.items.len - 1; -} - -fn hasEmbedValue(contents_arg: ?[]const u8) []const u8 { - const contents = contents_arg orelse return "0\n"; - if (contents.len == 0) return "2\n"; - return "1\n"; -} - -/// TODO: handle limit/prefix/suffix/etc -fn handleHasEmbed(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - const l_paren = pp.getToken(); - if (l_paren.id != .l_paren) { - pp.skipToNl(); - return; - } - - var is_std: bool = undefined; - const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { - error.InvalidInclude => return, - else => |e| return e, - }; - try pp.expectClosing(l_paren, .r_paren); - - const filename = include_str[1 .. include_str.len - 1]; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - - const contents = try pp.comp.findEmbed(filename, macro_tok.loc.id, include_type, 1); - const result = hasEmbedValue(contents); - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); - const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, macro_tok); - return pp.ungetToken(pasted_tok); -} - -// Skip until newline, ignore other tokens. -fn skipToNl(pp: *Preprocessor) void { - while (true) { - const tok = pp.getToken(); - if (tok.id.isDirectiveEnd()) return; - } -} - -fn readOneIdentifierArgument(pp: *Preprocessor, macro_tok: PreprocessorToken) !?PreprocessorToken { - const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); - _ = l_paren; - var invalid: ?PreprocessorToken = null; - var identifier: ?PreprocessorToken = null; - while (true) { - var tok = pp.getToken(); - tok.id.simplifyMacroKeywordExtra(true); - - switch (tok.id) { - .r_paren, .eof => break, - else => { - if (identifier) |_| invalid = tok else identifier = tok; - }, - } - } - if (invalid) |some| { - try pp.comp.addDiagnostic(.{ - .tag = .missing_tok_builtin, - .loc = some.loc, - .extra = .{ .tok_id_expected = .r_paren }, - }, &.{}); // TODO: expansion slice - return null; - } - if (identifier) |ident| { - if (ident.id == .identifier or ident.id == .extended_identifier) return ident; - } else { - const extra: Diagnostics.Message.Extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - } - return null; -} - -fn handleIsIdentifier(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - if (try pp.readOneIdentifierArgument(macro_tok)) |_| { - return pp.ungetToken(PreprocessorToken.one); - } else { - return pp.ungetToken(PreprocessorToken.zero); - } -} - -fn handlePragmaOperator(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { - _ = pp; - _ = macro_tok; - // TODO -} - -fn addBuiltinMacros(pp: *Preprocessor) !void { - try pp.addBuiltinMacro("__has_attribute", handleHasAttribute); - try pp.addBuiltinMacro("__has_c_attribute", handleHasCAttribute); - try pp.addBuiltinMacro("__has_declspec_attribute", handleHasDeclSpecAttribute); - try pp.addBuiltinMacro("__has_feature", handleHasFeature); - try pp.addBuiltinMacro("__has_extension", handleHasExtension); - try pp.addBuiltinMacro("__has_builtin", handleHasBuiltin); - try pp.addBuiltinMacro("__has_warning", handleHasWarning); - try pp.addBuiltinMacro("__has_include", handleHasInclude); - try pp.addBuiltinMacro("__has_include_next", handleHasIncludeNext); - try pp.addBuiltinMacro("__has_embed", handleHasEmbed); - - try pp.addBuiltinMacro("__is_identifier", handleIsIdentifier); - - try pp.addBuiltinMacro("__FILE__", handleFileMacro); - try pp.addBuiltinMacro("__LINE__", handleLineMacro); - try pp.addBuiltinMacro("__COUNTER__", handleCounterMacro); - try pp.addBuiltinMacro("_Pragma", handlePragmaOperator); -} - -/// Initialize Preprocessor with builtin macros. -pub fn initDefault(comp: *Compilation) !Preprocessor { - var pp = init(comp); - errdefer pp.deinit(); - try pp.addBuiltinMacros(); - return pp; -} - -pub fn deinit(pp: *Preprocessor) void { - pp.arena.deinit(); - pp.include_guards.deinit(pp.gpa); - pp.tokens.deinit(pp.gpa); - pp.tokenizers.deinit(pp.gpa); - for (pp.expansion_bufs.items) |*toklist| { - toklist.deinit(pp.gpa); - } - pp.expansion_bufs.deinit(pp.gpa); - pp.defines.deinit(pp.gpa); - pp.char_buf.deinit(pp.gpa); - for (pp.expansion_entries.items(.locs)) |locs| PreprocessorToken.free(locs, pp.gpa); - pp.expansion_entries.deinit(pp.gpa); - pp.guard_stack.deinit(pp.gpa); - pp.macro_arg_tokens.deinit(pp.gpa); - pp.macro_args.deinit(pp.gpa); - pp.safe_strings.deinit(pp.gpa); - pp.treap.deinit(); -} - -/// Preprocess a compilation unit of sources into a parsable list of tokens. -pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { - assert(sources.len > 1); - const first = sources[0]; - - for (sources[1..]) |header| { - _ = try pp.preprocess(header); - } - const eof = try pp.preprocess(first); - try pp.addToken(eof); -} - -fn propagateSpace(pp: *Preprocessor, tokens: []PreprocessorToken, template: PreprocessorToken) void { - if (tokens.len > 0) { - tokens[0].flags = template.flags; - } else { - pp.injectSpace(); - } -} - -fn ungetAll(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { - if (tokens.len == 0) return; - const start = pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len; - try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); - std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items[start..]); -} - -fn addHideSet(pp: *Preprocessor, toks: []PreprocessorToken, hideset: Treap.Node) !void { - for (toks) |*tok| { - switch (tok.id) { - // non-identifiers are not expanded, so we don't need to track their hidesets. - // Track r_paren hideset since it is used for computing the hideset of function-like macro expansions - .identifier, .extended_identifier, .r_paren => { - tok.hideset = try pp.treap.@"union"(tok.hideset, hideset); - }, - else => {}, - } - } -} - -fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) !PreprocessorToken { - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.append(pp.gpa, '"'); - const args = args_range.slice(pp.macro_arg_tokens.items); - for (args, 0..) |tok, i| { - const slice = pp.tokSlice(tok); - if (slice.len > 0 and tok.flags.space and i != 0) { - try pp.comp.generated_buf.append(pp.gpa, ' '); - } - try pp.comp.generated_buf.appendSlice(pp.gpa, slice); - } - try pp.comp.generated_buf.append(pp.gpa, '"'); - var tok = tmpl; - tok.id = .string_literal; - tok.loc = .{ - .id = .generated, - .byte_offset = @intCast(start), - .line = pp.generated_line, - }; - pp.generated_line += 1; - return tok; -} - -fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, args: MacroArgList, hideset_arg: Treap.Node) ![]PreprocessorToken { - _ = macro_tok; - var hideset = hideset_arg; - var r: TokenList = .{}; - defer r.deinit(pp.gpa); - var i: usize = 0; - while (i < macro.tokens.len) : (i += 1) { - const t0 = macro.tokens[i]; - const t1: ?PreprocessorToken = if (i == macro.tokens.len - 1) null else macro.tokens[i + 1]; - - const t0_param = t0.id == .macro_param; - const t1_param = if (t1) |tok| tok.id == .macro_param else false; - - if (t0.id == .hash and t1_param) { - const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; - const stringized = try pp.stringize(t0, arg); - try r.append(pp.gpa, stringized); - i += 1; - continue; - } - if (t0.id == .hash_hash and t1_param) { - const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; - if (t1.?.isVarArg() and r.items.len > 0 and r.items[r.items.len - 1].id == .comma) { - if (arg.len() == 0) { - _ = r.pop(); - } else { - try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); - } - } else if (arg.len() > 0) { - try pp.pasteAndPush(&r, arg.slice(pp.macro_arg_tokens.items)[0]); - try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)[1..]); - } - i += 1; - continue; - } - if (t0.id == .hash_hash and t1 != null) { - hideset = t1.?.hideset; - try pp.pasteAndPush(&r, t1.?); - i += 1; - continue; - } - if (t0_param and t1 != null and t1.?.id == .hash_hash) { - hideset = t1.?.hideset; - const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; - if (arg.len() == 0) { - i += 1; - } else { - try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); - } - continue; - } - if (t0_param) { - const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; - const expanded = try pp.expandAll(arg.slice(pp.macro_arg_tokens.items), t0); - defer pp.gpa.free(expanded); - try r.appendSlice(pp.gpa, expanded); - continue; - } - try r.append(pp.gpa, t0); - } - try pp.addHideSet(r.items, hideset); - return r.toOwnedSlice(pp.gpa); -} - -fn pasteTokens(pp: *Preprocessor, lhs: PreprocessorToken, rhs: PreprocessorToken) !PreprocessorToken { - const start = pp.comp.generated_buf.items.len; - const end = start + pp.tokSlice(lhs).len + pp.tokSlice(rhs).len; - try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline - - // We cannot use the same slices here since they might be invalidated by `ensureCapacity` - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(lhs)); - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(rhs)); - pp.comp.generated_buf.appendAssumeCapacity('\n'); - - // Try to tokenize the result. - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.generated_buf.items, - .langopts = pp.comp.langopts, - .index = @intCast(start), - .source = .generated, - }; - const pasted_token = tmp_tokenizer.nextNoWSComments(); - const next_tok = tmp_tokenizer.next(); - if (next_tok.id != .nl) { - try pp.errStr( - lhs, - .pasting_formed_invalid, - try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), - ); - } - return pp.makeGeneratedToken(start, pasted_token.id, lhs); -} - -/// Paste `tok` onto the last token in `tokens` -fn pasteAndPush(pp: *Preprocessor, tokens: *TokenList, tok: PreprocessorToken) !void { - const last = tokens.pop(); - const pasted = try pp.pasteTokens(last, tok); - return tokens.append(pp.gpa, pasted); -} - -fn tokenBufferStashReverse(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { - try pp.expansion_bufs.append(pp.gpa, .{}); - try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); - std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items); -} - -fn tokenBufferUnstash(pp: *Preprocessor) void { - var buf = pp.expansion_bufs.pop(); - buf.deinit(pp.gpa); -} - -fn expandAll(pp: *Preprocessor, tokens: []const PreprocessorToken, tmpl: PreprocessorToken) ![]const PreprocessorToken { - try pp.tokenBufferStashReverse(tokens); - defer pp.tokenBufferUnstash(); - var r: TokenList = .{}; - defer r.deinit(pp.gpa); - while (true) { - const tok = try pp.readExpand(); - if (tok.id == .eof) break; - try r.append(pp.gpa, tok); - } - pp.propagateSpace(r.items, tmpl); - return r.toOwnedSlice(pp.gpa); -} - -fn peekToken(pp: *Preprocessor) !PreprocessorToken { - const tok = try pp.readToken(); - try pp.ungetToken(tok); - return tok; -} - -/// Return a string with the same contents as `name` and whose lifetime is the same as the preprocessor's lifetime -/// If `tok` is not from the generated source, this is just `name`. -/// If `tok` is from the generated source, pointers are invalidated when the underlying ArrayList is resized. Therefore, -/// duplicate the string and store it (so we aren't repeatedly copying the same string) -fn getSafeString(pp: *Preprocessor, tok: PreprocessorToken, name: []const u8) ![]const u8 { - if (tok.loc.id != .generated) return name; - const gop = try pp.safe_strings.getOrPut(pp.gpa, name); - if (!gop.found_existing) { - const copy = try pp.arena.allocator().dupe(u8, name); - gop.key_ptr.* = copy; - } - return gop.key_ptr.*; -} - -fn injectSpace(pp: *Preprocessor) void { - var i = pp.expansion_bufs.items.len; - while (i > 0) : (i -= 1) { - var j = pp.expansion_bufs.items[i - 1].items.len; - while (j > 0) : (j -= 1) { - pp.expansion_bufs.items[i - 1].items[j - 1].flags.space = true; - return; - } - } -} - -fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { - const tok = pp.getToken(); - if (!tok.id.isMacroIdentifier()) return tok; - const name = pp.tokSlice(tok); - const macro = pp.defines.getPtr(name) orelse return tok; - - const macro_hideset = tok.hideset; - if (pp.treap.contains(macro_hideset, name)) return tok; - - switch (macro.kind) { - .object => { - const safe_name = try pp.getSafeString(tok, name); - const new_hideset = try pp.treap.addNodeTo(tok.hideset, safe_name); - - const tokens = try pp.subst(macro, tok, MacroArgList.empty, new_hideset); - defer pp.gpa.free(tokens); - pp.propagateSpace(tokens, tok); - try pp.ungetAll(tokens); - return pp.readExpand(); - }, - .func => { - if (!try pp.next(.l_paren)) return tok; - const arg_tokens_start = pp.macro_arg_tokens.items.len; - defer pp.macro_arg_tokens.items.len = arg_tokens_start; - const macro_args_start = pp.macro_args.items.len; - defer pp.macro_args.items.len = macro_args_start; - - const args = pp.readArgs(tok, macro) catch |err| switch (err) { - error.IncorrectArgumentCount => return PreprocessorToken.zero, - error.UnterminatedMacroArgumentList => { - try pp.errTok(tok, .unterminated_macro_arg_list); - return PreprocessorToken.zero; - }, - else => |e| return e, - }; - const r_paren = pp.getToken(); - std.debug.assert(r_paren.id == .r_paren); - const safe_name = try pp.getSafeString(tok, name); - - const intersection = try pp.treap.intersection(macro_hideset, r_paren.hideset); - const hideset = try pp.treap.addNodeTo(intersection, safe_name); - const tokens = try pp.subst(macro, tok, args, hideset); - defer pp.gpa.free(tokens); - pp.propagateSpace(tokens, tok); - try pp.ungetAll(tokens); - return pp.readExpand(); - }, - .special => |func| { - try func(pp, tok); - return pp.readExpand(); - }, - } -} - -fn readMacroArg(pp: *Preprocessor, end: *bool, readall: bool) !MacroArg { - var level: i32 = 0; - const start: u32 = @intCast(pp.macro_arg_tokens.items.len); - while (true) { - var tok = pp.getToken(); - if (tok.id == .eof) { - return error.UnterminatedMacroArgumentList; - } - if (tok.id == .nl) continue; - if (tok.flags.is_bol and tok.id == .hash) { - try pp.readDirective(); - continue; - } - if (level == 0 and tok.id == .r_paren) { - try pp.ungetToken(tok); - end.* = true; - break; - } - if (level == 0 and tok.id == .comma and !readall) { - break; - } - if (tok.id == .l_paren) { - level += 1; - } - if (tok.id == .r_paren) { - level -= 1; - } - if (tok.flags.is_bol) { - tok.flags = .{ .is_bol = false, .space = true }; - } - try pp.macro_arg_tokens.append(pp.gpa, tok); - } - return .{ .start = start, .end = @intCast(pp.macro_arg_tokens.items.len) }; -} - -fn doReadArgs(pp: *Preprocessor, macro: *const Macro) !MacroArgList { - const start: u32 = @intCast(pp.macro_args.items.len); - var end = false; - while (!end) { - const in_ellipsis = macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs; - const arg_range = try pp.readMacroArg(&end, in_ellipsis); - try pp.macro_args.append(pp.gpa, arg_range); - } - if (macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs) { - try pp.macro_args.append(pp.gpa, MacroArg.empty); - } - return .{ .start = start, .end = @intCast(pp.macro_args.items.len) }; -} - -fn readArgs(pp: *Preprocessor, ident: PreprocessorToken, macro: *const Macro) !MacroArgList { - if (macro.nargs == 0 and (try pp.peekToken()).id == .r_paren) { - return MacroArgList.empty; - } - const args = try pp.doReadArgs(macro); - if (args.len() != macro.nargs) { - const extra = Diagnostics.Message.Extra{ - .arguments = .{ .expected = @intCast(macro.nargs), .actual = @intCast(args.len()) }, - }; - try pp.comp.addDiagnostic( - .{ .tag = .expected_arguments, .loc = ident.loc, .extra = extra }, - &.{}, // TODO: expansion slice - ); - return error.IncorrectArgumentCount; - } - return args; -} - -fn readExpand(pp: *Preprocessor) Error!PreprocessorToken { - while (true) { - const tok = try pp.readExpandNewline(); - if (tok.id != .nl) return tok; - } -} - -/// # number "file" flags -/// TODO: validate that the pp_num token is solely digits -/// if not, emit `GNU line marker directive requires a simple digit sequence` -fn readLinemarker(pp: *Preprocessor) !void { - const name = pp.getToken(); - if (name.id.isDirectiveEnd()) return; - if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); - - const flag_1 = pp.getToken(); - if (flag_1.id.isDirectiveEnd()) return; - const flag_2 = pp.getToken(); - if (flag_2.id.isDirectiveEnd()) return; - const flag_3 = pp.getToken(); - if (flag_3.id.isDirectiveEnd()) return; - const flag_4 = pp.getToken(); - if (flag_4.id.isDirectiveEnd()) return; - try pp.expectNewline(); -} - -fn readIdent(pp: *Preprocessor) !?PreprocessorToken { - const tok = pp.getToken(); - if (!tok.id.isMacroIdentifier()) { - try pp.errTok(tok, .macro_name_must_be_identifier); - return null; - } - return tok; -} - -fn ungetToken(pp: *Preprocessor, tok: PreprocessorToken) !void { - if (tok.id == .eof) return; - if (pp.isBufferEmpty()) { - try pp.expansion_bufs.append(pp.gpa, .{}); - } - try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].append(pp.gpa, tok); -} - -fn hashHashCheck(pp: *Preprocessor, toks: []const PreprocessorToken) !void { - if (toks.len == 0) return; - if (toks[0].id == .hash_hash) { - return pp.errTok(toks[0], .hash_hash_at_start); - } - if (toks[toks.len - 1].id == .hash_hash) { - return pp.errTok(toks[toks.len - 1], .hash_hash_at_end); - } -} - -fn readObjMacro(pp: *Preprocessor, name: PreprocessorToken) !void { - var body: TokenList = .{}; - errdefer body.deinit(pp.gpa); - - while (true) { - const tok = pp.getToken(); - if (tok.id.isDirectiveEnd()) break; - - try body.append(pp.gpa, tok); - } - try pp.hashHashCheck(body.items); - const macro: Macro = .{ - .tokens = body.items, - .var_args = false, - .loc = undefined, - .kind = .object, - .nargs = undefined, - }; - try pp.defineMacro(name, macro); -} - -/// Defines a new macro and warns if it is a duplicate -fn defineMacro(pp: *Preprocessor, name_tok: PreprocessorToken, macro: Macro) Error!void { - const name_str = pp.tokSlice(name_tok); - const gop = try pp.defines.getOrPut(pp.gpa, name_str); - if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { - const tag: Diagnostics.Tag = if (gop.value_ptr.kind == .special) .builtin_macro_redefined else .macro_redefined; - const start = pp.comp.diagnostics.list.items.len; - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = name_tok.loc, - .extra = .{ .str = name_str }, - }, &.{}); - if (gop.value_ptr.kind != .special and pp.comp.diagnostics.list.items.len != start) { - try pp.comp.addDiagnostic(.{ - .tag = .previous_definition, - .loc = gop.value_ptr.loc, - }, &.{}); - } - } - gop.value_ptr.* = macro; -} - -/// Get raw token source string. -/// Returned slice is invalidated when comp.generated_buf is updated. -pub fn tokSlice(pp: *Preprocessor, token: PreprocessorToken) []const u8 { - if (token.id.lexeme()) |some| return some; - const source = pp.comp.getSource(token.loc.id); - var tmp_tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .index = token.loc.byte_offset, - .source = .generated, - }; - const tok = tmp_tokenizer.next(); - return tmp_tokenizer.buf[tok.start..tok.end]; -} - -fn expect(pp: *Preprocessor, expected: Tokenizer.Token.Id, tag: Diagnostics.Tag) !PreprocessorToken { - const tok = pp.getToken(); - if (tok.id != expected) { - try pp.errTok(tok, tag); - } - return tok; -} - -fn makeMacroToken(position: usize, is_vararg: bool) PreprocessorToken { - return .{ - .id = .macro_param, - .hideset = null, - .loc = .{ - .id = .unused, - .byte_offset = @intCast(position), - .line = @intFromBool(is_vararg), - }, - }; -} - -fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { - const tok = pp.getToken(); - if (tok.id == id) return true; - try pp.ungetToken(tok); - return false; -} - -/// Returns true for vararg function-like macro, false otherwise -fn readFunclikeMacroParams(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken, params: *ParamMap) !bool { - _ = name; - var pos: usize = 0; - while (true) { - var tok = pp.getToken(); - if (tok.id == .r_paren) return false; - if (pos != 0) { - if (tok.id != .comma) { - switch (tok.id) { - .nl, .eof => {}, - else => pp.skipToNl(), - } - try pp.errTok(tok, .expected_comma_param_list); - return error.InvalidMacroDef; - } - tok = pp.getToken(); - } - if (tok.id.isDirectiveEnd()) { - try pp.errTok(tok, .missing_paren_param_list); - return false; - } - if (tok.id == .ellipsis) { - try params.put(pp.gpa, "__VA_ARGS__", makeMacroToken(pos, true)); - pos += 1; - const r_paren = pp.getToken(); - if (r_paren.id != .r_paren) { - try pp.errTok(r_paren, .missing_paren_param_list); - try pp.errTok(l_paren, .to_match_paren); - return error.InvalidMacroDef; - } - return true; - } - if (!tok.id.isMacroIdentifier()) { - try pp.errTok(tok, .invalid_token_param_list); - return error.InvalidMacroDef; - } - const arg = pp.tokSlice(tok); - if (try pp.next(.ellipsis)) { - const r_paren = pp.getToken(); - if (r_paren.id != .r_paren) { - try pp.errTok(r_paren, .missing_paren_param_list); - try pp.errTok(l_paren, .to_match_paren); - pp.skipToNl(); - } - try params.put(pp.gpa, arg, makeMacroToken(pos, true)); - pos += 1; - return true; - } - try params.put(pp.gpa, arg, makeMacroToken(pos, false)); - pos += 1; - } -} - -fn readFunclikeMacroBody(pp: *Preprocessor, params: *const ParamMap) ![]const PreprocessorToken { - var tokens: TokenList = .{}; - errdefer tokens.deinit(pp.gpa); - while (true) { - const tok = pp.getToken(); - if (tok.id.isDirectiveEnd()) { - return tokens.toOwnedSlice(pp.gpa); - } - if (tok.id.isMacroIdentifier()) { - // const subst = params. - if (params.get(pp.tokSlice(tok))) |sub| { - var copy = sub; - copy.flags.space = tok.flags.space; - try tokens.append(pp.gpa, copy); - continue; - } - } - try tokens.append(pp.gpa, tok); - } -} - -fn readFuncLikeMacro(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken) Error!void { - var params: ParamMap = .{}; - defer params.deinit(pp.gpa); - const is_vararg = pp.readFunclikeMacroParams(name, l_paren, ¶ms) catch |err| switch (err) { - error.InvalidMacroDef => blk: { - pp.skipToNl(); - break :blk false; - }, - else => |e| return e, - }; - const body = try pp.readFunclikeMacroBody(¶ms); - errdefer pp.gpa.free(body); - try pp.hashHashCheck(body); - const macro: Macro = .{ - .tokens = body, - .var_args = is_vararg, - .loc = undefined, - .kind = .func, - .nargs = params.count(), - }; - try pp.defineMacro(name, macro); -} - -fn readDefine(pp: *Preprocessor) !void { - const name = try pp.readIdent() orelse { - pp.skipToNl(); - return; - }; - const next_tok = pp.getToken(); - if (next_tok.id == .l_paren and !next_tok.flags.space) { - try pp.readFuncLikeMacro(name, next_tok); - return; - } - try pp.ungetToken(next_tok); - try pp.readObjMacro(name); -} - -fn doSkipSpace(pp: *Preprocessor) bool { - const saved_tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; - const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); - switch (tok.id) { - .eof => return false, - .whitespace, .comment => return true, - else => { - pp.tokenizers.items[pp.tokenizers.items.len - 1] = saved_tokenizer; - return false; - }, - } -} - -/// Skips spaces including comments. -/// Returns true if at least one space is skipped. -fn skipSpace(pp: *Preprocessor) bool { - if (!pp.doSkipSpace()) { - return false; - } - while (pp.doSkipSpace()) {} - return true; -} - -/// Read the next raw token from the tokenizer stack -fn lexToken(pp: *Preprocessor) PreprocessorToken { - if (pp.skipSpace()) { - return .{ .id = .whitespace, .loc = undefined }; - } - const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); - return .{ - .id = tok.id, - .flags = .{ - .is_bol = tok.bol, - }, - .loc = .{ - .id = tok.source, - .byte_offset = tok.start, - .line = tok.line, - }, - }; -} - -/// Read the next token without expanding it -fn getToken(pp: *Preprocessor) PreprocessorToken { - if (!pp.isBufferEmpty() and pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len > 0) { - return pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].pop(); - } - if (pp.expansion_bufs.items.len > 1) { - return .{ .id = .eof, .loc = undefined }; - } - const bol = pp.tokenizers.items[pp.tokenizers.items.len - 1].bol; - var tok = pp.lexToken(); - while (tok.id == .whitespace) { - tok = pp.lexToken(); - tok.flags.space = true; - } - tok.flags.is_bol = bol; - return tok; -} - -fn readDefinedOp(pp: *Preprocessor) !PreprocessorToken { - var tok = pp.getToken(); - if (tok.id == .l_paren) { - tok = pp.getToken(); - const r_paren = pp.getToken(); - if (r_paren.id != .r_paren) { - try pp.errStr(r_paren, .closing_paren_after, "defined"); - } - } - if (!tok.id.isMacroIdentifier()) { - try pp.errTok(tok, .macro_name_must_be_identifier); - } - const slice = pp.tokSlice(tok); - if (pp.defines.contains(slice)) { - return PreprocessorToken.one; - } - return PreprocessorToken.zero; -} - -fn readIntExprLine(pp: *Preprocessor) !void { - while (true) { - const tok = try pp.readExpandNewline(); - if (tok.id.isDirectiveEnd()) break; - if (tok.id == .keyword_defined) { - const result = try pp.readDefinedOp(); - try pp.addToken(result); - } else if (tok.id.isMacroIdentifier()) { - try pp.addToken(PreprocessorToken.zero); - } else { - try pp.addToken(tok); - } - } - try pp.addToken(.{ .id = .eof, .loc = .{} }); -} - -fn readConstexpr(pp: *Preprocessor) !bool { - const start = pp.tokens.len; - defer pp.tokens.len = start; - try pp.readIntExprLine(); - - var oldpp = try OldPreprocessor.initDefault(pp.comp); - defer oldpp.deinit(); - - var i: usize = start; - while (i < pp.tokens.len) : (i += 1) { - const tok = pp.tokens.get(i); - try oldpp.tokens.append(pp.gpa, .{ .id = tok.id, .loc = tok.loc }); - } - - var parser = Parser{ - .pp = &oldpp, - .comp = pp.comp, - .gpa = pp.gpa, - .tok_ids = pp.tokens.items(.id)[start..], - .tok_i = 0, - .arena = undefined, - .in_macro = true, - .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), - - .data = undefined, - .value_map = undefined, - .labels = undefined, - .decl_buf = undefined, - .list_buf = undefined, - .param_buf = undefined, - .enum_buf = undefined, - .record_buf = undefined, - .attr_buf = undefined, - .field_attr_buf = undefined, - .string_ids = undefined, - }; - defer parser.strings.deinit(); - return parser.macroExpr(); -} - -/// #line number "file" -/// TODO: validate that the pp_num token is solely digits -fn readLine(pp: *Preprocessor) Error!void { - const digits = pp.getToken(); - if (digits.id != .pp_num) try pp.errTok(digits, .line_simple_digit); - - if (digits.id.isDirectiveEnd()) return; - const name = pp.getToken(); - if (name.id.isDirectiveEnd()) return; - if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); - try pp.expectNewline(); -} - -fn readPragma(pp: *Preprocessor) Error!void { - _ = pp; - // TODO -} - -fn readUndef(pp: *Preprocessor) Error!void { - const name = try pp.readIdent() orelse { - pp.skipToNl(); - return; - }; - try pp.expectNewline(); - _ = pp.defines.remove(pp.tokSlice(name)); -} - -/// Skip until after a newline, error if extra tokens before it. -fn expectNewline(pp: *Preprocessor) !void { - var sent_err = false; - while (true) { - const tok = pp.getToken(); - if (tok.id.isDirectiveEnd()) return; - if (tok.id == .whitespace or tok.id == .comment) continue; - if (!sent_err) { - sent_err = true; - try pp.errTok(tok, .extra_tokens_directive_end); - } - } -} - -/// TODO: pragma once -fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: Compilation.WhichInclude) Error!void { - var is_std: bool = undefined; - const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { - error.InvalidInclude => return, - else => |e| return e, - }; - try pp.expectNewline(); - - const filename = include_str[1 .. include_str.len - 1]; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - const tok: RawToken = .{ .id = include_token.id, .source = include_token.loc.id, .start = include_token.loc.byte_offset, .line = include_token.loc.line }; - const source = (try pp.comp.findInclude(filename, tok, include_type, which)) orelse return pp.fatalNotFound(include_token, filename); - if (pp.include_guards.get(source.id)) |guard| { - if (pp.defines.contains(guard)) return; - } - const guard = pp.findIncludeGuard(source); - try pp.guard_stack.append(pp.gpa, guard); - - try pp.tokenizers.append(pp.gpa, .{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .index = 0, - .source = source.id, - }); -} - -/// Read a header name delimited by quotes or angle brackets -fn readHeaderFileName(pp: *Preprocessor, is_std: *bool) !?[]const u8 { - if (!pp.isBufferEmpty()) return null; - _ = pp.skipSpace(); - - var close: u8 = undefined; - var tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; - defer pp.tokenizers.items[pp.tokenizers.items.len - 1] = tokenizer; - - if (tokenizer.buf[tokenizer.index..].len < 2) { - return null; - } - const start = tokenizer.index; - switch (tokenizer.buf[tokenizer.index..][0]) { - '"' => { - is_std.* = false; - close = '"'; - }, - '<' => { - is_std.* = true; - close = '>'; - }, - else => return null, - } - tokenizer.index += 1; - while (tokenizer.index < tokenizer.buf.len and tokenizer.buf[tokenizer.index] != close and tokenizer.buf[tokenizer.index] != '\n') : (tokenizer.index += 1) {} - - if (tokenizer.index == tokenizer.buf.len or tokenizer.buf[tokenizer.index] != close) { - try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .header_str_closing); - try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .header_str_match); - return error.InvalidInclude; - } - - tokenizer.index += 1; - - const buf = tokenizer.buf[start..tokenizer.index]; - if (buf.len == 2) { - try pp.errTok(.{ .id = .nl, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .empty_filename); - return error.InvalidInclude; - } - return buf; -} - -fn isBufferEmpty(pp: *const Preprocessor) bool { - return pp.expansion_bufs.items.len == 0; -} - -/// Read a delimited header name, or a macro expanded one -fn readHeaderName(pp: *Preprocessor, is_std: *bool) ![]const u8 { - if (try pp.readHeaderFileName(is_std)) |path| return path; - - // If a token following #include does not start with < nor ", - // try to read the token as a regular token. Macro-expanded - // form may be a valid header file path. - const tok = try pp.readExpandNewline(); - if (tok.id.isDirectiveEnd()) { - try pp.errTok(tok, .expected_filename); - return error.InvalidInclude; - } - if (tok.id == .string_literal) { - is_std.* = false; - return pp.tokSlice(tok); - } - if (tok.id != .angle_bracket_left) { - try pp.errStr(tok, .expected_left_angle_bracket, pp.tokSlice(tok)); - return error.InvalidInclude; - } - const start = pp.char_buf.items.len; - try pp.char_buf.append(pp.gpa, '<'); - defer pp.char_buf.items.len = start; - const writer = pp.char_buf.writer(pp.gpa); - while (true) { - const path_tok = try pp.readExpandNewline(); - if (path_tok.id == .nl) { - try pp.errTok(path_tok, .header_str_closing); - try pp.errTok(tok, .header_str_match); - return error.InvalidInclude; - } - if (path_tok.id == .angle_bracket_right) { - break; - } - try pp.prettyPrintToken(writer, path_tok); - } - is_std.* = true; - try pp.char_buf.append(pp.gpa, '>'); - return pp.gpa.dupe(u8, pp.char_buf.items[start..]); -} - -fn readInclude(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { - return pp.readIncludeExtra(include_token, .first); -} - -fn readIncludeNext(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { - return pp.readIncludeExtra(include_token, .next); -} - -fn readErrorMessage(pp: *Preprocessor, directive_tok: PreprocessorToken, tag: Diagnostics.Tag) !void { - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - var i: usize = 0; - while (true) : (i += 1) { - const tok = pp.getToken(); - if (tok.id.isDirectiveEnd()) break; - const slice = pp.tokSlice(tok); - if (slice.len > 0 and tok.flags.space and i != 0) { - try pp.char_buf.append(pp.gpa, ' '); - } - try pp.char_buf.appendSlice(pp.gpa, slice); - } - const slice = pp.char_buf.items[char_top..]; - const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = directive_tok.loc, - .extra = .{ .str = duped }, - }, &.{}); -} - -fn clearGuard(pp: *Preprocessor) void { - pp.guard_stack.items[pp.guard_stack.items.len - 1] = null; -} - -fn readDirective(pp: *Preprocessor) Error!void { - const directive = pp.getToken(); - if (directive.id.isDirectiveEnd()) return; - if (directive.id == .pp_num) { - return pp.readLinemarker(); - } - - const until_else = 0; - const until_endif = 1; - const until_endif_seen_else = 2; - - switch (directive.id) { - .keyword_define => try pp.readDefine(), - .keyword_elif => { - if (pp.if_level == 0) { - try pp.errTok(directive, .elif_without_if); - pp.if_level += 1; - pp.if_kind.set(pp.if_level, until_else); - } else if (pp.if_level == 1) { - pp.clearGuard(); - } - switch (pp.if_kind.get(pp.if_level)) { - until_else => if (try pp.readConstexpr()) { - pp.if_kind.set(pp.if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elif", .{}); - } - } else { - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elif", .{}); - } - }, - until_endif => try pp.skip(.until_endif), - until_endif_seen_else => { - try pp.errTok(directive, .elif_after_else); - pp.skipToNl(); - }, - else => unreachable, - } - }, - .keyword_else => { - try pp.expectNewline(); - if (pp.if_level == 0) { - try pp.errTok(directive, .else_without_if); - return; - } else if (pp.if_level == 1) { - pp.clearGuard(); - } - switch (pp.if_kind.get(pp.if_level)) { - until_else => { - pp.if_kind.set(pp.if_level, until_endif_seen_else); - if (pp.verbose) { - pp.verboseLog(directive, "#else branch here", .{}); - } - }, - until_endif => try pp.skip(.until_endif_seen_else), - until_endif_seen_else => { - try pp.errTok(directive, .else_after_else); - pp.skipToNl(); - }, - else => unreachable, - } - }, - .keyword_endif => { - try pp.expectNewline(); - if (pp.if_level == 0) { - pp.clearGuard(); - try pp.errTok(directive, .endif_without_if); - return; - } else if (pp.if_level == 1) { - var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; - const saved_tokenizer = tokenizer.*; - defer tokenizer.* = saved_tokenizer; - - var next_tok = tokenizer.nextNoWS(); - while (next_tok.id == .nl) : (next_tok = tokenizer.nextNoWS()) {} - if (next_tok.id != .eof) pp.clearGuard(); - } - pp.if_level -= 1; - }, - .keyword_error => try pp.readErrorMessage(directive, .error_directive), - .keyword_if => { - const sum, const overflowed = @addWithOverflow(pp.if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - pp.if_level = sum; - - if (try pp.readConstexpr()) { - pp.if_kind.set(pp.if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #if", .{}); - } - } else { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #if", .{}); - } - } - }, - .keyword_ifdef => { - const sum, const overflowed = @addWithOverflow(pp.if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - pp.if_level = sum; - - const macro_name = (try pp.expectMacroName()) orelse return; - try pp.expectNewline(); - if (pp.defines.get(macro_name) != null) { - pp.if_kind.set(pp.if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #ifdef", .{}); - } - } else { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #ifdef", .{}); - } - } - }, - .keyword_ifndef => { - const sum, const overflowed = @addWithOverflow(pp.if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - pp.if_level = sum; - - const macro_name = (try pp.expectMacroName()) orelse return; - try pp.expectNewline(); - if (pp.defines.get(macro_name) == null) { - pp.if_kind.set(pp.if_level, until_endif); - } else { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - } - }, - .keyword_elifdef => { - if (pp.if_level == 0) { - try pp.errTok(directive, .elifdef_without_if); - pp.if_level += 1; - pp.if_kind.set(pp.if_level, until_else); - } else if (pp.if_level == 1) { - pp.clearGuard(); - } - switch (pp.if_kind.get(pp.if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(); - if (macro_name == null) { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } else { - try pp.expectNewline(); - if (pp.defines.get(macro_name.?) != null) { - pp.if_kind.set(pp.if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifdef", .{}); - } - } else { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } - } - }, - until_endif => try pp.skip(.until_endif), - until_endif_seen_else => { - try pp.errTok(directive, .elifdef_after_else); - pp.skipToNl(); - }, - else => unreachable, - } - }, - .keyword_elifndef => { - if (pp.if_level == 0) { - try pp.errTok(directive, .elifdef_without_if); - pp.if_level += 1; - pp.if_kind.set(pp.if_level, until_else); - } else if (pp.if_level == 1) { - pp.clearGuard(); - } - switch (pp.if_kind.get(pp.if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(); - if (macro_name == null) { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } else { - try pp.expectNewline(); - if (pp.defines.get(macro_name.?) == null) { - pp.if_kind.set(pp.if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifndef", .{}); - } - } else { - pp.if_kind.set(pp.if_level, until_else); - try pp.skip(.until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } - } - }, - until_endif => try pp.skip(.until_endif), - until_endif_seen_else => { - try pp.errTok(directive, .elifdef_after_else); - pp.skipToNl(); - }, - else => unreachable, - } - }, - .keyword_include => try pp.readInclude(directive), - .keyword_include_next => try pp.readIncludeNext(directive), - .keyword_line => try pp.readLine(), - .keyword_pragma => try pp.readPragma(), - .keyword_undef => try pp.readUndef(), - .keyword_warning => try pp.readErrorMessage(directive, .warning_directive), - .keyword_embed => try pp.readEmbed(directive), - else => try pp.errTok(directive, .invalid_preprocessing_directive), - } -} - -/// TODO: handle limit/prefix/suffix/etc -fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { - var is_std: bool = undefined; - const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { - error.InvalidInclude => return, - else => |e| return e, - }; - - const filename = include_str[1 .. include_str.len - 1]; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - - const limit = std.math.maxInt(u32); - const embed_bytes = (try pp.comp.findEmbed(filename, directive_tok.loc.id, include_type, limit)) orelse - return pp.fatalNotFound(directive_tok, filename); - defer pp.comp.gpa.free(embed_bytes); - - try pp.ensureUnusedTokenCapacity(2 * embed_bytes.len - 1); // N bytes and N-1 commas - - // TODO: We currently only support systems with CHAR_BIT == 8 - // If the target's CHAR_BIT is not 8, we need to write out correctly-sized embed_bytes - // and correctly account for the target's endianness - const writer = pp.comp.generated_buf.writer(pp.gpa); - - { - const byte = embed_bytes[0]; - const start = pp.comp.generated_buf.items.len; - try writer.print("{d}", .{byte}); - var generated = try pp.makeGeneratedToken(start, .embed_byte, directive_tok); - generated.flags.is_bol = true; - pp.addTokenAssumeCapacity(generated); - } - - for (embed_bytes[1..]) |byte| { - const start = pp.comp.generated_buf.items.len; - try writer.print(",{d}", .{byte}); - pp.addTokenAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); - } - try pp.comp.generated_buf.append(pp.gpa, '\n'); -} - -fn readToken(pp: *Preprocessor) Error!PreprocessorToken { - while (true) { - const tok = try pp.readExpand(); - if (tok.flags.is_bol and tok.id == .hash and tok.hideset == null) { - try pp.readDirective(); - continue; - } - return tok; - } -} - -pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { - const guard = pp.findIncludeGuard(source); - try pp.guard_stack.append(pp.gpa, guard); - - try pp.tokenizers.append(pp.gpa, .{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .index = 0, - .source = source.id, - }); - while (true) { - const tok = try pp.readToken(); - if (tok.id == .eof) { - const tokenizer = pp.tokenizers.pop(); - const guard_name = pp.guard_stack.pop(); - if (guard_name) |name| { - try pp.include_guards.put(pp.gpa, tokenizer.source, name); - } - if (pp.tokenizers.items.len == 0) { - return tok; - } - } else { - try pp.addToken(tok); - } - } -} - -// After how many empty lines are needed to replace them with linemarkers. -const collapse_newlines = 8; - -/// Pretty print tokens and try to preserve whitespace. -pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { - var i: usize = 0; - while (i < pp.tokens.len) : (i += 1) { - const tok = pp.tokens.get(i); - if (tok.id == .eof) break; - try pp.prettyPrintToken(w, tok); - } - try w.writeByte('\n'); -} - -fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: PreprocessorToken) !void { - if (tok.flags.is_bol) { - try w.writeByte('\n'); - } - if (tok.flags.space) { - try w.writeByte(' '); - } - if (tok.id.lexeme()) |some| { - try w.writeAll(some); - } else { - try w.writeAll(pp.tokSlice(tok)); - } -} - -pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { - const S = struct { - fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { - _ = context; - return std.math.order(lhs, rhs); - } - }; - - const indices = pp.expansion_entries.items(.idx); - const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; - const locs = pp.expansion_entries.items(.locs)[idx]; - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - return locs[0..i]; -} - -pub fn addToken(pp: *Preprocessor, tok: PreprocessorToken) !void { - if (tok.expansion_locs) |expansion_locs| { - try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); - } - try pp.tokens.append(pp.gpa, tok); -} - -pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: PreprocessorToken) void { - if (tok.expansion_locs) |expansion_locs| { - pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); - } - pp.tokens.appendAssumeCapacity(tok); -} - -pub fn ensureTotalTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureTotalCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureTotalCapacity(pp.gpa, capacity); -} - -pub fn ensureUnusedTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureUnusedCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, capacity); -} - -fn skip( - pp: *Preprocessor, - cont: enum { until_else, until_endif, until_endif_seen_else }, -) Error!void { - var ifs_seen: u32 = 0; - var line_start = true; - var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; - - while (tokenizer.index < tokenizer.buf.len) { - if (line_start) { - const saved_tokenizer = tokenizer.*; - const hash = tokenizer.nextNoWS(); - if (hash.id == .nl) continue; - line_start = false; - if (hash.id != .hash) continue; - const directive = tokenizer.nextNoWS(); - switch (directive.id) { - .keyword_else => { - if (ifs_seen != 0) continue; - if (cont == .until_endif_seen_else) { - // try pp.err(directive, .else_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elif => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - // try pp.err(directive, .elif_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifdef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - // try pp.err(directive, .elifdef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifndef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - // try pp.err(directive, .elifndef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_endif => { - if (ifs_seen == 0) { - tokenizer.* = saved_tokenizer; - return; - } - ifs_seen -= 1; - }, - .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, - else => {}, - } - } else if (tokenizer.buf[tokenizer.index] == '\n') { - line_start = true; - tokenizer.index += 1; - tokenizer.line += 1; - tokenizer.bol = true; - if (pp.preserve_whitespace) { - try pp.addToken(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); - } - } else { - line_start = false; - tokenizer.index += 1; - } - } else { - return pp.errTok(.{ .id = .eof, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .unterminated_conditional_directive); - } -} - -fn verboseLog(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) void { - const source = pp.comp.getSource(tok.loc.id); - const line_col = source.lineCol(tok.loc); - - const stderr = std.io.getStdErr().writer(); - var buf_writer = std.io.bufferedWriter(stderr); - const writer = buf_writer.writer(); - defer buf_writer.flush() catch {}; - writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; - writer.print(fmt, args) catch return; - writer.writeByte('\n') catch return; - writer.writeAll(line_col.line) catch return; - writer.writeByte('\n') catch return; -} - -fn fatal(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) Compilation.Error { - try pp.comp.diagnostics.list.append(pp.gpa, .{ - .tag = .cli_error, - .kind = .@"fatal error", - .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, - .loc = tok.loc, - }); - return error.FatalError; -} - -fn fatalNotFound(pp: *Preprocessor, tok: PreprocessorToken, filename: []const u8) Compilation.Error { - const old = pp.comp.diagnostics.fatal_errors; - pp.comp.diagnostics.fatal_errors = true; - defer pp.comp.diagnostics.fatal_errors = old; - - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ - .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), - } }, tok.expansionSlice(), false); - unreachable; // addExtra should've returned FatalError -} - -/// Consume next token, error if it is not an identifier. -fn expectMacroName(pp: *Preprocessor) Error!?[]const u8 { - const macro_name = pp.getToken(); - if (!macro_name.id.isMacroIdentifier()) { - try pp.errTok(macro_name, .macro_name_missing); - pp.skipToNl(); - return null; - } - return pp.tokSlice(macro_name); -} - -/// Return the name of the #ifndef guard macro that starts a source, if any. -/// If a source starts with `#ifndef IDENTIFIER`, return `IDENTIFIER` -/// This function does not validate that the entire source is guarded by the -/// initial ifndef, if any -fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { - var tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .source = source.id, - }; - var hash = tokenizer.nextNoWS(); - while (hash.id == .nl) hash = tokenizer.nextNoWS(); - if (hash.id != .hash) return null; - const ifndef = tokenizer.nextNoWS(); - if (ifndef.id != .keyword_ifndef) return null; - const guard = tokenizer.nextNoWS(); - if (guard.id != .identifier) return null; - return pp.tokSlice(.{ .id = guard.id, .loc = .{ .id = guard.source, .byte_offset = guard.start, .line = guard.line } }); -} diff --git a/src/aro/Parser.zig b/src/aro/Parser.zig index 270d0a33..17c43e25 100644 --- a/src/aro/Parser.zig +++ b/src/aro/Parser.zig @@ -7097,6 +7097,10 @@ fn unExpr(p: *Parser) Error!Result { return operand; }, .plus_plus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; var operand = try p.castExpr(); @@ -7123,6 +7127,10 @@ fn unExpr(p: *Parser) Error!Result { return operand; }, .minus_minus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; var operand = try p.castExpr(); @@ -7423,6 +7431,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { switch (p.tok_ids[p.tok_i]) { .l_paren => return p.callExpr(lhs), .plus_plus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } defer p.tok_i += 1; var operand = lhs; @@ -7441,6 +7453,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return operand; }, .minus_minus => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } defer p.tok_i += 1; var operand = lhs; @@ -7459,6 +7475,10 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return operand; }, .l_bracket => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } const l_bracket = p.tok_i; p.tok_i += 1; var index = try p.expr(); @@ -7495,11 +7515,19 @@ fn suffixExpr(p: *Parser, lhs: Result) Error!Result { return ptr; }, .period => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; const name = try p.expectIdentifier(); return p.fieldAccess(lhs, name, false); }, .arrow => { + if (p.in_macro) { + try p.err(.invalid_preproc_operator); + return error.ParsingFailed; + } p.tok_i += 1; const name = try p.expectIdentifier(); if (lhs.ty.isArray()) { @@ -8039,6 +8067,11 @@ fn makePredefinedIdentifier(p: *Parser, strings_top: usize) !Result { } fn stringLiteral(p: *Parser) Error!Result { + if (p.in_macro) { + try p.err(.invalid_preproc_expr_start); + return error.ParsingFailed; + } + var string_end = p.tok_i; var string_kind: text_literal.Kind = .char; while (text_literal.Kind.classify(p.tok_ids[string_end], .string_literal)) |next| : (string_end += 1) { diff --git a/src/aro/Pragma.zig b/src/aro/Pragma.zig index 279ac5f0..3f698c31 100644 --- a/src/aro/Pragma.zig +++ b/src/aro/Pragma.zig @@ -57,8 +57,8 @@ pub fn pasteTokens(pp: *Preprocessor, start_idx: TokenIndex) ![]const u8 { .r_paren => rparen_count += 1, .string_literal => { if (rparen_count != 0) return error.ExpectedStringLiteral; - const str = pp.expandedSlice(tok); - try pp.char_buf.appendSlice(str[1 .. str.len - 1]); + const str = pp.tokSlice(tok); + try pp.char_buf.appendSlice(pp.gpa, str[1 .. str.len - 1]); }, else => return error.ExpectedStringLiteral, } diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 3fd98882..33df74c8 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -14,10 +14,13 @@ const Token = Tree.Token; const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs; const Attribute = @import("Attribute.zig"); const features = @import("features.zig"); -const Hideset = @import("Hideset.zig"); +const OldPreprocessor = @import("Preprocessor.zig"); +const Treap = @import("Treap.zig"); +const ParamMap = std.StringHashMapUnmanaged(PreprocessorToken); const DefineMap = std.StringHashMapUnmanaged(Macro); -const RawTokenList = std.ArrayList(RawToken); + +const TokenList = std.ArrayListUnmanaged(PreprocessorToken); const max_include_depth = 200; /// Errors that can be returned when expanding a macro. @@ -25,40 +28,72 @@ const max_include_depth = 200; /// it is handled there and doesn't escape that function const MacroError = Error || error{StopPreprocessing}; +const PreprocessingError = Error || error{PreprocessingFailed}; + +const SpecialMacroFn = fn (*Preprocessor, PreprocessorToken) Error!void; + +fn Range(comptime T: type) type { + return struct { + const Self = @This(); + const Item = T; + + start: u32, + end: u32, + const empty: Self = .{ .start = 0, .end = 0 }; + + fn len(self: Self) u32 { + return self.end - self.start; + } + + fn slice(self: Self, items: []const Item) []const Item { + return items[self.start..self.end]; + } + }; +} + +/// Each macro argument is a list of tokens (represented as a range of Preprocessor.macro_arg_tokens) +const MacroArg = Range(PreprocessorToken); + +/// List of MacroArg's for a macro invocation (represented as a range of Preprocessor.macro_args) +const MacroArgList = Range(MacroArg); + +const PreprocessorToken = TokenWithExpansionLocs; + const Macro = struct { - /// Parameters of the function type macro - params: []const []const u8, + /// Tokens constituting the macro body + tokens: []const PreprocessorToken, - /// Token constituting the macro body - tokens: []const RawToken, + /// Number of arguments for function-like macros + nargs: usize, /// If the function type macro has variable number of arguments var_args: bool, - /// Is a function type macro - is_func: bool, - - /// Is a predefined macro - is_builtin: bool = false, - /// Location of macro in the source loc: Source.Location, + kind: Kind, + + const Kind = union(enum) { + object, + func, + special: *const SpecialMacroFn, + }; + fn eql(a: Macro, b: Macro, pp: *Preprocessor) bool { + if ((a.kind == .object and b.kind != .object) or (a.kind == .func and b.kind != .func)) return false; + if (!std.meta.eql(a.kind, b.kind)) return false; if (a.tokens.len != b.tokens.len) return false; - if (a.is_builtin != b.is_builtin) return false; for (a.tokens, b.tokens) |a_tok, b_tok| if (!tokEql(pp, a_tok, b_tok)) return false; - if (a.is_func and b.is_func) { + if (a.kind == .func) { if (a.var_args != b.var_args) return false; - if (a.params.len != b.params.len) return false; - for (a.params, b.params) |a_param, b_param| if (!mem.eql(u8, a_param, b_param)) return false; } return true; } - fn tokEql(pp: *Preprocessor, a: RawToken, b: RawToken) bool { + fn tokEql(pp: *Preprocessor, a: PreprocessorToken, b: PreprocessorToken) bool { return mem.eql(u8, pp.tokSlice(a), pp.tokSlice(b)); } }; @@ -78,27 +113,15 @@ const TokenState = struct { comp: *Compilation, gpa: mem.Allocator, arena: std.heap.ArenaAllocator, -defines: DefineMap = .{}, -/// Do not directly mutate this; use addToken / addTokenAssumeCapacity / ensureTotalTokenCapacity / ensureUnusedTokenCapacity -tokens: Token.List = .{}, + +tokens: std.MultiArrayList(Token) = .{}, /// Do not directly mutate this; must be kept in sync with `tokens` expansion_entries: std.MultiArrayList(ExpansionEntry) = .{}, -token_buf: RawTokenList, -char_buf: std.ArrayList(u8), -/// Counter that is incremented each time preprocess() is called -/// Can be used to distinguish multiple preprocessings of the same file -preprocess_count: u32 = 0, -generated_line: u32 = 1, -add_expansion_nl: u32 = 0, -include_depth: u8 = 0, -counter: u32 = 0, -expansion_source_loc: Source.Location = undefined, -poisoned_identifiers: std.StringHashMap(void), + /// Map from Source.Id to macro name in the `#ifndef` condition which guards the source, if any include_guards: std.AutoHashMapUnmanaged(Source.Id, []const u8) = .{}, -/// Memory is retained to avoid allocation on every single token. -top_expansion_buf: ExpandBuf, +char_buf: std.ArrayListUnmanaged(u8) = .{}, /// Dump current state to stderr. verbose: bool = false, @@ -107,7 +130,35 @@ preserve_whitespace: bool = false, /// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers) linemarkers: Linemarkers = .none, -hideset: Hideset, +tokenizers: std.ArrayListUnmanaged(Tokenizer) = .{}, + +expansion_bufs: std.ArrayListUnmanaged(TokenList) = .{}, + +defines: DefineMap = .{}, + +generated_line: u32 = 1, + +counter: u32 = 0, + +if_level: u8 = 0, + +preprocess_count: u32 = 0, + +poisoned_identifiers: std.StringHashMap(void), + +if_kind: std.PackedIntArray(u2, 256) = blk: { + @setEvalBranchQuota(2000); + break :blk std.PackedIntArray(u2, 256).initAllTo(0); +}, + +guard_stack: std.ArrayListUnmanaged(?[]const u8) = .{}, + +macro_arg_tokens: std.ArrayListUnmanaged(MacroArg.Item) = .{}, +macro_args: std.ArrayListUnmanaged(MacroArgList.Item) = .{}, + +safe_strings: std.StringHashMapUnmanaged(void) = .{}, + +treap: Treap, pub const parse = Parser.parse; @@ -125,2854 +176,1578 @@ pub fn init(comp: *Compilation) Preprocessor { .comp = comp, .gpa = comp.gpa, .arena = std.heap.ArenaAllocator.init(comp.gpa), - .token_buf = RawTokenList.init(comp.gpa), - .char_buf = std.ArrayList(u8).init(comp.gpa), .poisoned_identifiers = std.StringHashMap(void).init(comp.gpa), - .top_expansion_buf = ExpandBuf.init(comp.gpa), - .hideset = .{ .comp = comp }, + .treap = Treap.init(comp.gpa), }; comp.pragmaEvent(.before_preprocess); return pp; } -/// Initialize Preprocessor with builtin macros. -pub fn initDefault(comp: *Compilation) !Preprocessor { - var pp = init(comp); - errdefer pp.deinit(); - try pp.addBuiltinMacros(); - return pp; -} - -const builtin_macros = struct { - const args = [1][]const u8{"X"}; - - const has_attribute = [1]RawToken{.{ - .id = .macro_param_has_attribute, - .source = .generated, - }}; - const has_c_attribute = [1]RawToken{.{ - .id = .macro_param_has_c_attribute, - .source = .generated, - }}; - const has_declspec_attribute = [1]RawToken{.{ - .id = .macro_param_has_declspec_attribute, - .source = .generated, - }}; - const has_warning = [1]RawToken{.{ - .id = .macro_param_has_warning, - .source = .generated, - }}; - const has_feature = [1]RawToken{.{ - .id = .macro_param_has_feature, - .source = .generated, - }}; - const has_extension = [1]RawToken{.{ - .id = .macro_param_has_extension, - .source = .generated, - }}; - const has_builtin = [1]RawToken{.{ - .id = .macro_param_has_builtin, - .source = .generated, - }}; - const has_include = [1]RawToken{.{ - .id = .macro_param_has_include, - .source = .generated, - }}; - const has_include_next = [1]RawToken{.{ - .id = .macro_param_has_include_next, - .source = .generated, - }}; - const has_embed = [1]RawToken{.{ - .id = .macro_param_has_embed, - .source = .generated, - }}; - - const is_identifier = [1]RawToken{.{ - .id = .macro_param_is_identifier, - .source = .generated, - }}; - - const pragma_operator = [1]RawToken{.{ - .id = .macro_param_pragma_operator, - .source = .generated, - }}; - - const file = [1]RawToken{.{ - .id = .macro_file, - .source = .generated, - }}; - const line = [1]RawToken{.{ - .id = .macro_line, - .source = .generated, - }}; - const counter = [1]RawToken{.{ - .id = .macro_counter, - .source = .generated, - }}; -}; - -fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, is_func: bool, tokens: []const RawToken) !void { +fn addBuiltinMacro(pp: *Preprocessor, name: []const u8, func: *const SpecialMacroFn) !void { try pp.defines.putNoClobber(pp.gpa, name, .{ - .params = &builtin_macros.args, - .tokens = tokens, + .tokens = &.{}, .var_args = false, - .is_func = is_func, .loc = .{ .id = .generated }, - .is_builtin = true, + .kind = .{ .special = func }, + .nargs = 0, }); } -pub fn addBuiltinMacros(pp: *Preprocessor) !void { - try pp.addBuiltinMacro("__has_attribute", true, &builtin_macros.has_attribute); - try pp.addBuiltinMacro("__has_c_attribute", true, &builtin_macros.has_c_attribute); - try pp.addBuiltinMacro("__has_declspec_attribute", true, &builtin_macros.has_declspec_attribute); - try pp.addBuiltinMacro("__has_warning", true, &builtin_macros.has_warning); - try pp.addBuiltinMacro("__has_feature", true, &builtin_macros.has_feature); - try pp.addBuiltinMacro("__has_extension", true, &builtin_macros.has_extension); - try pp.addBuiltinMacro("__has_builtin", true, &builtin_macros.has_builtin); - try pp.addBuiltinMacro("__has_include", true, &builtin_macros.has_include); - try pp.addBuiltinMacro("__has_include_next", true, &builtin_macros.has_include_next); - try pp.addBuiltinMacro("__has_embed", true, &builtin_macros.has_embed); - try pp.addBuiltinMacro("__is_identifier", true, &builtin_macros.is_identifier); - try pp.addBuiltinMacro("_Pragma", true, &builtin_macros.pragma_operator); - - try pp.addBuiltinMacro("__FILE__", false, &builtin_macros.file); - try pp.addBuiltinMacro("__LINE__", false, &builtin_macros.line); - try pp.addBuiltinMacro("__COUNTER__", false, &builtin_macros.counter); +fn handleLineMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{source.physicalLine(tok.loc)}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); } -pub fn deinit(pp: *Preprocessor) void { - pp.defines.deinit(pp.gpa); - pp.tokens.deinit(pp.gpa); - pp.arena.deinit(); - pp.token_buf.deinit(); - pp.char_buf.deinit(); - pp.poisoned_identifiers.deinit(); - pp.include_guards.deinit(pp.gpa); - pp.top_expansion_buf.deinit(); - pp.hideset.deinit(); - for (pp.expansion_entries.items(.locs)) |locs| TokenWithExpansionLocs.free(locs, pp.gpa); - pp.expansion_entries.deinit(pp.gpa); +fn handleFileMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + const start = pp.comp.generated_buf.items.len; + const source = pp.comp.getSource(tok.loc.id); + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("\"{s}\"\n", .{source.path}); + const pasted_tok = try pp.makeGeneratedToken(start, .string_literal, tok); + return pp.ungetToken(pasted_tok); } -/// Free buffers that are not needed after preprocessing -fn clearBuffers(pp: *Preprocessor) void { - pp.token_buf.clearAndFree(); - pp.char_buf.clearAndFree(); - pp.top_expansion_buf.clearAndFree(); - pp.hideset.clearAndFree(); +fn handleCounterMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { + defer pp.counter += 1; + const start = pp.comp.generated_buf.items.len; + const w = pp.comp.generated_buf.writer(pp.gpa); + try w.print("{d}\n", .{pp.counter}); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, tok); + return pp.ungetToken(pasted_tok); } -pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { - const S = struct { - fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { - _ = context; - return std.math.order(lhs, rhs); - } - }; +fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: PreprocessorToken) !PreprocessorToken { + const pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + } }; + pp.generated_line += 1; + // try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); + // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); + return pasted_token; +} - const indices = pp.expansion_entries.items(.idx); - const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; - const locs = pp.expansion_entries.items(.locs)[idx]; - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - return locs[0..i]; +fn errStr(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag, str: []const u8) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .str = str }, + }, &.{}); // todo expansion slice } -/// Preprocess a compilation unit of sources into a parsable list of tokens. -pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { - assert(sources.len > 1); - const first = sources[0]; - try pp.addIncludeStart(first); - for (sources[1..]) |header| { - try pp.addIncludeStart(header); - _ = try pp.preprocess(header); +fn errTok(pp: *Preprocessor, tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .none = {} }, + }, &.{}); // todo expansion slice +} + +fn expectClosing(pp: *Preprocessor, opening: PreprocessorToken, id: Token.Id) !void { + // todo: fix expect + const item = try pp.expect(id, .closing_paren); + if (item.id != id) { + try pp.errTok(opening, .to_match_paren); } - try pp.addIncludeResume(first.id, 0, 1); - const eof = try pp.preprocess(first); - try pp.addToken(eof); - pp.clearBuffers(); } -/// Preprocess a source file, returns eof token. -pub fn preprocess(pp: *Preprocessor, source: Source) Error!TokenWithExpansionLocs { - const eof = pp.preprocessExtra(source) catch |er| switch (er) { - // This cannot occur in the main file and is handled in `include`. - error.StopPreprocessing => unreachable, - else => |e| return e, - }; - try eof.checkMsEof(source, pp.comp); - return eof; +fn tokFromBool(b: bool) PreprocessorToken { + return if (b) PreprocessorToken.one else PreprocessorToken.zero; } -/// Tokenize a file without any preprocessing, returns eof token. -pub fn tokenize(pp: *Preprocessor, source: Source) Error!Token { - assert(pp.linemarkers == .none); - assert(pp.preserve_whitespace == false); - var tokenizer = Tokenizer{ - .buf = source.buf, - .comp = pp.comp, - .source = source.id, - }; +fn handleHasAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); + + const has_attr = Attribute.fromString(.gnu, null, pp.tokSlice(attr_name)) != null; + return pp.ungetToken(tokFromBool(has_attr)); +} - // Estimate how many new tokens this source will contain. - const estimated_token_count = source.buf.len / 8; - try pp.ensureTotalTokenCapacity(pp.tokens.len + estimated_token_count); +fn handleHasCAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + var tok: PreprocessorToken = undefined; while (true) { - const tok = tokenizer.next(); - if (tok.id == .eof) return tokFromRaw(tok); - try pp.addToken(tokFromRaw(tok)); + tok = try pp.readToken(); + if (tok.id == .comment) continue; + if (tok.id.isDirectiveEnd() or tok.id == .r_paren) break; + try r.append(pp.gpa, tok); } + try pp.expectClosing(l_paren, .r_paren); } -pub fn addIncludeStart(pp: *Preprocessor, source: Source) !void { - if (pp.linemarkers == .none) return; - try pp.addToken(.{ .id = .include_start, .loc = .{ - .id = source.id, - .byte_offset = std.math.maxInt(u32), - .line = 1, - } }); -} +fn handleHasDeclSpecAttribute(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); -pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: u32) !void { - if (pp.linemarkers == .none) return; - try pp.addToken(.{ .id = .include_resume, .loc = .{ - .id = source, - .byte_offset = offset, - .line = line, - } }); + const ident_str = pp.tokSlice(attr_name); + const has_attr = if (pp.comp.langopts.declspec_attrs) Attribute.fromString(.declspec, null, ident_str) != null else false; + return pp.ungetToken(tokFromBool(has_attr)); } -fn invalidTokenDiagnostic(tok_id: Token.Id) Diagnostics.Tag { - return switch (tok_id) { - .unterminated_string_literal => .unterminated_string_literal_warning, - .empty_char_literal => .empty_char_literal_warning, - .unterminated_char_literal => .unterminated_char_literal_warning, - else => unreachable, - }; -} +fn handleHasFeature(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); -/// Return the name of the #ifndef guard macro that starts a source, if any. -fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { - var tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .source = source.id, - }; - var hash = tokenizer.nextNoWS(); - while (hash.id == .nl) hash = tokenizer.nextNoWS(); - if (hash.id != .hash) return null; - const ifndef = tokenizer.nextNoWS(); - if (ifndef.id != .keyword_ifndef) return null; - const guard = tokenizer.nextNoWS(); - if (guard.id != .identifier) return null; - return pp.tokSlice(guard); + const ident_str = pp.tokSlice(attr_name); + const has_feature = features.hasFeature(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_feature)); } -fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!TokenWithExpansionLocs { - var guard_name = pp.findIncludeGuard(source); +fn handleHasExtension(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); - pp.preprocess_count += 1; - var tokenizer = Tokenizer{ - .buf = source.buf, - .langopts = pp.comp.langopts, - .source = source.id, - }; + const ident_str = pp.tokSlice(attr_name); + const has_extension = features.hasExtension(pp.comp, ident_str); + return pp.ungetToken(tokFromBool(has_extension)); +} - // Estimate how many new tokens this source will contain. - const estimated_token_count = source.buf.len / 8; - try pp.ensureTotalTokenCapacity(pp.tokens.len + estimated_token_count); +fn handleHasBuiltin(pp: *Preprocessor, ident_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(ident_tok); + const attr_name = try pp.readToken(); + try pp.expectClosing(l_paren, .r_paren); - var if_level: u8 = 0; - var if_kind = std.PackedIntArray(u2, 256).init([1]u2{0} ** 256); - const until_else = 0; - const until_endif = 1; - const until_endif_seen_else = 2; + const ident_str = pp.tokSlice(attr_name); + const has_builtin = pp.comp.hasBuiltin(ident_str); + return pp.ungetToken(tokFromBool(has_builtin)); +} + +fn handleHasWarning(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = try pp.expectLParen(macro_tok); + const start = pp.char_buf.items.len; + defer pp.char_buf.items.len = start; - var start_of_line = true; while (true) { - var tok = tokenizer.next(); + const tok = try pp.readExpandNewline(); switch (tok.id) { - .hash => if (!start_of_line) try pp.addToken(tokFromRaw(tok)) else { - const directive = tokenizer.nextNoWS(); - switch (directive.id) { - .keyword_error, .keyword_warning => { - // #error tokens.. - pp.top_expansion_buf.items.len = 0; - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - while (true) { - tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) break; - if (tok.id == .whitespace) tok.id = .macro_ws; - try pp.top_expansion_buf.append(tokFromRaw(tok)); - } - try pp.stringify(pp.top_expansion_buf.items); - const slice = pp.char_buf.items[char_top + 1 .. pp.char_buf.items.len - 2]; - const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); - - try pp.comp.addDiagnostic(.{ - .tag = if (directive.id == .keyword_error) .error_directive else .warning_directive, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - .extra = .{ .str = duped }, - }, &.{}); - }, - .keyword_if => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - if (try pp.expr(&tokenizer)) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #if", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #if", .{}); - } - } - }, - .keyword_ifdef => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name) != null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #ifdef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #ifdef", .{}); - } - } - }, - .keyword_ifndef => { - const sum, const overflowed = @addWithOverflow(if_level, 1); - if (overflowed != 0) - return pp.fatal(directive, "too many #if nestings", .{}); - if_level = sum; - - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name) == null) { - if_kind.set(if_level, until_endif); - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - } - }, - .keyword_elif => { - if (if_level == 0) { - try pp.err(directive, .elif_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => if (try pp.expr(&tokenizer)) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elif", .{}); - } - } else { - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elif", .{}); - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elif_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_elifdef => { - if (if_level == 0) { - try pp.err(directive, .elifdef_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(&tokenizer); - if (macro_name == null) { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } else { - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name.?) != null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifdef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifdef", .{}); - } - } - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elifdef_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_elifndef => { - if (if_level == 0) { - try pp.err(directive, .elifdef_without_if); - if_level += 1; - if_kind.set(if_level, until_else); - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - const macro_name = try pp.expectMacroName(&tokenizer); - if (macro_name == null) { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } else { - try pp.expectNl(&tokenizer); - if (pp.defines.get(macro_name.?) == null) { - if_kind.set(if_level, until_endif); - if (pp.verbose) { - pp.verboseLog(directive, "entering then branch of #elifndef", .{}); - } - } else { - if_kind.set(if_level, until_else); - try pp.skip(&tokenizer, .until_else); - if (pp.verbose) { - pp.verboseLog(directive, "entering else branch of #elifndef", .{}); - } - } - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif), - until_endif_seen_else => { - try pp.err(directive, .elifdef_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_else => { - try pp.expectNl(&tokenizer); - if (if_level == 0) { - try pp.err(directive, .else_without_if); - continue; - } else if (if_level == 1) { - guard_name = null; - } - switch (if_kind.get(if_level)) { - until_else => { - if_kind.set(if_level, until_endif_seen_else); - if (pp.verbose) { - pp.verboseLog(directive, "#else branch here", .{}); - } - }, - until_endif => try pp.skip(&tokenizer, .until_endif_seen_else), - until_endif_seen_else => { - try pp.err(directive, .else_after_else); - skipToNl(&tokenizer); - }, - else => unreachable, - } - }, - .keyword_endif => { - try pp.expectNl(&tokenizer); - if (if_level == 0) { - guard_name = null; - try pp.err(directive, .endif_without_if); - continue; - } else if (if_level == 1) { - const saved_tokenizer = tokenizer; - defer tokenizer = saved_tokenizer; - - var next = tokenizer.nextNoWS(); - while (next.id == .nl) : (next = tokenizer.nextNoWS()) {} - if (next.id != .eof) guard_name = null; - } - if_level -= 1; - }, - .keyword_define => try pp.define(&tokenizer), - .keyword_undef => { - const macro_name = (try pp.expectMacroName(&tokenizer)) orelse continue; - - _ = pp.defines.remove(macro_name); - try pp.expectNl(&tokenizer); - }, - .keyword_include => { - try pp.include(&tokenizer, .first); - continue; - }, - .keyword_include_next => { - try pp.comp.addDiagnostic(.{ - .tag = .include_next, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - }, &.{}); - if (pp.include_depth == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .include_next_outside_header, - .loc = .{ .id = tok.source, .byte_offset = directive.start, .line = directive.line }, - }, &.{}); - try pp.include(&tokenizer, .first); - } else { - try pp.include(&tokenizer, .next); - } - }, - .keyword_embed => try pp.embed(&tokenizer), - .keyword_pragma => { - try pp.pragma(&tokenizer, directive, null, &.{}); - continue; - }, - .keyword_line => { - // #line number "file" - const digits = tokenizer.nextNoWS(); - if (digits.id != .pp_num) try pp.err(digits, .line_simple_digit); - // TODO: validate that the pp_num token is solely digits - - if (digits.id == .eof or digits.id == .nl) continue; - const name = tokenizer.nextNoWS(); - if (name.id == .eof or name.id == .nl) continue; - if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); - try pp.expectNl(&tokenizer); - }, - .pp_num => { - // # number "file" flags - // TODO: validate that the pp_num token is solely digits - // if not, emit `GNU line marker directive requires a simple digit sequence` - const name = tokenizer.nextNoWS(); - if (name.id == .eof or name.id == .nl) continue; - if (name.id != .string_literal) try pp.err(name, .line_invalid_filename); - - const flag_1 = tokenizer.nextNoWS(); - if (flag_1.id == .eof or flag_1.id == .nl) continue; - const flag_2 = tokenizer.nextNoWS(); - if (flag_2.id == .eof or flag_2.id == .nl) continue; - const flag_3 = tokenizer.nextNoWS(); - if (flag_3.id == .eof or flag_3.id == .nl) continue; - const flag_4 = tokenizer.nextNoWS(); - if (flag_4.id == .eof or flag_4.id == .nl) continue; - try pp.expectNl(&tokenizer); - }, - .nl => {}, - .eof => { - if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); - return tokFromRaw(directive); - }, - else => { - try pp.err(tok, .invalid_preprocessing_directive); - skipToNl(&tokenizer); - }, - } - if (pp.preserve_whitespace) { - tok.id = .nl; - try pp.addToken(tokFromRaw(tok)); - } - }, - .whitespace => if (pp.preserve_whitespace) try pp.addToken(tokFromRaw(tok)), - .nl => { - start_of_line = true; - if (pp.preserve_whitespace) try pp.addToken(tokFromRaw(tok)); - }, - .eof => { - if (if_level != 0) try pp.err(tok, .unterminated_conditional_directive); - // The following check needs to occur here and not at the top of the function - // because a pragma may change the level during preprocessing - if (source.buf.len > 0 and source.buf[source.buf.len - 1] != '\n') { - try pp.err(tok, .newline_eof); - } - if (guard_name) |name| { - if (try pp.include_guards.fetchPut(pp.gpa, source.id, name)) |prev| { - assert(mem.eql(u8, name, prev.value)); - } - } - return tokFromRaw(tok); + .nl, .eof => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return pp.ungetToken(PreprocessorToken.zero); }, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - start_of_line = false; - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.expandMacro(&tokenizer, tok); + .r_paren => break, + .string_literal => { + const string = pp.tokSlice(tok); + try pp.char_buf.appendSlice(pp.gpa, string[1 .. string.len - 1]); }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), else => { - if (tok.id.isMacroIdentifier() and pp.poisoned_identifiers.get(pp.tokSlice(tok)) != null) { - try pp.err(tok, .poisoned_identifier); - } - // Add the token to the buffer doing any necessary expansions. - start_of_line = false; - try pp.expandMacro(&tokenizer, tok); + pp.skipToNl(); + try pp.errTok(tok, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return pp.ungetToken(PreprocessorToken.zero); }, } } + const actual_param = pp.char_buf.items[start..]; + if (actual_param.len == 0) { + try pp.comp.addDiagnostic(.{ + .tag = .expected_arguments, + .loc = macro_tok.loc, + .extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }, + }, &.{}); // todo expansion slice + return pp.ungetToken(PreprocessorToken.zero); + } + if (!mem.startsWith(u8, actual_param, "-W")) { + try pp.errStr(l_paren, .malformed_warning_check, "__has_warning"); + return pp.ungetToken(PreprocessorToken.zero); + } + const warning_name = actual_param[2..]; + const exists = Diagnostics.warningExists(warning_name); + return pp.ungetToken(tokFromBool(exists)); } -/// Get raw token source string. -/// Returned slice is invalidated when comp.generated_buf is updated. -pub fn tokSlice(pp: *Preprocessor, token: anytype) []const u8 { - if (token.id.lexeme()) |some| return some; - const source = pp.comp.getSource(token.source); - return source.buf[token.start..token.end]; +fn handleHasInclude(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .first); } -/// Convert a token from the Tokenizer into a token used by the parser. -fn tokFromRaw(raw: RawToken) TokenWithExpansionLocs { - return .{ - .id = raw.id, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }; +fn handleHasIncludeNext(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + return pp.handleHasIncludeExtra(macro_tok, .next); } -fn err(pp: *Preprocessor, raw: RawToken, tag: Diagnostics.Tag) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }, &.{}); +fn handleHasIncludeExtra(pp: *Preprocessor, macro_tok: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; + } + + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return pp.ungetToken(PreprocessorToken.zero), + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; + + if (which == .first or pp.includeDepth() == 0) { + if (which == .next) { + try pp.comp.addDiagnostic(.{ + .tag = .include_next_outside_header, + .loc = macro_tok.loc, + }, &.{}); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .first); + return pp.ungetToken(tokFromBool(has)); + } + const has = try pp.comp.hasInclude(filename, macro_tok.loc.id, include_type, .next); + return pp.ungetToken(tokFromBool(has)); } -fn errStr(pp: *Preprocessor, tok: TokenWithExpansionLocs, tag: Diagnostics.Tag, str: []const u8) !void { - try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = tok.loc, - .extra = .{ .str = str }, - }, tok.expansionSlice()); +fn includeDepth(pp: *Preprocessor) usize { + return pp.tokenizers.items.len - 1; } -fn fatal(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) Compilation.Error { - try pp.comp.diagnostics.list.append(pp.gpa, .{ - .tag = .cli_error, - .kind = .@"fatal error", - .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, - .loc = .{ - .id = raw.source, - .byte_offset = raw.start, - .line = raw.line, - }, - }); - return error.FatalError; +fn hasEmbedValue(contents_arg: ?[]const u8) []const u8 { + const contents = contents_arg orelse return "0\n"; + if (contents.len == 0) return "2\n"; + return "1\n"; } -fn fatalNotFound(pp: *Preprocessor, tok: TokenWithExpansionLocs, filename: []const u8) Compilation.Error { - const old = pp.comp.diagnostics.fatal_errors; - pp.comp.diagnostics.fatal_errors = true; - defer pp.comp.diagnostics.fatal_errors = old; +/// TODO: handle limit/prefix/suffix/etc +fn handleHasEmbed(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + pp.skipToNl(); + return; + } - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ - .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), - } }, tok.expansionSlice(), false); - unreachable; // addExtra should've returned FatalError -} + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + try pp.expectClosing(l_paren, .r_paren); -fn verboseLog(pp: *Preprocessor, raw: RawToken, comptime fmt: []const u8, args: anytype) void { - const source = pp.comp.getSource(raw.source); - const line_col = source.lineCol(.{ .id = raw.source, .line = raw.line, .byte_offset = raw.start }); + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; - const stderr = std.io.getStdErr().writer(); - var buf_writer = std.io.bufferedWriter(stderr); - const writer = buf_writer.writer(); - defer buf_writer.flush() catch {}; - writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; - writer.print(fmt, args) catch return; - writer.writeByte('\n') catch return; - writer.writeAll(line_col.line) catch return; - writer.writeByte('\n') catch return; + const contents = try pp.comp.findEmbed(filename, macro_tok.loc.id, include_type, 1); + const result = hasEmbedValue(contents); + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); + const pasted_tok = try pp.makeGeneratedToken(start, .pp_num, macro_tok); + return pp.ungetToken(pasted_tok); } -/// Consume next token, error if it is not an identifier. -fn expectMacroName(pp: *Preprocessor, tokenizer: *Tokenizer) Error!?[]const u8 { - const macro_name = tokenizer.nextNoWS(); - if (!macro_name.id.isMacroIdentifier()) { - try pp.err(macro_name, .macro_name_missing); - skipToNl(tokenizer); - return null; +// Skip until newline, ignore other tokens. +fn skipToNl(pp: *Preprocessor) void { + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; } - return pp.tokSlice(macro_name); } -/// Skip until after a newline, error if extra tokens before it. -fn expectNl(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { - var sent_err = false; +fn readOneIdentifierArgument(pp: *Preprocessor, macro_tok: PreprocessorToken) !?PreprocessorToken { + const l_paren = try pp.expect(.l_paren, .missing_lparen_after_builtin); + _ = l_paren; + var invalid: ?PreprocessorToken = null; + var identifier: ?PreprocessorToken = null; while (true) { - const tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) return; - if (tok.id == .whitespace or tok.id == .comment) continue; - if (!sent_err) { - sent_err = true; - try pp.err(tok, .extra_tokens_directive_end); + var tok = pp.getToken(); + tok.id.simplifyMacroKeywordExtra(true); + + switch (tok.id) { + .r_paren, .eof => break, + else => { + if (identifier) |_| invalid = tok else identifier = tok; + }, } } + if (invalid) |some| { + try pp.comp.addDiagnostic(.{ + .tag = .missing_tok_builtin, + .loc = some.loc, + .extra = .{ .tok_id_expected = .r_paren }, + }, &.{}); // TODO: expansion slice + return null; + } + if (identifier) |ident| { + if (ident.id == .identifier or ident.id == .extended_identifier) return ident; + } else { + const extra: Diagnostics.Message.Extra = .{ .arguments = .{ .expected = 1, .actual = 0 } }; + try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); + } + return null; } -fn getTokenState(pp: *const Preprocessor) TokenState { - return .{ - .tokens_len = pp.tokens.len, - .expansion_entries_len = pp.expansion_entries.len, - }; +fn handleIsIdentifier(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + if (try pp.readOneIdentifierArgument(macro_tok)) |_| { + return pp.ungetToken(PreprocessorToken.one); + } else { + return pp.ungetToken(PreprocessorToken.zero); + } } -fn restoreTokenState(pp: *Preprocessor, state: TokenState) void { - pp.tokens.len = state.tokens_len; - pp.expansion_entries.len = state.expansion_entries_len; +fn handlePragmaOperator(pp: *Preprocessor, macro_tok: PreprocessorToken) Error!void { + _ = pp; + _ = macro_tok; + // TODO } -/// Consume all tokens until a newline and parse the result into a boolean. -fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool { - const token_state = pp.getTokenState(); - defer { - for (pp.top_expansion_buf.items) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - pp.restoreTokenState(token_state); - } +pub fn addBuiltinMacros(pp: *Preprocessor) !void { + try pp.addBuiltinMacro("__has_attribute", handleHasAttribute); + try pp.addBuiltinMacro("__has_c_attribute", handleHasCAttribute); + try pp.addBuiltinMacro("__has_declspec_attribute", handleHasDeclSpecAttribute); + try pp.addBuiltinMacro("__has_feature", handleHasFeature); + try pp.addBuiltinMacro("__has_extension", handleHasExtension); + try pp.addBuiltinMacro("__has_builtin", handleHasBuiltin); + try pp.addBuiltinMacro("__has_warning", handleHasWarning); + try pp.addBuiltinMacro("__has_include", handleHasInclude); + try pp.addBuiltinMacro("__has_include_next", handleHasIncludeNext); + try pp.addBuiltinMacro("__has_embed", handleHasEmbed); - pp.top_expansion_buf.items.len = 0; - const eof = while (true) { - const tok = tokenizer.next(); - switch (tok.id) { - .nl, .eof => break tok, - .whitespace => if (pp.top_expansion_buf.items.len == 0) continue, - else => {}, - } - try pp.top_expansion_buf.append(tokFromRaw(tok)); - } else unreachable; - if (pp.top_expansion_buf.items.len != 0) { - pp.expansion_source_loc = pp.top_expansion_buf.items[0].loc; - pp.hideset.clearRetainingCapacity(); - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, pp.top_expansion_buf.items.len, false, .expr); - } - for (pp.top_expansion_buf.items) |tok| { - if (tok.id == .macro_ws) continue; - if (!tok.id.validPreprocessorExprStart()) { - try pp.comp.addDiagnostic(.{ - .tag = .invalid_preproc_expr_start, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - } - break; - } else { - try pp.err(eof, .expected_value_in_expr); - return false; - } + try pp.addBuiltinMacro("__is_identifier", handleIsIdentifier); - // validate the tokens in the expression - try pp.ensureUnusedTokenCapacity(pp.top_expansion_buf.items.len); - var i: usize = 0; - const items = pp.top_expansion_buf.items; - while (i < items.len) : (i += 1) { - var tok = items[i]; - switch (tok.id) { - .string_literal, - .string_literal_utf_16, - .string_literal_utf_8, - .string_literal_utf_32, - .string_literal_wide, - => { - try pp.comp.addDiagnostic(.{ - .tag = .string_literal_in_pp_expr, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - }, - .plus_plus, - .minus_minus, - .plus_equal, - .minus_equal, - .asterisk_equal, - .slash_equal, - .percent_equal, - .angle_bracket_angle_bracket_left_equal, - .angle_bracket_angle_bracket_right_equal, - .ampersand_equal, - .caret_equal, - .pipe_equal, - .l_bracket, - .r_bracket, - .l_brace, - .r_brace, - .ellipsis, - .semicolon, - .hash, - .hash_hash, - .equal, - .arrow, - .period, - => { - try pp.comp.addDiagnostic(.{ - .tag = .invalid_preproc_operator, - .loc = tok.loc, - }, tok.expansionSlice()); - return false; - }, - .macro_ws, .whitespace => continue, - .keyword_false => tok.id = .zero, - .keyword_true => tok.id = .one, - else => if (tok.id.isMacroIdentifier()) { - if (tok.id == .keyword_defined) { - const tokens_consumed = try pp.handleKeywordDefined(&tok, items[i + 1 ..], eof); - i += tokens_consumed; - } else { - try pp.errStr(tok, .undefined_macro, pp.expandedSlice(tok)); + try pp.addBuiltinMacro("__FILE__", handleFileMacro); + try pp.addBuiltinMacro("__LINE__", handleLineMacro); + try pp.addBuiltinMacro("__COUNTER__", handleCounterMacro); + try pp.addBuiltinMacro("_Pragma", handlePragmaOperator); +} - if (i + 1 < pp.top_expansion_buf.items.len and - pp.top_expansion_buf.items[i + 1].id == .l_paren) - { - try pp.errStr(tok, .fn_macro_undefined, pp.expandedSlice(tok)); - return false; - } +/// Initialize Preprocessor with builtin macros. +pub fn initDefault(comp: *Compilation) !Preprocessor { + var pp = init(comp); + errdefer pp.deinit(); + try pp.addBuiltinMacros(); + return pp; +} - tok.id = .zero; // undefined macro - } - }, - } - pp.addTokenAssumeCapacity(tok); +pub fn deinit(pp: *Preprocessor) void { + pp.arena.deinit(); + pp.include_guards.deinit(pp.gpa); + pp.tokens.deinit(pp.gpa); + pp.tokenizers.deinit(pp.gpa); + for (pp.expansion_bufs.items) |*toklist| { + toklist.deinit(pp.gpa); } - try pp.addToken(.{ - .id = .eof, - .loc = tokFromRaw(eof).loc, - }); - - // Actually parse it. - var parser = Parser{ - .pp = pp, - .comp = pp.comp, - .gpa = pp.gpa, - .tok_ids = pp.tokens.items(.id), - .tok_i = @intCast(token_state.tokens_len), - .arena = pp.arena.allocator(), - .in_macro = true, - .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), - - .data = undefined, - .value_map = undefined, - .labels = undefined, - .decl_buf = undefined, - .list_buf = undefined, - .param_buf = undefined, - .enum_buf = undefined, - .record_buf = undefined, - .attr_buf = undefined, - .field_attr_buf = undefined, - .string_ids = undefined, - }; - defer parser.strings.deinit(); - return parser.macroExpr(); + pp.expansion_bufs.deinit(pp.gpa); + pp.defines.deinit(pp.gpa); + pp.char_buf.deinit(pp.gpa); + for (pp.expansion_entries.items(.locs)) |locs| PreprocessorToken.free(locs, pp.gpa); + pp.expansion_entries.deinit(pp.gpa); + pp.guard_stack.deinit(pp.gpa); + pp.macro_arg_tokens.deinit(pp.gpa); + pp.macro_args.deinit(pp.gpa); + pp.safe_strings.deinit(pp.gpa); + pp.treap.deinit(); + pp.poisoned_identifiers.deinit(); } -/// Turns macro_tok from .keyword_defined into .zero or .one depending on whether the argument is defined -/// Returns the number of tokens consumed -fn handleKeywordDefined(pp: *Preprocessor, macro_tok: *TokenWithExpansionLocs, tokens: []const TokenWithExpansionLocs, eof: RawToken) !usize { - std.debug.assert(macro_tok.id == .keyword_defined); - var it = TokenIterator.init(tokens); - const first = it.nextNoWS() orelse { - try pp.err(eof, .macro_name_missing); - return it.i; - }; - switch (first.id) { - .l_paren => {}, - else => { - if (!first.id.isMacroIdentifier()) { - try pp.errStr(first, .macro_name_must_be_identifier, pp.expandedSlice(first)); - } - macro_tok.id = if (pp.defines.contains(pp.expandedSlice(first))) .one else .zero; - return it.i; - }, - } - const second = it.nextNoWS() orelse { - try pp.err(eof, .macro_name_missing); - return it.i; - }; - if (!second.id.isMacroIdentifier()) { - try pp.comp.addDiagnostic(.{ - .tag = .macro_name_must_be_identifier, - .loc = second.loc, - }, second.expansionSlice()); - return it.i; - } - macro_tok.id = if (pp.defines.contains(pp.expandedSlice(second))) .one else .zero; +/// Preprocess a compilation unit of sources into a parsable list of tokens. +pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void { + assert(sources.len > 1); + const first = sources[0]; - const last = it.nextNoWS(); - if (last == null or last.?.id != .r_paren) { - const tok = last orelse tokFromRaw(eof); - try pp.comp.addDiagnostic(.{ - .tag = .closing_paren, - .loc = tok.loc, - }, tok.expansionSlice()); - try pp.comp.addDiagnostic(.{ - .tag = .to_match_paren, - .loc = first.loc, - }, first.expansionSlice()); + for (sources[1..]) |header| { + _ = try pp.preprocess(header); } - - return it.i; + const eof = try pp.preprocess(first); + try pp.addToken(eof); } -/// Skip until #else #elif #endif, return last directive token id. -/// Also skips nested #if ... #endifs. -fn skip( - pp: *Preprocessor, - tokenizer: *Tokenizer, - cont: enum { until_else, until_endif, until_endif_seen_else }, -) Error!void { - var ifs_seen: u32 = 0; - var line_start = true; - while (tokenizer.index < tokenizer.buf.len) { - if (line_start) { - const saved_tokenizer = tokenizer.*; - const hash = tokenizer.nextNoWS(); - if (hash.id == .nl) continue; - line_start = false; - if (hash.id != .hash) continue; - const directive = tokenizer.nextNoWS(); - switch (directive.id) { - .keyword_else => { - if (ifs_seen != 0) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .else_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elif => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elif_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifdef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elifdef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_elifndef => { - if (ifs_seen != 0 or cont == .until_endif) continue; - if (cont == .until_endif_seen_else) { - try pp.err(directive, .elifndef_after_else); - continue; - } - tokenizer.* = saved_tokenizer; - return; - }, - .keyword_endif => { - if (ifs_seen == 0) { - tokenizer.* = saved_tokenizer; - return; - } - ifs_seen -= 1; - }, - .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, - else => {}, - } - } else if (tokenizer.buf[tokenizer.index] == '\n') { - line_start = true; - tokenizer.index += 1; - tokenizer.line += 1; - if (pp.preserve_whitespace) { - try pp.addToken(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); - } - } else { - line_start = false; - tokenizer.index += 1; - } +fn propagateSpace(pp: *Preprocessor, tokens: []PreprocessorToken, template: PreprocessorToken) void { + if (tokens.len > 0) { + tokens[0].flags = template.flags; } else { - const eof = tokenizer.next(); - return pp.err(eof, .unterminated_conditional_directive); + pp.injectSpace(); } } -// Skip until newline, ignore other tokens. -fn skipToNl(tokenizer: *Tokenizer) void { - while (true) { - const tok = tokenizer.next(); - if (tok.id == .nl or tok.id == .eof) return; - } +fn ungetAll(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + if (tokens.len == 0) return; + const start = pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len; + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items[start..]); } -const ExpandBuf = std.ArrayList(TokenWithExpansionLocs); -fn removePlacemarkers(buf: *ExpandBuf) void { - var i: usize = buf.items.len -% 1; - while (i < buf.items.len) : (i -%= 1) { - if (buf.items[i].id == .placemarker) { - const placemarker = buf.orderedRemove(i); - TokenWithExpansionLocs.free(placemarker.expansion_locs, buf.allocator); +fn addHideSet(pp: *Preprocessor, toks: []PreprocessorToken, hideset: Treap.Node) !void { + for (toks) |*tok| { + switch (tok.id) { + // non-identifiers are not expanded, so we don't need to track their hidesets. + // Track r_paren hideset since it is used for computing the hideset of function-like macro expansions + .identifier, .extended_identifier, .r_paren => { + tok.hideset = try pp.treap.@"union"(tok.hideset, hideset); + }, + else => {}, } } } -const MacroArguments = std.ArrayList([]const TokenWithExpansionLocs); -fn deinitMacroArguments(allocator: Allocator, args: *const MacroArguments) void { - for (args.items) |item| { - for (item) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, allocator); - allocator.free(item); +fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) !PreprocessorToken { + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.append(pp.gpa, '"'); + const args = args_range.slice(pp.macro_arg_tokens.items); + for (args, 0..) |tok, i| { + const slice = pp.tokSlice(tok); + if (slice.len > 0 and tok.flags.space and i != 0) { + try pp.comp.generated_buf.append(pp.gpa, ' '); + } + try pp.comp.generated_buf.appendSlice(pp.gpa, slice); } - args.deinit(); + try pp.comp.generated_buf.append(pp.gpa, '"'); + var tok = tmpl; + tok.id = .string_literal; + tok.loc = .{ + .id = .generated, + .byte_offset = @intCast(start), + .line = pp.generated_line, + }; + pp.generated_line += 1; + return tok; } -fn expandObjMacro(pp: *Preprocessor, simple_macro: *const Macro) Error!ExpandBuf { - var buf = ExpandBuf.init(pp.gpa); - errdefer buf.deinit(); - if (simple_macro.tokens.len == 0) { - try buf.append(.{ .id = .placemarker, .loc = .{ .id = .generated } }); - return buf; - } - try buf.ensureTotalCapacity(simple_macro.tokens.len); - - // Add all of the simple_macros tokens to the new buffer handling any concats. +fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, args: MacroArgList, hideset_arg: Treap.Node) ![]PreprocessorToken { + _ = macro_tok; + var hideset = hideset_arg; + var r: TokenList = .{}; + defer r.deinit(pp.gpa); var i: usize = 0; - while (i < simple_macro.tokens.len) : (i += 1) { - const raw = simple_macro.tokens[i]; - const tok = tokFromRaw(raw); - switch (raw.id) { - .hash_hash => { - var rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - while (true) { - if (rhs.id == .whitespace) { - rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - } else if (rhs.id == .comment and !pp.comp.langopts.preserve_comments_in_macros) { - rhs = tokFromRaw(simple_macro.tokens[i + 1]); - i += 1; - } else break; + while (i < macro.tokens.len) : (i += 1) { + const t0 = macro.tokens[i]; + const t1: ?PreprocessorToken = if (i == macro.tokens.len - 1) null else macro.tokens[i + 1]; + + const t0_param = t0.id == .macro_param; + const t1_param = if (t1) |tok| tok.id == .macro_param else false; + + if (t0.id == .hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + const stringized = try pp.stringize(t0, arg); + try r.append(pp.gpa, stringized); + i += 1; + continue; + } + if (t0.id == .hash_hash and t1_param) { + const arg = args.slice(pp.macro_args.items)[t1.?.argPosition()]; + if (t1.?.isVarArg() and r.items.len > 0 and r.items[r.items.len - 1].id == .comma) { + if (arg.len() == 0) { + _ = r.pop(); + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); } - try pp.pasteTokens(&buf, &.{rhs}); - }, - .whitespace => if (pp.preserve_whitespace) buf.appendAssumeCapacity(tok), - .macro_file => { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(pp.expansion_source_loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("\"{s}\"\n", .{source.path}); - - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .string_literal, tok)); - }, - .macro_line => { - const start = pp.comp.generated_buf.items.len; - const source = pp.comp.getSource(pp.expansion_source_loc.id); - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{source.physicalLine(pp.expansion_source_loc)}); - - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); - }, - .macro_counter => { - defer pp.counter += 1; - const start = pp.comp.generated_buf.items.len; - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{d}\n", .{pp.counter}); - - buf.appendAssumeCapacity(try pp.makeGeneratedToken(start, .pp_num, tok)); - }, - else => buf.appendAssumeCapacity(tok), + } else if (arg.len() > 0) { + try pp.pasteAndPush(&r, arg.slice(pp.macro_arg_tokens.items)[0]); + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)[1..]); + } + i += 1; + continue; } + if (t0.id == .hash_hash and t1 != null) { + hideset = t1.?.hideset; + try pp.pasteAndPush(&r, t1.?); + i += 1; + continue; + } + if (t0_param and t1 != null and t1.?.id == .hash_hash) { + hideset = t1.?.hideset; + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + if (arg.len() == 0) { + i += 1; + } else { + try r.appendSlice(pp.gpa, arg.slice(pp.macro_arg_tokens.items)); + } + continue; + } + if (t0_param) { + const arg = args.slice(pp.macro_args.items)[t0.argPosition()]; + const expanded = try pp.expandAll(arg.slice(pp.macro_arg_tokens.items), t0); + defer pp.gpa.free(expanded); + try r.appendSlice(pp.gpa, expanded); + continue; + } + try r.append(pp.gpa, t0); } - - return buf; -} - -/// Join a possibly-parenthesized series of string literal tokens into a single string without -/// leading or trailing quotes. The returned slice is invalidated if pp.char_buf changes. -/// Returns error.ExpectedStringLiteral if parentheses are not balanced, a non-string-literal -/// is encountered, or if no string literals are encountered -/// TODO: destringize (replace all '\\' with a single `\` and all '\"' with a '"') -fn pasteStringsUnsafe(pp: *Preprocessor, toks: []const TokenWithExpansionLocs) ![]const u8 { - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - var unwrapped = toks; - if (toks.len >= 2 and toks[0].id == .l_paren and toks[toks.len - 1].id == .r_paren) { - unwrapped = toks[1 .. toks.len - 1]; - } - if (unwrapped.len == 0) return error.ExpectedStringLiteral; - - for (unwrapped) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id != .string_literal) return error.ExpectedStringLiteral; - const str = pp.expandedSlice(tok); - try pp.char_buf.appendSlice(str[1 .. str.len - 1]); - } - return pp.char_buf.items[char_top..]; + try pp.addHideSet(r.items, hideset); + return r.toOwnedSlice(pp.gpa); } -/// Handle the _Pragma operator (implemented as a builtin macro) -fn pragmaOperator(pp: *Preprocessor, arg_tok: TokenWithExpansionLocs, operator_loc: Source.Location) !void { - const arg_slice = pp.expandedSlice(arg_tok); - const content = arg_slice[1 .. arg_slice.len - 1]; - const directive = "#pragma "; +fn pasteTokens(pp: *Preprocessor, lhs: PreprocessorToken, rhs: PreprocessorToken) !PreprocessorToken { + const start = pp.comp.generated_buf.items.len; + const end = start + pp.tokSlice(lhs).len + pp.tokSlice(rhs).len; + try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline - pp.char_buf.clearRetainingCapacity(); - const total_len = directive.len + content.len + 1; // destringify can never grow the string, + 1 for newline - try pp.char_buf.ensureUnusedCapacity(total_len); - pp.char_buf.appendSliceAssumeCapacity(directive); - pp.destringify(content); - pp.char_buf.appendAssumeCapacity('\n'); + // We cannot use the same slices here since they might be invalidated by `ensureCapacity` + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(lhs)); + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(rhs)); + pp.comp.generated_buf.appendAssumeCapacity('\n'); - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items); + // Try to tokenize the result. var tmp_tokenizer = Tokenizer{ .buf = pp.comp.generated_buf.items, .langopts = pp.comp.langopts, .index = @intCast(start), .source = .generated, - .line = pp.generated_line, }; - pp.generated_line += 1; - const hash_tok = tmp_tokenizer.next(); - assert(hash_tok.id == .hash); - const pragma_tok = tmp_tokenizer.next(); - assert(pragma_tok.id == .keyword_pragma); - try pp.pragma(&tmp_tokenizer, pragma_tok, operator_loc, arg_tok.expansionSlice()); -} - -/// Inverts the output of the preprocessor stringify (#) operation -/// (except all whitespace is condensed to a single space) -/// writes output to pp.char_buf; assumes capacity is sufficient -/// backslash backslash -> backslash -/// backslash doublequote -> doublequote -/// All other characters remain the same -fn destringify(pp: *Preprocessor, str: []const u8) void { - var state: enum { start, backslash_seen } = .start; - for (str) |c| { - switch (c) { - '\\' => { - if (state == .backslash_seen) pp.char_buf.appendAssumeCapacity(c); - state = if (state == .start) .backslash_seen else .start; - }, - else => { - if (state == .backslash_seen and c != '"') pp.char_buf.appendAssumeCapacity('\\'); - pp.char_buf.appendAssumeCapacity(c); - state = .start; - }, - } + const pasted_token = tmp_tokenizer.nextNoWSComments(); + const next_tok = tmp_tokenizer.next(); + if (next_tok.id != .nl) { + try pp.errStr( + lhs, + .pasting_formed_invalid, + try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), + ); } + return pp.makeGeneratedToken(start, pasted_token.id, lhs); } -/// Stringify `tokens` into pp.char_buf. -/// See https://gcc.gnu.org/onlinedocs/gcc-11.2.0/cpp/Stringizing.html#Stringizing -fn stringify(pp: *Preprocessor, tokens: []const TokenWithExpansionLocs) !void { - try pp.char_buf.append('"'); - var ws_state: enum { start, need, not_needed } = .start; - for (tokens) |tok| { - if (tok.id == .macro_ws) { - if (ws_state == .start) continue; - ws_state = .need; - continue; - } - if (ws_state == .need) try pp.char_buf.append(' '); - ws_state = .not_needed; - - // backslashes not inside strings are not escaped - const is_str = switch (tok.id) { - .string_literal, - .string_literal_utf_16, - .string_literal_utf_8, - .string_literal_utf_32, - .string_literal_wide, - .char_literal, - .char_literal_utf_16, - .char_literal_utf_32, - .char_literal_wide, - => true, - else => false, - }; - - for (pp.expandedSlice(tok)) |c| { - if (c == '"') - try pp.char_buf.appendSlice("\\\"") - else if (c == '\\' and is_str) - try pp.char_buf.appendSlice("\\\\") - else - try pp.char_buf.append(c); - } - } - try pp.char_buf.ensureUnusedCapacity(2); - if (pp.char_buf.items[pp.char_buf.items.len - 1] != '\\') { - pp.char_buf.appendSliceAssumeCapacity("\"\n"); - return; - } - pp.char_buf.appendAssumeCapacity('"'); - var tokenizer: Tokenizer = .{ - .buf = pp.char_buf.items, - .index = 0, - .source = .generated, - .langopts = pp.comp.langopts, - .line = 0, - }; - const item = tokenizer.next(); - if (item.id == .unterminated_string_literal) { - const tok = tokens[tokens.len - 1]; - try pp.comp.addDiagnostic(.{ - .tag = .invalid_pp_stringify_escape, - .loc = tok.loc, - }, tok.expansionSlice()); - pp.char_buf.items.len -= 2; // erase unpaired backslash and appended end quote - pp.char_buf.appendAssumeCapacity('"'); - } - pp.char_buf.appendAssumeCapacity('\n'); +/// Paste `tok` onto the last token in `tokens` +fn pasteAndPush(pp: *Preprocessor, tokens: *TokenList, tok: PreprocessorToken) !void { + const last = tokens.pop(); + const pasted = try pp.pasteTokens(last, tok); + return tokens.append(pp.gpa, pasted); } -fn reconstructIncludeString(pp: *Preprocessor, param_toks: []const TokenWithExpansionLocs, embed_args: ?*[]const TokenWithExpansionLocs, first: TokenWithExpansionLocs) !?[]const u8 { - if (param_toks.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; - } - - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - // Trim leading/trailing whitespace - var begin: usize = 0; - var end: usize = param_toks.len; - while (begin < end and param_toks[begin].id == .macro_ws) : (begin += 1) {} - while (end > begin and param_toks[end - 1].id == .macro_ws) : (end -= 1) {} - const params = param_toks[begin..end]; - - if (params.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; - } - // no string pasting - if (embed_args == null and params[0].id == .string_literal and params.len > 1) { - try pp.comp.addDiagnostic(.{ - .tag = .closing_paren, - .loc = params[1].loc, - }, params[1].expansionSlice()); - return null; - } - - for (params, 0..) |tok, i| { - const str = pp.expandedSliceExtra(tok, .preserve_macro_ws); - try pp.char_buf.appendSlice(str); - if (embed_args) |some| { - if ((i == 0 and tok.id == .string_literal) or tok.id == .angle_bracket_right) { - some.* = params[i + 1 ..]; - break; - } - } - } - - const include_str = pp.char_buf.items[char_top..]; - if (include_str.len < 3) { - if (include_str.len == 0) { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = first.loc, - }, first.expansionSlice()); - return null; - } - try pp.comp.addDiagnostic(.{ - .tag = .empty_filename, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; - } - - switch (include_str[0]) { - '<' => { - if (include_str[include_str.len - 1] != '>') { - // Ugly hack to find out where the '>' should go, since we don't have the closing ')' location - const start = params[0].loc; - try pp.comp.addDiagnostic(.{ - .tag = .header_str_closing, - .loc = .{ .id = start.id, .byte_offset = start.byte_offset + @as(u32, @intCast(include_str.len)) + 1, .line = start.line }, - }, params[0].expansionSlice()); - try pp.comp.addDiagnostic(.{ - .tag = .header_str_match, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; - } - return include_str; - }, - '"' => return include_str, - else => { - try pp.comp.addDiagnostic(.{ - .tag = .expected_filename, - .loc = params[0].loc, - }, params[0].expansionSlice()); - return null; - }, - } +fn tokenBufferStashReverse(pp: *Preprocessor, tokens: []const PreprocessorToken) !void { + try pp.expansion_bufs.append(pp.gpa, .{}); + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].appendSlice(pp.gpa, tokens); + std.mem.reverse(PreprocessorToken, pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items); } -fn handleBuiltinMacro(pp: *Preprocessor, builtin: RawToken.Id, param_toks: []const TokenWithExpansionLocs, src_loc: Source.Location) Error!bool { - switch (builtin) { - .macro_param_has_attribute, - .macro_param_has_declspec_attribute, - .macro_param_has_feature, - .macro_param_has_extension, - .macro_param_has_builtin, - => { - var invalid: ?TokenWithExpansionLocs = null; - var identifier: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id == .comment) continue; - if (!tok.id.isMacroIdentifier()) { - invalid = tok; - break; - } - if (identifier) |_| invalid = tok else identifier = tok; - } - if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; - if (invalid) |some| { - try pp.comp.addDiagnostic( - .{ .tag = .feature_check_requires_identifier, .loc = some.loc }, - some.expansionSlice(), - ); - return false; - } - - const ident_str = pp.expandedSlice(identifier.?); - return switch (builtin) { - .macro_param_has_attribute => Attribute.fromString(.gnu, null, ident_str) != null, - .macro_param_has_declspec_attribute => { - return if (pp.comp.langopts.declspec_attrs) - Attribute.fromString(.declspec, null, ident_str) != null - else - false; - }, - .macro_param_has_feature => features.hasFeature(pp.comp, ident_str), - .macro_param_has_extension => features.hasExtension(pp.comp, ident_str), - .macro_param_has_builtin => pp.comp.hasBuiltin(ident_str), - else => unreachable, - }; - }, - .macro_param_has_warning => { - const actual_param = pp.pasteStringsUnsafe(param_toks) catch |er| switch (er) { - error.ExpectedStringLiteral => { - try pp.errStr(param_toks[0], .expected_str_literal_in, "__has_warning"); - return false; - }, - else => |e| return e, - }; - if (!mem.startsWith(u8, actual_param, "-W")) { - try pp.errStr(param_toks[0], .malformed_warning_check, "__has_warning"); - return false; - } - const warning_name = actual_param[2..]; - return Diagnostics.warningExists(warning_name); - }, - .macro_param_is_identifier => { - var invalid: ?TokenWithExpansionLocs = null; - var identifier: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| switch (tok.id) { - .macro_ws => continue, - .comment => continue, - else => { - if (identifier) |_| invalid = tok else identifier = tok; - }, - }; - if (identifier == null and invalid == null) invalid = .{ .id = .eof, .loc = src_loc }; - if (invalid) |some| { - try pp.comp.addDiagnostic(.{ - .tag = .missing_tok_builtin, - .loc = some.loc, - .extra = .{ .tok_id_expected = .r_paren }, - }, some.expansionSlice()); - return false; - } +fn tokenBufferUnstash(pp: *Preprocessor) void { + var buf = pp.expansion_bufs.pop(); + buf.deinit(pp.gpa); +} - const id = identifier.?.id; - return id == .identifier or id == .extended_identifier; - }, - .macro_param_has_include, .macro_param_has_include_next => { - const include_str = (try pp.reconstructIncludeString(param_toks, null, param_toks[0])) orelse return false; - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - const filename = include_str[1 .. include_str.len - 1]; - if (builtin == .macro_param_has_include or pp.include_depth == 0) { - if (builtin == .macro_param_has_include_next) { - try pp.comp.addDiagnostic(.{ - .tag = .include_next_outside_header, - .loc = src_loc, - }, &.{}); - } - return pp.comp.hasInclude(filename, src_loc.id, include_type, .first); - } - return pp.comp.hasInclude(filename, src_loc.id, include_type, .next); - }, - else => unreachable, +fn expandAll(pp: *Preprocessor, tokens: []const PreprocessorToken, tmpl: PreprocessorToken) ![]const PreprocessorToken { + try pp.tokenBufferStashReverse(tokens); + defer pp.tokenBufferUnstash(); + var r: TokenList = .{}; + defer r.deinit(pp.gpa); + while (true) { + const tok = try pp.readExpand(); + if (tok.id == .eof) break; + try r.append(pp.gpa, tok); } + pp.propagateSpace(r.items, tmpl); + return r.toOwnedSlice(pp.gpa); +} + +fn peekToken(pp: *Preprocessor) !PreprocessorToken { + const tok = try pp.readToken(); + try pp.ungetToken(tok); + return tok; } -/// Treat whitespace-only paste arguments as empty -fn getPasteArgs(args: []const TokenWithExpansionLocs) []const TokenWithExpansionLocs { - for (args) |tok| { - if (tok.id != .macro_ws) return args; +/// Return a string with the same contents as `name` and whose lifetime is the same as the preprocessor's lifetime +/// If `tok` is not from the generated source, this is just `name`. +/// If `tok` is from the generated source, pointers are invalidated when the underlying ArrayList is resized. Therefore, +/// duplicate the string and store it (so we aren't repeatedly copying the same string) +fn getSafeString(pp: *Preprocessor, tok: PreprocessorToken, name: []const u8) ![]const u8 { + if (tok.loc.id != .generated) return name; + const gop = try pp.safe_strings.getOrPut(pp.gpa, name); + if (!gop.found_existing) { + const copy = try pp.arena.allocator().dupe(u8, name); + gop.key_ptr.* = copy; } - return &[1]TokenWithExpansionLocs{.{ - .id = .placemarker, - .loc = .{ .id = .generated, .byte_offset = 0, .line = 0 }, - }}; + return gop.key_ptr.*; } -fn expandFuncMacro( - pp: *Preprocessor, - macro_tok: TokenWithExpansionLocs, - func_macro: *const Macro, - args: *const MacroArguments, - expanded_args: *const MacroArguments, - hideset_arg: Hideset.Index, -) MacroError!ExpandBuf { - var hideset = hideset_arg; - var buf = ExpandBuf.init(pp.gpa); - try buf.ensureTotalCapacity(func_macro.tokens.len); - errdefer buf.deinit(); - - var expanded_variable_arguments = ExpandBuf.init(pp.gpa); - defer expanded_variable_arguments.deinit(); - var variable_arguments = ExpandBuf.init(pp.gpa); - defer variable_arguments.deinit(); - - if (func_macro.var_args) { - var i: usize = func_macro.params.len; - while (i < expanded_args.items.len) : (i += 1) { - try variable_arguments.appendSlice(args.items[i]); - try expanded_variable_arguments.appendSlice(expanded_args.items[i]); - if (i != expanded_args.items.len - 1) { - const comma = TokenWithExpansionLocs{ .id = .comma, .loc = .{ .id = .generated } }; - try variable_arguments.append(comma); - try expanded_variable_arguments.append(comma); - } +fn injectSpace(pp: *Preprocessor) void { + var i = pp.expansion_bufs.items.len; + while (i > 0) : (i -= 1) { + var j = pp.expansion_bufs.items[i - 1].items.len; + while (j > 0) : (j -= 1) { + pp.expansion_bufs.items[i - 1].items[j - 1].flags.space = true; + return; } } +} - // token concatenation and expansion phase - var tok_i: usize = 0; - while (tok_i < func_macro.tokens.len) : (tok_i += 1) { - const raw = func_macro.tokens[tok_i]; - switch (raw.id) { - .hash_hash => while (tok_i + 1 < func_macro.tokens.len) { - const raw_next = func_macro.tokens[tok_i + 1]; - tok_i += 1; - - var va_opt_buf = ExpandBuf.init(pp.gpa); - defer va_opt_buf.deinit(); - - const next = switch (raw_next.id) { - .macro_ws => continue, - .hash_hash => continue, - .comment => if (!pp.comp.langopts.preserve_comments_in_macros) - continue - else - &[1]TokenWithExpansionLocs{tokFromRaw(raw_next)}, - .macro_param, .macro_param_no_expand => getPasteArgs(args.items[raw_next.end]), - .keyword_va_args => variable_arguments.items, - .keyword_va_opt => blk: { - try pp.expandVaOpt(&va_opt_buf, raw_next, variable_arguments.items.len != 0); - if (va_opt_buf.items.len == 0) break; - break :blk va_opt_buf.items; - }, - else => &[1]TokenWithExpansionLocs{tokFromRaw(raw_next)}, - }; - try pp.pasteTokens(&buf, next); - if (next.len != 0) break; - }, - .macro_param_no_expand => { - if (tok_i + 1 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - hideset = pp.hideset.get(tokFromRaw(func_macro.tokens[tok_i + 1]).loc); - } - const slice = getPasteArgs(args.items[raw.end]); - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, slice, &.{raw_loc}); - }, - .macro_param => { - if (tok_i + 1 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - hideset = pp.hideset.get(tokFromRaw(func_macro.tokens[tok_i + 1]).loc); - } - const arg = expanded_args.items[raw.end]; - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, arg, &.{raw_loc}); - }, - .keyword_va_args => { - const raw_loc = Source.Location{ .id = raw.source, .byte_offset = raw.start, .line = raw.line }; - try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); - }, - .keyword_va_opt => { - try pp.expandVaOpt(&buf, raw, variable_arguments.items.len != 0); - }, - .stringify_param, .stringify_va_args => { - const arg = if (raw.id == .stringify_va_args) - variable_arguments.items - else - args.items[raw.end]; - - pp.char_buf.clearRetainingCapacity(); - try pp.stringify(arg); - - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items); +fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) return tok; + const name = pp.tokSlice(tok); + const macro = pp.defines.getPtr(name) orelse return tok; - try buf.append(try pp.makeGeneratedToken(start, .string_literal, tokFromRaw(raw))); - }, - .macro_param_has_attribute, - .macro_param_has_declspec_attribute, - .macro_param_has_warning, - .macro_param_has_feature, - .macro_param_has_extension, - .macro_param_has_builtin, - .macro_param_has_include, - .macro_param_has_include_next, - .macro_param_is_identifier, - => { - const arg = expanded_args.items[0]; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk false; - } else try pp.handleBuiltinMacro(raw.id, arg, macro_tok.loc); - const start = pp.comp.generated_buf.items.len; - const w = pp.comp.generated_buf.writer(pp.gpa); - try w.print("{}\n", .{@intFromBool(result)}); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_has_c_attribute => { - const arg = expanded_args.items[0]; - const not_found = "0\n"; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk not_found; - } else res: { - var invalid: ?TokenWithExpansionLocs = null; - var vendor_ident: ?TokenWithExpansionLocs = null; - var colon_colon: ?TokenWithExpansionLocs = null; - var attr_ident: ?TokenWithExpansionLocs = null; - for (arg) |tok| { - if (tok.id == .macro_ws) continue; - if (tok.id == .comment) continue; - if (tok.id == .colon_colon) { - if (colon_colon != null or attr_ident == null) { - invalid = tok; - break; - } - vendor_ident = attr_ident; - attr_ident = null; - colon_colon = tok; - continue; - } - if (!tok.id.isMacroIdentifier()) { - invalid = tok; - break; - } - if (attr_ident) |_| { - invalid = tok; - break; - } else attr_ident = tok; - } - if (vendor_ident != null and attr_ident == null) { - invalid = vendor_ident; - } else if (attr_ident == null and invalid == null) { - invalid = .{ .id = .eof, .loc = macro_tok.loc }; - } - if (invalid) |some| { - try pp.comp.addDiagnostic( - .{ .tag = .feature_check_requires_identifier, .loc = some.loc }, - some.expansionSlice(), - ); - break :res not_found; - } - if (vendor_ident) |some| { - const vendor_str = pp.expandedSlice(some); - const attr_str = pp.expandedSlice(attr_ident.?); - const exists = Attribute.fromString(.gnu, vendor_str, attr_str) != null; - - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, if (exists) "1\n" else "0\n"); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - continue; - } - if (!pp.comp.langopts.standard.atLeast(.c23)) break :res not_found; - - const attrs = std.StaticStringMap([]const u8).initComptime(.{ - .{ "deprecated", "201904L\n" }, - .{ "fallthrough", "201904L\n" }, - .{ "maybe_unused", "201904L\n" }, - .{ "nodiscard", "202003L\n" }, - .{ "noreturn", "202202L\n" }, - .{ "_Noreturn", "202202L\n" }, - .{ "unsequenced", "202207L\n" }, - .{ "reproducible", "202207L\n" }, - }); - - const attr_str = Attribute.normalize(pp.expandedSlice(attr_ident.?)); - break :res attrs.get(attr_str) orelse not_found; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, result); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_has_embed => { - const arg = expanded_args.items[0]; - const not_found = "0\n"; - const result = if (arg.len == 0) blk: { - const extra = Diagnostics.Message.Extra{ .arguments = .{ .expected = 1, .actual = 0 } }; - try pp.comp.addDiagnostic(.{ .tag = .expected_arguments, .loc = macro_tok.loc, .extra = extra }, &.{}); - break :blk not_found; - } else res: { - var embed_args: []const TokenWithExpansionLocs = &.{}; - const include_str = (try pp.reconstructIncludeString(arg, &embed_args, arg[0])) orelse - break :res not_found; - - var prev = tokFromRaw(raw); - prev.id = .eof; - var it: struct { - i: u32 = 0, - slice: []const TokenWithExpansionLocs, - prev: TokenWithExpansionLocs, - fn next(it: *@This()) TokenWithExpansionLocs { - while (it.i < it.slice.len) switch (it.slice[it.i].id) { - .macro_ws, .whitespace => it.i += 1, - else => break, - } else return it.prev; - defer it.i += 1; - it.prev = it.slice[it.i]; - it.prev.id = .eof; - return it.slice[it.i]; - } - } = .{ .slice = embed_args, .prev = prev }; - - while (true) { - const param_first = it.next(); - if (param_first.id == .eof) break; - if (param_first.id != .identifier) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } + const macro_hideset = tok.hideset; + if (pp.treap.contains(macro_hideset, name)) return tok; - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - const maybe_colon = it.next(); - const param = switch (maybe_colon.id) { - .colon_colon => blk: { - // vendor::param - const param = it.next(); - if (param.id != .identifier) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = param.loc }, - param.expansionSlice(), - ); - continue; - } - const l_paren = it.next(); - if (l_paren.id != .l_paren) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = l_paren.loc }, - l_paren.expansionSlice(), - ); - continue; - } - break :blk "doesn't exist"; - }, - .l_paren => Attribute.normalize(pp.expandedSlice(param_first)), - else => { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_param, .loc = maybe_colon.loc }, - maybe_colon.expansionSlice(), - ); - continue; - }, - }; - - var arg_count: u32 = 0; - var first_arg: TokenWithExpansionLocs = undefined; - while (true) { - const next = it.next(); - if (next.id == .eof) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - break; - } - if (next.id == .r_paren) break; - arg_count += 1; - if (arg_count == 1) first_arg = next; - } + switch (macro.kind) { + .object => { + const safe_name = try pp.getSafeString(tok, name); + const new_hideset = try pp.treap.addNodeTo(tok.hideset, safe_name); - if (std.mem.eql(u8, param, "limit")) { - if (arg_count != 1) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } - if (first_arg.id != .pp_num) { - try pp.comp.addDiagnostic( - .{ .tag = .malformed_embed_limit, .loc = param_first.loc }, - param_first.expansionSlice(), - ); - continue; - } - _ = std.fmt.parseInt(u32, pp.expandedSlice(first_arg), 10) catch { - break :res not_found; - }; - } else if (!std.mem.eql(u8, param, "prefix") and !std.mem.eql(u8, param, "suffix") and - !std.mem.eql(u8, param, "if_empty")) - { - break :res not_found; - } - } + const tokens = try pp.subst(macro, tok, MacroArgList.empty, new_hideset); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); + }, + .func => { + if (!try pp.next(.l_paren)) return tok; + const arg_tokens_start = pp.macro_arg_tokens.items.len; + defer pp.macro_arg_tokens.items.len = arg_tokens_start; + const macro_args_start = pp.macro_args.items.len; + defer pp.macro_args.items.len = macro_args_start; + + const args = pp.readArgs(tok, macro) catch |err| switch (err) { + error.IncorrectArgumentCount => return PreprocessorToken.zero, + error.UnterminatedMacroArgumentList => { + try pp.errTok(tok, .unterminated_macro_arg_list); + return PreprocessorToken.zero; + }, + else => |e| return e, + }; + const r_paren = pp.getToken(); + std.debug.assert(r_paren.id == .r_paren); + const safe_name = try pp.getSafeString(tok, name); + + const intersection = try pp.treap.intersection(macro_hideset, r_paren.hideset); + const hideset = try pp.treap.addNodeTo(intersection, safe_name); + const tokens = try pp.subst(macro, tok, args, hideset); + defer pp.gpa.free(tokens); + pp.propagateSpace(tokens, tok); + try pp.ungetAll(tokens); + return pp.readExpand(); + }, + .special => |func| { + try func(pp, tok); + return pp.readExpand(); + }, + } +} - const include_type: Compilation.IncludeType = switch (include_str[0]) { - '"' => .quotes, - '<' => .angle_brackets, - else => unreachable, - }; - const filename = include_str[1 .. include_str.len - 1]; - const contents = (try pp.comp.findEmbed(filename, arg[0].loc.id, include_type, 1)) orelse - break :res not_found; - - defer pp.comp.gpa.free(contents); - break :res if (contents.len != 0) "1\n" else "2\n"; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.comp.gpa, result); - try buf.append(try pp.makeGeneratedToken(start, .pp_num, tokFromRaw(raw))); - }, - .macro_param_pragma_operator => { - const param_toks = expanded_args.items[0]; - // Clang and GCC require exactly one token (so, no parentheses or string pasting) - // even though their error messages indicate otherwise. Ours is slightly more - // descriptive. - var invalid: ?TokenWithExpansionLocs = null; - var string: ?TokenWithExpansionLocs = null; - for (param_toks) |tok| switch (tok.id) { - .string_literal => { - if (string) |_| invalid = tok else string = tok; - }, - .macro_ws => continue, - .comment => continue, - else => { - invalid = tok; - break; - }, - }; - if (string == null and invalid == null) invalid = .{ .loc = macro_tok.loc, .id = .eof }; - if (invalid) |some| try pp.comp.addDiagnostic( - .{ .tag = .pragma_operator_string_literal, .loc = some.loc }, - some.expansionSlice(), - ) else try pp.pragmaOperator(string.?, macro_tok.loc); - }, - .comma => { - if (tok_i + 2 < func_macro.tokens.len and func_macro.tokens[tok_i + 1].id == .hash_hash) { - const hash_hash = func_macro.tokens[tok_i + 1]; - var maybe_va_args = func_macro.tokens[tok_i + 2]; - var consumed: usize = 2; - if (maybe_va_args.id == .macro_ws and tok_i + 3 < func_macro.tokens.len) { - consumed = 3; - maybe_va_args = func_macro.tokens[tok_i + 3]; - } - if (maybe_va_args.id == .keyword_va_args) { - // GNU extension: `, ##__VA_ARGS__` deletes the comma if __VA_ARGS__ is empty - tok_i += consumed; - if (func_macro.params.len == expanded_args.items.len) { - // Empty __VA_ARGS__, drop the comma - try pp.err(hash_hash, .comma_deletion_va_args); - } else if (func_macro.params.len == 0 and expanded_args.items.len == 1 and expanded_args.items[0].len == 0) { - // Ambiguous whether this is "empty __VA_ARGS__" or "__VA_ARGS__ omitted" - if (pp.comp.langopts.standard.isGNU()) { - // GNU standard, drop the comma - try pp.err(hash_hash, .comma_deletion_va_args); - } else { - // C standard, retain the comma - try buf.append(tokFromRaw(raw)); - } - } else { - try buf.append(tokFromRaw(raw)); - if (expanded_variable_arguments.items.len > 0 or variable_arguments.items.len == func_macro.params.len) { - try pp.err(hash_hash, .comma_deletion_va_args); - } - const raw_loc = Source.Location{ - .id = maybe_va_args.source, - .byte_offset = maybe_va_args.start, - .line = maybe_va_args.line, - }; - try bufCopyTokens(&buf, expanded_variable_arguments.items, &.{raw_loc}); - } - continue; - } - } - // Regular comma, no token pasting with __VA_ARGS__ - try buf.append(tokFromRaw(raw)); - }, - else => try buf.append(tokFromRaw(raw)), +fn readMacroArg(pp: *Preprocessor, end: *bool, readall: bool) !MacroArg { + var level: i32 = 0; + const start: u32 = @intCast(pp.macro_arg_tokens.items.len); + while (true) { + var tok = pp.getToken(); + if (tok.id == .eof) { + return error.UnterminatedMacroArgumentList; + } + if (tok.id == .nl) continue; + if (tok.flags.is_bol and tok.id == .hash) { + try pp.readDirective(); + continue; + } + if (level == 0 and tok.id == .r_paren) { + try pp.ungetToken(tok); + end.* = true; + break; + } + if (level == 0 and tok.id == .comma and !readall) { + break; + } + if (tok.id == .l_paren) { + level += 1; + } + if (tok.id == .r_paren) { + level -= 1; + } + if (tok.flags.is_bol) { + tok.flags = .{ .is_bol = false, .space = true }; } + try pp.macro_arg_tokens.append(pp.gpa, tok); } - removePlacemarkers(&buf); + return .{ .start = start, .end = @intCast(pp.macro_arg_tokens.items.len) }; +} - const macro_expansion_locs = macro_tok.expansionSlice(); - for (buf.items) |*tok| { - try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); - try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); - const tok_hidelist = pp.hideset.get(tok.loc); - const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hideset); - try pp.hideset.put(tok.loc, new_hidelist); +fn doReadArgs(pp: *Preprocessor, macro: *const Macro) !MacroArgList { + const start: u32 = @intCast(pp.macro_args.items.len); + var end = false; + while (!end) { + const in_ellipsis = macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs; + const arg_range = try pp.readMacroArg(&end, in_ellipsis); + try pp.macro_args.append(pp.gpa, arg_range); + } + if (macro.var_args and (pp.macro_args.items.len - start) + 1 == macro.nargs) { + try pp.macro_args.append(pp.gpa, MacroArg.empty); } + return .{ .start = start, .end = @intCast(pp.macro_args.items.len) }; +} - return buf; +fn readArgs(pp: *Preprocessor, ident: PreprocessorToken, macro: *const Macro) !MacroArgList { + if (macro.nargs == 0 and (try pp.peekToken()).id == .r_paren) { + return MacroArgList.empty; + } + const args = try pp.doReadArgs(macro); + if (args.len() != macro.nargs) { + const extra = Diagnostics.Message.Extra{ + .arguments = .{ .expected = @intCast(macro.nargs), .actual = @intCast(args.len()) }, + }; + try pp.comp.addDiagnostic( + .{ .tag = .expected_arguments, .loc = ident.loc, .extra = extra }, + &.{}, // TODO: expansion slice + ); + return error.IncorrectArgumentCount; + } + return args; } -fn expandVaOpt( - pp: *Preprocessor, - buf: *ExpandBuf, - raw: RawToken, - should_expand: bool, -) !void { - if (!should_expand) return; - - const source = pp.comp.getSource(raw.source); - var tokenizer: Tokenizer = .{ - .buf = source.buf, - .index = raw.start, - .source = raw.source, - .langopts = pp.comp.langopts, - .line = raw.line, - }; - while (tokenizer.index < raw.end) { - const tok = tokenizer.next(); - try buf.append(tokFromRaw(tok)); +fn readExpand(pp: *Preprocessor) Error!PreprocessorToken { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id != .nl) return tok; + } +} + +/// # number "file" flags +/// TODO: validate that the pp_num token is solely digits +/// if not, emit `GNU line marker directive requires a simple digit sequence` +fn readLinemarker(pp: *Preprocessor) !void { + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + + const flag_1 = pp.getToken(); + if (flag_1.id.isDirectiveEnd()) return; + const flag_2 = pp.getToken(); + if (flag_2.id.isDirectiveEnd()) return; + const flag_3 = pp.getToken(); + if (flag_3.id.isDirectiveEnd()) return; + const flag_4 = pp.getToken(); + if (flag_4.id.isDirectiveEnd()) return; + try pp.expectNewline(); +} + +fn readIdent(pp: *Preprocessor) !?PreprocessorToken { + const tok = pp.getToken(); + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + return null; } + return tok; } -fn bufCopyTokens(buf: *ExpandBuf, tokens: []const TokenWithExpansionLocs, src: []const Source.Location) !void { - try buf.ensureUnusedCapacity(tokens.len); - for (tokens) |tok| { - var copy = try tok.dupe(buf.allocator); - errdefer TokenWithExpansionLocs.free(copy.expansion_locs, buf.allocator); - try copy.addExpansionLocation(buf.allocator, src); - buf.appendAssumeCapacity(copy); +fn ungetToken(pp: *Preprocessor, tok: PreprocessorToken) !void { + if (tok.id == .eof) return; + if (pp.isBufferEmpty()) { + try pp.expansion_bufs.append(pp.gpa, .{}); } + try pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].append(pp.gpa, tok); } -fn nextBufToken( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: *usize, - end_idx: *usize, - extend_buf: bool, -) Error!TokenWithExpansionLocs { - start_idx.* += 1; - if (start_idx.* == buf.items.len and start_idx.* >= end_idx.*) { - if (extend_buf) { - const raw_tok = tokenizer.next(); - if (raw_tok.id.isMacroIdentifier() and - pp.poisoned_identifiers.get(pp.tokSlice(raw_tok)) != null) - try pp.err(raw_tok, .poisoned_identifier); - - if (raw_tok.id == .nl) pp.add_expansion_nl += 1; - - const new_tok = tokFromRaw(raw_tok); - end_idx.* += 1; - try buf.append(new_tok); - return new_tok; - } else { - return TokenWithExpansionLocs{ .id = .eof, .loc = .{ .id = .generated } }; - } - } else { - return buf.items[start_idx.*]; +fn hashHashCheck(pp: *Preprocessor, toks: []const PreprocessorToken) !void { + if (toks.len == 0) return; + if (toks[0].id == .hash_hash) { + return pp.errTok(toks[0], .hash_hash_at_start); + } + if (toks[toks.len - 1].id == .hash_hash) { + return pp.errTok(toks[toks.len - 1], .hash_hash_at_end); } } -fn collectMacroFuncArguments( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: *usize, - end_idx: *usize, - extend_buf: bool, - is_builtin: bool, - r_paren: *TokenWithExpansionLocs, -) !MacroArguments { - const name_tok = buf.items[start_idx.*]; - const saved_tokenizer = tokenizer.*; - const old_end = end_idx.*; +fn readObjMacro(pp: *Preprocessor, name: PreprocessorToken) !void { + var body: TokenList = .{}; + errdefer body.deinit(pp.gpa); while (true) { - const tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; switch (tok.id) { - .nl, .whitespace, .macro_ws => {}, - .l_paren => break, - else => { - if (is_builtin) { - try pp.errStr(name_tok, .missing_lparen_after_builtin, pp.expandedSlice(name_tok)); - } - // Not a macro function call, go over normal identifier, rewind - tokenizer.* = saved_tokenizer; - end_idx.* = old_end; - return error.MissingLParen; - }, + .unterminated_comment => try pp.errTok(tok, .unterminated_comment), + else => try body.append(pp.gpa, tok), } } + try pp.hashHashCheck(body.items); + const macro: Macro = .{ + .tokens = body.items, + .var_args = false, + .loc = name.loc, + .kind = .object, + .nargs = undefined, + }; + try pp.defineMacro(name, macro); +} - // collect the arguments. - var parens: u32 = 0; - var args = MacroArguments.init(pp.gpa); - errdefer deinitMacroArguments(pp.gpa, &args); - var curArgument = std.ArrayList(TokenWithExpansionLocs).init(pp.gpa); - defer curArgument.deinit(); - while (true) { - var tok = try nextBufToken(pp, tokenizer, buf, start_idx, end_idx, extend_buf); - tok.flags.is_macro_arg = true; - switch (tok.id) { - .comma => { - if (parens == 0) { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - } else { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - } - }, - .l_paren => { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - parens += 1; - }, - .r_paren => { - if (parens == 0) { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - r_paren.* = tok; - break; - } else { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - parens -= 1; - } - }, - .eof => { - { - const owned = try curArgument.toOwnedSlice(); - errdefer pp.gpa.free(owned); - try args.append(owned); - } - tokenizer.* = saved_tokenizer; - try pp.comp.addDiagnostic( - .{ .tag = .unterminated_macro_arg_list, .loc = name_tok.loc }, - name_tok.expansionSlice(), - ); - return error.Unterminated; - }, - .nl, .whitespace => { - try curArgument.append(.{ .id = .macro_ws, .loc = tok.loc }); - }, - else => { - const duped = try tok.dupe(pp.gpa); - errdefer TokenWithExpansionLocs.free(duped.expansion_locs, pp.gpa); - try curArgument.append(duped); - }, +/// Defines a new macro and warns if it is a duplicate +fn defineMacro(pp: *Preprocessor, name_tok: PreprocessorToken, macro: Macro) Error!void { + const name_str = pp.tokSlice(name_tok); + const gop = try pp.defines.getOrPut(pp.gpa, name_str); + if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { + const tag: Diagnostics.Tag = if (gop.value_ptr.kind == .special) .builtin_macro_redefined else .macro_redefined; + const start = pp.comp.diagnostics.list.items.len; + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = name_tok.loc, + .extra = .{ .str = name_str }, + }, &.{}); + if (gop.value_ptr.kind != .special and pp.comp.diagnostics.list.items.len != start) { + try pp.comp.addDiagnostic(.{ + .tag = .previous_definition, + .loc = gop.value_ptr.loc, + }, &.{}); } } - - return args; + gop.value_ptr.* = macro; } -fn removeExpandedTokens(pp: *Preprocessor, buf: *ExpandBuf, start: usize, len: usize, moving_end_idx: *usize) !void { - for (buf.items[start .. start + len]) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try buf.replaceRange(start, len, &.{}); - moving_end_idx.* -|= len; +/// Get raw token source string. +/// Returned slice is invalidated when comp.generated_buf is updated. +pub fn tokSlice(pp: *Preprocessor, token: anytype) []const u8 { + if (token.id.lexeme()) |some| return some; + const source = pp.comp.getSource(token.loc.id); + var tmp_tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = token.loc.byte_offset, + .source = .generated, + }; + const tok = tmp_tokenizer.next(); + return tmp_tokenizer.buf[tok.start..tok.end]; } -/// The behavior of `defined` depends on whether we are in a preprocessor -/// expression context (#if or #elif) or not. -/// In a non-expression context it's just an identifier. Within a preprocessor -/// expression it is a unary operator or one-argument function. -const EvalContext = enum { - expr, - non_expr, -}; - -/// Helper for safely iterating over a slice of tokens while skipping whitespace -const TokenIterator = struct { - toks: []const TokenWithExpansionLocs, - i: usize, +fn expect(pp: *Preprocessor, expected: Tokenizer.Token.Id, tag: Diagnostics.Tag) !PreprocessorToken { + const tok = pp.getToken(); + if (tok.id != expected) { + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = tok.loc, + .extra = .{ .none = {} }, + }, &.{}); // todo expansion slice + try pp.errTok(tok, tag); + } + return tok; +} - fn init(toks: []const TokenWithExpansionLocs) TokenIterator { - return .{ .toks = toks, .i = 0 }; +fn expectLParen(pp: *Preprocessor, tok: PreprocessorToken) !PreprocessorToken { + const l_paren = pp.getToken(); + if (l_paren.id != .l_paren) { + try pp.comp.addDiagnostic(.{ + .tag = .missing_lparen_after_builtin, + .loc = tok.loc, + .extra = .{ .str = pp.tokSlice(tok) }, + }, &.{}); // todo expansion slice } + return l_paren; +} + +fn makeMacroToken(position: usize, is_vararg: bool) PreprocessorToken { + return .{ + .id = .macro_param, + .hideset = null, + .loc = .{ + .id = .unused, + .byte_offset = @intCast(position), + .line = @intFromBool(is_vararg), + }, + }; +} - fn nextNoWS(self: *TokenIterator) ?TokenWithExpansionLocs { - while (self.i < self.toks.len) : (self.i += 1) { - const tok = self.toks[self.i]; - if (tok.id == .whitespace or tok.id == .macro_ws) continue; +fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { + const tok = pp.getToken(); + if (tok.id == id) return true; + try pp.ungetToken(tok); + return false; +} - self.i += 1; - return tok; +/// Returns true for vararg function-like macro, false otherwise +fn readFunclikeMacroParams(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken, params: *ParamMap) !bool { + _ = name; + var pos: usize = 0; + while (true) { + var tok = pp.getToken(); + switch (tok.id) { + .r_paren => return false, + .unterminated_comment => { + try pp.errTok(tok, .unterminated_comment); + return false; + }, + else => {}, } - return null; - } -}; - -fn expandMacroExhaustive( - pp: *Preprocessor, - tokenizer: *Tokenizer, - buf: *ExpandBuf, - start_idx: usize, - end_idx: usize, - extend_buf: bool, - eval_ctx: EvalContext, -) MacroError!void { - var moving_end_idx = end_idx; - var advance_index: usize = 0; - // rescan loop - var do_rescan = true; - while (do_rescan) { - do_rescan = false; - // expansion loop - var idx: usize = start_idx + advance_index; - while (idx < moving_end_idx) { - const macro_tok = buf.items[idx]; - if (macro_tok.id == .keyword_defined and eval_ctx == .expr) { - idx += 1; - var it = TokenIterator.init(buf.items[idx..moving_end_idx]); - if (it.nextNoWS()) |tok| { - switch (tok.id) { - .l_paren => { - _ = it.nextNoWS(); // eat (what should be) identifier - _ = it.nextNoWS(); // eat (what should be) r paren - }, - .identifier, .extended_identifier => {}, - else => {}, - } + if (pos != 0) { + if (tok.id != .comma) { + switch (tok.id) { + .nl, .eof => {}, + else => pp.skipToNl(), } - idx += it.i; - continue; + try pp.errTok(tok, .expected_comma_param_list); + return error.InvalidMacroDef; } - if (!macro_tok.id.isMacroIdentifier() or macro_tok.flags.expansion_disabled) { - idx += 1; - continue; + tok = pp.getToken(); + } + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .missing_paren_param_list); + return false; + } + if (tok.id == .ellipsis) { + try params.put(pp.gpa, "__VA_ARGS__", makeMacroToken(pos, true)); + pos += 1; + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + return error.InvalidMacroDef; } - const expanded = pp.expandedSlice(macro_tok); - const macro = pp.defines.getPtr(expanded) orelse { - idx += 1; - continue; - }; - const macro_hidelist = pp.hideset.get(macro_tok.loc); - if (pp.hideset.contains(macro_hidelist, expanded)) { - idx += 1; - continue; + return true; + } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .invalid_token_param_list); + return error.InvalidMacroDef; + } + const arg = pp.tokSlice(tok); + if (try pp.next(.ellipsis)) { + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errTok(r_paren, .missing_paren_param_list); + try pp.errTok(l_paren, .to_match_paren); + pp.skipToNl(); } + try params.put(pp.gpa, arg, makeMacroToken(pos, true)); + pos += 1; + return true; + } + try params.put(pp.gpa, arg, makeMacroToken(pos, false)); + pos += 1; + } +} - macro_handler: { - if (macro.is_func) { - var r_paren: TokenWithExpansionLocs = undefined; - var macro_scan_idx = idx; - // to be saved in case this doesn't turn out to be a call - const args = pp.collectMacroFuncArguments( - tokenizer, - buf, - ¯o_scan_idx, - &moving_end_idx, - extend_buf, - macro.is_builtin, - &r_paren, - ) catch |er| switch (er) { - error.MissingLParen => { - if (!buf.items[idx].flags.is_macro_arg) buf.items[idx].flags.expansion_disabled = true; - idx += 1; - break :macro_handler; - }, - error.Unterminated => { - if (pp.comp.langopts.emulate == .gcc) idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx, &moving_end_idx); - break :macro_handler; - }, - else => |e| return e, - }; - assert(r_paren.id == .r_paren); - var free_arg_expansion_locs = false; - defer { - for (args.items) |item| { - if (free_arg_expansion_locs) for (item) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - pp.gpa.free(item); - } - args.deinit(); - } - const r_paren_hidelist = pp.hideset.get(r_paren.loc); - var hs = try pp.hideset.intersection(macro_hidelist, r_paren_hidelist); - hs = try pp.hideset.prepend(macro_tok.loc, hs); - - var args_count: u32 = @intCast(args.items.len); - // if the macro has zero arguments g() args_count is still 1 - // an empty token list g() and a whitespace-only token list g( ) - // counts as zero arguments for the purposes of argument-count validation - if (args_count == 1 and macro.params.len == 0) { - for (args.items[0]) |tok| { - if (tok.id != .macro_ws) break; - } else { - args_count = 0; - } - } - - // Validate argument count. - const extra = Diagnostics.Message.Extra{ - .arguments = .{ .expected = @intCast(macro.params.len), .actual = args_count }, - }; - if (macro.var_args and args_count < macro.params.len) { - free_arg_expansion_locs = true; - try pp.comp.addDiagnostic( - .{ .tag = .expected_at_least_arguments, .loc = buf.items[idx].loc, .extra = extra }, - buf.items[idx].expansionSlice(), - ); - idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); - continue; - } - if (!macro.var_args and args_count != macro.params.len) { - free_arg_expansion_locs = true; - try pp.comp.addDiagnostic( - .{ .tag = .expected_arguments, .loc = buf.items[idx].loc, .extra = extra }, - buf.items[idx].expansionSlice(), - ); - idx += 1; - try pp.removeExpandedTokens(buf, idx, macro_scan_idx - idx + 1, &moving_end_idx); - continue; - } - var expanded_args = MacroArguments.init(pp.gpa); - defer deinitMacroArguments(pp.gpa, &expanded_args); - try expanded_args.ensureTotalCapacity(args.items.len); - for (args.items) |arg| { - var expand_buf = ExpandBuf.init(pp.gpa); - errdefer expand_buf.deinit(); - try expand_buf.appendSlice(arg); - - try pp.expandMacroExhaustive(tokenizer, &expand_buf, 0, expand_buf.items.len, false, eval_ctx); +fn readFunclikeMacroBody(pp: *Preprocessor, params: *const ParamMap) ![]const PreprocessorToken { + var tokens: TokenList = .{}; + errdefer tokens.deinit(pp.gpa); + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) { + return tokens.toOwnedSlice(pp.gpa); + } + if (tok.id.isMacroIdentifier()) { + // const subst = params. + if (params.get(pp.tokSlice(tok))) |sub| { + var copy = sub; + copy.flags.space = tok.flags.space; + try tokens.append(pp.gpa, copy); + continue; + } + } + try tokens.append(pp.gpa, tok); + } +} - expanded_args.appendAssumeCapacity(try expand_buf.toOwnedSlice()); - } +fn readFuncLikeMacro(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken) Error!void { + var params: ParamMap = .{}; + defer params.deinit(pp.gpa); + const is_vararg = pp.readFunclikeMacroParams(name, l_paren, ¶ms) catch |err| switch (err) { + error.InvalidMacroDef => blk: { + pp.skipToNl(); + break :blk false; + }, + else => |e| return e, + }; + const body = try pp.readFunclikeMacroBody(¶ms); + errdefer pp.gpa.free(body); + try pp.hashHashCheck(body); + const macro: Macro = .{ + .tokens = body, + .var_args = is_vararg, + .loc = name.loc, + .kind = .func, + .nargs = params.count(), + }; + try pp.defineMacro(name, macro); +} - var res = try pp.expandFuncMacro(macro_tok, macro, &args, &expanded_args, hs); - defer res.deinit(); - const tokens_added = res.items.len; - const tokens_removed = macro_scan_idx - idx + 1; - for (buf.items[idx .. idx + tokens_removed]) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try buf.replaceRange(idx, tokens_removed, res.items); - - moving_end_idx += tokens_added; - // Overflow here means that we encountered an unterminated argument list - // while expanding the body of this macro. - moving_end_idx -|= tokens_removed; - idx += tokens_added; - do_rescan = true; - } else { - const res = try pp.expandObjMacro(macro); - defer res.deinit(); - - const hs = try pp.hideset.prepend(macro_tok.loc, macro_hidelist); - - const macro_expansion_locs = macro_tok.expansionSlice(); - var increment_idx_by = res.items.len; - for (res.items, 0..) |*tok, i| { - tok.flags.is_macro_arg = macro_tok.flags.is_macro_arg; - try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); - try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); - - const tok_hidelist = pp.hideset.get(tok.loc); - const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs); - try pp.hideset.put(tok.loc, new_hidelist); - - if (tok.id == .keyword_defined and eval_ctx == .expr) { - try pp.comp.addDiagnostic(.{ - .tag = .expansion_to_defined, - .loc = tok.loc, - }, tok.expansionSlice()); - } +fn readDefine(pp: *Preprocessor) !void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; + }; + const next_tok = pp.getToken(); + if (next_tok.id == .l_paren and !next_tok.flags.space) { + try pp.readFuncLikeMacro(name, next_tok); + return; + } + try pp.ungetToken(next_tok); + try pp.readObjMacro(name); +} - if (i < increment_idx_by and (tok.id == .keyword_defined or pp.defines.contains(pp.expandedSlice(tok.*)))) { - increment_idx_by = i; - } - } +fn doSkipSpace(pp: *Preprocessor) bool { + const saved_tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + switch (tok.id) { + .eof => return false, + .whitespace, .comment => return true, + else => { + pp.tokenizers.items[pp.tokenizers.items.len - 1] = saved_tokenizer; + return false; + }, + } +} - TokenWithExpansionLocs.free(buf.items[idx].expansion_locs, pp.gpa); - try buf.replaceRange(idx, 1, res.items); - idx += increment_idx_by; - moving_end_idx = moving_end_idx + res.items.len - 1; - do_rescan = true; - } - } - if (idx - start_idx == advance_index + 1 and !do_rescan) { - advance_index += 1; - } - } // end of replacement phase +/// Skips spaces including comments. +/// Returns true if at least one space is skipped. +fn skipSpace(pp: *Preprocessor) bool { + if (!pp.doSkipSpace()) { + return false; } - // end of scanning phase + while (pp.doSkipSpace()) {} + return true; +} - // trim excess buffer - for (buf.items[moving_end_idx..]) |item| { - TokenWithExpansionLocs.free(item.expansion_locs, pp.gpa); +/// Read the next raw token from the tokenizer stack +fn lexToken(pp: *Preprocessor) PreprocessorToken { + if (pp.skipSpace()) { + return .{ .id = .whitespace, .loc = undefined }; } - buf.items.len = moving_end_idx; + const tok = pp.tokenizers.items[pp.tokenizers.items.len - 1].next(); + return .{ + .id = tok.id, + .flags = .{ + .is_bol = tok.bol, + }, + .loc = .{ + .id = tok.source, + .byte_offset = tok.start, + .line = tok.line, + }, + }; } -/// Try to expand a macro after a possible candidate has been read from the `tokenizer` -/// into the `raw` token passed as argument -fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void { - var source_tok = tokFromRaw(raw); - if (!raw.id.isMacroIdentifier()) { - source_tok.id.simplifyMacroKeyword(); - return pp.addToken(source_tok); +/// Read the next token without expanding it +fn getToken(pp: *Preprocessor) PreprocessorToken { + if (!pp.isBufferEmpty() and pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].items.len > 0) { + return pp.expansion_bufs.items[pp.expansion_bufs.items.len - 1].pop(); + } + if (pp.expansion_bufs.items.len > 1) { + return .{ .id = .eof, .loc = undefined }; } - pp.top_expansion_buf.items.len = 0; - try pp.top_expansion_buf.append(source_tok); - pp.expansion_source_loc = source_tok.loc; + const bol = pp.tokenizers.items[pp.tokenizers.items.len - 1].bol; + var tok = pp.lexToken(); + while (tok.id == .whitespace) { + tok = pp.lexToken(); + tok.flags.space = true; + } + tok.flags.is_bol = bol; + return tok; +} - pp.hideset.clearRetainingCapacity(); - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); - try pp.ensureUnusedTokenCapacity(pp.top_expansion_buf.items.len); - for (pp.top_expansion_buf.items) |*tok| { - if (tok.id == .macro_ws and !pp.preserve_whitespace) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - if (tok.id == .comment and !pp.comp.langopts.preserve_comments_in_macros) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - if (tok.id == .placemarker) { - TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - continue; - } - tok.id.simplifyMacroKeywordExtra(true); - pp.addTokenAssumeCapacity(tok.*); - } - if (pp.preserve_whitespace) { - try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl); - while (pp.add_expansion_nl > 0) : (pp.add_expansion_nl -= 1) { - pp.addTokenAssumeCapacity(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); +fn readDefinedOp(pp: *Preprocessor) !PreprocessorToken { + var tok = pp.getToken(); + if (tok.id == .l_paren) { + tok = pp.getToken(); + const r_paren = pp.getToken(); + if (r_paren.id != .r_paren) { + try pp.errStr(r_paren, .closing_paren_after, "defined"); } } + if (!tok.id.isMacroIdentifier()) { + try pp.errTok(tok, .macro_name_must_be_identifier); + } + const slice = pp.tokSlice(tok); + if (pp.defines.contains(slice)) { + return PreprocessorToken.one; + } + return PreprocessorToken.zero; } -fn expandedSliceExtra(pp: *const Preprocessor, tok: anytype, macro_ws_handling: enum { single_macro_ws, preserve_macro_ws }) []const u8 { - if (tok.id.lexeme()) |some| { - if (!tok.id.allowsDigraphs(pp.comp.langopts) and !(tok.id == .macro_ws and macro_ws_handling == .preserve_macro_ws)) return some; +fn readIntExprLine(pp: *Preprocessor) !void { + while (true) { + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) break; + if (tok.id == .keyword_defined) { + const result = try pp.readDefinedOp(); + try pp.addToken(result); + } else if (tok.id.isMacroIdentifier()) { + try pp.addToken(PreprocessorToken.zero); + } else { + try pp.addToken(tok); + } } - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.getSource(tok.loc.id).buf, - .langopts = pp.comp.langopts, - .index = tok.loc.byte_offset, - .source = .generated, + try pp.addToken(.{ .id = .eof, .loc = .{} }); +} + +fn readConstexpr(pp: *Preprocessor) !bool { + const start = pp.tokens.len; + defer pp.tokens.len = start; + try pp.readIntExprLine(); + + var parser = Parser{ + .pp = pp, + .comp = pp.comp, + .gpa = pp.gpa, + .tok_ids = pp.tokens.items(.id), + .tok_i = @intCast(start), + .arena = undefined, + .in_macro = true, + .strings = std.ArrayListAligned(u8, 4).init(pp.comp.gpa), + + .data = undefined, + .value_map = undefined, + .labels = undefined, + .decl_buf = undefined, + .list_buf = undefined, + .param_buf = undefined, + .enum_buf = undefined, + .record_buf = undefined, + .attr_buf = undefined, + .field_attr_buf = undefined, + .string_ids = undefined, }; - if (tok.id == .macro_string) { - while (true) : (tmp_tokenizer.index += 1) { - if (tmp_tokenizer.buf[tmp_tokenizer.index] == '>') break; - } - return tmp_tokenizer.buf[tok.loc.byte_offset .. tmp_tokenizer.index + 1]; - } - const res = tmp_tokenizer.next(); - return tmp_tokenizer.buf[res.start..res.end]; + defer parser.strings.deinit(); + return parser.macroExpr(); } -/// Get expanded token source string. -pub fn expandedSlice(pp: *const Preprocessor, tok: anytype) []const u8 { - return pp.expandedSliceExtra(tok, .single_macro_ws); +/// #line number "file" +/// TODO: validate that the pp_num token is solely digits +fn readLine(pp: *Preprocessor) Error!void { + const digits = pp.getToken(); + if (digits.id != .pp_num) try pp.errTok(digits, .line_simple_digit); + + if (digits.id.isDirectiveEnd()) return; + const name = pp.getToken(); + if (name.id.isDirectiveEnd()) return; + if (name.id != .string_literal) try pp.errTok(name, .line_invalid_filename); + try pp.expectNewline(); } -/// Concat two tokens and add the result to pp.generated -fn pasteTokens(pp: *Preprocessor, lhs_toks: *ExpandBuf, rhs_toks: []const TokenWithExpansionLocs) Error!void { - const lhs = while (lhs_toks.popOrNull()) |lhs| { - if ((pp.comp.langopts.preserve_comments_in_macros and lhs.id == .comment) or - (lhs.id != .macro_ws and lhs.id != .comment)) - break lhs; +fn readPragma(pp: *Preprocessor) Error!void { + // TODO + pp.skipToNl(); +} - TokenWithExpansionLocs.free(lhs.expansion_locs, pp.gpa); - } else { - return bufCopyTokens(lhs_toks, rhs_toks, &.{}); +fn readUndef(pp: *Preprocessor) Error!void { + const name = try pp.readIdent() orelse { + pp.skipToNl(); + return; }; + try pp.expectNewline(); + _ = pp.defines.remove(pp.tokSlice(name)); +} - var rhs_rest: u32 = 1; - const rhs = for (rhs_toks) |rhs| { - if ((pp.comp.langopts.preserve_comments_in_macros and rhs.id == .comment) or - (rhs.id != .macro_ws and rhs.id != .comment)) - break rhs; +/// Skip until after a newline, error if extra tokens before it. +fn expectNewline(pp: *Preprocessor) !void { + var sent_err = false; + while (true) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) return; + if (tok.id == .whitespace or tok.id == .comment) continue; + if (!sent_err) { + sent_err = true; + try pp.errTok(tok, .extra_tokens_directive_end); + } + } +} - rhs_rest += 1; - } else { - return lhs_toks.appendAssumeCapacity(lhs); +/// TODO: pragma once +fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: Compilation.WhichInclude) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, }; - defer TokenWithExpansionLocs.free(lhs.expansion_locs, pp.gpa); + try pp.expectNewline(); - const start = pp.comp.generated_buf.items.len; - const end = start + pp.expandedSlice(lhs).len + pp.expandedSlice(rhs).len; - try pp.comp.generated_buf.ensureTotalCapacity(pp.gpa, end + 1); // +1 for a newline - // We cannot use the same slices here since they might be invalidated by `ensureCapacity` - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(lhs)); - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.expandedSlice(rhs)); - pp.comp.generated_buf.appendAssumeCapacity('\n'); - - // Try to tokenize the result. - var tmp_tokenizer = Tokenizer{ - .buf = pp.comp.generated_buf.items, - .langopts = pp.comp.langopts, - .index = @intCast(start), - .source = .generated, + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, }; - const pasted_token = tmp_tokenizer.nextNoWSComments(); - const next = tmp_tokenizer.nextNoWSComments(); - const pasted_id = if (lhs.id == .placemarker and rhs.id == .placemarker) - .placemarker - else - pasted_token.id; - try lhs_toks.append(try pp.makeGeneratedToken(start, pasted_id, lhs)); - - if (next.id != .nl and next.id != .eof) { - try pp.errStr( - lhs, - .pasting_formed_invalid, - try pp.comp.diagnostics.arena.allocator().dupe(u8, pp.comp.generated_buf.items[start..end]), - ); - try lhs_toks.append(tokFromRaw(next)); + const tok: RawToken = .{ .id = include_token.id, .source = include_token.loc.id, .start = include_token.loc.byte_offset, .line = include_token.loc.line }; + const source = (try pp.comp.findInclude(filename, tok, include_type, which)) orelse return pp.fatalNotFound(include_token, filename); + if (pp.include_guards.get(source.id)) |guard| { + if (pp.defines.contains(guard)) return; } + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); - try bufCopyTokens(lhs_toks, rhs_toks[rhs_rest..], &.{}); -} - -fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: TokenWithExpansionLocs) !TokenWithExpansionLocs { - var pasted_token = TokenWithExpansionLocs{ .id = id, .loc = .{ - .id = .generated, - .byte_offset = @intCast(start), - .line = pp.generated_line, - } }; - pp.generated_line += 1; - try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); - try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); - return pasted_token; -} - -/// Defines a new macro and warns if it is a duplicate -fn defineMacro(pp: *Preprocessor, name_tok: RawToken, macro: Macro) Error!void { - const name_str = pp.tokSlice(name_tok); - const gop = try pp.defines.getOrPut(pp.gpa, name_str); - if (gop.found_existing and !gop.value_ptr.eql(macro, pp)) { - const tag: Diagnostics.Tag = if (gop.value_ptr.is_builtin) .builtin_macro_redefined else .macro_redefined; - const start = pp.comp.diagnostics.list.items.len; + if (pp.tokenizers.items.len > max_include_depth) { try pp.comp.addDiagnostic(.{ - .tag = tag, - .loc = .{ .id = name_tok.source, .byte_offset = name_tok.start, .line = name_tok.line }, - .extra = .{ .str = name_str }, + .tag = .too_many_includes, + .loc = include_token.loc, }, &.{}); - if (!gop.value_ptr.is_builtin and pp.comp.diagnostics.list.items.len != start) { - try pp.comp.addDiagnostic(.{ - .tag = .previous_definition, - .loc = gop.value_ptr.loc, - }, &.{}); - } + return error.FatalError; } - if (pp.verbose) { - pp.verboseLog(name_tok, "macro {s} defined", .{name_str}); - } - gop.value_ptr.* = macro; + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); } -/// Handle a #define directive. -fn define(pp: *Preprocessor, tokenizer: *Tokenizer) Error!void { - // Get macro name and validate it. - const macro_name = tokenizer.nextNoWS(); - if (macro_name.id == .keyword_defined) { - try pp.err(macro_name, .defined_as_macro_name); - return skipToNl(tokenizer); +/// Read a header name delimited by quotes or angle brackets +fn readHeaderFileName(pp: *Preprocessor, is_std: *bool) !?[]const u8 { + if (!pp.isBufferEmpty()) return null; + _ = pp.skipSpace(); + + var close: u8 = undefined; + var tokenizer = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + defer pp.tokenizers.items[pp.tokenizers.items.len - 1] = tokenizer; + + if (tokenizer.buf[tokenizer.index..].len < 2) { + return null; } - if (!macro_name.id.isMacroIdentifier()) { - try pp.err(macro_name, .macro_name_must_be_identifier); - return skipToNl(tokenizer); - } - var macro_name_token_id = macro_name.id; - macro_name_token_id.simplifyMacroKeyword(); - switch (macro_name_token_id) { - .identifier, .extended_identifier => {}, - else => if (macro_name_token_id.isMacroIdentifier()) { - try pp.err(macro_name, .keyword_macro); + const start = tokenizer.index; + switch (tokenizer.buf[tokenizer.index..][0]) { + '"' => { + is_std.* = false; + close = '"'; + }, + '<' => { + is_std.* = true; + close = '>'; }, + else => return null, } + tokenizer.index += 1; + while (tokenizer.index < tokenizer.buf.len and tokenizer.buf[tokenizer.index] != close and tokenizer.buf[tokenizer.index] != '\n') : (tokenizer.index += 1) {} - // Check for function macros and empty defines. - var first = tokenizer.next(); - switch (first.id) { - .nl, .eof => return pp.defineMacro(macro_name, .{ - .params = &.{}, - .tokens = &.{}, - .var_args = false, - .loc = tokFromRaw(macro_name).loc, - .is_func = false, - }), - .whitespace => first = tokenizer.next(), - .l_paren => return pp.defineFn(tokenizer, macro_name, first), - else => try pp.err(first, .whitespace_after_macro_name), - } - if (first.id == .hash_hash) { - try pp.err(first, .hash_hash_at_start); - return skipToNl(tokenizer); + if (tokenizer.index == tokenizer.buf.len or tokenizer.buf[tokenizer.index] != close) { + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .header_str_closing); + try pp.errTok(.{ .id = undefined, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .header_str_match); + return error.InvalidInclude; } - first.id.simplifyMacroKeyword(); - pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. + tokenizer.index += 1; - var need_ws = false; - // Collect the token body and validate any ## found. - var tok = first; - while (true) { - tok.id.simplifyMacroKeyword(); - switch (tok.id) { - .hash_hash => { - const next = tokenizer.nextNoWSComments(); - switch (next.id) { - .nl, .eof => { - try pp.err(tok, .hash_hash_at_end); - return; - }, - .hash_hash => { - try pp.err(next, .hash_hash_at_end); - return; - }, - else => {}, - } - try pp.token_buf.append(tok); - try pp.token_buf.append(next); - }, - .nl, .eof => break, - .comment => if (pp.comp.langopts.preserve_comments_in_macros) { - if (need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - .whitespace => need_ws = true, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.token_buf.append(tok); - }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), - else => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - } - tok = tokenizer.next(); + const buf = tokenizer.buf[start..tokenizer.index]; + if (buf.len == 2) { + try pp.errTok(.{ .id = .nl, .loc = .{ .id = tokenizer.source, .byte_offset = start, .line = tokenizer.line } }, .empty_filename); + return error.InvalidInclude; } + return buf; +} - const list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); - try pp.defineMacro(macro_name, .{ - .loc = tokFromRaw(macro_name).loc, - .tokens = list, - .params = undefined, - .is_func = false, - .var_args = false, - }); +fn isBufferEmpty(pp: *const Preprocessor) bool { + return pp.expansion_bufs.items.len == 0; } -/// Handle a function like #define directive. -fn defineFn(pp: *Preprocessor, tokenizer: *Tokenizer, macro_name: RawToken, l_paren: RawToken) Error!void { - assert(macro_name.id.isMacroIdentifier()); - var params = std.ArrayList([]const u8).init(pp.gpa); - defer params.deinit(); +/// Read a delimited header name, or a macro expanded one +fn readHeaderName(pp: *Preprocessor, is_std: *bool) ![]const u8 { + if (try pp.readHeaderFileName(is_std)) |path| return path; - // Parse the parameter list. - var gnu_var_args: []const u8 = ""; - var var_args = false; + // If a token following #include does not start with < nor ", + // try to read the token as a regular token. Macro-expanded + // form may be a valid header file path. + const tok = try pp.readExpandNewline(); + if (tok.id.isDirectiveEnd()) { + try pp.errTok(tok, .expected_filename); + return error.InvalidInclude; + } + if (tok.id == .string_literal) { + is_std.* = false; + return pp.tokSlice(tok); + } + if (tok.id != .angle_bracket_left) { + try pp.errStr(tok, .expected_left_angle_bracket, pp.tokSlice(tok)); + return error.InvalidInclude; + } + const start = pp.char_buf.items.len; + try pp.char_buf.append(pp.gpa, '<'); + defer pp.char_buf.items.len = start; + const writer = pp.char_buf.writer(pp.gpa); while (true) { - var tok = tokenizer.nextNoWS(); - if (tok.id == .r_paren) break; - if (tok.id == .eof) return pp.err(tok, .unterminated_macro_param_list); - if (tok.id == .ellipsis) { - var_args = true; - const r_paren = tokenizer.nextNoWS(); - if (r_paren.id != .r_paren) { - try pp.err(r_paren, .missing_paren_param_list); - try pp.err(l_paren, .to_match_paren); - return skipToNl(tokenizer); - } - break; + const path_tok = try pp.readExpandNewline(); + if (path_tok.id == .nl) { + try pp.errTok(path_tok, .header_str_closing); + try pp.errTok(tok, .header_str_match); + return error.InvalidInclude; } - if (!tok.id.isMacroIdentifier()) { - try pp.err(tok, .invalid_token_param_list); - return skipToNl(tokenizer); - } - - try params.append(pp.tokSlice(tok)); - - tok = tokenizer.nextNoWS(); - if (tok.id == .ellipsis) { - try pp.err(tok, .gnu_va_macro); - gnu_var_args = params.pop(); - const r_paren = tokenizer.nextNoWS(); - if (r_paren.id != .r_paren) { - try pp.err(r_paren, .missing_paren_param_list); - try pp.err(l_paren, .to_match_paren); - return skipToNl(tokenizer); - } - break; - } else if (tok.id == .r_paren) { + if (path_tok.id == .angle_bracket_right) { break; - } else if (tok.id != .comma) { - try pp.err(tok, .expected_comma_param_list); - return skipToNl(tokenizer); } + try pp.prettyPrintToken(writer, path_tok.toTreeToken()); } + is_std.* = true; + try pp.char_buf.append(pp.gpa, '>'); + return pp.gpa.dupe(u8, pp.char_buf.items[start..]); +} - var need_ws = false; - // Collect the body tokens and validate # and ##'s found. - pp.token_buf.items.len = 0; // Safe to use since we can only be in one directive at a time. - tok_loop: while (true) { - var tok = tokenizer.next(); - switch (tok.id) { - .nl, .eof => break, - .whitespace => need_ws = pp.token_buf.items.len != 0, - .comment => if (!pp.comp.langopts.preserve_comments_in_macros) continue else { - if (need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - try pp.token_buf.append(tok); - }, - .hash => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - const param = tokenizer.nextNoWS(); - blk: { - if (var_args and param.id == .keyword_va_args) { - tok.id = .stringify_va_args; - try pp.token_buf.append(tok); - continue :tok_loop; - } - if (!param.id.isMacroIdentifier()) break :blk; - const s = pp.tokSlice(param); - if (mem.eql(u8, s, gnu_var_args)) { - tok.id = .stringify_va_args; - try pp.token_buf.append(tok); - continue :tok_loop; - } - for (params.items, 0..) |p, i| { - if (mem.eql(u8, p, s)) { - tok.id = .stringify_param; - tok.end = @intCast(i); - try pp.token_buf.append(tok); - continue :tok_loop; - } - } - } - try pp.err(param, .hash_not_followed_param); - return skipToNl(tokenizer); - }, - .hash_hash => { - need_ws = false; - // if ## appears at the beginning, the token buf is still empty - // in this case, error out - if (pp.token_buf.items.len == 0) { - try pp.err(tok, .hash_hash_at_start); - return skipToNl(tokenizer); - } - const saved_tokenizer = tokenizer.*; - const next = tokenizer.nextNoWSComments(); - if (next.id == .nl or next.id == .eof) { - try pp.err(tok, .hash_hash_at_end); - return; - } - tokenizer.* = saved_tokenizer; - // convert the previous token to .macro_param_no_expand if it was .macro_param - if (pp.token_buf.items[pp.token_buf.items.len - 1].id == .macro_param) { - pp.token_buf.items[pp.token_buf.items.len - 1].id = .macro_param_no_expand; - } - try pp.token_buf.append(tok); - }, - .unterminated_string_literal, .unterminated_char_literal, .empty_char_literal => |tag| { - try pp.err(tok, invalidTokenDiagnostic(tag)); - try pp.token_buf.append(tok); - }, - .unterminated_comment => try pp.err(tok, .unterminated_comment), - else => { - if (tok.id != .whitespace and need_ws) { - need_ws = false; - try pp.token_buf.append(.{ .id = .macro_ws, .source = .generated }); - } - if (var_args and tok.id == .keyword_va_args) { - // do nothing - } else if (var_args and tok.id == .keyword_va_opt) { - const opt_l_paren = tokenizer.next(); - if (opt_l_paren.id != .l_paren) { - try pp.err(opt_l_paren, .va_opt_lparen); - return skipToNl(tokenizer); - } - tok.start = opt_l_paren.end; - - var parens: u32 = 0; - while (true) { - const opt_tok = tokenizer.next(); - switch (opt_tok.id) { - .l_paren => parens += 1, - .r_paren => if (parens == 0) { - break; - } else { - parens -= 1; - }, - .nl, .eof => { - try pp.err(opt_tok, .va_opt_rparen); - try pp.err(opt_l_paren, .to_match_paren); - return skipToNl(tokenizer); - }, - .whitespace => {}, - else => tok.end = opt_tok.end, - } - } - } else if (tok.id.isMacroIdentifier()) { - tok.id.simplifyMacroKeyword(); - const s = pp.tokSlice(tok); - if (mem.eql(u8, gnu_var_args, s)) { - tok.id = .keyword_va_args; - } else for (params.items, 0..) |param, i| { - if (mem.eql(u8, param, s)) { - // NOTE: it doesn't matter to assign .macro_param_no_expand - // here in case a ## was the previous token, because - // ## processing will eat this token with the same semantics - tok.id = .macro_param; - tok.end = @intCast(i); - break; - } - } - } - try pp.token_buf.append(tok); - }, +fn readInclude(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .first); +} + +fn readIncludeNext(pp: *Preprocessor, include_token: PreprocessorToken) Error!void { + return pp.readIncludeExtra(include_token, .next); +} + +fn readErrorMessage(pp: *Preprocessor, directive_tok: PreprocessorToken, tag: Diagnostics.Tag) !void { + const char_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_top; + var i: usize = 0; + while (true) : (i += 1) { + const tok = pp.getToken(); + if (tok.id.isDirectiveEnd()) break; + const slice = pp.tokSlice(tok); + if (slice.len > 0 and tok.flags.space and i != 0) { + try pp.char_buf.append(pp.gpa, ' '); } + try pp.char_buf.appendSlice(pp.gpa, slice); } - - const param_list = try pp.arena.allocator().dupe([]const u8, params.items); - const token_list = try pp.arena.allocator().dupe(RawToken, pp.token_buf.items); - try pp.defineMacro(macro_name, .{ - .is_func = true, - .params = param_list, - .var_args = var_args or gnu_var_args.len != 0, - .tokens = token_list, - .loc = tokFromRaw(macro_name).loc, - }); + const slice = pp.char_buf.items[char_top..]; + const duped = try pp.comp.diagnostics.arena.allocator().dupe(u8, slice); + try pp.comp.addDiagnostic(.{ + .tag = tag, + .loc = directive_tok.loc, + .extra = .{ .str = duped }, + }, &.{}); } -/// Handle an #embed directive -/// embedDirective : ("FILENAME" | ) embedParam* -/// embedParam : IDENTIFIER (:: IDENTIFIER)? '(' ')' -fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { - const first = tokenizer.nextNoWS(); - const filename_tok = pp.findIncludeFilenameToken(first, tokenizer, .ignore_trailing_tokens) catch |er| switch (er) { - error.InvalidInclude => return, - else => |e| return e, - }; - defer TokenWithExpansionLocs.free(filename_tok.expansion_locs, pp.gpa); +fn clearGuard(pp: *Preprocessor) void { + pp.guard_stack.items[pp.guard_stack.items.len - 1] = null; +} - // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); - if (tok_slice.len < 3) { - try pp.err(first, .empty_filename); - return; +fn readDirective(pp: *Preprocessor) Error!void { + const directive = pp.getToken(); + if (directive.id.isDirectiveEnd()) return; + if (directive.id == .pp_num) { + return pp.readLinemarker(); } - const filename = tok_slice[1 .. tok_slice.len - 1]; - const include_type: Compilation.IncludeType = switch (filename_tok.id) { - .string_literal => .quotes, - .macro_string => .angle_brackets, - else => unreachable, - }; - // Index into `token_buf` - const Range = struct { - start: u32, - end: u32, + const until_else = 0; + const until_endif = 1; + const until_endif_seen_else = 2; - fn expand(opt_range: ?@This(), pp_: *Preprocessor, tokenizer_: *Tokenizer) !void { - const range = opt_range orelse return; - const slice = pp_.token_buf.items[range.start..range.end]; - for (slice) |tok| { - try pp_.expandMacro(tokenizer_, tok); + switch (directive.id) { + .keyword_define => try pp.readDefine(), + .keyword_elif => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elif_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); } - } - }; - pp.token_buf.items.len = 0; - - var limit: ?u32 = null; - var prefix: ?Range = null; - var suffix: ?Range = null; - var if_empty: ?Range = null; - while (true) { - const param_first = tokenizer.nextNoWS(); - switch (param_first.id) { - .nl, .eof => break, - .identifier => {}, - else => { - try pp.err(param_first, .malformed_embed_param); - continue; - }, - } + switch (pp.if_kind.get(pp.if_level)) { + until_else => if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elif", .{}); + } + } else { + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elif", .{}); + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elif_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_else => { + try pp.expectNewline(); + if (pp.if_level == 0) { + try pp.errTok(directive, .else_without_if); + return; + } else if (pp.if_level == 1) { + pp.clearGuard(); + } + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + pp.if_kind.set(pp.if_level, until_endif_seen_else); + if (pp.verbose) { + pp.verboseLog(directive, "#else branch here", .{}); + } + }, + until_endif => try pp.skip(.until_endif_seen_else), + until_endif_seen_else => { + try pp.errTok(directive, .else_after_else); + pp.skipToNl(); + }, + else => unreachable, + } + }, + .keyword_endif => { + try pp.expectNewline(); + if (pp.if_level == 0) { + pp.clearGuard(); + try pp.errTok(directive, .endif_without_if); + return; + } else if (pp.if_level == 1) { + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; + const saved_tokenizer = tokenizer.*; + defer tokenizer.* = saved_tokenizer; - const char_top = pp.char_buf.items.len; - defer pp.char_buf.items.len = char_top; - - const maybe_colon = tokenizer.colonColon(); - const param = switch (maybe_colon.id) { - .colon_colon => blk: { - // vendor::param - const param = tokenizer.nextNoWS(); - if (param.id != .identifier) { - try pp.err(param, .malformed_embed_param); - continue; + var next_tok = tokenizer.nextNoWS(); + while (next_tok.id == .nl) : (next_tok = tokenizer.nextNoWS()) {} + if (next_tok.id != .eof) pp.clearGuard(); + } + pp.if_level -= 1; + }, + .keyword_error => try pp.readErrorMessage(directive, .error_directive), + .keyword_if => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + if (try pp.readConstexpr()) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #if", .{}); } - const l_paren = tokenizer.nextNoWS(); - if (l_paren.id != .l_paren) { - try pp.err(l_paren, .malformed_embed_param); - continue; + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #if", .{}); } - try pp.char_buf.appendSlice(Attribute.normalize(pp.tokSlice(param_first))); - try pp.char_buf.appendSlice("::"); - try pp.char_buf.appendSlice(Attribute.normalize(pp.tokSlice(param))); - break :blk pp.char_buf.items; - }, - .l_paren => Attribute.normalize(pp.tokSlice(param_first)), - else => { - try pp.err(maybe_colon, .malformed_embed_param); - continue; - }, - }; - - const start: u32 = @intCast(pp.token_buf.items.len); - while (true) { - const next = tokenizer.nextNoWS(); - if (next.id == .r_paren) break; - if (next.id == .eof) { - try pp.err(maybe_colon, .malformed_embed_param); - break; } - try pp.token_buf.append(next); - } - const end: u32 = @intCast(pp.token_buf.items.len); - - if (std.mem.eql(u8, param, "limit")) { - if (limit != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "limit"); - continue; + }, + .keyword_ifdef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #ifdef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #ifdef", .{}); + } } - if (start + 1 != end) { - try pp.err(param_first, .malformed_embed_limit); - continue; + }, + .keyword_ifndef => { + const sum, const overflowed = @addWithOverflow(pp.if_level, 1); + if (overflowed != 0) + return pp.fatal(directive, "too many #if nestings", .{}); + pp.if_level = sum; + + const macro_name = (try pp.expectMacroName()) orelse return; + try pp.expectNewline(); + if (pp.defines.get(macro_name) == null) { + pp.if_kind.set(pp.if_level, until_endif); + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); } - const limit_tok = pp.token_buf.items[start]; - if (limit_tok.id != .pp_num) { - try pp.err(param_first, .malformed_embed_limit); - continue; + }, + .keyword_elifdef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); } - limit = std.fmt.parseInt(u32, pp.tokSlice(limit_tok), 10) catch { - try pp.err(limit_tok, .malformed_embed_limit); - continue; - }; - pp.token_buf.items.len = start; - } else if (std.mem.eql(u8, param, "prefix")) { - if (prefix != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "prefix"); - continue; + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) != null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifdef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifdef", .{}); + } + } + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, } - prefix = .{ .start = start, .end = end }; - } else if (std.mem.eql(u8, param, "suffix")) { - if (suffix != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "suffix"); - continue; + }, + .keyword_elifndef => { + if (pp.if_level == 0) { + try pp.errTok(directive, .elifdef_without_if); + pp.if_level += 1; + pp.if_kind.set(pp.if_level, until_else); + } else if (pp.if_level == 1) { + pp.clearGuard(); } - suffix = .{ .start = start, .end = end }; - } else if (std.mem.eql(u8, param, "if_empty")) { - if (if_empty != null) { - try pp.errStr(tokFromRaw(param_first), .duplicate_embed_param, "if_empty"); - continue; + switch (pp.if_kind.get(pp.if_level)) { + until_else => { + const macro_name = try pp.expectMacroName(); + if (macro_name == null) { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } else { + try pp.expectNewline(); + if (pp.defines.get(macro_name.?) == null) { + pp.if_kind.set(pp.if_level, until_endif); + if (pp.verbose) { + pp.verboseLog(directive, "entering then branch of #elifndef", .{}); + } + } else { + pp.if_kind.set(pp.if_level, until_else); + try pp.skip(.until_else); + if (pp.verbose) { + pp.verboseLog(directive, "entering else branch of #elifndef", .{}); + } + } + } + }, + until_endif => try pp.skip(.until_endif), + until_endif_seen_else => { + try pp.errTok(directive, .elifdef_after_else); + pp.skipToNl(); + }, + else => unreachable, } - if_empty = .{ .start = start, .end = end }; - } else { - try pp.errStr( - tokFromRaw(param_first), - .unsupported_embed_param, - try pp.comp.diagnostics.arena.allocator().dupe(u8, param), - ); - pp.token_buf.items.len = start; - } + }, + .keyword_include => try pp.readInclude(directive), + .keyword_include_next => try pp.readIncludeNext(directive), + .keyword_line => try pp.readLine(), + .keyword_pragma => try pp.readPragma(), + .keyword_undef => try pp.readUndef(), + .keyword_warning => try pp.readErrorMessage(directive, .warning_directive), + .keyword_embed => try pp.readEmbed(directive), + else => try pp.errTok(directive, .invalid_preprocessing_directive), } +} + +/// TODO: handle limit/prefix/suffix/etc +fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { + var is_std: bool = undefined; + const include_str = pp.readHeaderName(&is_std) catch |err| switch (err) { + error.InvalidInclude => return, + else => |e| return e, + }; + + const filename = include_str[1 .. include_str.len - 1]; + const include_type: Compilation.IncludeType = switch (include_str[0]) { + '"' => .quotes, + '<' => .angle_brackets, + else => unreachable, + }; - const embed_bytes = (try pp.comp.findEmbed(filename, first.source, include_type, limit)) orelse - return pp.fatalNotFound(filename_tok, filename); + const limit = std.math.maxInt(u32); + const embed_bytes = (try pp.comp.findEmbed(filename, directive_tok.loc.id, include_type, limit)) orelse + return pp.fatalNotFound(directive_tok, filename); defer pp.comp.gpa.free(embed_bytes); - try Range.expand(prefix, pp, tokenizer); - - if (embed_bytes.len == 0) { - try Range.expand(if_empty, pp, tokenizer); - try Range.expand(suffix, pp, tokenizer); - return; - } + if (embed_bytes.len == 0) return; try pp.ensureUnusedTokenCapacity(2 * embed_bytes.len - 1); // N bytes and N-1 commas @@ -2985,102 +1760,117 @@ fn embed(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!void { const byte = embed_bytes[0]; const start = pp.comp.generated_buf.items.len; try writer.print("{d}", .{byte}); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start, .embed_byte, filename_tok)); + var generated = try pp.makeGeneratedToken(start, .embed_byte, directive_tok); + generated.flags.is_bol = true; + pp.addTokenAssumeCapacity(generated); } for (embed_bytes[1..]) |byte| { const start = pp.comp.generated_buf.items.len; try writer.print(",{d}", .{byte}); pp.addTokenAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, filename_tok)); + pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); } try pp.comp.generated_buf.append(pp.gpa, '\n'); +} - try Range.expand(suffix, pp, tokenizer); +fn readToken(pp: *Preprocessor) Error!PreprocessorToken { + while (true) { + const tok = try pp.readExpand(); + if (tok.flags.is_bol and tok.id == .hash and tok.hideset == null) { + try pp.readDirective(); + continue; + } + return tok; + } } -// Handle a #include directive. -fn include(pp: *Preprocessor, tokenizer: *Tokenizer, which: Compilation.WhichInclude) MacroError!void { - const first = tokenizer.nextNoWS(); - const new_source = findIncludeSource(pp, tokenizer, first, which) catch |er| switch (er) { - error.InvalidInclude => return, - else => |e| return e, - }; +pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { + const guard = pp.findIncludeGuard(source); + try pp.guard_stack.append(pp.gpa, guard); - // Prevent stack overflow - pp.include_depth += 1; - defer pp.include_depth -= 1; - if (pp.include_depth > max_include_depth) { - try pp.comp.addDiagnostic(.{ - .tag = .too_many_includes, - .loc = .{ .id = first.source, .byte_offset = first.start, .line = first.line }, - }, &.{}); - return error.StopPreprocessing; + try pp.tokenizers.append(pp.gpa, .{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .index = 0, + .source = source.id, + }); + while (true) { + const tok = try pp.readToken(); + if (tok.id == .eof) { + const tokenizer = pp.tokenizers.pop(); + const guard_name = pp.guard_stack.pop(); + if (guard_name) |name| { + try pp.include_guards.put(pp.gpa, tokenizer.source, name); + } + if (pp.tokenizers.items.len == 0) { + return tok; + } + } else { + switch (tok.id) { + .unterminated_comment => try pp.errTok(tok, .unterminated_comment), + else => try pp.addToken(tok), + } + } } +} - if (pp.include_guards.get(new_source.id)) |guard| { - if (pp.defines.contains(guard)) return; +// After how many empty lines are needed to replace them with linemarkers. +const collapse_newlines = 8; + +/// Pretty print tokens and try to preserve whitespace. +pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { + var i: usize = 0; + while (i < pp.tokens.len) : (i += 1) { + const tok = pp.tokens.get(i); + if (tok.id == .eof) break; + try pp.prettyPrintToken(w, tok); } + try w.writeByte('\n'); +} - if (pp.verbose) { - pp.verboseLog(first, "include file {s}", .{new_source.path}); +fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: Token) !void { + if (tok.flags.is_bol) { + try w.writeByte('\n'); + } + if (tok.flags.space) { + try w.writeByte(' '); + } + if (tok.id.lexeme()) |some| { + try w.writeAll(some); + } else { + try w.writeAll(pp.tokSlice(tok)); } +} - const token_state = pp.getTokenState(); - try pp.addIncludeStart(new_source); - const eof = pp.preprocessExtra(new_source) catch |er| switch (er) { - error.StopPreprocessing => { - for (pp.expansion_entries.items(.locs)[token_state.expansion_entries_len..]) |loc| TokenWithExpansionLocs.free(loc, pp.gpa); - pp.restoreTokenState(token_state); - return; - }, - else => |e| return e, +pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location { + const S = struct { + fn order_token_index(context: void, lhs: Tree.TokenIndex, rhs: Tree.TokenIndex) std.math.Order { + _ = context; + return std.math.order(lhs, rhs); + } }; - try eof.checkMsEof(new_source, pp.comp); - if (pp.preserve_whitespace and pp.tokens.items(.id)[pp.tokens.len - 1] != .nl) { - try pp.addToken(.{ .id = .nl, .loc = .{ - .id = tokenizer.source, - .line = tokenizer.line, - } }); - } - if (pp.linemarkers == .none) return; - var next = first; - while (true) { - var tmp = tokenizer.*; - next = tmp.nextNoWS(); - if (next.id != .nl) break; - tokenizer.* = tmp; - } - try pp.addIncludeResume(next.source, next.end, next.line); -} - -/// tokens that are part of a pragma directive can happen in 3 ways: -/// 1. directly in the text via `#pragma ...` -/// 2. Via a string literal argument to `_Pragma` -/// 3. Via a stringified macro argument which is used as an argument to `_Pragma` -/// operator_loc: Location of `_Pragma`; null if this is from #pragma -/// arg_locs: expansion locations of the argument to _Pragma. empty if #pragma or a raw string literal was used -fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !TokenWithExpansionLocs { - var tok = tokFromRaw(raw); - if (operator_loc) |loc| { - try tok.addExpansionLocation(pp.gpa, &.{loc}); - } - try tok.addExpansionLocation(pp.gpa, arg_locs); - return tok; + + const indices = pp.expansion_entries.items(.idx); + const idx = std.sort.binarySearch(Tree.TokenIndex, tok, indices, {}, S.order_token_index) orelse return &.{}; + const locs = pp.expansion_entries.items(.locs)[idx]; + var i: usize = 0; + while (locs[i].id != .unused) : (i += 1) {} + return locs[0..i]; } -pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void { +pub fn addToken(pp: *Preprocessor, tok: PreprocessorToken) !void { if (tok.expansion_locs) |expansion_locs| { try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); } - try pp.tokens.append(pp.gpa, .{ .id = tok.id, .loc = tok.loc }); + try pp.tokens.append(pp.gpa, tok.toTreeToken()); } -pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: TokenWithExpansionLocs) void { +pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: PreprocessorToken) void { if (tok.expansion_locs) |expansion_locs| { pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); } - pp.tokens.appendAssumeCapacity(.{ .id = tok.id, .loc = tok.loc }); + pp.tokens.appendAssumeCapacity(tok.toTreeToken()); } pub fn ensureTotalTokenCapacity(pp: *Preprocessor, capacity: usize) !void { @@ -3093,466 +1883,152 @@ pub fn ensureUnusedTokenCapacity(pp: *Preprocessor, capacity: usize) !void { try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, capacity); } -/// Handle a pragma directive -fn pragma(pp: *Preprocessor, tokenizer: *Tokenizer, pragma_tok: RawToken, operator_loc: ?Source.Location, arg_locs: []const Source.Location) !void { - const name_tok = tokenizer.nextNoWS(); - if (name_tok.id == .nl or name_tok.id == .eof) return; - - const name = pp.tokSlice(name_tok); - try pp.addToken(try pp.makePragmaToken(pragma_tok, operator_loc, arg_locs)); - const pragma_start: u32 = @intCast(pp.tokens.len); - - const pragma_name_tok = try pp.makePragmaToken(name_tok, operator_loc, arg_locs); - try pp.addToken(pragma_name_tok); - while (true) { - const next_tok = tokenizer.next(); - if (next_tok.id == .whitespace) continue; - if (next_tok.id == .eof) { - try pp.addToken(.{ - .id = .nl, - .loc = .{ .id = .generated }, - }); - break; - } - try pp.addToken(try pp.makePragmaToken(next_tok, operator_loc, arg_locs)); - if (next_tok.id == .nl) break; - } - if (pp.comp.getPragma(name)) |prag| unknown: { - return prag.preprocessorCB(pp, pragma_start) catch |er| switch (er) { - error.UnknownPragma => break :unknown, - else => |e| return e, - }; - } - return pp.comp.addDiagnostic(.{ - .tag = .unknown_pragma, - .loc = pragma_name_tok.loc, - }, pragma_name_tok.expansionSlice()); -} - -fn findIncludeFilenameToken( +fn skip( pp: *Preprocessor, - first_token: RawToken, - tokenizer: *Tokenizer, - trailing_token_behavior: enum { ignore_trailing_tokens, expect_nl_eof }, -) !TokenWithExpansionLocs { - var first = first_token; - - if (first.id == .angle_bracket_left) to_end: { - // The tokenizer does not handle include strings so do it here. - while (tokenizer.index < tokenizer.buf.len) : (tokenizer.index += 1) { - switch (tokenizer.buf[tokenizer.index]) { - '>' => { - tokenizer.index += 1; - first.end = tokenizer.index; - first.id = .macro_string; - break :to_end; + cont: enum { until_else, until_endif, until_endif_seen_else }, +) Error!void { + var ifs_seen: u32 = 0; + var line_start = true; + var tokenizer = &pp.tokenizers.items[pp.tokenizers.items.len - 1]; + + while (tokenizer.index < tokenizer.buf.len) { + if (line_start) { + const saved_tokenizer = tokenizer.*; + const hash = tokenizer.nextNoWS(); + if (hash.id == .nl) continue; + line_start = false; + if (hash.id != .hash) continue; + const directive = tokenizer.nextNoWS(); + switch (directive.id) { + .keyword_else => { + if (ifs_seen != 0) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .else_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elif => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elif_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifdef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifdef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_elifndef => { + if (ifs_seen != 0 or cont == .until_endif) continue; + if (cont == .until_endif_seen_else) { + // try pp.err(directive, .elifndef_after_else); + continue; + } + tokenizer.* = saved_tokenizer; + return; + }, + .keyword_endif => { + if (ifs_seen == 0) { + tokenizer.* = saved_tokenizer; + return; + } + ifs_seen -= 1; }, - '\n' => break, + .keyword_if, .keyword_ifdef, .keyword_ifndef => ifs_seen += 1, else => {}, } - } - try pp.comp.addDiagnostic(.{ - .tag = .header_str_closing, - .loc = .{ .id = first.source, .byte_offset = tokenizer.index, .line = first.line }, - }, &.{}); - try pp.err(first, .header_str_match); - } - - const source_tok = tokFromRaw(first); - const filename_tok, const expanded_trailing = switch (source_tok.id) { - .string_literal, .macro_string => .{ source_tok, false }, - else => expanded: { - // Try to expand if the argument is a macro. - pp.top_expansion_buf.items.len = 0; - defer for (pp.top_expansion_buf.items) |tok| TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa); - try pp.top_expansion_buf.append(source_tok); - pp.expansion_source_loc = source_tok.loc; - - try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); - var trailing_toks: []const TokenWithExpansionLocs = &.{}; - const include_str = (try pp.reconstructIncludeString(pp.top_expansion_buf.items, &trailing_toks, tokFromRaw(first))) orelse { - try pp.expectNl(tokenizer); - return error.InvalidInclude; - }; - const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.appendSlice(pp.gpa, include_str); - - break :expanded .{ try pp.makeGeneratedToken(start, switch (include_str[0]) { - '"' => .string_literal, - '<' => .macro_string, - else => unreachable, - }, pp.top_expansion_buf.items[0]), trailing_toks.len != 0 }; - }, - }; - - switch (trailing_token_behavior) { - .expect_nl_eof => { - // Error on extra tokens. - const nl = tokenizer.nextNoWS(); - if ((nl.id != .nl and nl.id != .eof) or expanded_trailing) { - skipToNl(tokenizer); - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ - .tag = .extra_tokens_directive_end, - .loc = filename_tok.loc, - }, filename_tok.expansionSlice(), false); + } else if (tokenizer.buf[tokenizer.index] == '\n') { + line_start = true; + tokenizer.index += 1; + tokenizer.line += 1; + tokenizer.bol = true; + if (pp.preserve_whitespace) { + try pp.addToken(.{ .id = .nl, .loc = .{ + .id = tokenizer.source, + .line = tokenizer.line, + } }); } - }, - .ignore_trailing_tokens => if (expanded_trailing) { - try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ - .tag = .extra_tokens_directive_end, - .loc = filename_tok.loc, - }, filename_tok.expansionSlice(), false); - }, + } else { + line_start = false; + tokenizer.index += 1; + } + } else { + return pp.errTok(.{ .id = .eof, .loc = .{ .id = tokenizer.source, .byte_offset = tokenizer.index, .line = tokenizer.line } }, .unterminated_conditional_directive); } - return filename_tok; } -fn findIncludeSource(pp: *Preprocessor, tokenizer: *Tokenizer, first: RawToken, which: Compilation.WhichInclude) !Source { - const filename_tok = try pp.findIncludeFilenameToken(first, tokenizer, .expect_nl_eof); - defer TokenWithExpansionLocs.free(filename_tok.expansion_locs, pp.gpa); - - // Check for empty filename. - const tok_slice = pp.expandedSliceExtra(filename_tok, .single_macro_ws); - if (tok_slice.len < 3) { - try pp.err(first, .empty_filename); - return error.InvalidInclude; - } - - // Find the file. - const filename = tok_slice[1 .. tok_slice.len - 1]; - const include_type: Compilation.IncludeType = switch (filename_tok.id) { - .string_literal => .quotes, - .macro_string => .angle_brackets, - else => unreachable, - }; +fn verboseLog(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) void { + const source = pp.comp.getSource(tok.loc.id); + const line_col = source.lineCol(tok.loc); - return (try pp.comp.findInclude(filename, first, include_type, which)) orelse - return pp.fatalNotFound(filename_tok, filename); + const stderr = std.io.getStdErr().writer(); + var buf_writer = std.io.bufferedWriter(stderr); + const writer = buf_writer.writer(); + defer buf_writer.flush() catch {}; + writer.print("{s}:{d}:{d}: ", .{ source.path, line_col.line_no, line_col.col }) catch return; + writer.print(fmt, args) catch return; + writer.writeByte('\n') catch return; + writer.writeAll(line_col.line) catch return; + writer.writeByte('\n') catch return; } -fn printLinemarker( - pp: *Preprocessor, - w: anytype, - line_no: u32, - source: Source, - start_resume: enum(u8) { start, @"resume", none }, -) !void { - try w.writeByte('#'); - if (pp.linemarkers == .line_directives) try w.writeAll("line"); - try w.print(" {d} \"", .{line_no}); - for (source.path) |byte| switch (byte) { - '\n' => try w.writeAll("\\n"), - '\r' => try w.writeAll("\\r"), - '\t' => try w.writeAll("\\t"), - '\\' => try w.writeAll("\\\\"), - '"' => try w.writeAll("\\\""), - ' ', '!', '#'...'&', '('...'[', ']'...'~' => try w.writeByte(byte), - // Use hex escapes for any non-ASCII/unprintable characters. - // This ensures that the parsed version of this string will end up - // containing the same bytes as the input regardless of encoding. - else => { - try w.writeAll("\\x"); - try std.fmt.formatInt(byte, 16, .lower, .{ .width = 2, .fill = '0' }, w); - }, - }; - try w.writeByte('"'); - if (pp.linemarkers == .numeric_directives) { - switch (start_resume) { - .none => {}, - .start => try w.writeAll(" 1"), - .@"resume" => try w.writeAll(" 2"), - } - switch (source.kind) { - .user => {}, - .system => try w.writeAll(" 3"), - .extern_c_system => try w.writeAll(" 3 4"), - } - } - try w.writeByte('\n'); +fn fatal(pp: *Preprocessor, tok: PreprocessorToken, comptime fmt: []const u8, args: anytype) Compilation.Error { + try pp.comp.diagnostics.list.append(pp.gpa, .{ + .tag = .cli_error, + .kind = .@"fatal error", + .extra = .{ .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), fmt, args) }, + .loc = tok.loc, + }); + return error.FatalError; } -// After how many empty lines are needed to replace them with linemarkers. -const collapse_newlines = 8; - -/// Pretty print tokens and try to preserve whitespace. -pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { - const tok_ids = pp.tokens.items(.id); - - var i: u32 = 0; - var last_nl = true; - outer: while (true) : (i += 1) { - var cur: Token = pp.tokens.get(i); - switch (cur.id) { - .eof => { - if (!last_nl) try w.writeByte('\n'); - return; - }, - .nl => { - var newlines: u32 = 0; - for (tok_ids[i..], i..) |id, j| { - if (id == .nl) { - newlines += 1; - } else if (id == .eof) { - if (!last_nl) try w.writeByte('\n'); - return; - } else if (id != .whitespace) { - if (pp.linemarkers == .none) { - if (newlines < 2) break; - } else if (newlines < collapse_newlines) { - break; - } - - i = @intCast((j - 1) - @intFromBool(tok_ids[j - 1] == .whitespace)); - if (!last_nl) try w.writeAll("\n"); - if (pp.linemarkers != .none) { - const next = pp.tokens.get(i); - const source = pp.comp.getSource(next.loc.id); - const line_col = source.lineCol(next.loc); - try pp.printLinemarker(w, line_col.line_no, source, .none); - last_nl = true; - } - continue :outer; - } - } - last_nl = true; - try w.writeAll("\n"); - }, - .keyword_pragma => { - const pragma_name = pp.expandedSlice(pp.tokens.get(i + 1)); - const end_idx = mem.indexOfScalarPos(Token.Id, tok_ids, i, .nl) orelse i + 1; - const pragma_len = @as(u32, @intCast(end_idx)) - i; - - if (pp.comp.getPragma(pragma_name)) |prag| { - if (!prag.shouldPreserveTokens(pp, i + 1)) { - try w.writeByte('\n'); - i += pragma_len; - cur = pp.tokens.get(i); - continue; - } - } - try w.writeAll("#pragma"); - i += 1; - while (true) : (i += 1) { - cur = pp.tokens.get(i); - if (cur.id == .nl) { - try w.writeByte('\n'); - last_nl = true; - break; - } - try w.writeByte(' '); - const slice = pp.expandedSlice(cur); - try w.writeAll(slice); - } - }, - .whitespace => { - var slice = pp.expandedSlice(cur); - while (mem.indexOfScalar(u8, slice, '\n')) |some| { - if (pp.linemarkers != .none) try w.writeByte('\n'); - slice = slice[some + 1 ..]; - } - for (slice) |_| try w.writeByte(' '); - last_nl = false; - }, - .include_start => { - const source = pp.comp.getSource(cur.loc.id); +fn fatalNotFound(pp: *Preprocessor, tok: PreprocessorToken, filename: []const u8) Compilation.Error { + const old = pp.comp.diagnostics.fatal_errors; + pp.comp.diagnostics.fatal_errors = true; + defer pp.comp.diagnostics.fatal_errors = old; - try pp.printLinemarker(w, 1, source, .start); - last_nl = true; - }, - .include_resume => { - const source = pp.comp.getSource(cur.loc.id); - const line_col = source.lineCol(cur.loc); - if (!last_nl) try w.writeAll("\n"); + try pp.comp.diagnostics.addExtra(pp.comp.langopts, .{ .tag = .cli_error, .loc = tok.loc, .extra = .{ + .str = try std.fmt.allocPrint(pp.comp.diagnostics.arena.allocator(), "'{s}' not found", .{filename}), + } }, tok.expansionSlice(), false); + unreachable; // addExtra should've returned FatalError +} - try pp.printLinemarker(w, line_col.line_no, source, .@"resume"); - last_nl = true; - }, - else => { - const slice = pp.expandedSlice(cur); - try w.writeAll(slice); - last_nl = false; - }, - } +/// Consume next token, error if it is not an identifier. +fn expectMacroName(pp: *Preprocessor) Error!?[]const u8 { + const macro_name = pp.getToken(); + if (!macro_name.id.isMacroIdentifier()) { + try pp.errTok(macro_name, .macro_name_missing); + pp.skipToNl(); + return null; } + return pp.tokSlice(macro_name); } -test "Preserve pragma tokens sometimes" { - const allocator = std.testing.allocator; - const Test = struct { - fn runPreprocessor(source_text: []const u8) ![]const u8 { - var buf = std.ArrayList(u8).init(allocator); - defer buf.deinit(); - - var comp = Compilation.init(allocator); - defer comp.deinit(); - - try comp.addDefaultPragmaHandlers(); - - var pp = Preprocessor.init(&comp); - defer pp.deinit(); - - pp.preserve_whitespace = true; - assert(pp.linemarkers == .none); - - const test_runner_macros = try comp.addSourceFromBuffer("", source_text); - const eof = try pp.preprocess(test_runner_macros); - try pp.addToken(eof); - try pp.prettyPrintTokens(buf.writer()); - return allocator.dupe(u8, buf.items); - } - - fn check(source_text: []const u8, expected: []const u8) !void { - const output = try runPreprocessor(source_text); - defer allocator.free(output); - - try std.testing.expectEqualStrings(expected, output); - } - }; - const preserve_gcc_diagnostic = - \\#pragma GCC diagnostic error "-Wnewline-eof" - \\#pragma GCC warning error "-Wnewline-eof" - \\int x; - \\#pragma GCC ignored error "-Wnewline-eof" - \\ - ; - try Test.check(preserve_gcc_diagnostic, preserve_gcc_diagnostic); - - const omit_once = - \\#pragma once - \\int x; - \\#pragma once - \\ - ; - // TODO should only be one newline afterwards when emulating clang - try Test.check(omit_once, "\nint x;\n\n"); - - const omit_poison = - \\#pragma GCC poison foobar - \\ - ; - try Test.check(omit_poison, "\n"); -} - -test "destringify" { - const allocator = std.testing.allocator; - const Test = struct { - fn testDestringify(pp: *Preprocessor, stringified: []const u8, destringified: []const u8) !void { - pp.char_buf.clearRetainingCapacity(); - try pp.char_buf.ensureUnusedCapacity(stringified.len); - pp.destringify(stringified); - try std.testing.expectEqualStrings(destringified, pp.char_buf.items); - } - }; - var comp = Compilation.init(allocator); - defer comp.deinit(); - var pp = Preprocessor.init(&comp); - defer pp.deinit(); - - try Test.testDestringify(&pp, "hello\tworld\n", "hello\tworld\n"); - try Test.testDestringify(&pp, - \\ \"FOO BAR BAZ\" - , - \\ "FOO BAR BAZ" - ); - try Test.testDestringify(&pp, - \\ \\t\\n - \\ - , - \\ \t\n - \\ - ); -} - -test "Include guards" { - const Test = struct { - /// This is here so that when #elifdef / #elifndef are added we don't forget - /// to test that they don't accidentally break include guard detection - fn pairsWithIfndef(tok_id: RawToken.Id) bool { - return switch (tok_id) { - .keyword_elif, - .keyword_elifdef, - .keyword_elifndef, - .keyword_else, - => true, - - .keyword_include, - .keyword_include_next, - .keyword_embed, - .keyword_define, - .keyword_defined, - .keyword_undef, - .keyword_ifdef, - .keyword_ifndef, - .keyword_error, - .keyword_warning, - .keyword_pragma, - .keyword_line, - .keyword_endif, - => false, - else => unreachable, - }; - } - - fn skippable(tok_id: RawToken.Id) bool { - return switch (tok_id) { - .keyword_defined, .keyword_va_args, .keyword_va_opt, .keyword_endif => true, - else => false, - }; - } - - fn testIncludeGuard(allocator: std.mem.Allocator, comptime template: []const u8, tok_id: RawToken.Id, expected_guards: u32) !void { - var comp = Compilation.init(allocator); - defer comp.deinit(); - var pp = Preprocessor.init(&comp); - defer pp.deinit(); - - const path = try std.fs.path.join(allocator, &.{ ".", "bar.h" }); - defer allocator.free(path); - - _ = try comp.addSourceFromBuffer(path, "int bar = 5;\n"); - - var buf = std.ArrayList(u8).init(allocator); - defer buf.deinit(); - - var writer = buf.writer(); - switch (tok_id) { - .keyword_include, .keyword_include_next => try writer.print(template, .{ tok_id.lexeme().?, " \"bar.h\"" }), - .keyword_define, .keyword_undef => try writer.print(template, .{ tok_id.lexeme().?, " BAR" }), - .keyword_ifndef, - .keyword_ifdef, - .keyword_elifdef, - .keyword_elifndef, - => try writer.print(template, .{ tok_id.lexeme().?, " BAR\n#endif" }), - else => try writer.print(template, .{ tok_id.lexeme().?, "" }), - } - const source = try comp.addSourceFromBuffer("test.h", buf.items); - _ = try pp.preprocess(source); - - try std.testing.expectEqual(expected_guards, pp.include_guards.count()); - } +/// Return the name of the #ifndef guard macro that starts a source, if any. +/// If a source starts with `#ifndef IDENTIFIER`, return `IDENTIFIER` +/// This function does not validate that the entire source is guarded by the +/// initial ifndef, if any +fn findIncludeGuard(pp: *Preprocessor, source: Source) ?[]const u8 { + var tokenizer = Tokenizer{ + .buf = source.buf, + .langopts = pp.comp.langopts, + .source = source.id, }; - const tags = std.meta.tags(RawToken.Id); - for (tags) |tag| { - if (Test.skippable(tag)) continue; - var copy = tag; - copy.simplifyMacroKeyword(); - if (copy != tag or tag == .keyword_else) { - const inside_ifndef_template = - \\//Leading comment (should be ignored) - \\ - \\#ifndef FOO - \\#{s}{s} - \\#endif - ; - const expected_guards: u32 = if (Test.pairsWithIfndef(tag)) 0 else 1; - try Test.testIncludeGuard(std.testing.allocator, inside_ifndef_template, tag, expected_guards); - - const outside_ifndef_template = - \\#ifndef FOO - \\#endif - \\#{s}{s} - ; - try Test.testIncludeGuard(std.testing.allocator, outside_ifndef_template, tag, 0); - } - } + var hash = tokenizer.nextNoWS(); + while (hash.id == .nl) hash = tokenizer.nextNoWS(); + if (hash.id != .hash) return null; + const ifndef = tokenizer.nextNoWS(); + if (ifndef.id != .keyword_ifndef) return null; + const guard = tokenizer.nextNoWS(); + if (guard.id != .identifier) return null; + return pp.tokSlice(.{ .id = guard.id, .loc = .{ .id = guard.source, .byte_offset = guard.start, .line = guard.line } }); } diff --git a/src/aro/Tree.zig b/src/aro/Tree.zig index f176930a..a58f42ea 100644 --- a/src/aro/Tree.zig +++ b/src/aro/Tree.zig @@ -6,11 +6,19 @@ const Compilation = @import("Compilation.zig"); const number_affixes = @import("Tree/number_affixes.zig"); const Source = @import("Source.zig"); const Tokenizer = @import("Tokenizer.zig"); +const Treap = @import("Treap.zig"); const Type = @import("Type.zig"); const Value = @import("Value.zig"); const StringInterner = @import("StringInterner.zig"); +const Flags = packed struct(u8) { + is_bol: bool = false, + space: bool = false, + _: u6 = undefined, +}; + pub const Token = struct { + flags: Flags, id: Id, loc: Source.Location, @@ -21,25 +29,36 @@ pub const Token = struct { }; pub const TokenWithExpansionLocs = struct { - id: Token.Id, - flags: packed struct { - expansion_disabled: bool = false, - is_macro_arg: bool = false, - } = .{}, - /// This location contains the actual token slice which might be generated. - /// If it is generated then there is guaranteed to be at least one - /// expansion location. + const Self = @This(); + + flags: Flags = .{}, + id: Tokenizer.Token.Id, + hideset: Treap.Node = null, loc: Source.Location, expansion_locs: ?[*]Source.Location = null, - pub fn expansionSlice(tok: TokenWithExpansionLocs) []const Source.Location { + pub fn toTreeToken(self: Self) Token { + return .{ .flags = self.flags, .id = self.id, .loc = self.loc }; + } + + pub fn argPosition(self: Self) u32 { + std.debug.assert(self.id == .macro_param); + return self.loc.byte_offset; + } + + pub fn isVarArg(self: Self) bool { + std.debug.assert(self.id == .macro_param); + return self.loc.line != 0; + } + + pub fn expansionSlice(tok: Self) []const Source.Location { const locs = tok.expansion_locs orelse return &[0]Source.Location{}; var i: usize = 0; while (locs[i].id != .unused) : (i += 1) {} return locs[0..i]; } - pub fn addExpansionLocation(tok: *TokenWithExpansionLocs, gpa: std.mem.Allocator, new: []const Source.Location) !void { + pub fn addExpansionLocation(tok: *Self, gpa: std.mem.Allocator, new: []const Source.Location) !void { if (new.len == 0 or tok.id == .whitespace or tok.id == .macro_ws or tok.id == .placemarker) return; var list = std.ArrayList(Source.Location).init(gpa); defer { @@ -80,14 +99,14 @@ pub const TokenWithExpansionLocs = struct { gpa.free(locs[0 .. i + 1]); } - pub fn dupe(tok: TokenWithExpansionLocs, gpa: std.mem.Allocator) !TokenWithExpansionLocs { + pub fn dupe(tok: Self, gpa: std.mem.Allocator) !Self { var copy = tok; copy.expansion_locs = null; try copy.addExpansionLocation(gpa, tok.expansionSlice()); return copy; } - pub fn checkMsEof(tok: TokenWithExpansionLocs, source: Source, comp: *Compilation) !void { + pub fn checkMsEof(tok: Self, source: Source, comp: *Compilation) !void { std.debug.assert(tok.id == .eof); if (source.buf.len > tok.loc.byte_offset and source.buf[tok.loc.byte_offset] == 0x1A) { try comp.addDiagnostic(.{ @@ -100,6 +119,9 @@ pub const TokenWithExpansionLocs = struct { }, &.{}); } } + + pub const one: Self = .{ .id = .one, .loc = .{} }; + pub const zero: Self = .{ .id = .zero, .loc = .{} }; }; pub const TokenIndex = u32; diff --git a/src/aro/pragmas/gcc.zig b/src/aro/pragmas/gcc.zig index 91ab750b..8887b632 100644 --- a/src/aro/pragmas/gcc.zig +++ b/src/aro/pragmas/gcc.zig @@ -69,7 +69,7 @@ fn diagnosticHandler(self: *GCC, pp: *Preprocessor, start_idx: TokenIndex) Pragm const diagnostic_tok = pp.tokens.get(start_idx); if (diagnostic_tok.id == .nl) return; - const diagnostic = std.meta.stringToEnum(Directive.Diagnostics, pp.expandedSlice(diagnostic_tok)) orelse + const diagnostic = std.meta.stringToEnum(Directive.Diagnostics, pp.tokSlice(diagnostic_tok)) orelse return error.UnknownPragma; switch (diagnostic) { @@ -112,7 +112,7 @@ fn preprocessorHandler(pragma: *Pragma, pp: *Preprocessor, start_idx: TokenIndex const directive_tok = pp.tokens.get(start_idx + 1); if (directive_tok.id == .nl) return; - const gcc_pragma = std.meta.stringToEnum(Directive, pp.expandedSlice(directive_tok)) orelse + const gcc_pragma = std.meta.stringToEnum(Directive, pp.tokSlice(directive_tok)) orelse return pp.comp.addDiagnostic(.{ .tag = .unknown_gcc_pragma, .loc = directive_tok.loc, @@ -159,7 +159,7 @@ fn preprocessorHandler(pragma: *Pragma, pp: *Preprocessor, start_idx: TokenIndex .loc = tok.loc, }, pp.expansionSlice(start_idx + i)); } - const str = pp.expandedSlice(tok); + const str = pp.tokSlice(tok); if (pp.defines.get(str) != null) { try pp.comp.addDiagnostic(.{ .tag = .pragma_poison_macro, @@ -177,7 +177,7 @@ fn parserHandler(pragma: *Pragma, p: *Parser, start_idx: TokenIndex) Compilation var self: *GCC = @fieldParentPtr("pragma", pragma); const directive_tok = p.pp.tokens.get(start_idx + 1); if (directive_tok.id == .nl) return; - const name = p.pp.expandedSlice(directive_tok); + const name = p.pp.tokSlice(directive_tok); if (mem.eql(u8, name, "diagnostic")) { return self.diagnosticHandler(p.pp, start_idx + 2) catch |err| switch (err) { error.UnknownPragma => {}, // handled during preprocessing @@ -190,7 +190,7 @@ fn parserHandler(pragma: *Pragma, p: *Parser, start_idx: TokenIndex) Compilation fn preserveTokens(_: *Pragma, pp: *Preprocessor, start_idx: TokenIndex) bool { const next = pp.tokens.get(start_idx + 1); if (next.id != .nl) { - const name = pp.expandedSlice(next); + const name = pp.tokSlice(next); if (mem.eql(u8, name, "poison")) { return false; } diff --git a/test/runner.zig b/test/runner.zig index a1d6cbf7..8e8a0fff 100644 --- a/test/runner.zig +++ b/test/runner.zig @@ -239,7 +239,7 @@ pub fn main() !void { try pp.addToken(eof); if (pp.defines.get("TESTS_SKIPPED")) |macro| { - if (macro.is_func or macro.tokens.len != 1 or macro.tokens[0].id != .pp_num) { + if (macro.kind == .func or macro.tokens.len != 1 or macro.tokens[0].id != .pp_num) { fail_count += 1; std.debug.print("invalid TESTS_SKIPPED, definition should contain exactly one integer literal {}\n", .{macro}); continue; @@ -380,7 +380,7 @@ pub fn main() !void { if (pp.defines.get("EXPECTED_OUTPUT")) |macro| blk: { if (comp.diagnostics.errors != 0) break :blk; - if (macro.is_func) { + if (macro.kind == .func) { fail_count += 1; std.debug.print("invalid EXPECTED_OUTPUT {}\n", .{macro}); continue; @@ -470,7 +470,7 @@ fn checkExpectedErrors(pp: *aro.Preprocessor, buf: *std.ArrayList(u8)) !?bool { defer m.deinit(); aro.Diagnostics.renderMessages(pp.comp, &m); - if (macro.is_func) { + if (macro.kind == .func) { std.debug.print("invalid EXPECTED_ERRORS {}\n", .{macro}); return false; } From efb35a1a1c777887cb0f223c2627a939f27159f5 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 18 Jul 2024 14:49:57 -0700 Subject: [PATCH 04/10] Preprocessor: basic pragma directive support --- src/aro/Preprocessor.zig | 80 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 33df74c8..d0eae66c 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -1309,9 +1309,39 @@ fn readLine(pp: *Preprocessor) Error!void { try pp.expectNewline(); } -fn readPragma(pp: *Preprocessor) Error!void { - // TODO - pp.skipToNl(); +fn readPragma(pp: *Preprocessor, pragma_tok: PreprocessorToken) Error!void { + const name_tok = pp.getToken(); + if (name_tok.id == .nl or name_tok.id == .eof) return; + + try pp.addToken(pragma_tok); + + const pragma_start: u32 = @intCast(pp.tokens.len); + try pp.addToken(name_tok); + + while (true) { + const next_tok = pp.getToken(); + if (next_tok.id == .eof) { + try pp.addToken(.{ + .id = .nl, + .loc = .{ .id = .generated }, + }); + break; + } + try pp.addToken(next_tok); + if (next_tok.id == .nl) break; + } + const name = pp.tokSlice(name_tok); + if (pp.comp.getPragma(name)) |prag| unknown: { + return prag.preprocessorCB(pp, pragma_start) catch |er| switch (er) { + error.UnknownPragma => break :unknown, + error.StopPreprocessing => { + _ = pp.tokenizers.pop(); + return; + }, + else => |e| return e, + }; + } + return pp.errTok(name_tok, .unknown_pragma); } fn readUndef(pp: *Preprocessor) Error!void { @@ -1367,6 +1397,7 @@ fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: }, &.{}); return error.FatalError; } + pp.preprocess_count += 1; try pp.tokenizers.append(pp.gpa, .{ .buf = source.buf, .langopts = pp.comp.langopts, @@ -1719,7 +1750,7 @@ fn readDirective(pp: *Preprocessor) Error!void { .keyword_include => try pp.readInclude(directive), .keyword_include_next => try pp.readIncludeNext(directive), .keyword_line => try pp.readLine(), - .keyword_pragma => try pp.readPragma(), + .keyword_pragma => try pp.readPragma(directive), .keyword_undef => try pp.readUndef(), .keyword_warning => try pp.readErrorMessage(directive, .warning_directive), .keyword_embed => try pp.readEmbed(directive), @@ -1789,6 +1820,7 @@ pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { const guard = pp.findIncludeGuard(source); try pp.guard_stack.append(pp.gpa, guard); + pp.preprocess_count += 1; try pp.tokenizers.append(pp.gpa, .{ .buf = source.buf, .langopts = pp.comp.langopts, @@ -1820,11 +1852,43 @@ const collapse_newlines = 8; /// Pretty print tokens and try to preserve whitespace. pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { - var i: usize = 0; + const tok_ids = pp.tokens.items(.id); + var i: u32 = 0; + var last_nl = false; while (i < pp.tokens.len) : (i += 1) { - const tok = pp.tokens.get(i); - if (tok.id == .eof) break; - try pp.prettyPrintToken(w, tok); + var cur: Token = pp.tokens.get(i); + switch (cur.id) { + .eof => break, + .keyword_pragma => { + const pragma_name = pp.tokSlice(pp.tokens.get(i + 1)); + const end_idx = mem.indexOfScalarPos(Token.Id, tok_ids, i, .nl) orelse i + 1; + const pragma_len = @as(u32, @intCast(end_idx)) - i; + + if (pp.comp.getPragma(pragma_name)) |prag| { + if (!prag.shouldPreserveTokens(pp, i + 1)) { + try w.writeByte('\n'); + i += pragma_len; + cur = pp.tokens.get(i); + continue; + } + } + try w.writeAll("#pragma"); + i += 1; + while (true) : (i += 1) { + cur = pp.tokens.get(i); + if (cur.id == .nl) { + try w.writeByte('\n'); + last_nl = true; + break; + } + try w.writeByte(' '); + const slice = pp.tokSlice(cur); + try w.writeAll(slice); + } + + }, + else => try pp.prettyPrintToken(w, cur), + } } try w.writeByte('\n'); } From e8e87cf9649383ed42f76d328c1b7d516ad4e13f Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 18 Jul 2024 16:19:04 -0700 Subject: [PATCH 05/10] Preprocessor: allocate space before appending --- src/aro/Preprocessor.zig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index d0eae66c..3bdfd4d9 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -600,10 +600,10 @@ fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) ! const args = args_range.slice(pp.macro_arg_tokens.items); for (args, 0..) |tok, i| { const slice = pp.tokSlice(tok); - if (slice.len > 0 and tok.flags.space and i != 0) { - try pp.comp.generated_buf.append(pp.gpa, ' '); - } - try pp.comp.generated_buf.appendSlice(pp.gpa, slice); + const needs_space = slice.len > 0 and tok.flags.space and i != 0; + const bytes_needed = slice.len + @intFromBool(needs_space); + try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, bytes_needed); + pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(tok)); } try pp.comp.generated_buf.append(pp.gpa, '"'); var tok = tmpl; From a4a66abef8c44d746daad2320903ad72dab1e513 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 18 Jul 2024 21:12:45 -0700 Subject: [PATCH 06/10] Preprocessor: fix stringification --- src/aro/Preprocessor.zig | 69 ++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 3bdfd4d9..ce432006 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -595,17 +595,67 @@ fn addHideSet(pp: *Preprocessor, toks: []PreprocessorToken, hideset: Treap.Node) } fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) !PreprocessorToken { + const char_buf_top = pp.char_buf.items.len; + defer pp.char_buf.items.len = char_buf_top; + const start = pp.comp.generated_buf.items.len; - try pp.comp.generated_buf.append(pp.gpa, '"'); + try pp.char_buf.append(pp.gpa, '"'); const args = args_range.slice(pp.macro_arg_tokens.items); - for (args, 0..) |tok, i| { - const slice = pp.tokSlice(tok); - const needs_space = slice.len > 0 and tok.flags.space and i != 0; - const bytes_needed = slice.len + @intFromBool(needs_space); - try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, bytes_needed); - pp.comp.generated_buf.appendSliceAssumeCapacity(pp.tokSlice(tok)); + for (args) |tok| { + if (tok.flags.space and pp.char_buf.items.len - 1 > char_buf_top) { + try pp.char_buf.append(pp.gpa, ' '); + } + // backslashes not inside strings are not escaped + const is_str = switch (tok.id) { + .string_literal, + .string_literal_utf_16, + .string_literal_utf_8, + .string_literal_utf_32, + .string_literal_wide, + .char_literal, + .char_literal_utf_16, + .char_literal_utf_32, + .char_literal_wide, + => true, + else => false, + }; + + for (pp.tokSlice(tok)) |c| { + if (c == '"') + try pp.char_buf.appendSlice(pp.gpa, "\\\"") + else if (c == '\\' and is_str) + try pp.char_buf.appendSlice(pp.gpa, "\\\\") + else + try pp.char_buf.append(pp.gpa, c); + } } - try pp.comp.generated_buf.append(pp.gpa, '"'); + try pp.char_buf.ensureUnusedCapacity(pp.gpa, 2); + if (pp.char_buf.items[pp.char_buf.items.len - 1] != '\\') { + pp.char_buf.appendSliceAssumeCapacity("\"\n"); + } else { + pp.char_buf.appendAssumeCapacity('"'); + var tokenizer: Tokenizer = .{ + .buf = pp.char_buf.items, + .index = 0, + .source = .generated, + .langopts = pp.comp.langopts, + .line = 0, + }; + const item = tokenizer.next(); + if (item.id == .unterminated_string_literal) { + const tok = args[args.len - 1]; + try pp.comp.addDiagnostic(.{ + .tag = .invalid_pp_stringify_escape, + .loc = tok.loc, + }, tok.expansionSlice()); + pp.char_buf.items.len -= 2; // erase unpaired backslash and appended end quote + pp.char_buf.appendAssumeCapacity('"'); + } + pp.char_buf.appendAssumeCapacity('\n'); + } + + try pp.comp.generated_buf.appendSlice(pp.gpa, pp.char_buf.items[char_buf_top..]); + var tok = tmpl; tok.id = .string_literal; tok.loc = .{ @@ -1885,9 +1935,8 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { const slice = pp.tokSlice(cur); try w.writeAll(slice); } - }, - else => try pp.prettyPrintToken(w, cur), + else => try pp.prettyPrintToken(w, cur), } } try w.writeByte('\n'); From e49dde70aba67c2940f385af588bd61539bc4376 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Fri, 19 Jul 2024 14:26:00 -0700 Subject: [PATCH 07/10] Preprocessor: start working on expansion locations --- src/aro/Preprocessor.zig | 53 +++++++++++++++++----------------- src/aro/Tree.zig | 61 ++++++++++++++++++++-------------------- 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index ce432006..6b0c8beb 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -221,14 +221,14 @@ fn handleCounterMacro(pp: *Preprocessor, tok: PreprocessorToken) Error!void { } fn makeGeneratedToken(pp: *Preprocessor, start: usize, id: Token.Id, source: PreprocessorToken) !PreprocessorToken { - const pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ + var pasted_token = PreprocessorToken{ .id = id, .flags = source.flags, .loc = .{ .id = .generated, .byte_offset = @intCast(start), .line = pp.generated_line, } }; pp.generated_line += 1; - // try pasted_token.addExpansionLocation(pp.gpa, &.{source.loc}); - // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); + try pasted_token.addExpansionLocation(pp.gpa, source.loc); + // try pasted_token.addExpansionLocation(pp.gpa, source.expansionSlice()); TODO return pasted_token; } @@ -668,7 +668,6 @@ fn stringize(pp: *Preprocessor, tmpl: PreprocessorToken, args_range: MacroArg) ! } fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, args: MacroArgList, hideset_arg: Treap.Node) ![]PreprocessorToken { - _ = macro_tok; var hideset = hideset_arg; var r: TokenList = .{}; defer r.deinit(pp.gpa); @@ -728,6 +727,10 @@ fn subst(pp: *Preprocessor, macro: *const Macro, macro_tok: PreprocessorToken, a try r.append(pp.gpa, t0); } try pp.addHideSet(r.items, hideset); + for (r.items) |*tok| { + try tok.addExpansionLocation(pp.gpa, macro_tok.loc); + try tok.addExpansionLocationList(pp.gpa, macro_tok.loc_list); + } return r.toOwnedSlice(pp.gpa); } @@ -838,6 +841,10 @@ fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { const new_hideset = try pp.treap.addNodeTo(tok.hideset, safe_name); const tokens = try pp.subst(macro, tok, MacroArgList.empty, new_hideset); + for (tokens) |*t| { + try t.addExpansionLocation(pp.gpa, tok.loc); + try t.addExpansionLocationList(pp.gpa, tok.loc_list); + } defer pp.gpa.free(tokens); pp.propagateSpace(tokens, tok); try pp.ungetAll(tokens); @@ -1830,8 +1837,6 @@ fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { if (embed_bytes.len == 0) return; - try pp.ensureUnusedTokenCapacity(2 * embed_bytes.len - 1); // N bytes and N-1 commas - // TODO: We currently only support systems with CHAR_BIT == 8 // If the target's CHAR_BIT is not 8, we need to write out correctly-sized embed_bytes // and correctly account for the target's endianness @@ -1843,14 +1848,14 @@ fn readEmbed(pp: *Preprocessor, directive_tok: PreprocessorToken) Error!void { try writer.print("{d}", .{byte}); var generated = try pp.makeGeneratedToken(start, .embed_byte, directive_tok); generated.flags.is_bol = true; - pp.addTokenAssumeCapacity(generated); + try pp.addToken(generated); } for (embed_bytes[1..]) |byte| { const start = pp.comp.generated_buf.items.len; try writer.print(",{d}", .{byte}); - pp.addTokenAssumeCapacity(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); - pp.addTokenAssumeCapacity(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); + try pp.addToken(.{ .id = .comma, .loc = .{ .id = .generated, .byte_offset = @intCast(start) } }); + try pp.addToken(try pp.makeGeneratedToken(start + 1, .embed_byte, directive_tok)); } try pp.comp.generated_buf.append(pp.gpa, '\n'); } @@ -1973,27 +1978,23 @@ pub fn expansionSlice(pp: *Preprocessor, tok: Tree.TokenIndex) []Source.Location } pub fn addToken(pp: *Preprocessor, tok: PreprocessorToken) !void { - if (tok.expansion_locs) |expansion_locs| { - try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); - } - try pp.tokens.append(pp.gpa, tok.toTreeToken()); -} + var r: std.ArrayListUnmanaged(Source.Location) = .{}; + defer r.deinit(pp.gpa); -pub fn addTokenAssumeCapacity(pp: *Preprocessor, tok: PreprocessorToken) void { - if (tok.expansion_locs) |expansion_locs| { - pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); + var it = tok.loc_list.first; + while (it) |node| : (it = node.next) { + try r.append(pp.gpa, node.data); } - pp.tokens.appendAssumeCapacity(tok.toTreeToken()); -} + if (r.items.len > 0) { + // std.debug.print("gottem\n", .{}); + try r.append(pp.gpa, .{ .id = .unused, .byte_offset = 1 }); + try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, 1); -pub fn ensureTotalTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureTotalCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureTotalCapacity(pp.gpa, capacity); -} + const items = try r.toOwnedSlice(pp.gpa); // TODO: reverse? + pp.expansion_entries.appendAssumeCapacity(.{ .idx = @intCast(pp.tokens.len), .locs = items.ptr }); + } -pub fn ensureUnusedTokenCapacity(pp: *Preprocessor, capacity: usize) !void { - try pp.tokens.ensureUnusedCapacity(pp.gpa, capacity); - try pp.expansion_entries.ensureUnusedCapacity(pp.gpa, capacity); + try pp.tokens.append(pp.gpa, tok.toTreeToken()); } fn skip( diff --git a/src/aro/Tree.zig b/src/aro/Tree.zig index a58f42ea..5756d37d 100644 --- a/src/aro/Tree.zig +++ b/src/aro/Tree.zig @@ -28,6 +28,8 @@ pub const Token = struct { pub const NumberSuffix = number_affixes.Suffix; }; +const LocList = std.SinglyLinkedList(Source.Location); + pub const TokenWithExpansionLocs = struct { const Self = @This(); @@ -36,6 +38,7 @@ pub const TokenWithExpansionLocs = struct { hideset: Treap.Node = null, loc: Source.Location, expansion_locs: ?[*]Source.Location = null, + loc_list: LocList = .{}, pub fn toTreeToken(self: Self) Token { return .{ .flags = self.flags, .id = self.id, .loc = self.loc }; @@ -58,37 +61,35 @@ pub const TokenWithExpansionLocs = struct { return locs[0..i]; } - pub fn addExpansionLocation(tok: *Self, gpa: std.mem.Allocator, new: []const Source.Location) !void { - if (new.len == 0 or tok.id == .whitespace or tok.id == .macro_ws or tok.id == .placemarker) return; - var list = std.ArrayList(Source.Location).init(gpa); - defer { - @memset(list.items.ptr[list.items.len..list.capacity], .{}); - // Add a sentinel to indicate the end of the list since - // the ArrayList's capacity isn't guaranteed to be exactly - // what we ask for. - if (list.capacity > 0) { - list.items.ptr[list.capacity - 1].byte_offset = 1; - } - tok.expansion_locs = list.items.ptr; - } - - if (tok.expansion_locs) |locs| { - var i: usize = 0; - while (locs[i].id != .unused) : (i += 1) {} - list.items = locs[0..i]; - while (locs[i].byte_offset != 1) : (i += 1) {} - list.capacity = i + 1; - } - - const min_len = @max(list.items.len + new.len + 1, 4); - const wanted_len = std.math.ceilPowerOfTwo(usize, min_len) catch - return error.OutOfMemory; - try list.ensureTotalCapacity(wanted_len); + pub fn addExpansionLocationList(tok: *Self, gpa: std.mem.Allocator, list: LocList) !void { + const first = tok.loc_list.first orelse return; + const new_list = list.first orelse return; + const end = first.findLast(); + // end.insertAfter(new_list); + _ = end; + _ = new_list; + // _ = end; + // const last = tok.loc_list.first.?.findLast(); + // _ = list; + // const last = first.findLast(); + // last.insertAfter(first); + _ = gpa; + // _ = last; + // // var it = list.first; + // // while (it) |node| : (it = node.next) { + // // // try r.append(pp.gpa, node.data); + // // } + // _ = tok; + // _ = gpa; + } - for (new) |new_loc| { - if (new_loc.id == .generated) continue; - list.appendAssumeCapacity(new_loc); - } + pub fn addExpansionLocation(tok: *Self, gpa: std.mem.Allocator, loc: Source.Location) !void { + _ = tok; + _ = gpa; + _ = loc; + // const node = try gpa.create(LocList.Node); + // node.* = .{ .data = loc }; + // tok.loc_list.prepend(node); } pub fn free(expansion_locs: ?[*]Source.Location, gpa: std.mem.Allocator) void { From 1f87ef0940d3efb9217414d643e9fa40fdeacaab Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Tue, 23 Jul 2024 23:07:39 -0700 Subject: [PATCH 08/10] Preprocessor: start fixing line markers --- src/aro/Preprocessor.zig | 121 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 6b0c8beb..372b3f3a 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -559,9 +559,12 @@ pub fn preprocessSources(pp: *Preprocessor, sources: []const Source) Error!void assert(sources.len > 1); const first = sources[0]; + try pp.addIncludeStart(first); for (sources[1..]) |header| { + try pp.addIncludeStart(header); _ = try pp.preprocess(header); } + try pp.addIncludeResume(first.id, 0, 1); const eof = try pp.preprocess(first); try pp.addToken(eof); } @@ -1101,6 +1104,24 @@ fn makeMacroToken(position: usize, is_vararg: bool) PreprocessorToken { }; } +pub fn addIncludeStart(pp: *Preprocessor, source: Source) !void { + if (pp.linemarkers == .none) return; + try pp.addToken(.{ .id = .include_start, .loc = .{ + .id = source.id, + .byte_offset = std.math.maxInt(u32), + .line = 1, + } }); +} + +pub fn addIncludeResume(pp: *Preprocessor, source: Source.Id, offset: u32, line: u32) !void { + if (pp.linemarkers == .none) return; + try pp.addToken(.{ .id = .include_resume, .loc = .{ + .id = source, + .byte_offset = offset, + .line = line, + } }); +} + fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { const tok = pp.getToken(); if (tok.id == id) return true; @@ -1455,6 +1476,7 @@ fn readIncludeExtra(pp: *Preprocessor, include_token: PreprocessorToken, which: return error.FatalError; } pp.preprocess_count += 1; + try pp.addIncludeStart(source); try pp.tokenizers.append(pp.gpa, .{ .buf = source.buf, .langopts = pp.comp.langopts, @@ -1886,6 +1908,17 @@ pub fn preprocess(pp: *Preprocessor, source: Source) !PreprocessorToken { const tok = try pp.readToken(); if (tok.id == .eof) { const tokenizer = pp.tokenizers.pop(); + + if (pp.tokenizers.items.len > 0) { + var next_tok: RawToken = undefined; + var tmp = pp.tokenizers.items[pp.tokenizers.items.len - 1]; + while (true) { + next_tok = tmp.nextNoWS(); + if (next_tok.id != .nl) break; + } + try pp.addIncludeResume(next_tok.source, next_tok.end, next_tok.line); + } + const guard_name = pp.guard_stack.pop(); if (guard_name) |name| { try pp.include_guards.put(pp.gpa, tokenizer.source, name); @@ -1910,10 +1943,40 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { const tok_ids = pp.tokens.items(.id); var i: u32 = 0; var last_nl = false; - while (i < pp.tokens.len) : (i += 1) { + outer: while (i < pp.tokens.len) : (i += 1) { var cur: Token = pp.tokens.get(i); switch (cur.id) { .eof => break, + .nl => { + var newlines: u32 = 0; + for (tok_ids[i..], i..) |id, j| { + if (id == .nl) { + newlines += 1; + } else if (id == .eof) { + if (!last_nl) try w.writeByte('\n'); + return; + } else if (id != .whitespace) { + if (pp.linemarkers == .none) { + if (newlines < 2) break; + } else if (newlines < collapse_newlines) { + break; + } + + i = @intCast((j - 1) - @intFromBool(tok_ids[j - 1] == .whitespace)); + if (!last_nl) try w.writeAll("\n"); + if (pp.linemarkers != .none) { + const next_tok = pp.tokens.get(i); + const source = pp.comp.getSource(next_tok.loc.id); + const line_col = source.lineCol(next_tok.loc); + try pp.printLinemarker(w, line_col.line_no, source, .none); + last_nl = true; + } + continue :outer; + } + } + last_nl = true; + try w.writeAll("\n"); + }, .keyword_pragma => { const pragma_name = pp.tokSlice(pp.tokens.get(i + 1)); const end_idx = mem.indexOfScalarPos(Token.Id, tok_ids, i, .nl) orelse i + 1; @@ -1941,12 +2004,68 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { try w.writeAll(slice); } }, + .include_start => { + const source = pp.comp.getSource(cur.loc.id); + + try pp.printLinemarker(w, 1, source, .start); + last_nl = true; + }, + .include_resume => { + const source = pp.comp.getSource(cur.loc.id); + const line_col = source.lineCol(cur.loc); + if (!last_nl) try w.writeAll("\n"); + + try pp.printLinemarker(w, line_col.line_no, source, .@"resume"); + last_nl = true; + }, else => try pp.prettyPrintToken(w, cur), } } try w.writeByte('\n'); } +fn printLinemarker( + pp: *Preprocessor, + w: anytype, + line_no: u32, + source: Source, + start_resume: enum(u8) { start, @"resume", none }, +) !void { + try w.writeByte('#'); + if (pp.linemarkers == .line_directives) try w.writeAll("line"); + try w.print(" {d} \"", .{line_no}); + for (source.path) |byte| switch (byte) { + '\n' => try w.writeAll("\\n"), + '\r' => try w.writeAll("\\r"), + '\t' => try w.writeAll("\\t"), + '\\' => try w.writeAll("\\\\"), + '"' => try w.writeAll("\\\""), + ' ', '!', '#'...'&', '('...'[', ']'...'~' => try w.writeByte(byte), + // Use hex escapes for any non-ASCII/unprintable characters. + // This ensures that the parsed version of this string will end up + // containing the same bytes as the input regardless of encoding. + else => { + try w.writeAll("\\x"); + try std.fmt.formatInt(byte, 16, .lower, .{ .width = 2, .fill = '0' }, w); + }, + }; + try w.writeByte('"'); + if (pp.linemarkers == .numeric_directives) { + switch (start_resume) { + .none => {}, + .start => try w.writeAll(" 1"), + .@"resume" => try w.writeAll(" 2"), + } + switch (source.kind) { + .user => {}, + .system => try w.writeAll(" 3"), + .extern_c_system => try w.writeAll(" 3 4"), + } + } + try w.writeByte('\n'); +} + + fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: Token) !void { if (tok.flags.is_bol) { try w.writeByte('\n'); From e6fe13a995eb313fd52fb28128b7280155423d10 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 25 Jul 2024 11:34:19 -0700 Subject: [PATCH 09/10] import: fix spelling --- src/aro.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aro.zig b/src/aro.zig index 45e90154..73f105f4 100644 --- a/src/aro.zig +++ b/src/aro.zig @@ -35,6 +35,6 @@ test { _ = @import("aro/target.zig"); _ = @import("aro/Tokenizer.zig"); _ = @import("aro/toolchains/Linux.zig"); - _ = @import("aro/treap.zig"); + _ = @import("aro/Treap.zig"); _ = @import("aro/Value.zig"); } From 0d8bfaeec088105645cdc0c09ddec88772991181 Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 25 Jul 2024 22:12:43 -0700 Subject: [PATCH 10/10] Preprocessor: ignore newlines before func-like macro left paren --- src/aro/Preprocessor.zig | 16 +++++++++++++--- ...standard-redefinition-reexamination-example.c | 5 ++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 372b3f3a..13008292 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -854,7 +854,7 @@ fn readExpandNewline(pp: *Preprocessor) Error!PreprocessorToken { return pp.readExpand(); }, .func => { - if (!try pp.next(.l_paren)) return tok; + if (!try pp.getMacroLParen()) return tok; const arg_tokens_start = pp.macro_arg_tokens.items.len; defer pp.macro_arg_tokens.items.len = arg_tokens_start; const macro_args_start = pp.macro_args.items.len; @@ -1129,6 +1129,17 @@ fn next(pp: *Preprocessor, id: Tokenizer.Token.Id) !bool { return false; } +fn getMacroLParen(pp: *Preprocessor) !bool { + while (true) { + const tok = pp.getToken(); + if (tok.id == .nl) continue; + + if (tok.id == .l_paren) return true; + try pp.ungetToken(tok); + return false; + } +} + /// Returns true for vararg function-like macro, false otherwise fn readFunclikeMacroParams(pp: *Preprocessor, name: PreprocessorToken, l_paren: PreprocessorToken, params: *ParamMap) !bool { _ = name; @@ -2017,7 +2028,7 @@ pub fn prettyPrintTokens(pp: *Preprocessor, w: anytype) !void { try pp.printLinemarker(w, line_col.line_no, source, .@"resume"); last_nl = true; - }, + }, else => try pp.prettyPrintToken(w, cur), } } @@ -2065,7 +2076,6 @@ fn printLinemarker( try w.writeByte('\n'); } - fn prettyPrintToken(pp: *Preprocessor, w: anytype, tok: Token) !void { if (tok.flags.is_bol) { try w.writeByte('\n'); diff --git a/test/cases/expanded/standard-redefinition-reexamination-example.c b/test/cases/expanded/standard-redefinition-reexamination-example.c index 5e5688ce..a9673c54 100644 --- a/test/cases/expanded/standard-redefinition-reexamination-example.c +++ b/test/cases/expanded/standard-redefinition-reexamination-example.c @@ -1,5 +1,4 @@ f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1); -f(2 * (2+(3,4)-0,1)) | f(2 * (\~{ } 5)) & f(2 * (0,1)) -^m(0,1); -int i[] = { 1, 23, 4, 5, }; +f(2 * (2+(3,4)-0,1)) | f(2 * (\~{ } 5)) & f(2 * (0,1))^m(0,1); +int i[] = { 1, 23, 4, 5, }; char c[2][6] = { "hello", "" };