From b3e6f445666bd4fcdce308fd87e5a012d5a2de7f Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Tue, 23 Aug 2022 19:17:11 -0700 Subject: [PATCH] Preprocessor: use hidesets to manage token expansion disabling Based on https://www.spinellis.gr/blog/20060626/cpp.algo.pdf Closes #380 --- src/aro/Hideset.zig | 176 ++++++++++++++++++ src/aro/Preprocessor.zig | 44 ++++- .../recursive call non-expanded parens.c | 1 + test/cases/expanded/unspecified expansion.c | 2 +- .../recursive call non-expanded parens.c | 5 + test/cases/unspecified expansion.c | 2 +- 6 files changed, 224 insertions(+), 6 deletions(-) create mode 100644 src/aro/Hideset.zig create mode 100644 test/cases/expanded/recursive call non-expanded parens.c create mode 100644 test/cases/recursive call non-expanded parens.c diff --git a/src/aro/Hideset.zig b/src/aro/Hideset.zig new file mode 100644 index 00000000..e3e1620c --- /dev/null +++ b/src/aro/Hideset.zig @@ -0,0 +1,176 @@ +const std = @import("std"); +const mem = std.mem; +const Allocator = mem.Allocator; +const Source = @import("Source.zig"); +const Compilation = @import("Compilation.zig"); +const Tokenizer = @import("Tokenizer.zig"); + +pub const Hideset = @This(); + +const HashContext = struct { + pub fn hash(ctx: HashContext, key: Identifier) u64 { + _ = ctx; + return std.hash.Wyhash.hash(0, std.mem.asBytes(&key)); + } + pub fn eql(ctx: HashContext, a: Identifier, b: Identifier) bool { + _ = ctx; + return a.id == b.id and a.byte_offset == b.byte_offset; + } +}; + +const Identifier = packed struct(u64) { + id: Source.Id = .unused, + byte_offset: u32 = 0, + + fn slice(self: Identifier, comp: *const Compilation) []const u8 { + var tmp_tokenizer = Tokenizer{ + .buf = comp.getSource(self.id).buf, + .langopts = comp.langopts, + .index = self.byte_offset, + .source = .generated, + }; + const res = tmp_tokenizer.next(); + return tmp_tokenizer.buf[res.start..res.end]; + } +}; + +const Item = struct { + name: Identifier = .{}, + next: Index = .sentinel, + + const List = std.MultiArrayList(Item); +}; + +const Index = enum(u32) { + sentinel = std.math.maxInt(u32), + _, +}; + +map: std.HashMapUnmanaged(Identifier, Index, HashContext, std.hash_map.default_max_load_percentage) = .{}, +intersection_map: std.StringHashMapUnmanaged(void) = .{}, +linked_list: Item.List = .{}, +next_idx: Index = @enumFromInt(0), +comp: *const Compilation, + +const Iterator = struct { + slice: Item.List.Slice, + i: Index, + + fn next(self: *Iterator) ?Identifier { + if (self.i == .sentinel) return null; + defer self.i = self.slice.items(.next)[@intFromEnum(self.i)]; + return self.slice.items(.name)[@intFromEnum(self.i)]; + } +}; + +pub fn init(comp: *const Compilation) Hideset { + return Hideset{ + .comp = comp, + }; +} + +pub fn deinit(self: *Hideset) void { + self.map.deinit(self.comp.gpa); + self.intersection_map.deinit(self.comp.gpa); + self.linked_list.deinit(self.comp.gpa); +} + +pub fn clearRetainingCapacity(self: *Hideset) void { + self.next_idx = @enumFromInt(0); + self.map.clearRetainingCapacity(); +} + +pub fn iterator(self: *const Hideset, idx: Index) Iterator { + return Iterator{ + .slice = self.linked_list.slice(), + .i = idx, + }; +} + +pub fn get(self: *const Hideset, name: Identifier) Index { + return self.map.get(name) orelse .sentinel; +} + +pub fn put(self: *Hideset, key: Identifier, value: Index) !void { + try self.map.put(self.comp.gpa, key, value); +} + +pub fn ensureTotalCapacity(self: *Hideset, new_size: usize) !void { + try self.linked_list.ensureTotalCapacity(self.comp.gpa, new_size); +} + +/// Allocates a new item and returns its index +fn allocate(self: *Hideset, name: Identifier) !Index { + const next: Index = if (@intFromEnum(self.next_idx) < self.linked_list.len) self.next_idx else blk: { + const new_item_idx = try self.linked_list.addOne(self.comp.gpa); + break :blk @enumFromInt(new_item_idx); + }; + self.next_idx = @enumFromInt(@intFromEnum(next) + 1); + self.linked_list.set(@intFromEnum(next), .{ .name = name }); + return next; +} + +/// Create a new list with `name` at the front followed by `tail` +pub fn prepend(self: *Hideset, name: Identifier, tail: Index) !Index { + const new_idx = try self.allocate(name); + self.linked_list.items(.next)[@intFromEnum(new_idx)] = tail; + return new_idx; +} + +/// Copy a, then attach b at the end +pub fn @"union"(self: *Hideset, a: Index, b: Index) !Index { + var cur: Index = .sentinel; + var head: Index = b; + var it = self.iterator(a); + while (it.next()) |name| { + const new_idx = try self.allocate(name); + if (head == b) { + head = new_idx; + } + if (cur != .sentinel) { + self.linked_list.items(.next)[@intFromEnum(cur)] = new_idx; + } + cur = new_idx; + } + if (cur != .sentinel) { + self.linked_list.items(.next)[@intFromEnum(cur)] = b; + } + return head; +} + +pub fn contains(self: *const Hideset, list: Index, name: []const u8) bool { + var it = self.iterator(list); + while (it.next()) |item_name| { + const this = item_name.slice(self.comp); + if (mem.eql(u8, name, this)) return true; + } + return false; +} + +pub fn intersection(self: *Hideset, a: Index, b: Index) !Index { + if (a == .sentinel or b == .sentinel) return .sentinel; + self.intersection_map.clearRetainingCapacity(); + + var cur: Index = .sentinel; + var head: Index = .sentinel; + var it = self.iterator(a); + while (it.next()) |name| { + const str = name.slice(self.comp); + try self.intersection_map.put(self.comp.gpa, str, {}); + } + it = self.iterator(b); + while (it.next()) |name| { + const str = name.slice(self.comp); + if (self.intersection_map.contains(str)) { + const new_idx = try self.allocate(name); + if (head == .sentinel) { + head = new_idx; + } + if (cur != .sentinel) { + self.linked_list.items(.next)[@intFromEnum(cur)] = new_idx; + } + cur = new_idx; + } + } + return head; +} diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 58af2099..3ebb8794 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -12,6 +12,7 @@ const Diagnostics = @import("Diagnostics.zig"); const Token = @import("Tree.zig").Token; const Attribute = @import("Attribute.zig"); const features = @import("features.zig"); +const Hideset = @import("Hideset.zig"); const DefineMap = std.StringHashMapUnmanaged(Macro); const RawTokenList = std.ArrayList(RawToken); @@ -93,6 +94,8 @@ preserve_whitespace: bool = false, /// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers) linemarkers: Linemarkers = .none, +hideset: Hideset, + pub const parse = Parser.parse; pub const Linemarkers = enum { @@ -113,6 +116,7 @@ pub fn init(comp: *Compilation) Preprocessor { .char_buf = std.ArrayList(u8).init(comp.gpa), .poisoned_identifiers = std.StringHashMap(void).init(comp.gpa), .top_expansion_buf = ExpandBuf.init(comp.gpa), + .hideset = Hideset.init(comp), }; comp.pragmaEvent(.before_preprocess); return pp; @@ -236,6 +240,7 @@ pub fn deinit(pp: *Preprocessor) void { pp.poisoned_identifiers.deinit(); pp.include_guards.deinit(pp.gpa); pp.top_expansion_buf.deinit(); + pp.hideset.deinit(); } /// Preprocess a compilation unit of sources into a parsable list of tokens. @@ -341,6 +346,7 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token { // Estimate how many new tokens this source will contain. const estimated_token_count = source.buf.len / 8; try pp.tokens.ensureTotalCapacity(pp.gpa, pp.tokens.len + estimated_token_count); + try pp.hideset.ensureTotalCapacity(1024); var if_level: u8 = 0; var if_kind = std.PackedIntArray(u2, 256).init([1]u2{0} ** 256); @@ -818,6 +824,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool { } else unreachable; if (pp.top_expansion_buf.items.len != 0) { pp.expansion_source_loc = pp.top_expansion_buf.items[0].loc; + pp.hideset.clearRetainingCapacity(); try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, pp.top_expansion_buf.items.len, false, .expr); } for (pp.top_expansion_buf.items) |tok| { @@ -1948,6 +1955,7 @@ fn collectMacroFuncArguments( end_idx: *usize, extend_buf: bool, is_builtin: bool, + r_paren: *Token, ) !MacroArguments { const name_tok = buf.items[start_idx.*]; const saved_tokenizer = tokenizer.*; @@ -2002,6 +2010,7 @@ fn collectMacroFuncArguments( const owned = try curArgument.toOwnedSlice(); errdefer pp.gpa.free(owned); try args.append(owned); + r_paren.* = tok; break; } else { const duped = try tok.dupe(pp.gpa); @@ -2108,13 +2117,24 @@ fn expandMacroExhaustive( idx += it.i; continue; } - const macro_entry = pp.defines.getPtr(pp.expandedSlice(macro_tok)); - if (macro_entry == null or !shouldExpand(buf.items[idx], macro_entry.?)) { + if (!macro_tok.id.isMacroIdentifier() or macro_tok.flags.expansion_disabled) { idx += 1; continue; } - if (macro_entry) |macro| macro_handler: { + const expanded = pp.expandedSlice(macro_tok); + const macro = pp.defines.getPtr(expanded) orelse { + idx += 1; + continue; + }; + const macro_hidelist = pp.hideset.get(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset }); + if (pp.hideset.contains(macro_hidelist, expanded)) { + idx += 1; + continue; + } + + macro_handler: { if (macro.is_func) { + var r_paren: Token = undefined; var macro_scan_idx = idx; // to be saved in case this doesn't turn out to be a call const args = pp.collectMacroFuncArguments( @@ -2124,6 +2144,7 @@ fn expandMacroExhaustive( &moving_end_idx, extend_buf, macro.is_builtin, + &r_paren, ) catch |er| switch (er) { error.MissingLParen => { if (!buf.items[idx].flags.is_macro_arg) buf.items[idx].flags.expansion_disabled = true; @@ -2137,12 +2158,16 @@ fn expandMacroExhaustive( }, else => |e| return e, }; + assert(r_paren.id == .r_paren); defer { for (args.items) |item| { pp.gpa.free(item); } args.deinit(); } + const r_paren_hidelist = pp.hideset.get(.{ .id = r_paren.loc.id, .byte_offset = r_paren.loc.byte_offset }); + var hs = try pp.hideset.intersection(macro_hidelist, r_paren_hidelist); + hs = try pp.hideset.prepend(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset }, hs); var args_count: u32 = @intCast(args.items.len); // if the macro has zero arguments g() args_count is still 1 @@ -2199,6 +2224,9 @@ fn expandMacroExhaustive( for (res.items) |*tok| { try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); + const tok_hidelist = pp.hideset.get(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }); + const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs); + try pp.hideset.put(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }, new_hidelist); } const tokens_removed = macro_scan_idx - idx + 1; @@ -2215,12 +2243,19 @@ fn expandMacroExhaustive( const res = try pp.expandObjMacro(macro); defer res.deinit(); + const hs = try pp.hideset.prepend(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset }, macro_hidelist); + const macro_expansion_locs = macro_tok.expansionSlice(); var increment_idx_by = res.items.len; for (res.items, 0..) |*tok, i| { tok.flags.is_macro_arg = macro_tok.flags.is_macro_arg; try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc}); try tok.addExpansionLocation(pp.gpa, macro_expansion_locs); + + const tok_hidelist = pp.hideset.get(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }); + const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs); + try pp.hideset.put(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }, new_hidelist); + if (tok.id == .keyword_defined and eval_ctx == .expr) { try pp.comp.addDiagnostic(.{ .tag = .expansion_to_defined, @@ -2266,6 +2301,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr try pp.top_expansion_buf.append(source_tok); pp.expansion_source_loc = source_tok.loc; + pp.hideset.clearRetainingCapacity(); try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr); try pp.tokens.ensureUnusedCapacity(pp.gpa, pp.top_expansion_buf.items.len); for (pp.top_expansion_buf.items) |*tok| { @@ -2312,7 +2348,7 @@ fn expandedSliceExtra(pp: *const Preprocessor, tok: Token, macro_ws_handling: en } /// Get expanded token source string. -pub fn expandedSlice(pp: *Preprocessor, tok: Token) []const u8 { +pub fn expandedSlice(pp: *const Preprocessor, tok: Token) []const u8 { return pp.expandedSliceExtra(tok, .single_macro_ws); } diff --git a/test/cases/expanded/recursive call non-expanded parens.c b/test/cases/expanded/recursive call non-expanded parens.c new file mode 100644 index 00000000..33f7ad9a --- /dev/null +++ b/test/cases/expanded/recursive call non-expanded parens.c @@ -0,0 +1 @@ +1 2 1 bar diff --git a/test/cases/expanded/unspecified expansion.c b/test/cases/expanded/unspecified expansion.c index 240dc5b2..738d3966 100644 --- a/test/cases/expanded/unspecified expansion.c +++ b/test/cases/expanded/unspecified expansion.c @@ -1 +1 @@ -2*f(9) +2*9*g diff --git a/test/cases/recursive call non-expanded parens.c b/test/cases/recursive call non-expanded parens.c new file mode 100644 index 00000000..5dc15bd9 --- /dev/null +++ b/test/cases/recursive call non-expanded parens.c @@ -0,0 +1,5 @@ +//aro-args -E -P +#define foo(X) 1 bar +#define bar(X) 2 foo + +foo(X)(Y)(Z) diff --git a/test/cases/unspecified expansion.c b/test/cases/unspecified expansion.c index 55cbe3d3..d828307a 100644 --- a/test/cases/unspecified expansion.c +++ b/test/cases/unspecified expansion.c @@ -1,6 +1,6 @@ //aro-args -E -P // This can either expand as 2*f(9) or as 2*9*g (see 6.10.3.4 in the standard) -// Currently arocc does the former, but gcc and clang do the latter +// We follow gcc and clang in expanding it to 2*9*g #define f(a) a*g #define g(a) f(a)