From b3e6f445666bd4fcdce308fd87e5a012d5a2de7f Mon Sep 17 00:00:00 2001
From: Evan Haas <evan@lagerdata.com>
Date: Tue, 23 Aug 2022 19:17:11 -0700
Subject: [PATCH] Preprocessor: use hidesets to manage token expansion
 disabling

Based on https://www.spinellis.gr/blog/20060626/cpp.algo.pdf

Closes #380
---
 src/aro/Hideset.zig                           | 176 ++++++++++++++++++
 src/aro/Preprocessor.zig                      |  44 ++++-
 .../recursive call non-expanded parens.c      |   1 +
 test/cases/expanded/unspecified expansion.c   |   2 +-
 .../recursive call non-expanded parens.c      |   5 +
 test/cases/unspecified expansion.c            |   2 +-
 6 files changed, 224 insertions(+), 6 deletions(-)
 create mode 100644 src/aro/Hideset.zig
 create mode 100644 test/cases/expanded/recursive call non-expanded parens.c
 create mode 100644 test/cases/recursive call non-expanded parens.c

diff --git a/src/aro/Hideset.zig b/src/aro/Hideset.zig
new file mode 100644
index 00000000..e3e1620c
--- /dev/null
+++ b/src/aro/Hideset.zig
@@ -0,0 +1,176 @@
+const std = @import("std");
+const mem = std.mem;
+const Allocator = mem.Allocator;
+const Source = @import("Source.zig");
+const Compilation = @import("Compilation.zig");
+const Tokenizer = @import("Tokenizer.zig");
+
+pub const Hideset = @This();
+
+const HashContext = struct {
+    pub fn hash(ctx: HashContext, key: Identifier) u64 {
+        _ = ctx;
+        return std.hash.Wyhash.hash(0, std.mem.asBytes(&key));
+    }
+    pub fn eql(ctx: HashContext, a: Identifier, b: Identifier) bool {
+        _ = ctx;
+        return a.id == b.id and a.byte_offset == b.byte_offset;
+    }
+};
+
+const Identifier = packed struct(u64) {
+    id: Source.Id = .unused,
+    byte_offset: u32 = 0,
+
+    fn slice(self: Identifier, comp: *const Compilation) []const u8 {
+        var tmp_tokenizer = Tokenizer{
+            .buf = comp.getSource(self.id).buf,
+            .langopts = comp.langopts,
+            .index = self.byte_offset,
+            .source = .generated,
+        };
+        const res = tmp_tokenizer.next();
+        return tmp_tokenizer.buf[res.start..res.end];
+    }
+};
+
+const Item = struct {
+    name: Identifier = .{},
+    next: Index = .sentinel,
+
+    const List = std.MultiArrayList(Item);
+};
+
+const Index = enum(u32) {
+    sentinel = std.math.maxInt(u32),
+    _,
+};
+
+map: std.HashMapUnmanaged(Identifier, Index, HashContext, std.hash_map.default_max_load_percentage) = .{},
+intersection_map: std.StringHashMapUnmanaged(void) = .{},
+linked_list: Item.List = .{},
+next_idx: Index = @enumFromInt(0),
+comp: *const Compilation,
+
+const Iterator = struct {
+    slice: Item.List.Slice,
+    i: Index,
+
+    fn next(self: *Iterator) ?Identifier {
+        if (self.i == .sentinel) return null;
+        defer self.i = self.slice.items(.next)[@intFromEnum(self.i)];
+        return self.slice.items(.name)[@intFromEnum(self.i)];
+    }
+};
+
+pub fn init(comp: *const Compilation) Hideset {
+    return Hideset{
+        .comp = comp,
+    };
+}
+
+pub fn deinit(self: *Hideset) void {
+    self.map.deinit(self.comp.gpa);
+    self.intersection_map.deinit(self.comp.gpa);
+    self.linked_list.deinit(self.comp.gpa);
+}
+
+pub fn clearRetainingCapacity(self: *Hideset) void {
+    self.next_idx = @enumFromInt(0);
+    self.map.clearRetainingCapacity();
+}
+
+pub fn iterator(self: *const Hideset, idx: Index) Iterator {
+    return Iterator{
+        .slice = self.linked_list.slice(),
+        .i = idx,
+    };
+}
+
+pub fn get(self: *const Hideset, name: Identifier) Index {
+    return self.map.get(name) orelse .sentinel;
+}
+
+pub fn put(self: *Hideset, key: Identifier, value: Index) !void {
+    try self.map.put(self.comp.gpa, key, value);
+}
+
+pub fn ensureTotalCapacity(self: *Hideset, new_size: usize) !void {
+    try self.linked_list.ensureTotalCapacity(self.comp.gpa, new_size);
+}
+
+/// Allocates a new item and returns its index
+fn allocate(self: *Hideset, name: Identifier) !Index {
+    const next: Index = if (@intFromEnum(self.next_idx) < self.linked_list.len) self.next_idx else blk: {
+        const new_item_idx = try self.linked_list.addOne(self.comp.gpa);
+        break :blk @enumFromInt(new_item_idx);
+    };
+    self.next_idx = @enumFromInt(@intFromEnum(next) + 1);
+    self.linked_list.set(@intFromEnum(next), .{ .name = name });
+    return next;
+}
+
+/// Create a new list with `name` at the front followed by `tail`
+pub fn prepend(self: *Hideset, name: Identifier, tail: Index) !Index {
+    const new_idx = try self.allocate(name);
+    self.linked_list.items(.next)[@intFromEnum(new_idx)] = tail;
+    return new_idx;
+}
+
+/// Copy a, then attach b at the end
+pub fn @"union"(self: *Hideset, a: Index, b: Index) !Index {
+    var cur: Index = .sentinel;
+    var head: Index = b;
+    var it = self.iterator(a);
+    while (it.next()) |name| {
+        const new_idx = try self.allocate(name);
+        if (head == b) {
+            head = new_idx;
+        }
+        if (cur != .sentinel) {
+            self.linked_list.items(.next)[@intFromEnum(cur)] = new_idx;
+        }
+        cur = new_idx;
+    }
+    if (cur != .sentinel) {
+        self.linked_list.items(.next)[@intFromEnum(cur)] = b;
+    }
+    return head;
+}
+
+pub fn contains(self: *const Hideset, list: Index, name: []const u8) bool {
+    var it = self.iterator(list);
+    while (it.next()) |item_name| {
+        const this = item_name.slice(self.comp);
+        if (mem.eql(u8, name, this)) return true;
+    }
+    return false;
+}
+
+pub fn intersection(self: *Hideset, a: Index, b: Index) !Index {
+    if (a == .sentinel or b == .sentinel) return .sentinel;
+    self.intersection_map.clearRetainingCapacity();
+
+    var cur: Index = .sentinel;
+    var head: Index = .sentinel;
+    var it = self.iterator(a);
+    while (it.next()) |name| {
+        const str = name.slice(self.comp);
+        try self.intersection_map.put(self.comp.gpa, str, {});
+    }
+    it = self.iterator(b);
+    while (it.next()) |name| {
+        const str = name.slice(self.comp);
+        if (self.intersection_map.contains(str)) {
+            const new_idx = try self.allocate(name);
+            if (head == .sentinel) {
+                head = new_idx;
+            }
+            if (cur != .sentinel) {
+                self.linked_list.items(.next)[@intFromEnum(cur)] = new_idx;
+            }
+            cur = new_idx;
+        }
+    }
+    return head;
+}
diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig
index 58af2099..3ebb8794 100644
--- a/src/aro/Preprocessor.zig
+++ b/src/aro/Preprocessor.zig
@@ -12,6 +12,7 @@ const Diagnostics = @import("Diagnostics.zig");
 const Token = @import("Tree.zig").Token;
 const Attribute = @import("Attribute.zig");
 const features = @import("features.zig");
+const Hideset = @import("Hideset.zig");
 
 const DefineMap = std.StringHashMapUnmanaged(Macro);
 const RawTokenList = std.ArrayList(RawToken);
@@ -93,6 +94,8 @@ preserve_whitespace: bool = false,
 /// linemarker tokens. Must be .none unless in -E mode (parser does not handle linemarkers)
 linemarkers: Linemarkers = .none,
 
+hideset: Hideset,
+
 pub const parse = Parser.parse;
 
 pub const Linemarkers = enum {
@@ -113,6 +116,7 @@ pub fn init(comp: *Compilation) Preprocessor {
         .char_buf = std.ArrayList(u8).init(comp.gpa),
         .poisoned_identifiers = std.StringHashMap(void).init(comp.gpa),
         .top_expansion_buf = ExpandBuf.init(comp.gpa),
+        .hideset = Hideset.init(comp),
     };
     comp.pragmaEvent(.before_preprocess);
     return pp;
@@ -236,6 +240,7 @@ pub fn deinit(pp: *Preprocessor) void {
     pp.poisoned_identifiers.deinit();
     pp.include_guards.deinit(pp.gpa);
     pp.top_expansion_buf.deinit();
+    pp.hideset.deinit();
 }
 
 /// Preprocess a compilation unit of sources into a parsable list of tokens.
@@ -341,6 +346,7 @@ fn preprocessExtra(pp: *Preprocessor, source: Source) MacroError!Token {
     // Estimate how many new tokens this source will contain.
     const estimated_token_count = source.buf.len / 8;
     try pp.tokens.ensureTotalCapacity(pp.gpa, pp.tokens.len + estimated_token_count);
+    try pp.hideset.ensureTotalCapacity(1024);
 
     var if_level: u8 = 0;
     var if_kind = std.PackedIntArray(u2, 256).init([1]u2{0} ** 256);
@@ -818,6 +824,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool {
     } else unreachable;
     if (pp.top_expansion_buf.items.len != 0) {
         pp.expansion_source_loc = pp.top_expansion_buf.items[0].loc;
+        pp.hideset.clearRetainingCapacity();
         try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, pp.top_expansion_buf.items.len, false, .expr);
     }
     for (pp.top_expansion_buf.items) |tok| {
@@ -1948,6 +1955,7 @@ fn collectMacroFuncArguments(
     end_idx: *usize,
     extend_buf: bool,
     is_builtin: bool,
+    r_paren: *Token,
 ) !MacroArguments {
     const name_tok = buf.items[start_idx.*];
     const saved_tokenizer = tokenizer.*;
@@ -2002,6 +2010,7 @@ fn collectMacroFuncArguments(
                     const owned = try curArgument.toOwnedSlice();
                     errdefer pp.gpa.free(owned);
                     try args.append(owned);
+                    r_paren.* = tok;
                     break;
                 } else {
                     const duped = try tok.dupe(pp.gpa);
@@ -2108,13 +2117,24 @@ fn expandMacroExhaustive(
                 idx += it.i;
                 continue;
             }
-            const macro_entry = pp.defines.getPtr(pp.expandedSlice(macro_tok));
-            if (macro_entry == null or !shouldExpand(buf.items[idx], macro_entry.?)) {
+            if (!macro_tok.id.isMacroIdentifier() or macro_tok.flags.expansion_disabled) {
                 idx += 1;
                 continue;
             }
-            if (macro_entry) |macro| macro_handler: {
+            const expanded = pp.expandedSlice(macro_tok);
+            const macro = pp.defines.getPtr(expanded) orelse {
+                idx += 1;
+                continue;
+            };
+            const macro_hidelist = pp.hideset.get(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset });
+            if (pp.hideset.contains(macro_hidelist, expanded)) {
+                idx += 1;
+                continue;
+            }
+
+            macro_handler: {
                 if (macro.is_func) {
+                    var r_paren: Token = undefined;
                     var macro_scan_idx = idx;
                     // to be saved in case this doesn't turn out to be a call
                     const args = pp.collectMacroFuncArguments(
@@ -2124,6 +2144,7 @@ fn expandMacroExhaustive(
                         &moving_end_idx,
                         extend_buf,
                         macro.is_builtin,
+                        &r_paren,
                     ) catch |er| switch (er) {
                         error.MissingLParen => {
                             if (!buf.items[idx].flags.is_macro_arg) buf.items[idx].flags.expansion_disabled = true;
@@ -2137,12 +2158,16 @@ fn expandMacroExhaustive(
                         },
                         else => |e| return e,
                     };
+                    assert(r_paren.id == .r_paren);
                     defer {
                         for (args.items) |item| {
                             pp.gpa.free(item);
                         }
                         args.deinit();
                     }
+                    const r_paren_hidelist = pp.hideset.get(.{ .id = r_paren.loc.id, .byte_offset = r_paren.loc.byte_offset });
+                    var hs = try pp.hideset.intersection(macro_hidelist, r_paren_hidelist);
+                    hs = try pp.hideset.prepend(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset }, hs);
 
                     var args_count: u32 = @intCast(args.items.len);
                     // if the macro has zero arguments g() args_count is still 1
@@ -2199,6 +2224,9 @@ fn expandMacroExhaustive(
                     for (res.items) |*tok| {
                         try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc});
                         try tok.addExpansionLocation(pp.gpa, macro_expansion_locs);
+                        const tok_hidelist = pp.hideset.get(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset });
+                        const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs);
+                        try pp.hideset.put(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }, new_hidelist);
                     }
 
                     const tokens_removed = macro_scan_idx - idx + 1;
@@ -2215,12 +2243,19 @@ fn expandMacroExhaustive(
                     const res = try pp.expandObjMacro(macro);
                     defer res.deinit();
 
+                    const hs = try pp.hideset.prepend(.{ .id = macro_tok.loc.id, .byte_offset = macro_tok.loc.byte_offset }, macro_hidelist);
+
                     const macro_expansion_locs = macro_tok.expansionSlice();
                     var increment_idx_by = res.items.len;
                     for (res.items, 0..) |*tok, i| {
                         tok.flags.is_macro_arg = macro_tok.flags.is_macro_arg;
                         try tok.addExpansionLocation(pp.gpa, &.{macro_tok.loc});
                         try tok.addExpansionLocation(pp.gpa, macro_expansion_locs);
+
+                        const tok_hidelist = pp.hideset.get(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset });
+                        const new_hidelist = try pp.hideset.@"union"(tok_hidelist, hs);
+                        try pp.hideset.put(.{ .id = tok.loc.id, .byte_offset = tok.loc.byte_offset }, new_hidelist);
+
                         if (tok.id == .keyword_defined and eval_ctx == .expr) {
                             try pp.comp.addDiagnostic(.{
                                 .tag = .expansion_to_defined,
@@ -2266,6 +2301,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr
     try pp.top_expansion_buf.append(source_tok);
     pp.expansion_source_loc = source_tok.loc;
 
+    pp.hideset.clearRetainingCapacity();
     try pp.expandMacroExhaustive(tokenizer, &pp.top_expansion_buf, 0, 1, true, .non_expr);
     try pp.tokens.ensureUnusedCapacity(pp.gpa, pp.top_expansion_buf.items.len);
     for (pp.top_expansion_buf.items) |*tok| {
@@ -2312,7 +2348,7 @@ fn expandedSliceExtra(pp: *const Preprocessor, tok: Token, macro_ws_handling: en
 }
 
 /// Get expanded token source string.
-pub fn expandedSlice(pp: *Preprocessor, tok: Token) []const u8 {
+pub fn expandedSlice(pp: *const Preprocessor, tok: Token) []const u8 {
     return pp.expandedSliceExtra(tok, .single_macro_ws);
 }
 
diff --git a/test/cases/expanded/recursive call non-expanded parens.c b/test/cases/expanded/recursive call non-expanded parens.c
new file mode 100644
index 00000000..33f7ad9a
--- /dev/null
+++ b/test/cases/expanded/recursive call non-expanded parens.c	
@@ -0,0 +1 @@
+1 2 1 bar
diff --git a/test/cases/expanded/unspecified expansion.c b/test/cases/expanded/unspecified expansion.c
index 240dc5b2..738d3966 100644
--- a/test/cases/expanded/unspecified expansion.c	
+++ b/test/cases/expanded/unspecified expansion.c	
@@ -1 +1 @@
-2*f(9)
+2*9*g
diff --git a/test/cases/recursive call non-expanded parens.c b/test/cases/recursive call non-expanded parens.c
new file mode 100644
index 00000000..5dc15bd9
--- /dev/null
+++ b/test/cases/recursive call non-expanded parens.c	
@@ -0,0 +1,5 @@
+//aro-args -E -P
+#define foo(X) 1 bar
+#define bar(X) 2 foo
+
+foo(X)(Y)(Z)
diff --git a/test/cases/unspecified expansion.c b/test/cases/unspecified expansion.c
index 55cbe3d3..d828307a 100644
--- a/test/cases/unspecified expansion.c	
+++ b/test/cases/unspecified expansion.c	
@@ -1,6 +1,6 @@
 //aro-args -E -P
 // This can either expand as 2*f(9) or as 2*9*g (see 6.10.3.4 in the standard)
-// Currently arocc does the former, but gcc and clang do the latter
+// We follow gcc and clang in expanding it to 2*9*g
 
 #define f(a) a*g
 #define g(a) f(a)