From 8462f5c23f715b1e207f3c0120e70a02fbd477a3 Mon Sep 17 00:00:00 2001 From: Nils Date: Thu, 15 Aug 2024 09:56:22 +0000 Subject: [PATCH] uid mapping --- build.zig | 29 +++++ build.zig.zon | 4 + src/dockerc.zig | 11 +- src/main.zig | 314 +++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 298 insertions(+), 60 deletions(-) diff --git a/build.zig b/build.zig index b302bfc..3f41ec9 100644 --- a/build.zig +++ b/build.zig @@ -247,6 +247,35 @@ pub fn build(b: *std.Build) void { }, }); + runtime.addIncludePath(b.dependency("shadow", .{}).path("libsubid")); + runtime.addIncludePath(b.dependency("shadow", .{}).path("lib")); + runtime.addIncludePath(b.dependency("shadow", .{}).path("")); + + runtime.addCSourceFiles(.{ + .root = b.dependency("shadow", .{}).path(""), + .files = &[_][]const u8{ + "libsubid/api.c", + "lib/shadowlog.c", + "lib/subordinateio.c", + "lib/commonio.c", + "lib/write_full.c", + "lib/nss.c", + "lib/get_pid.c", + "lib/memzero.c", + "lib/alloc.c", + "lib/atoi/str2i.c", + "lib/atoi/a2i.c", + "lib/atoi/strtou_noneg.c", + "lib/atoi/strtoi.c", + "lib/string/sprintf.c", + }, + .flags = &[_][]const u8{ + "-DENABLE_SUBIDS", + // duplicate symbol with crun + "-Dxasprintf=shadow_xasprintf", + }, + }); + const aarch64_target = b.resolveTargetQuery(.{ .cpu_arch = .aarch64, .abi = .musl, diff --git a/build.zig.zon b/build.zig.zon index 6754750..20c25dc 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -19,6 +19,10 @@ .url = "https://github.com/Hejsil/zig-clap/archive/60cd46aacff4960104703da9ba683077b1e3c76c.tar.gz", .hash = "12205eb22c644df2469045083efe9c92189d56494ebe8901b36ef1218c3717e6eb78", }, + .shadow = .{ + .url = "https://github.com/shadow-maint/shadow/releases/download/4.15.3/shadow-4.15.3.tar.gz", + .hash = "12202972bc830245199eb4c5b10f853f1be094eb8c86d431299a69b1c01fa5781a7e", + }, }, .paths = .{ // This makes *all* files, recursively, included in this package. It is generally diff --git a/src/dockerc.zig b/src/dockerc.zig index 11e7b04..a679ffb 100644 --- a/src/dockerc.zig +++ b/src/dockerc.zig @@ -178,13 +178,12 @@ pub fn main() !void { "0", }; - // in rootfull, do not force uid/gid to 0,0 - if (res.args.rootfull != 0) { - mksquashfs_args[mksquashfs_args.len - 4] = null; - } - mksquashfs_main( - mksquashfs_args.len, + // in rootfull, do not force uid/gid to 0,0 + if (res.args.rootfull != 0) + mksquashfs_args.len - 4 + else + mksquashfs_args.len, &mksquashfs_args, ); diff --git a/src/main.zig b/src/main.zig index cfd0dac..0b604e8 100644 --- a/src/main.zig +++ b/src/main.zig @@ -8,6 +8,7 @@ const extract_file = common.extract_file; const c = @cImport({ @cInclude("libcrun/container.h"); @cInclude("libcrun/custom-handler.h"); + @cInclude("subid.h"); }); extern fn squashfuse_main(argc: c_int, argv: [*:null]const ?[*:0]const u8) c_int; @@ -30,6 +31,134 @@ fn getEnvFull(key: []const u8) ?[:0]const u8 { return null; } +const IDMapping = struct { + containerID: i64, + hostID: i64, + size: i64, + + fn toValue(self: @This(), allocator: Allocator) !std.json.Value { + var object = std.json.ObjectMap.init(allocator); + try object.put("containerID", std.json.Value{ + .integer = self.containerID, + }); + try object.put("hostID", std.json.Value{ + .integer = self.hostID, + }); + try object.put("size", std.json.Value{ + .integer = self.size, + }); + return std.json.Value{ .object = object }; + } +}; + +const IDMappings = []IDMapping; + +fn intToString(allocator: Allocator, v: i64) ![]u8 { + return std.fmt.allocPrint(allocator, "{}", .{v}); +} + +fn newgidmap(allocator: Allocator, pid: i64, gid_mappings: IDMappings) !void { + return uidgidmap_helper(allocator, "newgidmap", pid, gid_mappings); +} + +fn newuidmap(allocator: Allocator, pid: i64, uid_mappings: IDMappings) !void { + return uidgidmap_helper(allocator, "newuidmap", pid, uid_mappings); +} + +fn uidgidmap_helper(child_allocator: Allocator, helper: []const u8, pid: i64, uid_mappings: IDMappings) !void { + var arena = std.heap.ArenaAllocator.init(child_allocator); + const allocator = arena.allocator(); + defer arena.deinit(); + + var argv = try std.ArrayList([]const u8).initCapacity(allocator, 2 + 3 * uid_mappings.len); + argv.appendAssumeCapacity(helper); + // TODO: specify pid using fd:N to avoid a TOCTTOU, see newuidmap(1) + argv.appendAssumeCapacity(try intToString(allocator, pid)); + + for (uid_mappings) |uid_mapping| { + argv.appendAssumeCapacity(try intToString(allocator, uid_mapping.containerID)); + argv.appendAssumeCapacity(try intToString(allocator, uid_mapping.hostID)); + argv.appendAssumeCapacity(try intToString(allocator, uid_mapping.size)); + } + + var newuidmapProcess = std.process.Child.init(argv.items, allocator); + switch (try newuidmapProcess.spawnAndWait()) { + .Exited => |status| if (status == 0) { + return; + } else { + std.debug.panic("newuidmap/newgidmap failed with status: {}", .{status}); + }, + else => |term| { + std.debug.panic("newuidmap/newgidmap terminated abnormally: {}", .{term}); + }, + } + return error.UidGidMapFailed; +} + +const Allocator = std.mem.Allocator; + +fn IDMappingsToValue(allocator: Allocator, id_mappings: IDMappings) !std.json.Value { + var array = try std.json.Array.initCapacity(allocator, id_mappings.len); + for (id_mappings) |id_mapping| { + array.appendAssumeCapacity(try id_mapping.toValue(allocator)); + } + return std.json.Value{ .array = array }; +} + +const IdMapParser = struct { + bytes: []const u8, + index: usize = 0, + + fn nextNumber(self: *IdMapParser) ?i64 { + while (self.index < self.bytes.len and (self.bytes[self.index] < '0' or self.bytes[self.index] > '9')) { + self.index += 1; + } + + if (self.index == self.bytes.len) { + return null; + } + + const intStart = self.index; + + while (self.bytes[self.index] >= '0' and self.bytes[self.index] <= '9') { + self.index += 1; + + if (self.index == self.bytes.len) { + break; + } + } + + return std.fmt.parseInt(i64, self.bytes[intStart..self.index], 10) catch |err| { + std.debug.panic("unexpected error parsing uid_map/gid_map: {}\n", .{err}); + }; + } +}; + +fn parseIdmap(allocator: Allocator, bytes: []const u8) !IDMappings { + var idmap_parser = IdMapParser{ .bytes = bytes }; + var id_mappings = std.ArrayList(IDMapping).init(allocator); + + while (idmap_parser.nextNumber()) |containerID| { + try id_mappings.append(IDMapping{ + .containerID = containerID, + .hostID = idmap_parser.nextNumber() orelse std.debug.panic("must have 3 numbers\n", .{}), + .size = idmap_parser.nextNumber() orelse std.debug.panic("must have 3 numbers\n", .{}), + }); + } + + return id_mappings.toOwnedSlice(); +} + +fn updateIdMap(id_mappings: IDMappings) void { + var runningId: i64 = 0; + + for (id_mappings) |*id_mapping| { + id_mapping.*.hostID = id_mapping.*.containerID; + id_mapping.*.containerID = runningId; + runningId += id_mapping.*.size; + } +} + fn getContainerFromArgs(file: std.fs.File, rootfs_absolute_path: []const u8, parentAllocator: std.mem.Allocator) ![*c]c.libcrun_container_t { var arena = std.heap.ArenaAllocator.init(parentAllocator); defer arena.deinit(); @@ -97,36 +226,24 @@ fn getContainerFromArgs(file: std.fs.File, rootfs_absolute_path: []const u8, par const linuxVal = object.getPtr("linux") orelse @panic("no linux key"); switch (linuxVal.*) { .object => |*linux| { - const uidMappingsVal = linux.getPtr("uidMappings") orelse @panic("no uidMappings key"); - switch (uidMappingsVal.*) { - .array => |*uidMappings| { - assert(uidMappings.items.len == 1); - const uidMappingVal = uidMappings.getLast(); - - switch (uidMappingVal) { - .object => |*uidMapping| { - (uidMapping.getPtr("hostID") orelse @panic("no hostID key")).* = std.json.Value{ .integer = std.os.linux.geteuid() }; - }, - else => return error.InvalidJSON, - } - }, - else => return error.InvalidJSON, + // In rootfull containers uidMappings is not set + if (linux.getPtr("uidMappings")) |uidMappingsVal| { + const uid_map = try std.fs.cwd().readFileAlloc(allocator, "/proc/self/uid_map", 1000000); + const uidMappings = try parseIdmap(allocator, uid_map); + + updateIdMap(uidMappings); + + uidMappingsVal.* = try IDMappingsToValue(allocator, uidMappings); } - const gidMappingsVal = linux.getPtr("gidMappings") orelse @panic("no gidMappings key"); - switch (gidMappingsVal.*) { - .array => |*gidMappings| { - assert(gidMappings.items.len == 1); - const gidMappingVal = gidMappings.getLast(); - - switch (gidMappingVal) { - .object => |*gidMapping| { - (gidMapping.getPtr("hostID") orelse @panic("no hostID key")).* = std.json.Value{ .integer = std.os.linux.getegid() }; - }, - else => return error.InvalidJSON, - } - }, - else => return error.InvalidJSON, + // In rootfull containers gidMappings is not set + if (linux.getPtr("gidMappings")) |gidMappingsVal| { + const gid_map = try std.fs.cwd().readFileAlloc(allocator, "/proc/self/gid_map", 1000000); + const gidMappings = try parseIdmap(allocator, gid_map); + + updateIdMap(gidMappings); + + gidMappingsVal.* = try IDMappingsToValue(allocator, gidMappings); } }, else => return error.InvalidJSON, @@ -229,37 +346,128 @@ fn check_unprivileged_userns_permissions() void { } } -pub fn main() !void { +pub fn main() !u8 { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); - { + const euid = std.os.linux.geteuid(); + const egid = std.os.linux.getegid(); + + if (euid != 0) { // So that fuse filesystems can be mounted without needing fusermount3 - const euid = std.os.linux.geteuid(); - const egid = std.os.linux.getegid(); - const retVal = std.os.linux.unshare(std.os.linux.CLONE.NEWUSER | std.os.linux.CLONE.NEWNS); - if (retVal != 0) { - std.debug.panic("Failed to unshare namespaces: {}", .{std.posix.errno(retVal)}); + const username = try allocator.dupeZ(u8, std.mem.span((std.c.getpwuid(euid) orelse @panic("couldn't get username")).pw_name orelse @panic("couldn't get username"))); + defer allocator.free(username); + + var subuid_ranges: [*]c.subid_range = undefined; + var subgid_ranges: [*]c.subid_range = undefined; + + var uid_mappings = std.ArrayList(IDMapping).init(allocator); + defer uid_mappings.deinit(); + + try uid_mappings.append(IDMapping{ + .containerID = 0, + .hostID = euid, + .size = 1, + }); + + var gid_mappings = std.ArrayList(IDMapping).init(allocator); + defer gid_mappings.deinit(); + + try gid_mappings.append(IDMapping{ + .containerID = 0, + .hostID = egid, + .size = 1, + }); + + const subuid_ranges_len = c.subid_get_uid_ranges(username, @ptrCast(&subuid_ranges)); + const subgid_ranges_len = c.subid_get_gid_ranges(username, @ptrCast(&subgid_ranges)); + + if (subuid_ranges_len > 0) { + for (0..@intCast(subuid_ranges_len)) |i| { + try uid_mappings.append(IDMapping{ + .containerID = @intCast(subuid_ranges[i].start), + .hostID = @intCast(subuid_ranges[i].start), + .size = @intCast(subuid_ranges[i].count), + }); + } } - const uid_map_path = "/proc/self/uid_map"; - const uid_map_content = try std.fmt.allocPrint(allocator, "0 {} 1", .{euid}); - defer allocator.free(uid_map_content); - std.fs.cwd().writeFile(.{ .sub_path = uid_map_path, .data = uid_map_content }) catch |err| { - if (err == std.posix.WriteError.AccessDenied) { - check_unprivileged_userns_permissions(); + if (subgid_ranges_len > 0) { + for (0..@intCast(subgid_ranges_len)) |i| { + try gid_mappings.append(IDMapping{ + .containerID = @intCast(subgid_ranges[i].start), + .hostID = @intCast(subgid_ranges[i].start), + .size = @intCast(subgid_ranges[i].count), + }); } - std.debug.panic("error: {}\n", .{err}); - }; + } + + const pipe = try std.posix.pipe(); + const read_fd = pipe[0]; + const write_fd = pipe[1]; + + const pid: i64 = @bitCast(std.os.linux.clone2(std.os.linux.CLONE.NEWUSER | std.os.linux.CLONE.NEWNS | std.os.linux.SIG.CHLD, 0)); + if (pid < 0) { + std.debug.panic("failed to clone process: {}\n", .{std.posix.errno(pid)}); + } + + if (pid > 0) { + std.posix.close(read_fd); + // inside parent process + + const set_groups_file = try std.fmt.allocPrint(allocator, "/proc/{}/setgroups", .{pid}); + defer allocator.free(set_groups_file); - try std.fs.cwd().writeFile(.{ .sub_path = "/proc/self/setgroups", .data = "deny" }); + // NOTE(nilsirl): I don't know what this does. If set to "allow" + // then sometimes /proc/{pid}/gid_map fails to write + try std.fs.cwd().writeFile(.{ .sub_path = set_groups_file, .data = "deny" }); - const gid_map_path = "/proc/self/gid_map"; - const gid_map_content = try std.fmt.allocPrint(allocator, "0 {} 1", .{egid}); - defer allocator.free(gid_map_content); - try std.fs.cwd().writeFile(.{ .sub_path = gid_map_path, .data = gid_map_content }); + newuidmap(allocator, pid, uid_mappings.items) catch { + std.debug.print("newuidmap failed, falling back to single user mapping\n", .{}); + const uid_map_path = try std.fmt.allocPrint(allocator, "/proc/{}/uid_map", .{pid}); + defer allocator.free(uid_map_path); + + const uid_map_content = try std.fmt.allocPrint(allocator, "0 {} 1", .{euid}); + defer allocator.free(uid_map_content); + std.fs.cwd().writeFile(.{ .sub_path = uid_map_path, .data = uid_map_content }) catch |err| { + if (err == std.posix.WriteError.AccessDenied) { + check_unprivileged_userns_permissions(); + } + std.debug.panic("error: {}\n", .{err}); + }; + }; + + newgidmap(allocator, pid, gid_mappings.items) catch { + std.debug.print("newgidmap failed, falling back to single group mapping\n", .{}); + const gid_map_path = try std.fmt.allocPrint(allocator, "/proc/{}/gid_map", .{pid}); + defer allocator.free(gid_map_path); + + const gid_map_content = try std.fmt.allocPrint(allocator, "0 {} 1", .{egid}); + defer allocator.free(gid_map_content); + std.fs.cwd().writeFile(.{ .sub_path = gid_map_path, .data = gid_map_content }) catch |err| { + if (err == std.posix.WriteError.AccessDenied) { + check_unprivileged_userns_permissions(); + } + std.debug.panic("error: {}\n", .{err}); + }; + }; + + std.posix.close(write_fd); + const wait_result = std.posix.waitpid(@intCast(pid), 0); + if (std.os.linux.W.IFEXITED(wait_result.status)) { + return std.os.linux.W.EXITSTATUS(wait_result.status); + } + std.debug.print("did not exit normally status: {}\n", .{wait_result.status}); + } + + std.posix.close(write_fd); + + var buf: [1]u8 = undefined; + const bytes_read = try std.posix.read(read_fd, &buf); + assert(bytes_read == 0); + std.posix.close(read_fd); } var args = std.process.args(); @@ -356,12 +564,8 @@ pub fn main() !void { const ret = c.libcrun_container_run(&crun_context, container, 0, &err); - if (ret != 0) { - if (err != null) { - std.debug.panic("failed to run container (status/errno: {}) ({d}): {s}\n", .{ err.*.status, ret, err.*.msg }); - } else { - std.debug.panic("failed to run container ({d})\n", .{ret}); - } + if (err != null) { + std.debug.panic("failed to run container (status/errno: {}) ({d}): {s}\n", .{ err.*.status, ret, err.*.msg }); } if (std.os.linux.umount(mount_dir_path) != 0) { @@ -372,4 +576,6 @@ pub fn main() !void { } // TODO: clean up /tmp + + return @intCast(ret); }