diff --git a/src/main.zig b/src/main.zig index 0b604e8..aaf8371 100644 --- a/src/main.zig +++ b/src/main.zig @@ -346,17 +346,29 @@ fn check_unprivileged_userns_permissions() void { } } +fn umount(path: [*:0]const u8) void { + const umountRet: i64 = @bitCast(std.os.linux.umount(path)); + if (umountRet != 0) { + assert(umountRet < 0 and umountRet > -4096); + const errno: std.posix.E = @enumFromInt(-umountRet); + std.debug.panic("Failed to unmount {s}. Errno: {}\n", .{ path, errno }); + } +} + pub fn main() !u8 { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); + // TODO: consider the case where a user can mount the filesystem but isn't root + // We might only need to check for CAP_SYS_ADMIN + // Also in the case where fusermount3 is present this is unnecessary const euid = std.os.linux.geteuid(); - const egid = std.os.linux.getegid(); - if (euid != 0) { // So that fuse filesystems can be mounted without needing fusermount3 + const egid = std.os.linux.getegid(); + const username = try allocator.dupeZ(u8, std.mem.span((std.c.getpwuid(euid) orelse @panic("couldn't get username")).pw_name orelse @panic("couldn't get username"))); defer allocator.free(username); @@ -420,10 +432,6 @@ pub fn main() !u8 { const set_groups_file = try std.fmt.allocPrint(allocator, "/proc/{}/setgroups", .{pid}); defer allocator.free(set_groups_file); - // NOTE(nilsirl): I don't know what this does. If set to "allow" - // then sometimes /proc/{pid}/gid_map fails to write - try std.fs.cwd().writeFile(.{ .sub_path = set_groups_file, .data = "deny" }); - newuidmap(allocator, pid, uid_mappings.items) catch { std.debug.print("newuidmap failed, falling back to single user mapping\n", .{}); const uid_map_path = try std.fmt.allocPrint(allocator, "/proc/{}/uid_map", .{pid}); @@ -433,6 +441,8 @@ pub fn main() !u8 { defer allocator.free(uid_map_content); std.fs.cwd().writeFile(.{ .sub_path = uid_map_path, .data = uid_map_content }) catch |err| { if (err == std.posix.WriteError.AccessDenied) { + // TODO: when using newuidmap this may not get hit until + // trying to mount file system check_unprivileged_userns_permissions(); } std.debug.panic("error: {}\n", .{err}); @@ -441,6 +451,11 @@ pub fn main() !u8 { newgidmap(allocator, pid, gid_mappings.items) catch { std.debug.print("newgidmap failed, falling back to single group mapping\n", .{}); + + // must be set for writing to gid_map to succeed (see user_namespaces(7)) + // otherwise we want to leave it untouched so that setgroups can be used in the container + try std.fs.cwd().writeFile(.{ .sub_path = set_groups_file, .data = "deny" }); + const gid_map_path = try std.fmt.allocPrint(allocator, "/proc/{}/gid_map", .{pid}); defer allocator.free(gid_map_path); @@ -459,7 +474,7 @@ pub fn main() !u8 { if (std.os.linux.W.IFEXITED(wait_result.status)) { return std.os.linux.W.EXITSTATUS(wait_result.status); } - std.debug.print("did not exit normally status: {}\n", .{wait_result.status}); + std.debug.panic("did not exit normally status: {}\n", .{wait_result.status}); } std.posix.close(write_fd); @@ -520,12 +535,18 @@ pub fn main() !u8 { const overlayfs_args = [_:null]?[*:0]const u8{ "fuse-overlayfs", "-o", overlayfs_options, mount_dir_path }; + // reap the child of fuse-overlayfs so that we can be sure fuse-overlayfs + // has exited before unmounting squashfuse + assert(try std.posix.prctl(std.posix.PR.SET_CHILD_SUBREAPER, .{1}) == 0); const pid = try std.posix.fork(); if (pid == 0) { - std.process.exit(@intCast(overlayfs_main(overlayfs_args.len, &overlayfs_args))); + _ = overlayfs_main(overlayfs_args.len, &overlayfs_args); + std.debug.panic("unreachable", .{}); } const wait_pid_result = std.posix.waitpid(pid, 0); + assert(try std.posix.prctl(std.posix.PR.SET_CHILD_SUBREAPER, .{0}) == 0); + if (wait_pid_result.status != 0) { std.debug.panic("failed to run overlayfs", .{}); } @@ -562,20 +583,36 @@ pub fn main() !u8 { // fails because most users do not have write permission there assert(c.setenv("XDG_RUNTIME_DIR", "/tmp", 0) == 0); - const ret = c.libcrun_container_run(&crun_context, container, 0, &err); + const pid = try std.posix.fork(); + assert(pid >= 0); + if (pid == 0) { + // Run container in a separate process because crun will try to reap + // every child including the fuse-overlayfs process still running + const ret = c.libcrun_container_run(&crun_context, container, 0, &err); + + if (err != null) { + std.debug.panic("failed to run container (status/errno: {}) ({d}): {s}\n", .{ err.*.status, ret, err.*.msg }); + } - if (err != null) { - std.debug.panic("failed to run container (status/errno: {}) ({d}): {s}\n", .{ err.*.status, ret, err.*.msg }); + return @intCast(ret); } - if (std.os.linux.umount(mount_dir_path) != 0) { - std.debug.print("Failed to unmount {s}\n", .{mount_dir_path}); + const retStatus = std.posix.waitpid(pid, 0); + if (!std.posix.W.IFEXITED(retStatus.status)) { + std.debug.panic("container didn't exist normally : {}\n", .{retStatus.status}); } - if (std.os.linux.umount(filesystem_bundle_dir_null) != 0) { - std.debug.print("Failed to unmount {s}\n", .{filesystem_bundle_dir_null}); + + umount(mount_dir_path); + + // wait for overlayfs process to finish so that device is not busy to unmount squashfuse + const overlayfs_status = std.posix.waitpid(-1, 0); + if (!std.posix.W.IFEXITED(overlayfs_status.status) or std.posix.W.EXITSTATUS(overlayfs_status.status) != 0) { + std.debug.panic("overlayfs failed to exit successfully, status: {}\n", .{overlayfs_status.status}); } - // TODO: clean up /tmp + umount(filesystem_bundle_dir_null); + + try std.fs.deleteTreeAbsolute(&temp_dir_path); - return @intCast(ret); + return std.posix.W.EXITSTATUS(retStatus.status); }