From b0c7ce51588cca89fa32772ccdb50397f218332f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 4 Aug 2023 17:57:35 +1000 Subject: [PATCH 1/7] makefile: quote TESTFLAGS when passing to containerised make Otherwise TESTFLAGS="-run FooBar" will result in TESTFLAGS=-run being executed in the container. Signed-off-by: Aleksa Sarai --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c72dd2f414d..0d48fe8c521 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ unittest: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS) + $(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)" localunittest: all $(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... @@ -115,7 +115,7 @@ integration: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH) + $(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)" localintegration: all bats -t tests/integration$(TESTPATH) From c6e7b1a8ec55d0d3f3d6e72c7ea1d8bd6bca0f04 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 15 Aug 2023 18:22:38 -0700 Subject: [PATCH 2/7] libct: initProcess.start: fix sync logic The code in this function became quite complicated and not entirely correct over time. As a result, if an error is returned from parseSync, it might end up stuck waiting for the child to finish. 1. Let's not wait() for the child twice. We already do it in the defer statement (call p.terminate()) when we are returning an error. 2. Remove sentResume and sentRun since we do not want to check if these were sent or not. Instead, introduce and check seenProcReady, as procReady is always expected from runc init. 3. Eliminate the possibility to wrap nil as an error. 4. Make sure we always call shutdown on the sync socket, and do not let shutdown error shadow the ierr. This fixes the issue of stuck `runc runc` with the optimization patch (sending procSeccompDone earlier) applied. Signed-off-by: Kir Kolyshkin --- libcontainer/process_linux.go | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 48861406dba..41709a4ea21 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -451,11 +451,8 @@ func (p *initProcess) start() (retErr error) { if err := p.sendConfig(); err != nil { return fmt.Errorf("error sending config to init process: %w", err) } - var ( - sentRun bool - sentResume bool - ) + var seenProcReady bool ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { case procSeccomp: @@ -494,6 +491,7 @@ func (p *initProcess) start() (retErr error) { return err } case procReady: + seenProcReady = true // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -555,7 +553,6 @@ func (p *initProcess) start() (retErr error) { if err := writeSync(p.messageSockPair.parent, procRun); err != nil { return err } - sentRun = true case procHooks: // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { @@ -587,7 +584,6 @@ func (p *initProcess) start() (retErr error) { if err := writeSync(p.messageSockPair.parent, procResume); err != nil { return err } - sentResume = true default: return errors.New("invalid JSON payload from child") } @@ -595,20 +591,14 @@ func (p *initProcess) start() (retErr error) { return nil }) - if !sentRun { - return fmt.Errorf("error during container init: %w", ierr) - } - if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { - return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process") - } - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil && ierr == nil { return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} } - - // Must be done after Shutdown so the child will exit and we can wait for it. + if !seenProcReady && ierr == nil { + ierr = errors.New("procReady not received") + } if ierr != nil { - _, _ = p.wait() - return ierr + return fmt.Errorf("error during container init: %w", ierr) } return nil } From f81ef1493d35ad708bf5e08f5ec5bb29d9f45294 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 8 Aug 2023 11:33:06 +1000 Subject: [PATCH 3/7] libcontainer: sync: cleanup synchronisation code This includes quite a few cleanups and improvements to the way we do synchronisation. The core behaviour is unchanged, but switching to embedding json.RawMessage into the synchronisation structure will allow us to do more complicated synchronisation operations in future patches. The file descriptor passing through the synchronisation system feature will be used as part of the idmapped-mount and bind-mount-source features when switching that code to use the new mount API outside of nsexec.c. Signed-off-by: Aleksa Sarai Signed-off-by: Kir Kolyshkin --- contrib/cmd/recvtty/recvtty.go | 4 +- libcontainer/criu_linux.go | 2 +- libcontainer/init_linux.go | 59 ++++------ libcontainer/integration/execin_test.go | 4 +- libcontainer/process_linux.go | 83 ++++++++------ libcontainer/rootfs_linux.go | 3 +- libcontainer/setns_init_linux.go | 2 - libcontainer/sync.go | 141 +++++++++++++++--------- libcontainer/utils/cmsg.go | 85 ++++++++------ libcontainer/utils/utils_unix.go | 2 +- tty.go | 2 +- 11 files changed, 224 insertions(+), 163 deletions(-) diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go index 35c293de642..532a4c33904 100644 --- a/contrib/cmd/recvtty/recvtty.go +++ b/contrib/cmd/recvtty/recvtty.go @@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return err } @@ -171,7 +171,7 @@ func handleNull(path string) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return } diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index 31c8a900eb1..ec6228399d8 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, defer master.Close() // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { + if err := utils.SendFile(process.ConsoleSocket, master); err != nil { return err } case "status-ready": diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 2e2d00ff83a..cef1142a2bc 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,7 +5,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "os" "runtime" @@ -121,11 +120,8 @@ func startInitialization() (retErr error) { defer func() { // If this defer is ever called, this means initialization has failed. // Send the error back to the parent process in the form of an initError. - if err := writeSync(pipe, procError); err != nil { - fmt.Fprintln(os.Stderr, err) - return - } - if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil { + ierr := initError{Message: retErr.Error()} + if err := writeSyncArg(pipe, procError, ierr); err != nil { fmt.Fprintln(os.Stderr, err) return } @@ -352,7 +348,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { if err != nil { return err } - // After we return from here, we don't need the console anymore. defer pty.Close() @@ -374,9 +369,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { } } // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { + if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil { return err } + runtime.KeepAlive(pty) + // Now, dup over all the things. return dupStdio(slavePath) } @@ -384,12 +381,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. -func syncParentReady(pipe io.ReadWriter) error { +func syncParentReady(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procReady); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procRun) } @@ -397,44 +393,37 @@ func syncParentReady(pipe io.ReadWriter) error { // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. -func syncParentHooks(pipe io.ReadWriter) error { +func syncParentHooks(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procHooks); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procResume) } -// syncParentSeccomp sends to the given pipe a JSON payload which -// indicates that the parent should pick up the seccomp fd with pidfd_getfd() -// and send it to the seccomp agent over a unix socket. It then waits for -// the parent to indicate that it is cleared to resume and closes the seccompFd. -// If the seccompFd is -1, there isn't anything to sync with the parent, so it -// returns no error. -func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error { +// syncParentSeccomp sends the fd associated with the seccomp file descriptor +// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. +func syncParentSeccomp(pipe *os.File, seccompFd int) error { if seccompFd == -1 { return nil } - - // Tell parent. - if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil { - unix.Close(seccompFd) + defer unix.Close(seccompFd) + + // Tell parent to grab our fd. + // + // Notably, we do not use writeSyncFile here because a container might have + // an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest + // possible number of system calls here because all of those syscalls + // cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here + // before the parent gets the file descriptor would deadlock "runc init" if + // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more + // details. + if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil { return err } - - // Wait for parent to give the all-clear. - if err := readSync(pipe, procSeccompDone); err != nil { - unix.Close(seccompFd) - return fmt.Errorf("sync parent seccomp: %w", err) - } - - if err := unix.Close(seccompFd); err != nil { - return fmt.Errorf("close seccomp fd: %w", err) - } - - return nil + // Wait for parent to tell us they've grabbed the seccompfd. + return readSync(pipe, procSeccompDone) } // setupUser changes the groups, gid, and uid for the user inside the container diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 58519a419c0..c5c324130c6 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) { done := make(chan (error)) go func() { - f, err := utils.RecvFd(parent) + f, err := utils.RecvFile(parent) if err != nil { - done <- fmt.Errorf("RecvFd: %w", err) + done <- fmt.Errorf("RecvFile: %w", err) return } c, err := console.ConsoleFromFile(f) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 41709a4ea21..7f78f730257 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strconv" "time" @@ -171,14 +172,27 @@ func (p *setnsProcess) start() (retErr error) { panic("unexpected procHooks in setns") case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd)) + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + var srcFd int + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) bundle, annotations := utils.Annotations(p.config.Config.Labels) containerProcessState := &specs.ContainerProcessState{ @@ -199,15 +213,10 @@ func (p *setnsProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } - return nil default: return errors.New("invalid JSON payload from child") } + return nil }) if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { @@ -457,14 +466,27 @@ func (p *initProcess) start() (retErr error) { switch sync.Type { case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd)) + var srcFd int + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) s, err := p.container.currentOCIState() if err != nil { @@ -485,11 +507,6 @@ func (p *initProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } case procReady: seenProcReady = true // set rlimits, this has to be done here because we lose permissions @@ -587,7 +604,6 @@ func (p *initProcess) start() (retErr error) { default: return errors.New("invalid JSON payload from child") } - return nil }) @@ -674,22 +690,20 @@ func (p *initProcess) forwardChildLogs() chan error { return logs.ForwardLogs(p.logFilePair.parent) } -func recvSeccompFd(childPid, childFd uintptr) (int, error) { - pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno) +func pidGetFd(pid, srcFd int) (*os.File, error) { + pidFd, err := unix.PidfdOpen(pid, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_open", err) } - defer unix.Close(int(pidfd)) - - seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno) + defer unix.Close(pidFd) + fd, err := unix.PidfdGetfd(pidFd, srcFd, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_getfd", err) } - - return int(seccompFd), nil + return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil } -func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error { +func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error { conn, err := net.Dial("unix", listenerPath) if err != nil { return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) @@ -706,11 +720,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces return fmt.Errorf("cannot marshall seccomp state: %w", err) } - err = utils.SendFds(socket, b, fd) - if err != nil { + if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil { return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) } - + runtime.KeepAlive(file) return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 54a2ff57a85..be6c73000eb 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -3,7 +3,6 @@ package libcontainer import ( "errors" "fmt" - "io" "os" "path" "path/filepath" @@ -64,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index ac58190758a..40a47a2e95c 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return err } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } @@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return fmt.Errorf("unable to init seccomp: %w", err) } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 25dc2863071..25507e58ad9 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "os" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -15,15 +16,19 @@ type syncType string // during container setup. They come in pairs (with procError being a generic // response which is followed by an &initError). // -// [ child ] <-> [ parent ] +// [ child ] <-> [ parent ] // -// procHooks --> [run hooks] -// <-- procResume +// procSeccomp --> [forward fd to listenerPath] +// file: seccomp fd +// --- no return synchronisation // -// procReady --> [final setup] -// <-- procRun +// procHooks --> [run hooks] +// <-- procResume // -// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// procReady --> [final setup] +// <-- procRun +// +// procSeccomp --> [grab seccomp fd with pidfd_getfd()] // <-- procSeccompDone const ( procError syncType = "procError" @@ -35,9 +40,17 @@ const ( procSeccompDone syncType = "procSeccompDone" ) +type syncFlags int + +const ( + syncFlagHasFd syncFlags = (1 << iota) +) + type syncT struct { - Type syncType `json:"type"` - Fd int `json:"fd"` + Type syncType `json:"type"` + Flags syncFlags `json:"flags"` + Arg *json.RawMessage `json:"arg,omitempty"` + File *os.File `json:"-"` // passed oob through SCM_RIGHTS } // initError is used to wrap errors for passing them via JSON, @@ -50,74 +63,100 @@ func (i initError) Error() string { return i.Message } -// writeSync is used to write to a synchronisation pipe. An error is returned -// if there was a problem writing the payload. -func writeSync(pipe io.Writer, sync syncType) error { - return writeSyncWithFd(pipe, sync, -1) +func doWriteSync(pipe *os.File, sync syncT) error { + sync.Flags &= ^syncFlagHasFd + if sync.File != nil { + sync.Flags |= syncFlagHasFd + } + if err := utils.WriteJSON(pipe, sync); err != nil { + return fmt.Errorf("writing sync %q: %w", sync.Type, err) + } + if sync.Flags&syncFlagHasFd != 0 { + if err := utils.SendFile(pipe, sync.File); err != nil { + return fmt.Errorf("sending file after sync %q: %w", sync.Type, err) + } + } + return nil +} + +func writeSync(pipe *os.File, sync syncType) error { + return doWriteSync(pipe, syncT{Type: sync}) } -// writeSyncWithFd is used to write to a synchronisation pipe. An error is -// returned if there was a problem writing the payload. -func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error { - if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil { - return fmt.Errorf("writing syncT %q: %w", string(sync), err) +func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error { + argJSON, err := json.Marshal(arg) + if err != nil { + return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err) } - return nil + argJSONMsg := json.RawMessage(argJSON) + return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg}) } -// readSync is used to read from a synchronisation pipe. An error is returned -// if we got an initError, the pipe was closed, or we got an unexpected flag. -func readSync(pipe io.Reader, expected syncType) error { - var procSync syncT - if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { +func doReadSync(pipe *os.File) (syncT, error) { + var sync syncT + if err := json.NewDecoder(pipe).Decode(&sync); err != nil { if errors.Is(err, io.EOF) { - return errors.New("parent closed synchronisation channel") + return sync, err } - return fmt.Errorf("failed reading error from parent: %w", err) + return sync, fmt.Errorf("reading from parent failed: %w", err) } - - if procSync.Type == procError { + if sync.Type == procError { var ierr initError - - if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { - return fmt.Errorf("failed reading error from parent: %w", err) + if sync.Arg == nil { + return sync, errors.New("procError missing error payload") } + if err := json.Unmarshal(*sync.Arg, &ierr); err != nil { + return sync, fmt.Errorf("unmarshal procError failed: %w", err) + } + return sync, &ierr + } + if sync.Flags&syncFlagHasFd != 0 { + file, err := utils.RecvFile(pipe) + if err != nil { + return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err) + } + sync.File = file + } + return sync, nil +} - return &ierr +func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { + sync, err := doReadSync(pipe) + if err != nil { + return sync, err + } + if sync.Type != expected { + return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected) } + return sync, nil +} - if procSync.Type != expected { - return errors.New("invalid synchronisation flag from parent") +func readSync(pipe *os.File, expected syncType) error { + sync, err := readSyncFull(pipe, expected) + if err != nil { + return err + } + if sync.Arg != nil { + return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg)) + } + if sync.File != nil { + _ = sync.File.Close() + return fmt.Errorf("sync %q had unexpected file passed", sync.Type) } return nil } // parseSync runs the given callback function on each syncT received from the // child. It will return once io.EOF is returned from the given pipe. -func parseSync(pipe io.Reader, fn func(*syncT) error) error { - dec := json.NewDecoder(pipe) +func parseSync(pipe *os.File, fn func(*syncT) error) error { for { - var sync syncT - if err := dec.Decode(&sync); err != nil { + sync, err := doReadSync(pipe) + if err != nil { if errors.Is(err, io.EOF) { break } return err } - - // We handle this case outside fn for cleanliness reasons. - var ierr *initError - if sync.Type == procError { - if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) { - return fmt.Errorf("error decoding proc error from init: %w", err) - } - if ierr != nil { - return ierr - } - // Programmer error. - panic("No error following JSON procError payload.") - } - if err := fn(&sync); err != nil { return err } diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go index fd9326cb59a..2edd1417af3 100644 --- a/libcontainer/utils/cmsg.go +++ b/libcontainer/utils/cmsg.go @@ -19,13 +19,14 @@ package utils import ( "fmt" "os" + "runtime" "golang.org/x/sys/unix" ) -// MaxNameLen is the maximum length of the name of a file descriptor being -// sent using SendFd. The name of the file handle returned by RecvFd will never -// be larger than this value. +// MaxNameLen is the maximum length of the name of a file descriptor being sent +// using SendFile. The name of the file handle returned by RecvFile will never be +// larger than this value. const MaxNameLen = 4096 // oobSpace is the size of the oob slice required to store a single FD. Note @@ -33,26 +34,21 @@ const MaxNameLen = 4096 // so sizeof(fd) = 4. var oobSpace = unix.CmsgSpace(4) -// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// RecvFile waits for a file descriptor to be sent over the given AF_UNIX // socket. The file name of the remote file descriptor will be recreated // locally (it is sent as non-auxiliary data in the same payload). -func RecvFd(socket *os.File) (*os.File, error) { - // For some reason, unix.Recvmsg uses the length rather than the capacity - // when passing the msg_controllen and other attributes to recvmsg. So we - // have to actually set the length. +func RecvFile(socket *os.File) (_ *os.File, Err error) { name := make([]byte, MaxNameLen) oob := make([]byte, oobSpace) sockfd := socket.Fd() - n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC) if err != nil { return nil, err } - if n >= MaxNameLen || oobn != oobSpace { - return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) } - // Truncate. name = name[:n] oob = oob[:oobn] @@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) { if err != nil { return nil, err } - if len(scms) != 1 { - return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + + // We cannot control how many SCM_RIGHTS we receive, and upon receiving + // them all of the descriptors are installed in our fd table, so we need to + // parse all of the SCM_RIGHTS we received in order to close all of the + // descriptors on error. + var fds []int + defer func() { + for i, fd := range fds { + if i == 0 && Err == nil { + // Only close the first one on error. + continue + } + // Always close extra ones. + _ = unix.Close(fd) + } + }() + var lastErr error + for _, scm := range scms { + if scm.Header.Type == unix.SCM_RIGHTS { + scmFds, err := unix.ParseUnixRights(&scm) + if err != nil { + lastErr = err + } else { + fds = append(fds, scmFds...) + } + } + } + if lastErr != nil { + return nil, lastErr } - scm := scms[0] - fds, err := unix.ParseUnixRights(&scm) - if err != nil { - return nil, err + // We do this after collecting the fds to make sure we close them all when + // returning an error here. + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) } if len(fds) != 1 { return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) } - fd := uintptr(fds[0]) - - return os.NewFile(fd, string(name)), nil + return os.NewFile(uintptr(fds[0]), string(name)), nil } -// SendFd sends a file descriptor over the given AF_UNIX socket. In -// addition, the file.Name() of the given file will also be sent as -// non-auxiliary data in the same payload (allowing to send contextual -// information for a file descriptor). -func SendFd(socket *os.File, name string, fd uintptr) error { +// SendFile sends a file over the given AF_UNIX socket. file.Name() is also +// included so that if the other end uses RecvFile, the file will have the same +// name information. +func SendFile(socket *os.File, file *os.File) error { + name := file.Name() if len(name) >= MaxNameLen { return fmt.Errorf("sendfd: filename too long: %s", name) } - return SendFds(socket, []byte(name), int(fd)) + err := SendRawFd(socket, name, file.Fd()) + runtime.KeepAlive(file) + return err } -// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. -func SendFds(socket *os.File, msg []byte, fds ...int) error { - oob := unix.UnixRights(fds...) - return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) +// SendRawFd sends a specific file descriptor over the given AF_UNIX socket. +func SendRawFd(socket *os.File, msg string, fd uintptr) error { + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 6b9a7be038f..ca520b63b36 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -91,7 +91,7 @@ func CloseExecFrom(minFd int) error { } // NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err diff --git a/tty.go b/tty.go index fba3e025bc0..c101aacb78b 100644 --- a/tty.go +++ b/tty.go @@ -100,7 +100,7 @@ func (t *tty) initHostConsole() error { } func (t *tty) recvtty(socket *os.File) (Err error) { - f, err := utils.RecvFd(socket) + f, err := utils.RecvFile(socket) if err != nil { return err } From 20b95f23ca33cb1751ee0b5318e1d5f3676032c6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 8 Aug 2023 11:42:24 +1000 Subject: [PATCH 4/7] libcontainer: seccomp: pass around *os.File for notifyfd *os.File is correctly tracked by the garbage collector, and there's no need to use raw file descriptors for this code. Signed-off-by: Aleksa Sarai --- libcontainer/init_linux.go | 8 ++--- libcontainer/seccomp/patchbpf/enosys_linux.go | 13 ++++--- libcontainer/seccomp/seccomp_linux.go | 35 +++++++++---------- libcontainer/seccomp/seccomp_unsupported.go | 7 ++-- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index cef1142a2bc..7b6c8113b6c 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -404,11 +404,11 @@ func syncParentHooks(pipe *os.File) error { // syncParentSeccomp sends the fd associated with the seccomp file descriptor // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. -func syncParentSeccomp(pipe *os.File, seccompFd int) error { - if seccompFd == -1 { +func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error { + if seccompFd == nil { return nil } - defer unix.Close(seccompFd) + defer seccompFd.Close() // Tell parent to grab our fd. // @@ -419,7 +419,7 @@ func syncParentSeccomp(pipe *os.File, seccompFd int) error { // before the parent gets the file descriptor would deadlock "runc init" if // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more // details. - if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil { + if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil { return err } // Wait for parent to tell us they've grabbed the seccompfd. diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index 7fc9fd662c3..40e51533410 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -690,17 +690,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err erro // patches said filter to handle -ENOSYS in a much nicer manner than the // default libseccomp default action behaviour, and loads the patched filter // into the kernel for the current process. -func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) { +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) { // Generate a patched filter. fprog, err := enosysPatchFilter(config, filter) if err != nil { - return -1, fmt.Errorf("error patching filter: %w", err) + return nil, fmt.Errorf("error patching filter: %w", err) } // Get the set of libseccomp flags set. seccompFlags, noNewPrivs, err := filterFlags(config, filter) if err != nil { - return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) } // Set no_new_privs if it was requested, though in runc we handle @@ -708,15 +708,14 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, if noNewPrivs { logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err) + return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err) } } // Finally, load the filter. fd, err := sysSeccompSetFilter(seccompFlags, fprog) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter: %w", err) + return nil, fmt.Errorf("error loading seccomp filter: %w", err) } - - return fd, nil + return os.NewFile(uintptr(fd), "[seccomp filter]"), nil } diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index fed02bcedc4..2c64ebbbadd 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -6,6 +6,7 @@ package seccomp import ( "errors" "fmt" + "os" libseccomp "github.com/seccomp/libseccomp-golang" "github.com/sirupsen/logrus" @@ -27,17 +28,16 @@ const ( ) // InitSeccomp installs the seccomp filters to be used in the container as -// specified in config. -// Returns the seccomp file descriptor if any of the filters include a -// SCMP_ACT_NOTIFY action, otherwise returns -1. -func InitSeccomp(config *configs.Seccomp) (int, error) { +// specified in config. Returns the seccomp file descriptor if any of the +// filters include a SCMP_ACT_NOTIFY action. +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config == nil { - return -1, errors.New("cannot initialize Seccomp - nil config passed") + return nil, errors.New("cannot initialize Seccomp - nil config passed") } defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) if err != nil { - return -1, errors.New("error initializing seccomp - invalid default action") + return nil, errors.New("error initializing seccomp - invalid default action") } // Ignore the error since pre-2.4 libseccomp is treated as API level 0. @@ -45,7 +45,7 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { for _, call := range config.Syscalls { if call.Action == configs.Notify { if apiLevel < 6 { - return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) } // We can't allow the write syscall to notify to the seccomp agent. @@ -61,36 +61,36 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // agent allows those syscalls to proceed, initialization works just fine and the agent can // handle future read()/close() syscalls as it wanted. if call.Name == "write" { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") } } } // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. if defaultAction == libseccomp.ActNotify { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") } filter, err := libseccomp.NewFilter(defaultAction) if err != nil { - return -1, fmt.Errorf("error creating filter: %w", err) + return nil, fmt.Errorf("error creating filter: %w", err) } // Add extra architectures for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return -1, fmt.Errorf("error validating Seccomp architecture: %w", err) + return nil, fmt.Errorf("error validating Seccomp architecture: %w", err) } if err := filter.AddArch(scmpArch); err != nil { - return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err) + return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err) } } // Add extra flags. for _, flag := range config.Flags { if err := setFlag(filter, flag); err != nil { - return -1, err + return nil, err } } @@ -110,25 +110,24 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { - return -1, fmt.Errorf("error setting no new privileges: %w", err) + return nil, fmt.Errorf("error setting no new privileges: %w", err) } // Add a rule for each syscall for _, call := range config.Syscalls { if call == nil { - return -1, errors.New("encountered nil syscall while initializing Seccomp") + return nil, errors.New("encountered nil syscall while initializing Seccomp") } if err := matchCall(filter, call, defaultAction); err != nil { - return -1, err + return nil, err } } seccompFd, err := patchbpf.PatchAndLoad(config, filter) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err) + return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err) } - return seccompFd, nil } diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go index 885529dc7d0..b08a3498687 100644 --- a/libcontainer/seccomp/seccomp_unsupported.go +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -5,6 +5,7 @@ package seccomp import ( "errors" + "os" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" @@ -13,11 +14,11 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") // InitSeccomp does nothing because seccomp is not supported. -func InitSeccomp(config *configs.Seccomp) (int, error) { +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config != nil { - return -1, ErrSeccompNotEnabled + return nil, ErrSeccompNotEnabled } - return -1, nil + return nil, nil } // FlagSupported tells if a provided seccomp flag is supported. From 5c7839b50315a6874fc8e4989960de5ed4f57d14 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 2 Aug 2023 12:33:14 +1000 Subject: [PATCH 5/7] rootfs: use empty src for MS_REMOUNT The kernel ignores these arguments, and passing them can lead to confusing error messages (the old source is irrelevant for MS_REMOUNT), as well as causing issues for a future patch where we switch to move_mount(2). Signed-off-by: Aleksa Sarai --- libcontainer/rootfs_linux.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index be6c73000eb..4212b33a314 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1105,7 +1105,7 @@ func writeSystemProperty(key, value string) error { func remount(m mountEntry, rootfs string, noMountFallback bool) error { return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { flags := uintptr(m.Flags | unix.MS_REMOUNT) - err := mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") if err == nil { return nil } @@ -1129,7 +1129,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error { } // ... and retry the mount with flags found above. flags |= uintptr(int(s.Flags) & checkflags) - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") }) } From 8aa97ad3a38833359511c8b5c7320e2b714b53c9 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Aug 2023 20:30:53 +1000 Subject: [PATCH 6/7] nsexec: remove cgroupns special-casing The original implementation of cgroupns had additional synchronisation to "ensure" that the process is in the correct cgroup before unsharing the cgroupns. This behaviour was actually never necessary, and after commit 5110bd2fc026 ("nsenter: remove cgroupns sync mechanism") there is no synchronisation at all, meaning that CLONE_NEWCGROUP should not get any special treatment. Fixes: 5110bd2fc026 ("nsenter: remove cgroupns sync mechanism") Fixes: df3fa115f974 ("Add support for cgroup namespace") Signed-off-by: Aleksa Sarai --- libcontainer/nsenter/nsexec.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 07d9e0df130..17e0468c6af 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -1171,7 +1171,7 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ - try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)"); + try_unshare(config.cloneflags, "remaining namespaces"); update_timens_offsets(config.timensoffset, config.timensoffset_len); /* Ask our parent to send the mount sources fds. */ @@ -1302,10 +1302,6 @@ void nsexec(void) bail("setgroups failed"); } - if (config.cloneflags & CLONE_NEWCGROUP) { - try_unshare(CLONE_NEWCGROUP, "cgroup namespace"); - } - write_log(DEBUG, "signal completion to stage-0"); s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) From 1f25724a962c8049d876915085d348fbc2154f7f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 6 Aug 2023 13:59:06 +1000 Subject: [PATCH 7/7] configs: fix idmapped mounts json field names In the runc state JSON we always use snake_case. This is a no-op change, but it will cause any existing container state files to be incorrectly parsed. Luckily, commit fbf183c6f8c4 ("Add uid and gid mappings to mounts") has never been in a runc release so we can change this before a 1.2.z release. Fixes: fbf183c6f8c4 ("Add uid and gid mappings to mounts") Signed-off-by: Aleksa Sarai --- libcontainer/configs/mount_linux.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go index a4275df3a03..6d4106de0c6 100644 --- a/libcontainer/configs/mount_linux.go +++ b/libcontainer/configs/mount_linux.go @@ -34,13 +34,13 @@ type Mount struct { // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - UIDMappings []IDMap `json:"uidMappings,omitempty"` + UIDMappings []IDMap `json:"uid_mappings,omitempty"` // GIDMappings is used to changing file group owners w/o calling chown. // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - GIDMappings []IDMap `json:"gidMappings,omitempty"` + GIDMappings []IDMap `json:"gid_mappings,omitempty"` } func (m *Mount) IsBind() bool {