From 94505a046ad365eaeb28654a36c273b1356e050c Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Sun, 1 Oct 2023 11:09:49 +0800 Subject: [PATCH] *: introduce pidfd-socket flag The container manager like containerd-shim can't use cgroup.kill feature or freeze all the processes in cgroup to terminate the exec init process. It's unsafe to call kill(2) since the pid can be recycled. It's good to provide the pidfd of init process through the pidfd-socket. It's similar to the console-socket. With the pidfd, the container manager like containerd-shim can send the signal to target process safely. And for the standard init process, we can have polling support to get exit event instead of blocking on wait4. Signed-off-by: Wei Fu --- .gitignore | 1 + Makefile | 7 +- contrib/cmd/pidfd-kill/pidfd-kill.go | 114 +++++++++++++++++++++++++++ create.go | 4 + exec.go | 5 ++ libcontainer/container_linux.go | 7 ++ libcontainer/init_linux.go | 33 +++++++- libcontainer/process.go | 3 + libcontainer/setns_init_linux.go | 6 ++ libcontainer/standard_init_linux.go | 7 ++ run.go | 4 + tests/integration/helpers.bash | 42 ++++++++++ tests/integration/pidfd-socket.bats | 99 +++++++++++++++++++++++ utils_linux.go | 44 +++++++++++ 14 files changed, 371 insertions(+), 5 deletions(-) create mode 100644 contrib/cmd/pidfd-kill/pidfd-kill.go create mode 100644 tests/integration/pidfd-socket.bats diff --git a/.gitignore b/.gitignore index f022ed275cd..52dac0bf421 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ vendor/pkg /contrib/cmd/seccompagent/seccompagent /contrib/cmd/fs-idmap/fs-idmap /contrib/cmd/memfd-bind/memfd-bind +/contrib/cmd/pidfd-kill/pidfd-kill man/man8 release Vagrantfile diff --git a/Makefile b/Makefile index f81879cbc5c..df935e41929 100644 --- a/Makefile +++ b/Makefile @@ -71,10 +71,10 @@ runc-bin: runc-dmz $(GO_BUILD) -o runc . .PHONY: all -all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind +all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill -.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind -recvtty sd-helper seccompagent fs-idmap memfd-bind: +.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill +recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill: $(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@ .PHONY: static @@ -194,6 +194,7 @@ clean: rm -f contrib/cmd/sd-helper/sd-helper rm -f contrib/cmd/seccompagent/seccompagent rm -f contrib/cmd/memfd-bind/memfd-bind + rm -f contrib/cmd/pidfd-kill/pidfd-kill sudo rm -rf release rm -rf man/man8 diff --git a/contrib/cmd/pidfd-kill/pidfd-kill.go b/contrib/cmd/pidfd-kill/pidfd-kill.go new file mode 100644 index 00000000000..b5caa4f208d --- /dev/null +++ b/contrib/cmd/pidfd-kill/pidfd-kill.go @@ -0,0 +1,114 @@ +package main + +import ( + "errors" + "fmt" + "net" + "os" + "os/signal" + + "github.com/urfave/cli" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +const ( + usage = `Open Container Initiative contrib/cmd/pidfd-kill + +pidfd-kill is an implementation of a consumer of runC's --pidfd-socket API. +After received SIGTERM, pidfd-kill sends the given signal to init process by +pidfd received from --pidfd-socket. + +To use pidfd-kill, just specify a socket path at which you want to receive +pidfd: + + $ pidfd-kill [--signal KILL] socket.sock +` +) + +func main() { + app := cli.NewApp() + app.Name = "pidfd-kill" + app.Usage = usage + + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "signal", + Value: "SIGKILL", + Usage: "Signal to send to the init process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "Path to write the pidfd-kill process ID to", + }, + } + + app.Action = func(ctx *cli.Context) error { + args := ctx.Args() + if len(args) != 1 { + return errors.New("required a single socket path") + } + + socketFile := ctx.Args()[0] + + pidFile := ctx.String("pid-file") + if pidFile != "" { + pid := fmt.Sprintf("%d\n", os.Getpid()) + if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil { + return err + } + defer os.Remove(pidFile) + } + + sigStr := ctx.String("signal") + if sigStr == "" { + sigStr = "SIGKILL" + } + sig := unix.SignalNum(sigStr) + + pidfdFile, err := recvPidfd(socketFile) + if err != nil { + return err + } + defer pidfdFile.Close() + + signalCh := make(chan os.Signal, 16) + signal.Notify(signalCh, unix.SIGTERM) + <-signalCh + + return unix.PidfdSendSignal(int(pidfdFile.Fd()), sig, nil, 0) + } + if err := app.Run(os.Args); err != nil { + fmt.Fprintln(os.Stderr, "fatal error:", err) + os.Exit(1) + } +} + +func recvPidfd(socketFile string) (*os.File, error) { + ln, err := net.Listen("unix", socketFile) + if err != nil { + return nil, err + } + defer ln.Close() + + conn, err := ln.Accept() + if err != nil { + return nil, err + } + defer conn.Close() + + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return nil, errors.New("failed to cast to unixconn") + } + + socket, err := unixconn.File() + if err != nil { + return nil, err + } + defer socket.Close() + + return utils.RecvFile(socket) +} diff --git a/create.go b/create.go index 3788a532fce..83b5ba5cf4d 100644 --- a/create.go +++ b/create.go @@ -34,6 +34,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See Value: "", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process", + }, cli.StringFlag{ Name: "pid-file", Value: "", diff --git a/exec.go b/exec.go index 982520f72b8..675f12fb905 100644 --- a/exec.go +++ b/exec.go @@ -33,6 +33,10 @@ following will output a list of processes running in the container: Name: "console-socket", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the exec process", + }, cli.StringFlag{ Name: "cwd", Usage: "current working directory in the container", @@ -181,6 +185,7 @@ func execProcess(context *cli.Context) (int, error) { shouldDestroy: false, container: container, consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), detach: context.Bool("detach"), pidFile: context.String("pid-file"), action: CT_ACT_RUN, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 35f4f5df390..64e53e50e33 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -586,6 +586,13 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) } + if p.PidfdSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + if safeExe != nil { // Due to a Go stdlib bug, we need to add safeExe to the set of // ExtraFiles otherwise it is possible for the stdlib to clobber the fd diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index b9affb91c4b..7ffaae32453 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -179,6 +179,16 @@ func startInitialization() (retErr error) { defer consoleSocket.Close() } + var pidfdSocket *os.File + if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" { + sockFd, err := strconv.Atoi(envSockFd) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err) + } + pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket") + defer pidfdSocket.Close() + } + // Get mount files (O_PATH). mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS") if err != nil { @@ -222,10 +232,10 @@ func startInitialization() (retErr error) { } // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } -func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { +func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { if err := populateProcessEnvironment(config.Env); err != nil { return err } @@ -240,6 +250,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock i := &linuxSetnsInit{ pipe: pipe, consoleSocket: consoleSocket, + pidfdSocket: pidfdSocket, config: config, logFd: logFd, dmzExe: dmzExe, @@ -249,6 +260,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock i := &linuxStandardInit{ pipe: pipe, consoleSocket: consoleSocket, + pidfdSocket: pidfdSocket, parentPid: unix.Getppid(), config: config, fifoFd: fifoFd, @@ -690,3 +702,20 @@ func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { return nil } + +// setupPidfd opens a process file descriptor of init process, and sends the +// file descriptor back to the socket. +func setupPidfd(socket *os.File, initType string) error { + defer socket.Close() + + pidFd, err := unix.PidfdOpen(os.Getpid(), 0) + if err != nil { + return fmt.Errorf("failed to pidfd_open: %w", err) + } + + if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil { + unix.Close(pidFd) + return fmt.Errorf("failed to send pidfd on socket: %w", err) + } + return unix.Close(pidFd) +} diff --git a/libcontainer/process.go b/libcontainer/process.go index 08c2396fe02..8181062ae64 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -77,6 +77,9 @@ type Process struct { // ConsoleSocket provides the masterfd console. ConsoleSocket *os.File + // PidfdSocket provides process file descriptor of it own. + PidfdSocket *os.File + // Init specifies whether the process is the first process in the container. Init bool diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index f3edb7100c8..46b07620933 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -22,6 +22,7 @@ import ( type linuxSetnsInit struct { pipe *syncSocket consoleSocket *os.File + pidfdSocket *os.File config *initConfig logFd int dmzExe *os.File @@ -56,6 +57,11 @@ func (l *linuxSetnsInit) Init() error { return err } } + if l.pidfdSocket != nil { + if err := setupPidfd(l.pidfdSocket, "setns"); err != nil { + return fmt.Errorf("failed to setup pidfd: %w", err) + } + } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { return err diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4fab50c0581..c01c7ed72a8 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -22,6 +22,7 @@ import ( type linuxStandardInit struct { pipe *syncSocket consoleSocket *os.File + pidfdSocket *os.File parentPid int fifoFd int logFd int @@ -114,6 +115,12 @@ func (l *linuxStandardInit) Init() error { } } + if l.pidfdSocket != nil { + if err := setupPidfd(l.pidfdSocket, "standard"); err != nil { + return fmt.Errorf("failed to setup pidfd: %w", err) + } + } + // Finish the rootfs setup. if l.config.Config.Namespaces.Contains(configs.NEWNS) { if err := finalizeRootfs(l.config.Config); err != nil { diff --git a/run.go b/run.go index 8b4f4d1fb23..3ee1060a83d 100644 --- a/run.go +++ b/run.go @@ -35,6 +35,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See Value: "", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process", + }, cli.BoolFlag{ Name: "detach, d", Usage: "detach from the container's process", diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 7e6399a47b8..1e601541d9b 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -17,6 +17,7 @@ RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty" SD_HELPER="${INTEGRATION_ROOT}/../../contrib/cmd/sd-helper/sd-helper" SECCOMP_AGENT="${INTEGRATION_ROOT}/../../contrib/cmd/seccompagent/seccompagent" FS_IDMAP="${INTEGRATION_ROOT}/../../contrib/cmd/fs-idmap/fs-idmap" +PIDFD_KILL="${INTEGRATION_ROOT}/../../contrib/cmd/pidfd-kill/pidfd-kill" # Some variables may not always be set. Set those to empty value, # if unset, to avoid "unbound variable" error. @@ -697,3 +698,44 @@ function requires_idmap_fs() { esac # If we have another error, the integration test will fail and report it. } + +# setup_pidfd_kill runs pidfd-kill process in background and receives the +# SIGTERM as signal to send the given signal to init process. +function setup_pidfd_kill() { + local signal=$1 + + [ ! -v ROOT ] && return 1 + local dir="${ROOT}/pidfd" + + mkdir "${dir}" + export PIDFD_SOCKET="${dir}/sock" + + ("${PIDFD_KILL}" --pid-file "${dir}/pid" --signal "${signal}" "${PIDFD_SOCKET}" &) & + + # ensure socket is ready + retry 10 1 stat "${PIDFD_SOCKET}" +} + +# teardown_pidfd_kill cleanups all the resources related to pidfd-kill. +function teardown_pidfd_kill() { + [ ! -v ROOT ] && return 0 + + local dir="${ROOT}/pidfd" + + if [ -f "${dir}/pid" ]; then + kill -9 "$(cat "${dir}/pid")" + fi + + rm -rf "${dir}" +} + +# pidfd_kill sends the signal to init process. +function pidfd_kill() { + [ ! -v ROOT ] && return 0 + + local dir="${ROOT}/pidfd" + + if [ -f "${dir}/pid" ]; then + kill "$(cat "${dir}/pid")" + fi +} diff --git a/tests/integration/pidfd-socket.bats b/tests/integration/pidfd-socket.bats new file mode 100644 index 00000000000..9c10f26e1df --- /dev/null +++ b/tests/integration/pidfd-socket.bats @@ -0,0 +1,99 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root + requires_kernel 5.3 + setup_busybox + update_config '.process.args = ["/bin/sleep", "1d"]' +} + +function teardown() { + teardown_pidfd_kill + teardown_bundle +} + +@test "runc create [ --pidfd-socket ] " { + setup_pidfd_kill "SIGTERM" + + runc create --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd created + + pidfd_kill + wait_for_container 10 1 test_pidfd stopped +} + +@test "runc run [ --pidfd-socket ] " { + setup_pidfd_kill "SIGKILL" + + runc run -d --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + pidfd_kill + wait_for_container 10 1 test_pidfd stopped +} + +@test "runc exec [ --pidfd-socket ] [cgroups_v1] " { + requires cgroups_v1 + + set_cgroups_path + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + # Use sub-cgroup to ensure that exec process has been killed + test_pidfd_cgroup_path=$(get_cgroup_path "pids") + mkdir "${test_pidfd_cgroup_path}/exec_pidfd" + [ "$status" -eq 0 ] + + setup_pidfd_kill "SIGKILL" + + __runc exec -d --cgroup "pids:exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d + [ "$status" -eq 0 ] + + exec_pid=$(cat exec_pid.txt) + exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs") + [ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ] + + pidfd_kill + + # ensure exec process has been reaped + retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd" + + testcontainer test_pidfd running +} + +@test "runc exec [ --pidfd-socket ] [cgroups_v2] " { + requires cgroups_v2 + + set_cgroups_path + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + # Use sub-cgroup to ensure that exec process has been killed + test_pidfd_cgroup_path=$(get_cgroup_path "pids") + mkdir "${test_pidfd_cgroup_path}/exec_pidfd" + [ "$status" -eq 0 ] + + setup_pidfd_kill "SIGKILL" + + __runc exec -d --cgroup "exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d + [ "$status" -eq 0 ] + + exec_pid=$(cat exec_pid.txt) + exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs") + [ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ] + + pidfd_kill + + # ensure exec process has been reaped + retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd" + + testcontainer test_pidfd running +} diff --git a/utils_linux.go b/utils_linux.go index b5c855cdbb9..c7d9794c86e 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/system/kernelversion" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -199,6 +200,7 @@ type runner struct { preserveFDs int pidFile string consoleSocket string + pidfdSocket string container *libcontainer.Container action CtAct notifySocket *notifySocket @@ -255,6 +257,14 @@ func (r *runner) run(config *specs.Process) (int, error) { } defer tty.Close() + if r.pidfdSocket != "" { + connClose, err := setupPidfdSocket(process, r.pidfdSocket) + if err != nil { + return -1, err + } + defer connClose() + } + switch r.action { case CT_ACT_CREATE: err = r.container.Start(process) @@ -390,6 +400,7 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C listenFDs: listenFDs, notifySocket: notifySocket, consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), detach: context.Bool("detach"), pidFile: context.String("pid-file"), preserveFDs: context.Int("preserve-fds"), @@ -399,3 +410,36 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C } return r.run(spec.Process) } + +func setupPidfdSocket(process *libcontainer.Process, sockpath string) (_clean func(), _ error) { + linux530 := kernelversion.KernelVersion{Kernel: 5, Major: 3} + ok, err := kernelversion.GreaterEqualThan(linux530) + if err != nil { + return nil, err + } + if !ok { + return nil, fmt.Errorf("--pidfd-socket requires >= v5.3 kernel") + } + + conn, err := net.Dial("unix", sockpath) + if err != nil { + return nil, fmt.Errorf("failed to dail %s: %w", sockpath, err) + } + + uc, ok := conn.(*net.UnixConn) + if !ok { + conn.Close() + return nil, errors.New("failed to cast to UnixConn") + } + + socket, err := uc.File() + if err != nil { + conn.Close() + return nil, fmt.Errorf("failed to dup socket: %w", err) + } + + process.PidfdSocket = socket + return func() { + conn.Close() + }, nil +}