diff --git a/.gitignore b/.gitignore index f022ed275cd..52dac0bf421 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ vendor/pkg /contrib/cmd/seccompagent/seccompagent /contrib/cmd/fs-idmap/fs-idmap /contrib/cmd/memfd-bind/memfd-bind +/contrib/cmd/pidfd-kill/pidfd-kill man/man8 release Vagrantfile diff --git a/Makefile b/Makefile index f81879cbc5c..df935e41929 100644 --- a/Makefile +++ b/Makefile @@ -71,10 +71,10 @@ runc-bin: runc-dmz $(GO_BUILD) -o runc . .PHONY: all -all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind +all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill -.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind -recvtty sd-helper seccompagent fs-idmap memfd-bind: +.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill +recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill: $(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@ .PHONY: static @@ -194,6 +194,7 @@ clean: rm -f contrib/cmd/sd-helper/sd-helper rm -f contrib/cmd/seccompagent/seccompagent rm -f contrib/cmd/memfd-bind/memfd-bind + rm -f contrib/cmd/pidfd-kill/pidfd-kill sudo rm -rf release rm -rf man/man8 diff --git a/contrib/cmd/pidfd-kill/pidfd-kill.go b/contrib/cmd/pidfd-kill/pidfd-kill.go new file mode 100644 index 00000000000..b5caa4f208d --- /dev/null +++ b/contrib/cmd/pidfd-kill/pidfd-kill.go @@ -0,0 +1,114 @@ +package main + +import ( + "errors" + "fmt" + "net" + "os" + "os/signal" + + "github.com/urfave/cli" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +const ( + usage = `Open Container Initiative contrib/cmd/pidfd-kill + +pidfd-kill is an implementation of a consumer of runC's --pidfd-socket API. +After received SIGTERM, pidfd-kill sends the given signal to init process by +pidfd received from --pidfd-socket. + +To use pidfd-kill, just specify a socket path at which you want to receive +pidfd: + + $ pidfd-kill [--signal KILL] socket.sock +` +) + +func main() { + app := cli.NewApp() + app.Name = "pidfd-kill" + app.Usage = usage + + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "signal", + Value: "SIGKILL", + Usage: "Signal to send to the init process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "Path to write the pidfd-kill process ID to", + }, + } + + app.Action = func(ctx *cli.Context) error { + args := ctx.Args() + if len(args) != 1 { + return errors.New("required a single socket path") + } + + socketFile := ctx.Args()[0] + + pidFile := ctx.String("pid-file") + if pidFile != "" { + pid := fmt.Sprintf("%d\n", os.Getpid()) + if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil { + return err + } + defer os.Remove(pidFile) + } + + sigStr := ctx.String("signal") + if sigStr == "" { + sigStr = "SIGKILL" + } + sig := unix.SignalNum(sigStr) + + pidfdFile, err := recvPidfd(socketFile) + if err != nil { + return err + } + defer pidfdFile.Close() + + signalCh := make(chan os.Signal, 16) + signal.Notify(signalCh, unix.SIGTERM) + <-signalCh + + return unix.PidfdSendSignal(int(pidfdFile.Fd()), sig, nil, 0) + } + if err := app.Run(os.Args); err != nil { + fmt.Fprintln(os.Stderr, "fatal error:", err) + os.Exit(1) + } +} + +func recvPidfd(socketFile string) (*os.File, error) { + ln, err := net.Listen("unix", socketFile) + if err != nil { + return nil, err + } + defer ln.Close() + + conn, err := ln.Accept() + if err != nil { + return nil, err + } + defer conn.Close() + + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return nil, errors.New("failed to cast to unixconn") + } + + socket, err := unixconn.File() + if err != nil { + return nil, err + } + defer socket.Close() + + return utils.RecvFile(socket) +} diff --git a/create.go b/create.go index 3788a532fce..83b5ba5cf4d 100644 --- a/create.go +++ b/create.go @@ -34,6 +34,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See Value: "", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process", + }, cli.StringFlag{ Name: "pid-file", Value: "", diff --git a/exec.go b/exec.go index 982520f72b8..53f5f7dea8d 100644 --- a/exec.go +++ b/exec.go @@ -33,6 +33,10 @@ following will output a list of processes running in the container: Name: "console-socket", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process", + }, cli.StringFlag{ Name: "cwd", Usage: "current working directory in the container", @@ -181,6 +185,7 @@ func execProcess(context *cli.Context) (int, error) { shouldDestroy: false, container: container, consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), detach: context.Bool("detach"), pidFile: context.String("pid-file"), action: CT_ACT_RUN, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 35f4f5df390..64e53e50e33 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -586,6 +586,13 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) } + if p.PidfdSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + if safeExe != nil { // Due to a Go stdlib bug, we need to add safeExe to the set of // ExtraFiles otherwise it is possible for the stdlib to clobber the fd diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index b9affb91c4b..7ffaae32453 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -179,6 +179,16 @@ func startInitialization() (retErr error) { defer consoleSocket.Close() } + var pidfdSocket *os.File + if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" { + sockFd, err := strconv.Atoi(envSockFd) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err) + } + pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket") + defer pidfdSocket.Close() + } + // Get mount files (O_PATH). mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS") if err != nil { @@ -222,10 +232,10 @@ func startInitialization() (retErr error) { } // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } -func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { +func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { if err := populateProcessEnvironment(config.Env); err != nil { return err } @@ -240,6 +250,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock i := &linuxSetnsInit{ pipe: pipe, consoleSocket: consoleSocket, + pidfdSocket: pidfdSocket, config: config, logFd: logFd, dmzExe: dmzExe, @@ -249,6 +260,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock i := &linuxStandardInit{ pipe: pipe, consoleSocket: consoleSocket, + pidfdSocket: pidfdSocket, parentPid: unix.Getppid(), config: config, fifoFd: fifoFd, @@ -690,3 +702,20 @@ func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { return nil } + +// setupPidfd opens a process file descriptor of init process, and sends the +// file descriptor back to the socket. +func setupPidfd(socket *os.File, initType string) error { + defer socket.Close() + + pidFd, err := unix.PidfdOpen(os.Getpid(), 0) + if err != nil { + return fmt.Errorf("failed to pidfd_open: %w", err) + } + + if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil { + unix.Close(pidFd) + return fmt.Errorf("failed to send pidfd on socket: %w", err) + } + return unix.Close(pidFd) +} diff --git a/libcontainer/process.go b/libcontainer/process.go index 08c2396fe02..8181062ae64 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -77,6 +77,9 @@ type Process struct { // ConsoleSocket provides the masterfd console. ConsoleSocket *os.File + // PidfdSocket provides process file descriptor of it own. + PidfdSocket *os.File + // Init specifies whether the process is the first process in the container. Init bool diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index f3edb7100c8..46b07620933 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -22,6 +22,7 @@ import ( type linuxSetnsInit struct { pipe *syncSocket consoleSocket *os.File + pidfdSocket *os.File config *initConfig logFd int dmzExe *os.File @@ -56,6 +57,11 @@ func (l *linuxSetnsInit) Init() error { return err } } + if l.pidfdSocket != nil { + if err := setupPidfd(l.pidfdSocket, "setns"); err != nil { + return fmt.Errorf("failed to setup pidfd: %w", err) + } + } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { return err diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4fab50c0581..c01c7ed72a8 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -22,6 +22,7 @@ import ( type linuxStandardInit struct { pipe *syncSocket consoleSocket *os.File + pidfdSocket *os.File parentPid int fifoFd int logFd int @@ -114,6 +115,12 @@ func (l *linuxStandardInit) Init() error { } } + if l.pidfdSocket != nil { + if err := setupPidfd(l.pidfdSocket, "standard"); err != nil { + return fmt.Errorf("failed to setup pidfd: %w", err) + } + } + // Finish the rootfs setup. if l.config.Config.Namespaces.Contains(configs.NEWNS) { if err := finalizeRootfs(l.config.Config); err != nil { diff --git a/run.go b/run.go index 8b4f4d1fb23..3ee1060a83d 100644 --- a/run.go +++ b/run.go @@ -35,6 +35,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See Value: "", Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", }, + cli.StringFlag{ + Name: "pidfd-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process", + }, cli.BoolFlag{ Name: "detach, d", Usage: "detach from the container's process", diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 7e6399a47b8..1e601541d9b 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -17,6 +17,7 @@ RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty" SD_HELPER="${INTEGRATION_ROOT}/../../contrib/cmd/sd-helper/sd-helper" SECCOMP_AGENT="${INTEGRATION_ROOT}/../../contrib/cmd/seccompagent/seccompagent" FS_IDMAP="${INTEGRATION_ROOT}/../../contrib/cmd/fs-idmap/fs-idmap" +PIDFD_KILL="${INTEGRATION_ROOT}/../../contrib/cmd/pidfd-kill/pidfd-kill" # Some variables may not always be set. Set those to empty value, # if unset, to avoid "unbound variable" error. @@ -697,3 +698,44 @@ function requires_idmap_fs() { esac # If we have another error, the integration test will fail and report it. } + +# setup_pidfd_kill runs pidfd-kill process in background and receives the +# SIGTERM as signal to send the given signal to init process. +function setup_pidfd_kill() { + local signal=$1 + + [ ! -v ROOT ] && return 1 + local dir="${ROOT}/pidfd" + + mkdir "${dir}" + export PIDFD_SOCKET="${dir}/sock" + + ("${PIDFD_KILL}" --pid-file "${dir}/pid" --signal "${signal}" "${PIDFD_SOCKET}" &) & + + # ensure socket is ready + retry 10 1 stat "${PIDFD_SOCKET}" +} + +# teardown_pidfd_kill cleanups all the resources related to pidfd-kill. +function teardown_pidfd_kill() { + [ ! -v ROOT ] && return 0 + + local dir="${ROOT}/pidfd" + + if [ -f "${dir}/pid" ]; then + kill -9 "$(cat "${dir}/pid")" + fi + + rm -rf "${dir}" +} + +# pidfd_kill sends the signal to init process. +function pidfd_kill() { + [ ! -v ROOT ] && return 0 + + local dir="${ROOT}/pidfd" + + if [ -f "${dir}/pid" ]; then + kill "$(cat "${dir}/pid")" + fi +} diff --git a/tests/integration/pidfd-socket.bats b/tests/integration/pidfd-socket.bats new file mode 100644 index 00000000000..9c10f26e1df --- /dev/null +++ b/tests/integration/pidfd-socket.bats @@ -0,0 +1,99 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root + requires_kernel 5.3 + setup_busybox + update_config '.process.args = ["/bin/sleep", "1d"]' +} + +function teardown() { + teardown_pidfd_kill + teardown_bundle +} + +@test "runc create [ --pidfd-socket ] " { + setup_pidfd_kill "SIGTERM" + + runc create --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd created + + pidfd_kill + wait_for_container 10 1 test_pidfd stopped +} + +@test "runc run [ --pidfd-socket ] " { + setup_pidfd_kill "SIGKILL" + + runc run -d --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + pidfd_kill + wait_for_container 10 1 test_pidfd stopped +} + +@test "runc exec [ --pidfd-socket ] [cgroups_v1] " { + requires cgroups_v1 + + set_cgroups_path + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + # Use sub-cgroup to ensure that exec process has been killed + test_pidfd_cgroup_path=$(get_cgroup_path "pids") + mkdir "${test_pidfd_cgroup_path}/exec_pidfd" + [ "$status" -eq 0 ] + + setup_pidfd_kill "SIGKILL" + + __runc exec -d --cgroup "pids:exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d + [ "$status" -eq 0 ] + + exec_pid=$(cat exec_pid.txt) + exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs") + [ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ] + + pidfd_kill + + # ensure exec process has been reaped + retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd" + + testcontainer test_pidfd running +} + +@test "runc exec [ --pidfd-socket ] [cgroups_v2] " { + requires cgroups_v2 + + set_cgroups_path + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd + [ "$status" -eq 0 ] + testcontainer test_pidfd running + + # Use sub-cgroup to ensure that exec process has been killed + test_pidfd_cgroup_path=$(get_cgroup_path "pids") + mkdir "${test_pidfd_cgroup_path}/exec_pidfd" + [ "$status" -eq 0 ] + + setup_pidfd_kill "SIGKILL" + + __runc exec -d --cgroup "exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d + [ "$status" -eq 0 ] + + exec_pid=$(cat exec_pid.txt) + exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs") + [ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ] + + pidfd_kill + + # ensure exec process has been reaped + retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd" + + testcontainer test_pidfd running +} diff --git a/utils_linux.go b/utils_linux.go index b5c855cdbb9..c7d9794c86e 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/system/kernelversion" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -199,6 +200,7 @@ type runner struct { preserveFDs int pidFile string consoleSocket string + pidfdSocket string container *libcontainer.Container action CtAct notifySocket *notifySocket @@ -255,6 +257,14 @@ func (r *runner) run(config *specs.Process) (int, error) { } defer tty.Close() + if r.pidfdSocket != "" { + connClose, err := setupPidfdSocket(process, r.pidfdSocket) + if err != nil { + return -1, err + } + defer connClose() + } + switch r.action { case CT_ACT_CREATE: err = r.container.Start(process) @@ -390,6 +400,7 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C listenFDs: listenFDs, notifySocket: notifySocket, consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), detach: context.Bool("detach"), pidFile: context.String("pid-file"), preserveFDs: context.Int("preserve-fds"), @@ -399,3 +410,36 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C } return r.run(spec.Process) } + +func setupPidfdSocket(process *libcontainer.Process, sockpath string) (_clean func(), _ error) { + linux530 := kernelversion.KernelVersion{Kernel: 5, Major: 3} + ok, err := kernelversion.GreaterEqualThan(linux530) + if err != nil { + return nil, err + } + if !ok { + return nil, fmt.Errorf("--pidfd-socket requires >= v5.3 kernel") + } + + conn, err := net.Dial("unix", sockpath) + if err != nil { + return nil, fmt.Errorf("failed to dail %s: %w", sockpath, err) + } + + uc, ok := conn.(*net.UnixConn) + if !ok { + conn.Close() + return nil, errors.New("failed to cast to UnixConn") + } + + socket, err := uc.File() + if err != nil { + conn.Close() + return nil, fmt.Errorf("failed to dup socket: %w", err) + } + + process.PidfdSocket = socket + return func() { + conn.Close() + }, nil +}