Skip to content

Commit

Permalink
*: introduce pidfd-socket flag
Browse files Browse the repository at this point in the history
The container manager like containerd-shim can't use cgroup.kill feature or
freeze all the processes in cgroup to terminate the exec init process.
It's unsafe to call kill(2) since the pid can be recycled. It's good to
provide the pidfd of init process through the pidfd-socket. It's similar to
the console-socket. With the pidfd, the container manager like containerd-shim
can send the signal to target process safely.

And for the standard init process, we can have polling support to get
exit event instead of blocking on wait4.

Signed-off-by: Wei Fu <[email protected]>
  • Loading branch information
fuweid committed Oct 4, 2023
1 parent d8d576c commit 0117ed9
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 2 deletions.
4 changes: 4 additions & 0 deletions create.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process",
},
cli.StringFlag{
Name: "pid-file",
Value: "",
Expand Down
5 changes: 5 additions & 0 deletions exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ following will output a list of processes running in the container:
Name: "console-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process",
},
cli.StringFlag{
Name: "cwd",
Usage: "current working directory in the container",
Expand Down Expand Up @@ -181,6 +185,7 @@ func execProcess(context *cli.Context) (int, error) {
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
Expand Down
7 changes: 7 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,13 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
}

if p.PidfdSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
}

if safeExe != nil {
// Due to a Go stdlib bug, we need to add safeExe to the set of
// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
Expand Down
33 changes: 31 additions & 2 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,16 @@ func startInitialization() (retErr error) {
defer consoleSocket.Close()
}

var pidfdSocket *os.File
if envConsole := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
}
pidfdSocket = os.NewFile(uintptr(console), "pidfd-socket")
defer pidfdSocket.Close()
}

// Get mount files (O_PATH).
mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
if err != nil {
Expand Down Expand Up @@ -222,10 +232,10 @@ func startInitialization() (retErr error) {
}

// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
}

func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}
Expand All @@ -240,6 +250,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
config: config,
logFd: logFd,
dmzExe: dmzExe,
Expand All @@ -249,6 +260,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
i := &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
Expand Down Expand Up @@ -676,3 +688,20 @@ func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {

return nil
}

// setupPidfd opens a process file descriptor of init process, and sends the
// file descriptor back to the socket.
func setupPidfd(socket *os.File, initType string) error {
defer socket.Close()

pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
if err != nil {
return fmt.Errorf("failed to pidfd_open: %w", err)
}

if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
unix.Close(pidFd)
return fmt.Errorf("failed to send pidfd on socket: %w", err)
}
return unix.Close(pidFd)
}
3 changes: 3 additions & 0 deletions libcontainer/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ type Process struct {
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File

// PidfdSocket provides process file descriptor of it own.
PidfdSocket *os.File

// Init specifies whether the process is the first process in the container.
Init bool

Expand Down
6 changes: 6 additions & 0 deletions libcontainer/setns_init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
type linuxSetnsInit struct {
pipe *syncSocket
consoleSocket *os.File
pidfdSocket *os.File
config *initConfig
logFd int
dmzExe *os.File
Expand Down Expand Up @@ -56,6 +57,11 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
if l.pidfdSocket != nil {
if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
return fmt.Errorf("failed to setup pidfd: %w", err)
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
Expand Down
7 changes: 7 additions & 0 deletions libcontainer/standard_init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
type linuxStandardInit struct {
pipe *syncSocket
consoleSocket *os.File
pidfdSocket *os.File
parentPid int
fifoFd int
logFd int
Expand Down Expand Up @@ -114,6 +115,12 @@ func (l *linuxStandardInit) Init() error {
}
}

if l.pidfdSocket != nil {
if err := setupPidfd(l.pidfdSocket, "standard"); err != nil {
return fmt.Errorf("failed to setup pidfd: %w", err)
}
}

// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
Expand Down
4 changes: 4 additions & 0 deletions run.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process",
},
cli.BoolFlag{
Name: "detach, d",
Usage: "detach from the container's process",
Expand Down
32 changes: 32 additions & 0 deletions utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ type runner struct {
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
Expand Down Expand Up @@ -250,6 +251,14 @@ func (r *runner) run(config *specs.Process) (int, error) {
}
defer tty.Close()

if r.pidfdSocket != "" {
connClose, err := setupPidfdSocket(process, r.pidfdSocket)
if err != nil {
return -1, err
}
defer connClose()
}

switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process)
Expand Down Expand Up @@ -385,6 +394,7 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
Expand All @@ -394,3 +404,25 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
}
return r.run(spec.Process)
}

func setupPidfdSocket(process *libcontainer.Process, sockpath string) (_clean func(), _ error) {
conn, err := net.Dial("unix", sockpath)
if err != nil {
return nil, fmt.Errorf("failed to dail %s: %w", sockpath, err)
}

uc, ok := conn.(*net.UnixConn)
if !ok {
return nil, errors.New("failed to cast to UnixConn")
}

socket, err := uc.File()
if err != nil {
return nil, fmt.Errorf("failed to dup socket: %w", err)
}

process.PidfdSocket = socket
return func() {
conn.Close()
}, nil
}

0 comments on commit 0117ed9

Please sign in to comment.