Skip to content

Commit

Permalink
use /proc/self/exe directly to start init
Browse files Browse the repository at this point in the history
Signed-off-by: lifubang <[email protected]>
  • Loading branch information
lifubang committed Jan 21, 2024
1 parent 0ff15c5 commit 124867d
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 165 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/moby/sys/mountinfo v0.7.1
github.com/moby/sys/user v0.1.0
github.com/mrunalp/fileutils v0.5.1
github.com/opencontainers-sec/go-containersec v0.0.1
github.com/opencontainers-sec/go-containersec v0.0.2
github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4
github.com/opencontainers/selinux v1.11.0
github.com/seccomp/libseccomp-golang v0.10.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg=
github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/opencontainers-sec/go-containersec v0.0.1 h1:+4ov9mAgONY6w/of5x3eRFLu/5TtfFUh/SXeofqIM8M=
github.com/opencontainers-sec/go-containersec v0.0.1/go.mod h1:8tU3XOqpsj1/WwjTsJa78OkCuJpQ9VrjXkgITmqzeUw=
github.com/opencontainers-sec/go-containersec v0.0.2 h1:E37DR3CH9VWRJhr4+0VZbjdMQTR3371ijJiUGpQVOOM=
github.com/opencontainers-sec/go-containersec v0.0.2/go.mod h1:8tU3XOqpsj1/WwjTsJa78OkCuJpQ9VrjXkgITmqzeUw=
github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow=
github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU=
Expand Down
126 changes: 1 addition & 125 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ import (

"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/dmz"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
"github.com/opencontainers/runc/libcontainer/utils"
)

Expand Down Expand Up @@ -443,117 +441,13 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
return nil
}

// No longer needed in Go 1.21.
func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
for _, val := range slice {
if val == needle {
return true
}
}
return false
}

func isDmzBinarySafe(c *configs.Config) bool {
if !dmz.WorksWithSELinux(c) {
return false
}

// Because we set the dumpable flag in nsexec, the only time when it is
// unsafe to use runc-dmz is when the container process would be able to
// race against "runc init" and bypass the ptrace_may_access() checks.
//
// This is only the case if the container processes could have
// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
// inheritable, or ambient sets). Luckily, most containers do not have this
// capability.
if c.Capabilities == nil ||
(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
return true
}

// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
// access /proc/$pid/exe of runc after it joins the namespace (until it
// does an exec), regardless of the capability set. This has been
// backported to other distribution kernels, but there's no way of checking
// this cheaply -- better to be safe than sorry here.
linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
if c.Namespaces.Contains(configs.NEWUSER) {
return true
}
}

// Assume it's unsafe otherwise.
return false
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
comm, err := newProcessComm()
if err != nil {
return nil, err
}

// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
// binary each time this is called, to make sure that if a container
// manages to overwrite the file it cannot affect other containers on the
// system. For runc, this code will only ever be called once, but
// libcontainer users might call this more than once.
p.closeClonedExes()
var (
exePath string
// only one of dmzExe or safeExe are used at a time
dmzExe, safeExe *os.File
)
if dmz.IsSelfExeCloned() {
// /proc/self/exe is already a cloned binary -- no need to do anything
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
// We don't need to use /proc/thread-self here because the exe mm of a
// thread-group is guaranteed to be the same for all threads by
// definition. This lets us avoid having to do runtime.LockOSThread.
exePath = "/proc/self/exe"
} else {
var err error
if isDmzBinarySafe(c.config) {
dmzExe, err = dmz.Binary(c.stateDir)
if err == nil {
// We can use our own executable without cloning if we are
// using runc-dmz. We don't need to use /proc/thread-self here
// because the exe mm of a thread-group is guaranteed to be the
// same for all threads by definition. This lets us avoid
// having to do runtime.LockOSThread.
exePath = "/proc/self/exe"
p.clonedExes = append(p.clonedExes, dmzExe)
logrus.Debug("runc-dmz: using runc-dmz") // used for tests
} else if errors.Is(err, dmz.ErrNoDmzBinary) {
logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
} else if err != nil {
return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
}
} else {
// If the configuration makes it unsafe to use runc-dmz, pretend we
// don't have it embedded so we do /proc/self/exe cloning.
logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
err = dmz.ErrNoDmzBinary
}
if errors.Is(err, dmz.ErrNoDmzBinary) {
safeExe, err = dmz.CloneSelfExe(c.stateDir)
if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
}
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
p.clonedExes = append(p.clonedExes, safeExe)
logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests
}
// Just to make sure we don't run without protection.
if dmzExe == nil && safeExe == nil {
// This should never happen.
return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
}
}

cmd := exec.Command(exePath, "init")
cmd := exec.Command("/proc/self/exe", "init")
cmd.Args[0] = os.Args[0]
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
Expand All @@ -580,12 +474,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
"_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)

if dmzExe != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
}

cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
Expand All @@ -600,18 +488,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
)
}

if safeExe != nil {
// Due to a Go stdlib bug, we need to add safeExe to the set of
// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
// during forkAndExecInChild1 and replace it with some other file that
// might be malicious. This is less than ideal (because the descriptor
// will be non-O_CLOEXEC) however we have protections in "runc init" to
// stop us from leaking extra file descriptors.
//
// See <https://github.com/golang/go/issues/61751>.
cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
}

// NOTE: when running a container with no PID namespace and the parent
// process spawning the container is PID1 the pdeathsig is being
// delivered to the container's init process by the kernel for some
Expand Down
34 changes: 0 additions & 34 deletions tests/integration/run.bats
Original file line number Diff line number Diff line change
Expand Up @@ -127,40 +127,6 @@ function teardown() {
[ "${lines[0]}" = "410" ]
}

@test "runc run [runc-dmz]" {
runc --debug run test_hello
[ "$status" -eq 0 ]
[[ "$output" = *"Hello World"* ]]
# We use runc-dmz if we can.
[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
}

@test "runc run [cap_sys_ptrace -> /proc/self/exe clone]" {
# Add CAP_SYS_PTRACE to the bounding set, the minimum needed to indicate a
# container process _could_ get CAP_SYS_PTRACE.
update_config '.process.capabilities.bounding += ["CAP_SYS_PTRACE"]'

runc --debug run test_hello
[ "$status" -eq 0 ]
[[ "$output" = *"Hello World"* ]]
if [ "$EUID" -ne 0 ] && is_kernel_gte 4.10; then
# For Linux 4.10 and later, rootless containers will use runc-dmz
# because they are running in a user namespace. See isDmzBinarySafe().
[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
else
# If the container has CAP_SYS_PTRACE and is not rootless, we use
# /proc/self/exe cloning.
[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
fi
}

@test "RUNC_DMZ=legacy runc run [/proc/self/exe clone]" {
RUNC_DMZ=legacy runc --debug run test_hello
[ "$status" -eq 0 ]
[[ "$output" = *"Hello World"* ]]
[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
}

@test "runc run [joining existing container namespaces]" {
requires timens

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vendor/modules.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ github.com/moby/sys/user
# github.com/mrunalp/fileutils v0.5.1
## explicit; go 1.13
github.com/mrunalp/fileutils
# github.com/opencontainers-sec/go-containersec v0.0.1
# github.com/opencontainers-sec/go-containersec v0.0.2
## explicit; go 1.21
github.com/opencontainers-sec/go-containersec/execve
github.com/opencontainers-sec/go-containersec/execve/system
Expand Down

0 comments on commit 124867d

Please sign in to comment.