Skip to content

Commit 284ba30

Browse files
committed
init: close internal fds before execve
If we leak a file descriptor referencing the host filesystem, an attacker could use a /proc/self/fd magic-link as the source for execve to execute a host binary in the container. This would allow the binary itself (or a process inside the container in the 'runc exec' case) to write to a host binary, leading to a container escape. The simple solution is to make sure we close all file descriptors immediately before the execve(2) step. Doing this earlier can lead to very serious issues in Go (as file descriptors can be reused, any (*os.File) reference could start silently operating on a different file) so we have to do it as late as possible. Unfortunately, there are some Go runtime file descriptors that we must not close (otherwise the Go scheduler panics randomly). The only way of being sure which file descriptors cannot be closed is to sneakily go:linkname the runtime internal "internal/poll.IsPollDescriptor" function. This is almost certainly not recommended but there isn't any other way to be absolutely sure, while also closing any other possible files. In addition, we can keep the logrus forwarding logfd open because you cannot execve a pipe and the contents of the pipe are so restricted (JSON-encoded in a format we pick) that it seems unlikely you could even construct shellcode. Closing the logfd causes issues if there is an error returned from execve. In mainline runc, runc-dmz protects us against this attack because the intermediate execve(2) closes all of the O_CLOEXEC internal runc file descriptors and thus runc-dmz cannot access them to attack the host. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai <[email protected]>
1 parent fbe3eed commit 284ba30

File tree

4 files changed

+111
-8
lines changed

4 files changed

+111
-8
lines changed

libcontainer/logs/logs.go

+9
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,19 @@ import (
44
"bufio"
55
"encoding/json"
66
"io"
7+
"os"
78

89
"github.com/sirupsen/logrus"
910
)
1011

12+
// IsLogrusFd returns whether the provided fd matches the one that logrus is
13+
// currently outputting to. This should only ever be called by UnsafeCloseFrom
14+
// from `runc init`.
15+
func IsLogrusFd(fd uintptr) bool {
16+
file, ok := logrus.StandardLogger().Out.(*os.File)
17+
return ok && file.Fd() == fd
18+
}
19+
1120
func ForwardLogs(logPipe io.ReadCloser) chan error {
1221
done := make(chan error, 1)
1322
s := bufio.NewScanner(logPipe)

libcontainer/setns_init_linux.go

+19
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/opencontainers/runc/libcontainer/keys"
1616
"github.com/opencontainers/runc/libcontainer/seccomp"
1717
"github.com/opencontainers/runc/libcontainer/system"
18+
"github.com/opencontainers/runc/libcontainer/utils"
1819
)
1920

2021
// linuxSetnsInit performs the container's initialization for running a new process
@@ -117,5 +118,23 @@ func (l *linuxSetnsInit) Init() error {
117118
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
118119
}
119120

121+
// Close all file descriptors we are not passing to the container. This is
122+
// necessary because the execve target could use internal runc fds as the
123+
// execve path, potentially giving access to binary files from the host
124+
// (which can then be opened by container processes, leading to container
125+
// escapes). Note that because this operation will close any open file
126+
// descriptors that are referenced by (*os.File) handles from underneath
127+
// the Go runtime, we must not do any file operations after this point
128+
// (otherwise the (*os.File) finaliser could close the wrong file). See
129+
// CVE-2024-21626 for more information as to why this protection is
130+
// necessary.
131+
//
132+
// This is not needed for runc-dmz, because the extra execve(2) step means
133+
// that all O_CLOEXEC file descriptors have already been closed and thus
134+
// the second execve(2) from runc-dmz cannot access internal file
135+
// descriptors from runc.
136+
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
137+
return err
138+
}
120139
return system.Exec(name, l.config.Args[0:], os.Environ())
121140
}

libcontainer/standard_init_linux.go

+19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/opencontainers/runc/libcontainer/keys"
1818
"github.com/opencontainers/runc/libcontainer/seccomp"
1919
"github.com/opencontainers/runc/libcontainer/system"
20+
"github.com/opencontainers/runc/libcontainer/utils"
2021
)
2122

2223
type linuxStandardInit struct {
@@ -258,5 +259,23 @@ func (l *linuxStandardInit) Init() error {
258259
return err
259260
}
260261

262+
// Close all file descriptors we are not passing to the container. This is
263+
// necessary because the execve target could use internal runc fds as the
264+
// execve path, potentially giving access to binary files from the host
265+
// (which can then be opened by container processes, leading to container
266+
// escapes). Note that because this operation will close any open file
267+
// descriptors that are referenced by (*os.File) handles from underneath
268+
// the Go runtime, we must not do any file operations after this point
269+
// (otherwise the (*os.File) finaliser could close the wrong file). See
270+
// CVE-2024-21626 for more information as to why this protection is
271+
// necessary.
272+
//
273+
// This is not needed for runc-dmz, because the extra execve(2) step means
274+
// that all O_CLOEXEC file descriptors have already been closed and thus
275+
// the second execve(2) from runc-dmz cannot access internal file
276+
// descriptors from runc.
277+
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
278+
return err
279+
}
261280
return system.Exec(name, l.config.Args[0:], os.Environ())
262281
}

libcontainer/utils/utils_unix.go

+64-8
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@ import (
77
"fmt"
88
"os"
99
"strconv"
10+
_ "unsafe" // for go:linkname
1011

1112
"golang.org/x/sys/unix"
13+
14+
"github.com/opencontainers/runc/libcontainer/logs"
1215
)
1316

1417
// EnsureProcHandle returns whether or not the given file handle is on procfs.
@@ -23,9 +26,11 @@ func EnsureProcHandle(fh *os.File) error {
2326
return nil
2427
}
2528

26-
// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
27-
// the process (except for those below the given fd value).
28-
func CloseExecFrom(minFd int) error {
29+
type fdFunc func(fd int)
30+
31+
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
32+
// the current process.
33+
func fdRangeFrom(minFd int, fn fdFunc) error {
2934
fdDir, err := os.Open("/proc/self/fd")
3035
if err != nil {
3136
return err
@@ -50,15 +55,66 @@ func CloseExecFrom(minFd int) error {
5055
if fd < minFd {
5156
continue
5257
}
53-
// Intentionally ignore errors from unix.CloseOnExec -- the cases where
54-
// this might fail are basically file descriptors that have already
55-
// been closed (including and especially the one that was created when
56-
// os.ReadDir did the "opendir" syscall).
57-
unix.CloseOnExec(fd)
58+
// Ignore the file descriptor we used for readdir, as it will be closed
59+
// when we return.
60+
if uintptr(fd) == fdDir.Fd() {
61+
continue
62+
}
63+
// Run the closure.
64+
fn(fd)
5865
}
5966
return nil
6067
}
6168

69+
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
70+
// equal to minFd in the current process.
71+
func CloseExecFrom(minFd int) error {
72+
return fdRangeFrom(minFd, unix.CloseOnExec)
73+
}
74+
75+
//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
76+
77+
// In order to make sure we do not close the internal epoll descriptors the Go
78+
// runtime uses, we need to ensure that we skip descriptors that match
79+
// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
80+
// unfortunately there's no other way to be sure we're only keeping the file
81+
// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
82+
func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
83+
84+
// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
85+
// current process, except for those critical to Go's runtime (such as the
86+
// netpoll management descriptors).
87+
//
88+
// NOTE: That this function is incredibly dangerous to use in most Go code, as
89+
// closing file descriptors from underneath *os.File handles can lead to very
90+
// bad behaviour (the closed file descriptor can be re-used and then any
91+
// *os.File operations would apply to the wrong file). This function is only
92+
// intended to be called from the last stage of runc init.
93+
func UnsafeCloseFrom(minFd int) error {
94+
// We must not close some file descriptors.
95+
return fdRangeFrom(minFd, func(fd int) {
96+
if runtime_IsPollDescriptor(uintptr(fd)) {
97+
// These are the Go runtimes internal netpoll file descriptors.
98+
// These file descriptors are operated on deep in the Go scheduler,
99+
// and closing those files from underneath Go can result in panics.
100+
// There is no issue with keeping them because they are not
101+
// executable and are not useful to an attacker anyway. Also we
102+
// don't have any choice.
103+
return
104+
}
105+
if logs.IsLogrusFd(uintptr(fd)) {
106+
// Do not close the logrus output fd. We cannot exec a pipe, and
107+
// the contents are quite limited (very little attacker control,
108+
// JSON-encoded) making shellcode attacks unlikely.
109+
return
110+
}
111+
// There's nothing we can do about errors from close(2), and the
112+
// only likely error to be seen is EBADF which indicates the fd was
113+
// already closed (in which case, we got what we wanted).
114+
_ = unix.Close(fd)
115+
})
116+
}
117+
62118
// NewSockPair returns a new unix socket pair
63119
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
64120
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)

0 commit comments

Comments
 (0)