Skip to content

Commit b999376

Browse files
committed
nsenter: cloned_binary: remove bindfd logic entirely
While the ro-bind-mount trick did eliminate the memory overhead of copying the runc binary for each "runc init" invocation, on machines with very significant container churn, creating a temporary mount namespace on every container invocation can trigger severe lock contention on namespace_sem that makes containers fail to spawn. The only reason we added bindfd in commit 16612d7 ("nsenter: cloned_binary: try to ro-bind /proc/self/exe before copying") was due to a Kubernetes e2e test failure where they had a ridiculously small memory limit. It seems incredibly unlikely that real workloads are running without 10MB to spare for the very short time that runc is interacting with the container. In addition, since the original cloned_binary implementation, cgroupv2 is now almost universally used on modern systems. Unlike cgroupv1, the cgroupv2 memcg implementation does not migrate memory usage when processes change cgroups (even cgroupv1 only did this if you had memory.move_charge_at_immigrate enabled). In addition, because we do the /proc/self/exe clone before synchronising the bootstrap data read, we are guaranteed to do the clone before "runc init" is moved into the container cgroup -- meaning that the memory used by the /proc/self/exe clone is charged against the root cgroup, and thus container workloads should not be affected at all with memfd cloning. The long-term fix for this problem is to block the /proc/self/exe re-opening attack entirely in-kernel, which is something I'm working on[1]. Though it should also be noted that because the memfd is completely separate to the host binary, even attacks like Dirty COW against the runc binary can be defended against with the memfd approach. Of course, once we have in-kernel protection against the /proc/self/exe re-opening attack, we won't have that protection anymore... [1]: https://lwn.net/Articles/934460/ Signed-off-by: Aleksa Sarai <[email protected]>
1 parent 23e41ef commit b999376

1 file changed

Lines changed: 0 additions & 205 deletions

File tree

libcontainer/nsenter/cloned_binary.c

Lines changed: 0 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -400,199 +400,6 @@ static int seal_execfd(int *fd, int fdtype)
400400
return -1;
401401
}
402402

403-
struct bindfd_child_args {
404-
int sockfd;
405-
const char *mount_target;
406-
};
407-
408-
static int bindfd_in_subprocess(void *arg)
409-
{
410-
/*
411-
* In the interests of efficiency (read: minimizing the syscall count)
412-
* and conciseness, no attempt is made to release resources which would
413-
* be cleaned up automatically on process exit, i.e. when this function
414-
* returns. This includes filesystem mounts, as this function is
415-
* executed in a dedicated mount namespace.
416-
*/
417-
418-
/*
419-
* For obvious reasons this won't work in rootless mode because we
420-
* haven't created a userns -- but getting that to work will be a bit
421-
* complicated and it's only worth doing if someone actually needs it.
422-
*/
423-
if (mount("none", "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
424-
return errno;
425-
/*
426-
* The kernel resolves the magic symlink /proc/self/exe to the real file
427-
* _in the original mount namespace_. Cross-namespace bind mounts are
428-
* not allowed, so we must locate the file inside the current mount
429-
* namespace to be able to bind-mount it. (The mount(8) command resolves
430-
* symlinks, which is why it appears to work at first glance.)
431-
*/
432-
char linkbuf[PATH_MAX + 1] = { 0 };
433-
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
434-
if (linkpathlen < 0)
435-
return errno;
436-
if (linkpathlen == sizeof(linkbuf)) {
437-
/*
438-
* The link path is longer than PATH_MAX, and the contents of
439-
* linkbuf might have been truncated. A truncated path could
440-
* happen to be a valid path to a different file, which could
441-
* allow for local privilege escalation if we were to exec it.
442-
* The mount syscall doesn't accept paths longer than PATH_MAX,
443-
* anyway.
444-
*/
445-
return ENAMETOOLONG;
446-
}
447-
448-
int srcfd = open(linkbuf, O_PATH | O_CLOEXEC);
449-
if (srcfd < 0)
450-
return errno;
451-
/*
452-
* linkbuf holds the path to the binary which the parent process was
453-
* launched from. Someone could have moved a different file to that path
454-
* in the interim, in which case srcfd is not the file we want to
455-
* bind-mount. Guard against this situation by verifying srcfd is the
456-
* same file as /proc/self/exe.
457-
*/
458-
struct stat realexe = { 0 };
459-
if (stat("/proc/self/exe", &realexe) < 0)
460-
return errno;
461-
struct stat resolved = { 0 };
462-
if (fstat(srcfd, &resolved) < 0)
463-
return errno;
464-
if (resolved.st_dev != realexe.st_dev || resolved.st_ino != realexe.st_ino)
465-
return ENOENT;
466-
if (snprintf(linkbuf, sizeof(linkbuf), "/proc/self/fd/%d", srcfd) == sizeof(linkbuf))
467-
return ENAMETOOLONG;
468-
469-
const struct bindfd_child_args *args = arg;
470-
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
471-
return errno;
472-
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
473-
return errno;
474-
475-
int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
476-
if (fd < 0)
477-
return errno;
478-
479-
/*
480-
* Make sure the MNT_DETACH works, otherwise we could get remounted
481-
* read-write and that would be quite bad.
482-
*/
483-
if (umount2(args->mount_target, MNT_DETACH) < 0)
484-
return errno;
485-
486-
if (send_fd(args->sockfd, fd) < 0)
487-
return errno;
488-
return 0;
489-
}
490-
491-
static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
492-
static int spawn_bindfd_child(const struct bindfd_child_args *args)
493-
{
494-
/*
495-
* Carve out a chunk of our call stack for the child process to use as
496-
* we can be sure it is correctly mapped for use as stack. (Technically
497-
* only the libc clone() wrapper writes to this buffer. The child
498-
* process operates on a copy of the parent's virtual memory space and
499-
* so can safely overflow into the rest of the stack memory region
500-
* without consequence.)
501-
*/
502-
char stack[4 * 1024] __attribute__((aligned(16)));
503-
int tid = clone(bindfd_in_subprocess,
504-
/*
505-
* Assume stack grows down, as HP-PA, the only Linux
506-
* platform where stack grows up, is obsolete.
507-
*/
508-
stack + sizeof(stack),
509-
/*
510-
* Suspend the parent process until the child has exited to
511-
* save an unnecessary context switch as we'd just be
512-
* waiting for the child process to exit anyway.
513-
*/
514-
CLONE_NEWNS | CLONE_VFORK, (void *)args);
515-
if (tid < 0)
516-
return -errno;
517-
return tid;
518-
}
519-
520-
static int try_bindfd(void)
521-
{
522-
int fd, ret = -1;
523-
char template[PATH_MAX] = { 0 };
524-
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
525-
526-
if (!prefix || *prefix != '/')
527-
prefix = "/tmp";
528-
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
529-
return ret;
530-
531-
/*
532-
* We need somewhere to mount it, mounting anything over /proc/self is a
533-
* BAD idea on the host -- even if we do it temporarily.
534-
*/
535-
fd = mkstemp(template);
536-
if (fd < 0)
537-
return ret;
538-
close(fd);
539-
540-
/*
541-
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
542-
* re-parse it on every change, which gets expensive when the mount table
543-
* is large and/or changes frequently. Perform the mount operations in a
544-
* new, private mount namespace so as not to wake up those processes
545-
* every time we nsexec into a container. We clone a child process into
546-
* a new mount namespace to do the dirty work so the side effects of
547-
* unsharing the mount namespace do not leak into the current process.
548-
*/
549-
int sock[2];
550-
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0) {
551-
ret = -errno;
552-
goto cleanup_unlink;
553-
}
554-
555-
struct bindfd_child_args args = {
556-
.sockfd = sock[0],
557-
.mount_target = template,
558-
};
559-
int cpid = spawn_bindfd_child(&args);
560-
close(sock[0]);
561-
if (cpid < 0) {
562-
ret = cpid;
563-
goto cleanup_socketpair;
564-
}
565-
566-
int wstatus = 0;
567-
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
568-
bail("error waiting for bindfd child process to exit");
569-
if (WIFEXITED(wstatus)) {
570-
if (WEXITSTATUS(wstatus)) {
571-
ret = -WEXITSTATUS(wstatus);
572-
goto cleanup_socketpair;
573-
}
574-
} else if (WIFSIGNALED(wstatus)) {
575-
int sig = WTERMSIG(wstatus);
576-
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
577-
} else {
578-
/* Should never happen... */
579-
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
580-
}
581-
582-
ret = receive_fd(sock[1]);
583-
584-
cleanup_socketpair:
585-
close(sock[1]);
586-
587-
cleanup_unlink:
588-
/*
589-
* We don't care about unlink errors, the worst that happens is that
590-
* there's an empty file left around in STATEDIR.
591-
*/
592-
unlink(template);
593-
return ret;
594-
}
595-
596403
static ssize_t fd_to_fd(int outfd, int infd)
597404
{
598405
ssize_t total = 0;
@@ -627,18 +434,6 @@ static int clone_binary(void)
627434
size_t sent = 0;
628435
int fdtype = EFD_NONE;
629436

630-
/*
631-
* Before we resort to copying, let's try creating an ro-binfd in one shot
632-
* by getting a handle for a read-only bind-mount of the execfd.
633-
*/
634-
execfd = try_bindfd();
635-
if (execfd >= 0)
636-
return execfd;
637-
638-
/*
639-
* Dammit, that didn't work -- time to copy the binary to a safe place we
640-
* can seal the contents.
641-
*/
642437
execfd = make_execfd(&fdtype);
643438
if (execfd < 0 || fdtype == EFD_NONE)
644439
return -ENOTRECOVERABLE;

0 commit comments

Comments
 (0)