2727
2828#include <sys/types.h>
2929#include <sys/stat.h>
30+ #include <sys/statfs.h>
3031#include <sys/vfs.h>
3132#include <sys/mman.h>
33+ #include <sys/mount.h>
3234#include <sys/sendfile.h>
3335#include <sys/syscall.h>
3436
@@ -67,6 +69,7 @@ int memfd_create(const char *name, unsigned int flags)
6769# define F_SEAL_WRITE 0x0008 /* prevent writes */
6870#endif
6971
72+ #define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
7073#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
7174#define RUNC_MEMFD_SEALS \
7275 (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
@@ -88,29 +91,56 @@ static void *must_realloc(void *ptr, size_t size)
8891static int is_self_cloned (void )
8992{
9093 int fd , ret , is_cloned = 0 ;
94+ struct stat statbuf = {};
95+ struct statfs fsbuf = {};
9196
9297 fd = open ("/proc/self/exe" , O_RDONLY |O_CLOEXEC );
9398 if (fd < 0 )
9499 return - ENOTRECOVERABLE ;
95100
96- /* First check memfd. */
101+ /*
102+ * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
103+ * this, because you cannot write to a sealed memfd no matter what (so
104+ * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
105+ * memfd to /usr/bin/runc to allow re-use).
106+ */
97107 ret = fcntl (fd , F_GET_SEALS );
98108 if (ret >= 0 ) {
99109 is_cloned = (ret == RUNC_MEMFD_SEALS );
100- } else {
101- /*
102- * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
103- * which appears to have a borked backport of F_GET_SEALS. Either way,
104- * having a file which has no hardlinks indicates that we aren't using
105- * a host-side "runc" binary and this is something that a container
106- * cannot fake (because unlinking requires being able to resolve the
107- * path that you want to unlink).
108- */
109- struct stat statbuf = {};
110- if (fstat (fd , & statbuf ) >= 0 )
111- is_cloned = (statbuf .st_nlink == 0 );
110+ goto out ;
112111 }
113112
113+ /*
114+ * All other forms require CLONED_BINARY_ENV, since they are potentially
115+ * writeable (or we can't tell if they're fully safe) and thus we must
116+ * check the environment as an extra layer of defence.
117+ */
118+ if (!getenv (CLONED_BINARY_ENV )) {
119+ is_cloned = false;
120+ goto out ;
121+ }
122+
123+ /*
124+ * Is the binary on a read-only filesystem? We can't detect bind-mounts in
125+ * particular (in-kernel they are identical to regular mounts) but we can
126+ * at least be sure that it's read-only. In addition, to make sure that
127+ * it's *our* bind-mount we check CLONED_BINARY_ENV.
128+ */
129+ if (fstatfs (fd , & fsbuf ) >= 0 )
130+ is_cloned |= (fsbuf .f_flags & MS_RDONLY );
131+
132+ /*
133+ * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
134+ * which appears to have a borked backport of F_GET_SEALS. Either way,
135+ * having a file which has no hardlinks indicates that we aren't using
136+ * a host-side "runc" binary and this is something that a container
137+ * cannot fake (because unlinking requires being able to resolve the
138+ * path that you want to unlink).
139+ */
140+ if (fstat (fd , & statbuf ) >= 0 )
141+ is_cloned |= (statbuf .st_nlink == 0 );
142+
143+ out :
114144 close (fd );
115145 return is_cloned ;
116146}
@@ -227,15 +257,16 @@ static int make_execfd(int *fdtype)
227257 return -1 ;
228258
229259 /*
230- * Try memfd first, it's much nicer since it's easily detected thanks to
231- * sealing and also doesn't require assumptions like /tmp.
260+ * Now try memfd, it's much nicer than actually creating a file in STATEDIR
261+ * since it's easily detected thanks to sealing and also doesn't require
262+ * assumptions about STATEDIR.
232263 */
233264 * fdtype = EFD_MEMFD ;
234265 fd = memfd_create (RUNC_MEMFD_COMMENT , MFD_CLOEXEC | MFD_ALLOW_SEALING );
235266 if (fd >= 0 )
236267 return fd ;
237- if (errno != ENOSYS )
238- goto err ;
268+ if (errno != ENOSYS && errno != EINVAL )
269+ goto error ;
239270
240271#ifdef O_TMPFILE
241272 /*
@@ -266,7 +297,7 @@ static int make_execfd(int *fdtype)
266297 errno = EISDIR ;
267298 }
268299 if (errno != EISDIR )
269- goto err ;
300+ goto error ;
270301#endif /* defined(O_TMPFILE) */
271302
272303 /*
@@ -281,7 +312,7 @@ static int make_execfd(int *fdtype)
281312 close (fd );
282313 }
283314
284- err :
315+ error :
285316 * fdtype = EFD_NONE ;
286317 return -1 ;
287318}
@@ -316,15 +347,83 @@ static int seal_execfd(int *fd, int fdtype)
316347 return -1 ;
317348}
318349
350+ static int try_bindfd (void )
351+ {
352+ int fd , ret = -1 ;
353+ char template [PATH_MAX ] = {0 };
354+ char * prefix = secure_getenv ("_LIBCONTAINER_STATEDIR" );
355+
356+ if (!prefix || * prefix != '/' )
357+ prefix = "/tmp" ;
358+ if (snprintf (template , sizeof (template ), "%s/runc.XXXXXX" , prefix ) < 0 )
359+ return ret ;
360+
361+ /*
362+ * We need somewhere to mount it, mounting anything over /proc/self is a
363+ * BAD idea on the host -- even if we do it temporarily.
364+ */
365+ fd = mkstemp (template );
366+ if (fd < 0 )
367+ return ret ;
368+ close (fd );
369+
370+ /*
371+ * For obvious reasons this won't work in rootless mode because we haven't
372+ * created a userns+mntns -- but getting that to work will be a bit
373+ * complicated and it's only worth doing if someone actually needs it.
374+ */
375+ ret = - EPERM ;
376+ if (mount ("/proc/self/exe" , template , "" , MS_BIND , "" ) < 0 )
377+ goto out ;
378+ if (mount ("" , template , "" , MS_REMOUNT | MS_BIND | MS_RDONLY , "" ) < 0 )
379+ goto out_umount ;
380+
381+
382+ /* Get read-only handle that we're sure can't be made read-write. */
383+ ret = open (template , O_PATH | O_CLOEXEC );
384+
385+ out_umount :
386+ /*
387+ * Make sure the MNT_DETACH works, otherwise we could get remounted
388+ * read-write and that would be quite bad (the fd would be made read-write
389+ * too, invalidating the protection).
390+ */
391+ if (umount2 (template , MNT_DETACH ) < 0 ) {
392+ if (ret >= 0 )
393+ close (ret );
394+ ret = - ENOTRECOVERABLE ;
395+ }
396+
397+ out :
398+ /*
399+ * We don't care about unlink errors, the worst that happens is that
400+ * there's an empty file left around in STATEDIR.
401+ */
402+ unlink (template );
403+ return ret ;
404+ }
405+
319406static int clone_binary (void )
320407{
321- int binfd , memfd ;
408+ int binfd , execfd ;
322409 struct stat statbuf = {};
323410 size_t sent = 0 ;
324411 int fdtype = EFD_NONE ;
325412
326- memfd = make_execfd (& fdtype );
327- if (memfd < 0 || fdtype == EFD_NONE )
413+ /*
414+ * Before we resort to copying, let's try creating an ro-binfd in one shot
415+ * by getting a handle for a read-only bind-mount of the execfd.
416+ */
417+ execfd = try_bindfd ();
418+ if (execfd >= 0 )
419+ return execfd ;
420+
421+ /*
422+ * Dammit, that didn't work -- time to copy the binary to a safe place we
423+ * can seal the contents.
424+ */
425+ execfd = make_execfd (& fdtype );
426+ if (execfd < 0 || fdtype == EFD_NONE )
328427 return - ENOTRECOVERABLE ;
329428
330429 binfd = open ("/proc/self/exe" , O_RDONLY | O_CLOEXEC );
@@ -335,7 +434,7 @@ static int clone_binary(void)
335434 goto error_binfd ;
336435
337436 while (sent < statbuf .st_size ) {
338- int n = sendfile (memfd , binfd , NULL , statbuf .st_size - sent );
437+ int n = sendfile (execfd , binfd , NULL , statbuf .st_size - sent );
339438 if (n < 0 )
340439 goto error_binfd ;
341440 sent += n ;
@@ -344,14 +443,15 @@ static int clone_binary(void)
344443 if (sent != statbuf .st_size )
345444 goto error ;
346445
347- if (seal_execfd (& memfd , fdtype ) < 0 )
446+ if (seal_execfd (& execfd , fdtype ) < 0 )
348447 goto error ;
349- return memfd ;
448+
449+ return execfd ;
350450
351451error_binfd :
352452 close (binfd );
353453error :
354- close (memfd );
454+ close (execfd );
355455 return - EIO ;
356456}
357457
@@ -375,6 +475,11 @@ int ensure_cloned_binary(void)
375475 if (execfd < 0 )
376476 return - EIO ;
377477
478+ if (putenv (CLONED_BINARY_ENV "=1" ))
479+ goto error ;
480+
378481 fexecve (execfd , argv , environ );
482+ error :
483+ close (execfd );
379484 return - ENOEXEC ;
380485}
0 commit comments