@@ -304,6 +304,125 @@ func TestPodUserNS(t *testing.T) {
304304 }
305305}
306306
307+ // TestIssue10598 tests a case[1] that init processes in container should be able
308+ // to open /dev/stdout or /dev/stderr if init processes are running in their
309+ // user namespace instead of root user.
310+ //
311+ // The shim server creates pipe for init processes' standard output. By default,
312+ // the owner of pipe is the same to shim server (root user). Let's say, the init
313+ // process is running with uid=1000/gid=1000 user. Init processes inherits the
314+ // pipe created by shim server so that it can just write data into that pipe.
315+ // However, if that init process tries to open /dev/stderr, the kernel will
316+ // return no permission error.
317+ //
318+ // The following output is from retsnoop[2].
319+ //
320+ // → do_open
321+ // → inode_permission
322+ // → generic_permission
323+ // ↔ make_vfsuid [0] 0.500us
324+ // ↔ make_vfsuid [0] 6.501us
325+ // ↔ from_kuid [0xffffffff] 0.700us
326+ // ← generic_permission [-EACCES] 13.501us
327+ //
328+ // Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert
329+ // uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid
330+ // value and then `do_open` returns EACCES error.
331+ //
332+ // [1]: https://github.com/containerd/containerd/issues/10598
333+ // [2]: https://github.com/anakryiko/retsnoop
334+ func TestIssue10598 (t * testing.T ) {
335+ if ! supportsUserNS () {
336+ t .Skip ("User namespaces are not supported" )
337+ }
338+ if ! supportsIDMap (defaultRoot ) {
339+ t .Skipf ("ID mappings are not supported on: %v" , defaultRoot )
340+ }
341+ if err := supportsRuncIDMap (); err != nil {
342+ t .Skipf ("OCI runtime doesn't support idmap mounts: %v" , err )
343+ }
344+
345+ testPodLogDir := t .TempDir ()
346+
347+ containerID := uint32 (0 )
348+ hostID := uint32 (65536 )
349+ size := uint32 (65536 )
350+
351+ t .Log ("Create a sandbox with userns" )
352+ sandboxOpts := []PodSandboxOpts {
353+ WithPodUserNs (containerID , hostID , size ),
354+ WithPodLogDirectory (testPodLogDir ),
355+ }
356+ sbConfig := PodSandboxConfig ("issue10598" , "userns" , sandboxOpts ... )
357+ sb , err := runtimeService .RunPodSandbox (sbConfig , * runtimeHandler )
358+ require .NoError (t , err )
359+
360+ // Make sure the sandbox is cleaned up.
361+ defer func () {
362+ assert .NoError (t , runtimeService .StopPodSandbox (sb ))
363+ assert .NoError (t , runtimeService .RemovePodSandbox (sb ))
364+ }()
365+
366+ t .Log ("Create a container for userns" )
367+
368+ containerName := "nginx-userns"
369+ testImage := images .Get (images .Nginx )
370+
371+ EnsureImageExists (t , testImage )
372+
373+ containerOpts := []ContainerOpts {
374+ WithUserNamespace (containerID , hostID , size ),
375+ WithLogPath (containerName ),
376+ // The SELinux policy enforced by container-selinux prevents
377+ // NGINX from opening the /proc/self/fd/2 pipe. This scenario
378+ // is not intended to verify SELinux behavior in the user namespace
379+ // but rather to confirm the ownership of the standard output
380+ // file descriptor. The following option demonstrates how to
381+ // disable the restrictive SELinux rule for the NGINX process.
382+ WithSELinuxOptions (
383+ "unconfined_u" ,
384+ "unconfined_r" ,
385+ "container_runtime_t" ,
386+ "s0" ,
387+ ),
388+ }
389+
390+ cnConfig := ContainerConfig (
391+ containerName ,
392+ testImage ,
393+ containerOpts ... ,
394+ )
395+ cn , err := runtimeService .CreateContainer (sb , cnConfig , sbConfig )
396+ require .NoError (t , err )
397+
398+ t .Log ("Start the container" )
399+ require .NoError (t , runtimeService .StartContainer (cn ))
400+
401+ t .Log ("Wait for container to start" )
402+ require .NoError (t , Eventually (func () (bool , error ) {
403+ content , err := os .ReadFile (filepath .Join (testPodLogDir , containerName ))
404+ if err != nil {
405+ return false , err
406+ }
407+
408+ s , err := runtimeService .ContainerStatus (cn )
409+ if err != nil {
410+ return false , err
411+ }
412+
413+ if state := s .GetState (); state != runtime .ContainerState_CONTAINER_RUNNING {
414+ return false , fmt .Errorf ("%s is not running\n state: %s\n log: %s" ,
415+ containerName , state , string (content ))
416+ }
417+
418+ started := strings .Contains (string (content ), "start worker processes" )
419+ if started {
420+ t .Log (string (content ))
421+ }
422+ return started , nil
423+ }, time .Second , 30 * time .Second ))
424+ }
425+
307426func supportsRuncIDMap () error {
308427 var r runc.Runc
309428 features , err := r .Features (context .Background ())
0 commit comments