@@ -23,6 +23,7 @@ import (
2323 "fmt"
2424 "math"
2525 "path/filepath"
26+ goruntime "runtime"
2627 "strings"
2728 "time"
2829
@@ -244,8 +245,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
244245 return nil , fmt .Errorf ("failed to get sandbox container info: %w" , err )
245246 }
246247
248+ userNsEnabled := false
249+ if goruntime .GOOS != "windows" {
250+ usernsOpts := config .GetLinux ().GetSecurityContext ().GetNamespaceOptions ().GetUsernsOptions ()
251+ if usernsOpts != nil && usernsOpts .GetMode () == runtime .NamespaceMode_POD {
252+ userNsEnabled = true
253+ }
254+ }
255+
247256 // Setup the network namespace if host networking wasn't requested.
248- if ! hostNetwork (config ) {
257+ if ! hostNetwork (config ) && ! userNsEnabled {
258+ // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
259+ // We can't move this to a function, as the defer calls need to be executed if other
260+ // errors are returned in this function. So, we would need more refactors to move
261+ // this code to a function and the idea was to not change the current code for
262+ // !userNsEnabled case, therefore doing it would defeat the purpose.
263+ //
264+ // The difference between the cases is the use of netns.NewNetNS() vs
265+ // netns.NewNetNSFromPID() and we verify the task is still running in the other case.
266+ //
267+ // To simplify this, in the future, we should just remove this case (podNetwork &&
268+ // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
249269 netStart := time .Now ()
250270
251271 // If it is not in host network namespace then create a namespace and set the sandbox
@@ -353,6 +373,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
353373 return nil , fmt .Errorf ("failed to wait for sandbox container task: %w" , err )
354374 }
355375
376+ if ! hostNetwork (config ) && userNsEnabled {
377+ // If userns is enabled, then the netns was created by the OCI runtime
378+ // when creating "task". The OCI runtime needs to create the netns
379+ // because, if userns is in use, the netns needs to be owned by the
380+ // userns. So, let the OCI runtime just handle this for us.
381+ // If the netns is not owned by the userns several problems will happen.
382+ // For instance, the container will lack permission (even if
383+ // capabilities are present) to modify the netns or, even worse, the OCI
384+ // runtime will fail to mount sysfs:
385+ // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
386+ netStart := time .Now ()
387+
388+ // If it is not in host network namespace then create a namespace and set the sandbox
389+ // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
390+ // namespaces. If the pod is in host network namespace then both are empty and should not
391+ // be used.
392+ var netnsMountDir = "/var/run/netns"
393+ if c .config .NetNSMountsUnderStateDir {
394+ netnsMountDir = filepath .Join (c .config .StateDir , "netns" )
395+ }
396+ sandbox .NetNS , err = netns .NewNetNSFromPID (netnsMountDir , task .Pid ())
397+ if err != nil {
398+ return nil , fmt .Errorf ("failed to create network namespace for sandbox %q: %w" , id , err )
399+ }
400+
401+ // Verify task is still in created state.
402+ if st , err := task .Status (ctx ); err != nil || st .Status != containerd .Created {
403+ return nil , fmt .Errorf ("failed to create pod sandbox %q: err is %v - status is %q and is expected %q" , id , err , st .Status , containerd .Created )
404+ }
405+ sandbox .NetNSPath = sandbox .NetNS .GetPath ()
406+
407+ defer func () {
408+ // Remove the network namespace only if all the resource cleanup is done.
409+ if retErr != nil && cleanupErr == nil {
410+ if cleanupErr = sandbox .NetNS .Remove (); cleanupErr != nil {
411+ log .G (ctx ).WithError (cleanupErr ).Errorf ("Failed to remove network namespace %s for sandbox %q" , sandbox .NetNSPath , id )
412+ return
413+ }
414+ sandbox .NetNSPath = ""
415+ }
416+ }()
417+
418+ // Update network namespace in the container's spec
419+ c .updateNetNamespacePath (spec , sandbox .NetNSPath )
420+
421+ if err := container .Update (ctx ,
422+ // Update spec of the container
423+ containerd .UpdateContainerOpts (containerd .WithSpec (spec )),
424+ // Update sandbox metadata to include NetNS info
425+ containerd .UpdateContainerOpts (containerd .WithContainerExtension (sandboxMetadataExtension , & sandbox .Metadata ))); err != nil {
426+ return nil , fmt .Errorf ("failed to update the network namespace for the sandbox container %q: %w" , id , err )
427+ }
428+
429+ // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
430+ // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions.
431+ defer func () {
432+ // Teardown the network only if all the resource cleanup is done.
433+ if retErr != nil && cleanupErr == nil {
434+ deferCtx , deferCancel := ctrdutil .DeferContext ()
435+ defer deferCancel ()
436+ // Teardown network if an error is returned.
437+ if cleanupErr = c .teardownPodNetwork (deferCtx , sandbox ); cleanupErr != nil {
438+ log .G (ctx ).WithError (cleanupErr ).Errorf ("Failed to destroy network for sandbox %q" , id )
439+ }
440+ }
441+ }()
442+
443+ // Setup network for sandbox.
444+ // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
445+ // rely on the assumption that CRI shim will not be querying the network namespace to check the
446+ // network states such as IP.
447+ // In future runtime implementation should avoid relying on CRI shim implementation details.
448+ // In this case however caching the IP will add a subtle performance enhancement by avoiding
449+ // calls to network namespace of the pod to query the IP of the veth interface on every
450+ // SandboxStatus request.
451+ if err := c .setupPodNetwork (ctx , & sandbox ); err != nil {
452+ return nil , fmt .Errorf ("failed to setup network for sandbox %q: %w" , id , err )
453+ }
454+
455+ sandboxCreateNetworkTimer .UpdateSince (netStart )
456+ }
457+
356458 if c .nri .isEnabled () {
357459 err = c .nri .runPodSandbox (ctx , & sandbox )
358460 if err != nil {
0 commit comments