Skip to content

Commit 36f520d

Browse files
committed
Let OCI runtime create netns when userns is used
As explained in the comments, this patch lets the OCI runtime create the netns when userns are in use. This is needed because the netns needs to be owned by the userns (otherwise can't modify the IP, etc.). Before this patch, we are creating the netns and then starting the pod sandbox asking to join this netns. This can't never work with userns, as the userns needs to be created first for the netns ownership to be correct. One option would be to also create the userns in containerd, then create the netns. But this is painful (needs tricks with the go runtime, special care to write the mapping, etc.). So, we just let the OCI runtime create the userns and netns, that creates them with the proper ownership. As requested by Mike Brown, the current code when userns is not used is left unchanged. We can unify the cases (with and without userns) in a future release. Signed-off-by: Rodrigo Campos <[email protected]>
1 parent 3233d5d commit 36f520d

4 files changed

Lines changed: 144 additions & 5 deletions

File tree

pkg/cri/server/sandbox_run.go

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"fmt"
2424
"math"
2525
"path/filepath"
26+
goruntime "runtime"
2627
"strings"
2728
"time"
2829

@@ -244,8 +245,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
244245
return nil, fmt.Errorf("failed to get sandbox container info: %w", err)
245246
}
246247

248+
userNsEnabled := false
249+
if goruntime.GOOS != "windows" {
250+
usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
251+
if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
252+
userNsEnabled = true
253+
}
254+
}
255+
247256
// Setup the network namespace if host networking wasn't requested.
248-
if !hostNetwork(config) {
257+
if !hostNetwork(config) && !userNsEnabled {
258+
// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
259+
// We can't move this to a function, as the defer calls need to be executed if other
260+
// errors are returned in this function. So, we would need more refactors to move
261+
// this code to a function and the idea was to not change the current code for
262+
// !userNsEnabled case, therefore doing it would defeat the purpose.
263+
//
264+
// The difference between the cases is the use of netns.NewNetNS() vs
265+
// netns.NewNetNSFromPID() and we verify the task is still running in the other case.
266+
//
267+
// To simplify this, in the future, we should just remove this case (podNetwork &&
268+
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
249269
netStart := time.Now()
250270

251271
// If it is not in host network namespace then create a namespace and set the sandbox
@@ -353,6 +373,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
353373
return nil, fmt.Errorf("failed to wait for sandbox container task: %w", err)
354374
}
355375

376+
if !hostNetwork(config) && userNsEnabled {
377+
// If userns is enabled, then the netns was created by the OCI runtime
378+
// when creating "task". The OCI runtime needs to create the netns
379+
// because, if userns is in use, the netns needs to be owned by the
380+
// userns. So, let the OCI runtime just handle this for us.
381+
// If the netns is not owned by the userns several problems will happen.
382+
// For instance, the container will lack permission (even if
383+
// capabilities are present) to modify the netns or, even worse, the OCI
384+
// runtime will fail to mount sysfs:
385+
// https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
386+
netStart := time.Now()
387+
388+
// If it is not in host network namespace then create a namespace and set the sandbox
389+
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
390+
// namespaces. If the pod is in host network namespace then both are empty and should not
391+
// be used.
392+
var netnsMountDir = "/var/run/netns"
393+
if c.config.NetNSMountsUnderStateDir {
394+
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
395+
}
396+
sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, task.Pid())
397+
if err != nil {
398+
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
399+
}
400+
401+
// Verify task is still in created state.
402+
if st, err := task.Status(ctx); err != nil || st.Status != containerd.Created {
403+
return nil, fmt.Errorf("failed to create pod sandbox %q: err is %v - status is %q and is expected %q", id, err, st.Status, containerd.Created)
404+
}
405+
sandbox.NetNSPath = sandbox.NetNS.GetPath()
406+
407+
defer func() {
408+
// Remove the network namespace only if all the resource cleanup is done.
409+
if retErr != nil && cleanupErr == nil {
410+
if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
411+
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
412+
return
413+
}
414+
sandbox.NetNSPath = ""
415+
}
416+
}()
417+
418+
// Update network namespace in the container's spec
419+
c.updateNetNamespacePath(spec, sandbox.NetNSPath)
420+
421+
if err := container.Update(ctx,
422+
// Update spec of the container
423+
containerd.UpdateContainerOpts(containerd.WithSpec(spec)),
424+
// Update sandbox metadata to include NetNS info
425+
containerd.UpdateContainerOpts(containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata))); err != nil {
426+
return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err)
427+
}
428+
429+
// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
430+
// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions.
431+
defer func() {
432+
// Teardown the network only if all the resource cleanup is done.
433+
if retErr != nil && cleanupErr == nil {
434+
deferCtx, deferCancel := ctrdutil.DeferContext()
435+
defer deferCancel()
436+
// Teardown network if an error is returned.
437+
if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
438+
log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
439+
}
440+
}
441+
}()
442+
443+
// Setup network for sandbox.
444+
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
445+
// rely on the assumption that CRI shim will not be querying the network namespace to check the
446+
// network states such as IP.
447+
// In future runtime implementation should avoid relying on CRI shim implementation details.
448+
// In this case however caching the IP will add a subtle performance enhancement by avoiding
449+
// calls to network namespace of the pod to query the IP of the veth interface on every
450+
// SandboxStatus request.
451+
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
452+
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
453+
}
454+
455+
sandboxCreateNetworkTimer.UpdateSince(netStart)
456+
}
457+
356458
if c.nri.isEnabled() {
357459
err = c.nri.runPodSandbox(ctx, &sandbox)
358460
if err != nil {

pkg/netns/netns_linux.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ import (
5050

5151
// newNS creates a new persistent (bind-mounted) network namespace and returns the
5252
// path to the network namespace.
53-
func newNS(baseDir string) (nsPath string, err error) {
53+
// If pid is not 0, returns the netns from that pid persistently mounted. Otherwise,
54+
// a new netns is created.
55+
func newNS(baseDir string, pid uint32) (nsPath string, err error) {
5456
b := make([]byte, 16)
5557

5658
_, err = rand.Read(b)
@@ -81,6 +83,16 @@ func newNS(baseDir string) (nsPath string, err error) {
8183
}
8284
}()
8385

86+
if pid != 0 {
87+
procNsPath := getNetNSPathFromPID(pid)
88+
// bind mount the netns onto the mount point. This causes the namespace
89+
// to persist, even when there are no threads in the ns.
90+
if err = unix.Mount(procNsPath, nsPath, "none", unix.MS_BIND, ""); err != nil {
91+
return "", fmt.Errorf("failed to bind mount ns src: %v at %s: %w", procNsPath, nsPath, err)
92+
}
93+
return nsPath, nil
94+
}
95+
8496
var wg sync.WaitGroup
8597
wg.Add(1)
8698

@@ -155,14 +167,23 @@ func getCurrentThreadNetNSPath() string {
155167
return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid())
156168
}
157169

170+
func getNetNSPathFromPID(pid uint32) string {
171+
return fmt.Sprintf("/proc/%d/ns/net", pid)
172+
}
173+
158174
// NetNS holds network namespace.
159175
type NetNS struct {
160176
path string
161177
}
162178

163179
// NewNetNS creates a network namespace.
164180
func NewNetNS(baseDir string) (*NetNS, error) {
165-
path, err := newNS(baseDir)
181+
return NewNetNSFromPID(baseDir, 0)
182+
}
183+
184+
// NewNetNS returns the netns from pid or a new netns if pid is 0.
185+
func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) {
186+
path, err := newNS(baseDir, pid)
166187
if err != nil {
167188
return nil, fmt.Errorf("failed to setup netns: %w", err)
168189
}

pkg/netns/netns_other.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ func NewNetNS(baseDir string) (*NetNS, error) {
3535
return nil, errNotImplementedOnUnix
3636
}
3737

38+
// NewNetNS returns the netns from pid or a new netns if pid is 0.
39+
func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) {
40+
return nil, errNotImplementedOnUnix
41+
}
42+
3843
// LoadNetNS loads existing network namespace.
3944
func LoadNetNS(path string) *NetNS {
4045
return &NetNS{path: path}

pkg/netns/netns_windows.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,20 @@
1616

1717
package netns
1818

19-
import "github.com/Microsoft/hcsshim/hcn"
19+
import (
20+
"errors"
21+
22+
"github.com/Microsoft/hcsshim/hcn"
23+
)
24+
25+
var errNotImplementedOnWindows = errors.New("not implemented on windows")
2026

2127
// NetNS holds network namespace for sandbox
2228
type NetNS struct {
2329
path string
2430
}
2531

26-
// NewNetNS creates a network namespace for the sandbox
32+
// NewNetNS creates a network namespace for the sandbox.
2733
func NewNetNS(baseDir string) (*NetNS, error) {
2834
temp := hcn.HostComputeNamespace{}
2935
hcnNamespace, err := temp.Create()
@@ -34,6 +40,11 @@ func NewNetNS(baseDir string) (*NetNS, error) {
3440
return &NetNS{path: hcnNamespace.Id}, nil
3541
}
3642

43+
// NewNetNS returns the netns from pid or a new netns if pid is 0.
44+
func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) {
45+
return nil, errNotImplementedOnWindows
46+
}
47+
3748
// LoadNetNS loads existing network namespace.
3849
func LoadNetNS(path string) *NetNS {
3950
return &NetNS{path: path}

0 commit comments

Comments
 (0)