Skip to content

Commit 9b2d72e

Browse files
chrishenziek8s-infra-cherrypick-robot
authored andcommitted
Preserve host cgroup mount options for privileged containers
Privileged containers don't have a cgroup namespace and share the host's cgroup namespace. Mounting cgroup2 inside these containers can inadvertently alter the host's cgroup2 VFS superblock mount options because they are shared. To prevent this, update WithMounts to read the host's /sys/fs/cgroup mount options and explicitly propagate nsdelegate and memory_recursiveprot into the container's mount spec. This avoids stripping them on the host when they are not in the hardcoded default set. Signed-off-by: Chris Henzie <[email protected]>
1 parent 5b66cd6 commit 9b2d72e

2 files changed

Lines changed: 101 additions & 1 deletion

File tree

internal/cri/opts/spec_linux_opts.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"os"
2424
"path/filepath"
25+
"slices"
2526
"sort"
2627
"strconv"
2728
"strings"
@@ -70,11 +71,35 @@ func withMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru
7071
if cgroupWritable {
7172
mode = "rw"
7273
}
74+
75+
cgroupOptions := []string{"nosuid", "noexec", "nodev", "relatime", mode}
76+
77+
hasCgroupNS := false
78+
if s.Linux != nil {
79+
hasCgroupNS = slices.ContainsFunc(s.Linux.Namespaces, func(ns runtimespec.LinuxNamespace) bool {
80+
return ns.Type == runtimespec.CgroupNamespace
81+
})
82+
}
83+
84+
// If a container shares the host's cgroup namespace, mounting cgroup2
85+
// inside the container applies the new mount options to the single shared
86+
// cgroup2 VFS superblock. Therefore, explicitly copy these options from
87+
// the host's /sys/fs/cgroup to avoid being stripped.
88+
if !hasCgroupNS {
89+
if mountInfo, err := osi.LookupMount("/sys/fs/cgroup"); err == nil {
90+
for opt := range strings.SplitSeq(mountInfo.VFSOptions, ",") {
91+
if opt == "nsdelegate" || opt == "memory_recursiveprot" {
92+
cgroupOptions = append(cgroupOptions, opt)
93+
}
94+
}
95+
}
96+
}
97+
7398
s.Mounts = append(s.Mounts, runtimespec.Mount{
7499
Source: "cgroup",
75100
Destination: "/sys/fs/cgroup",
76101
Type: "cgroup",
77-
Options: []string{"nosuid", "noexec", "nodev", "relatime", mode},
102+
Options: cgroupOptions,
78103
})
79104

80105
// Copy all mounts from default mounts, except for

internal/cri/opts/spec_linux_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,15 @@
1717
package opts
1818

1919
import (
20+
"context"
2021
"testing"
2122

23+
"github.com/containerd/containerd/v2/core/mount"
24+
ostesting "github.com/containerd/containerd/v2/pkg/os/testing"
25+
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
2226
"github.com/stretchr/testify/assert"
2327
"github.com/stretchr/testify/require"
28+
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
2429
)
2530

2631
func TestMergeGids(t *testing.T) {
@@ -45,3 +50,73 @@ func TestRestrictOOMScoreAdj(t *testing.T) {
4550
require.NoError(t, err)
4651
assert.Equal(t, got, current+1)
4752
}
53+
54+
func TestWithMountsCgroupNamespaceOptions(t *testing.T) {
55+
tests := []struct {
56+
name string
57+
hasCgroupNS bool
58+
hostMountOpts string
59+
expectedOpts []string
60+
}{
61+
{
62+
name: "has cgroupns, should use default options",
63+
hasCgroupNS: true,
64+
hostMountOpts: "rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot",
65+
expectedOpts: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
66+
},
67+
{
68+
name: "no cgroupns, with host options present",
69+
hasCgroupNS: false,
70+
hostMountOpts: "rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot",
71+
expectedOpts: []string{"nosuid", "noexec", "nodev", "relatime", "ro", "nsdelegate", "memory_recursiveprot"},
72+
},
73+
{
74+
name: "no cgroupns, with host missing nsdelegate",
75+
hasCgroupNS: false,
76+
hostMountOpts: "rw,nosuid,nodev,noexec,relatime,memory_recursiveprot",
77+
expectedOpts: []string{"nosuid", "noexec", "nodev", "relatime", "ro", "memory_recursiveprot"},
78+
},
79+
{
80+
name: "no cgroupns, with host missing all extra options",
81+
hasCgroupNS: false,
82+
hostMountOpts: "rw,nosuid,nodev,noexec,relatime",
83+
expectedOpts: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
84+
},
85+
}
86+
87+
for _, tt := range tests {
88+
t.Run(tt.name, func(t *testing.T) {
89+
fakeOS := ostesting.NewFakeOS()
90+
fakeOS.LookupMountFn = func(path string) (mount.Info, error) {
91+
if path == "/sys/fs/cgroup" {
92+
return mount.Info{VFSOptions: tt.hostMountOpts}, nil
93+
}
94+
return mount.Info{}, nil
95+
}
96+
97+
config := &runtime.ContainerConfig{
98+
Linux: &runtime.LinuxContainerConfig{},
99+
}
100+
101+
spec := &runtimespec.Spec{}
102+
if tt.hasCgroupNS {
103+
spec.Linux = &runtimespec.Linux{Namespaces: []runtimespec.LinuxNamespace{{Type: runtimespec.CgroupNamespace}}}
104+
}
105+
106+
opt := withMounts(fakeOS, config, nil, "", nil, false)
107+
err := opt(context.Background(), nil, nil, spec)
108+
require.NoError(t, err)
109+
110+
var cgroupMount *runtimespec.Mount
111+
for _, m := range spec.Mounts {
112+
if m.Destination == "/sys/fs/cgroup" {
113+
cgroupMount = &m
114+
break
115+
}
116+
}
117+
118+
require.NotNil(t, cgroupMount)
119+
assert.ElementsMatch(t, tt.expectedOpts, cgroupMount.Options)
120+
})
121+
}
122+
}

0 commit comments

Comments
 (0)