Skip to content

Commit b810da1

Browse files
committed
cgroups: systemd: make use of Device*= properties
It seems we missed that systemd added support for the devices cgroup, as a result systemd would actually *write an allow-all rule each time you did 'runc update'* if you used the systemd cgroup driver. This is obviously ... bad and was a clear security bug. Luckily the commits which introduced this were never in an actual runc release. So we simply generate the cgroupv1-style rules (which is what systemd's DeviceAllow wants) and default to a deny-all ruleset. Unfortunately it turns out that systemd is susceptible to the same spurrious error failure that we were, so that problem is out of our hands for systemd cgroup users. However, systemd has a similar bug to the one fixed in [1]. It will happily write a disruptive deny-all rule when it is not necessary. Unfortunately, we cannot even use devices.Emulator to generate a minimal set of transition rules because the DBus API is limited (you can only clear or append to the DeviceAllow= list -- so we are forced to always clear it). To work around this, we simply freeze the container during SetUnitProperties. [1]: afe8348 ("cgroupv1: devices: use minimal transition rules with devices.Emulator") Fixes: 1d4ccc8 ("fix data inconsistent when runc update in systemd driven cgroup v1") Fixes: 7682a2b ("fix data inconsistent when runc update in systemd driven cgroup v2") Signed-off-by: Aleksa Sarai <[email protected]>
1 parent afe8348 commit b810da1

4 files changed

Lines changed: 283 additions & 1 deletion

File tree

libcontainer/cgroups/devices/devices_emulator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ func (e *Emulator) IsBlacklist() bool {
5757
return e.defaultAllow
5858
}
5959

60+
func (e *Emulator) IsAllowAll() bool {
61+
return e.IsBlacklist() && len(e.rules) == 0
62+
}
63+
6064
var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)
6165

6266
func parseLine(line string) (*deviceRule, error) {

libcontainer/cgroups/systemd/common.go

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package systemd
22

33
import (
4+
"bufio"
45
"fmt"
56
"os"
67
"strings"
@@ -9,6 +10,7 @@ import (
910

1011
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
1112
dbus "github.com/godbus/dbus/v5"
13+
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
1214
"github.com/opencontainers/runc/libcontainer/configs"
1315
"github.com/pkg/errors"
1416
"github.com/sirupsen/logrus"
@@ -69,6 +71,209 @@ func ExpandSlice(slice string) (string, error) {
6971
return path, nil
7072
}
7173

74+
func groupPrefix(ruleType configs.DeviceType) (string, error) {
75+
switch ruleType {
76+
case configs.BlockDevice:
77+
return "block-", nil
78+
case configs.CharDevice:
79+
return "char-", nil
80+
default:
81+
return "", errors.Errorf("device type %v has no group prefix", ruleType)
82+
}
83+
}
84+
85+
// findDeviceGroup tries to find the device group name (as listed in
86+
// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
87+
// given (type, major) combination. If more than one device group exists, an
88+
// arbitrary one is chosen.
89+
func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
90+
fh, err := os.Open("/proc/devices")
91+
if err != nil {
92+
return "", err
93+
}
94+
defer fh.Close()
95+
96+
prefix, err := groupPrefix(ruleType)
97+
if err != nil {
98+
return "", err
99+
}
100+
101+
scanner := bufio.NewScanner(fh)
102+
var currentType configs.DeviceType
103+
for scanner.Scan() {
104+
// We need to strip spaces because the first number is column-aligned.
105+
line := strings.TrimSpace(scanner.Text())
106+
107+
// Handle the "header" lines.
108+
switch line {
109+
case "Block devices:":
110+
currentType = configs.BlockDevice
111+
continue
112+
case "Character devices:":
113+
currentType = configs.CharDevice
114+
continue
115+
case "":
116+
continue
117+
}
118+
119+
// Skip lines unrelated to our type.
120+
if currentType != ruleType {
121+
continue
122+
}
123+
124+
// Parse out the (major, name).
125+
var (
126+
currMajor int64
127+
currName string
128+
)
129+
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
130+
if err == nil {
131+
err = errors.Errorf("wrong number of fields")
132+
}
133+
return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
134+
}
135+
136+
if currMajor == ruleMajor {
137+
return prefix + currName, nil
138+
}
139+
}
140+
if err := scanner.Err(); err != nil {
141+
return "", errors.Wrap(err, "reading /proc/devices")
142+
}
143+
// Couldn't find the device group.
144+
return "", nil
145+
}
146+
147+
// generateDeviceProperties takes the configured device rules and generates a
148+
// corresponding set of systemd properties to configure the devices correctly.
149+
func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
150+
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
151+
// to represent it in Go.
152+
type deviceAllowEntry struct {
153+
Path string
154+
Perms string
155+
}
156+
157+
properties := []systemdDbus.Property{
158+
// Always run in the strictest white-list mode.
159+
newProp("DevicePolicy", "strict"),
160+
// Empty the DeviceAllow array before filling it.
161+
newProp("DeviceAllow", []deviceAllowEntry{}),
162+
}
163+
164+
// Figure out the set of rules.
165+
configEmu := &devices.Emulator{}
166+
for _, rule := range rules {
167+
if err := configEmu.Apply(*rule); err != nil {
168+
return nil, errors.Wrap(err, "apply rule for systemd")
169+
}
170+
}
171+
// systemd doesn't support blacklists. So we log a warning, and tell
172+
// systemd to act as a deny-all whitelist. This ruleset will be replaced
173+
// with our normal fallback code. This may result in spurrious errors, but
174+
// the only other option is to error out here.
175+
if configEmu.IsBlacklist() {
176+
// However, if we're dealing with an allow-all rule then we can do it.
177+
if configEmu.IsAllowAll() {
178+
return []systemdDbus.Property{
179+
// Run in white-list mode by setting to "auto" and removing all
180+
// DeviceAllow rules.
181+
newProp("DevicePolicy", "auto"),
182+
newProp("DeviceAllow", []deviceAllowEntry{}),
183+
}, nil
184+
}
185+
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
186+
return properties, nil
187+
}
188+
189+
// Now generate the set of rules we actually need to apply. Unlike the
190+
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
191+
// whitelist which is the default for devices.Emulator.
192+
baseEmu := &devices.Emulator{}
193+
finalRules, err := baseEmu.Transition(configEmu)
194+
if err != nil {
195+
return nil, errors.Wrap(err, "get simplified rules for systemd")
196+
}
197+
var deviceAllowList []deviceAllowEntry
198+
for _, rule := range finalRules {
199+
if !rule.Allow {
200+
// Should never happen.
201+
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
202+
}
203+
switch rule.Type {
204+
case configs.BlockDevice, configs.CharDevice:
205+
default:
206+
// Should never happen.
207+
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
208+
}
209+
210+
entry := deviceAllowEntry{
211+
Perms: string(rule.Permissions),
212+
}
213+
214+
// systemd has a fairly odd (though understandable) syntax here, and
215+
// because of the OCI configuration format we have to do quite a bit of
216+
// trickery to convert things:
217+
//
218+
// * Concrete rules with non-wildcard major/minor numbers have to use
219+
// /dev/{block,char} paths. This is slightly odd because it means
220+
// that we cannot add whitelist rules for devices that don't exist,
221+
// but there's not too much we can do about that.
222+
//
223+
// However, path globbing is not support for path-based rules so we
224+
// need to handle wildcards in some other manner.
225+
//
226+
// * Wildcard-minor rules have to specify a "device group name" (the
227+
// second column in /proc/devices).
228+
//
229+
// * Wildcard (major and minor) rules can just specify a glob with the
230+
// type ("char-*" or "block-*").
231+
//
232+
// The only type of rule we can't handle is wildcard-major rules, and
233+
// so we'll give a warning in that case (note that the fallback code
234+
// will insert any rules systemd couldn't handle). What amazing fun.
235+
236+
if rule.Major == configs.Wildcard {
237+
// "_ *:n _" rules aren't supported by systemd.
238+
if rule.Minor != configs.Wildcard {
239+
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
240+
continue
241+
}
242+
243+
// "_ *:* _" rules just wildcard everything.
244+
prefix, err := groupPrefix(rule.Type)
245+
if err != nil {
246+
return nil, err
247+
}
248+
entry.Path = prefix + "*"
249+
} else if rule.Minor == configs.Wildcard {
250+
// "_ n:* _" rules require a device group from /proc/devices.
251+
group, err := findDeviceGroup(rule.Type, rule.Major)
252+
if err != nil {
253+
return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
254+
}
255+
if group == "" {
256+
// Couldn't find a group.
257+
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
258+
continue
259+
}
260+
entry.Path = group
261+
} else {
262+
// "_ n:m _" rules are just a path in /dev/{block,char}/.
263+
switch rule.Type {
264+
case configs.BlockDevice:
265+
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
266+
case configs.CharDevice:
267+
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
268+
}
269+
}
270+
deviceAllowList = append(deviceAllowList, entry)
271+
}
272+
273+
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
274+
return properties, nil
275+
}
276+
72277
// getDbusConnection lazy initializes systemd dbus connection
73278
// and returns it
74279
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {

libcontainer/cgroups/systemd/v1.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/opencontainers/runc/libcontainer/cgroups"
1616
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
1717
"github.com/opencontainers/runc/libcontainer/configs"
18+
"github.com/sirupsen/logrus"
1819
)
1920

2021
type legacyManager struct {
@@ -70,6 +71,13 @@ var legacySubsystems = subsystemSet{
7071

7172
func genV1ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
7273
var properties []systemdDbus.Property
74+
75+
deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
76+
if err != nil {
77+
return nil, err
78+
}
79+
properties = append(properties, deviceProperties...)
80+
7381
if c.Resources.Memory != 0 {
7482
properties = append(properties,
7583
newProp("MemoryLimit", uint64(c.Resources.Memory)))
@@ -381,21 +389,47 @@ func (m *legacyManager) Set(container *configs.Config) error {
381389
if err != nil {
382390
return err
383391
}
392+
393+
// Figure out the current freezer state, so we can revert to it after we
394+
// temporarily freeze the container.
395+
targetFreezerState, err := m.GetFreezerState()
396+
if err != nil {
397+
return err
398+
}
399+
if targetFreezerState == configs.Undefined {
400+
targetFreezerState = configs.Thawed
401+
}
402+
403+
// We have to freeze the container while systemd sets the cgroup settings.
404+
// The reason for this is that systemd's application of DeviceAllow rules
405+
// is done disruptively, resulting in spurrious errors to common devices
406+
// (unlike our fs driver, they will happily write deny-all rules to running
407+
// containers). So we freeze the container to avoid them hitting the cgroup
408+
// error. But if the freezer cgroup isn't supported, we just warn about it.
409+
if err := m.Freeze(configs.Frozen); err != nil {
410+
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
411+
}
412+
384413
dbusConnection, err := getDbusConnection(false)
385414
if err != nil {
415+
_ = m.Freeze(targetFreezerState)
386416
return err
387417
}
388418
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
419+
_ = m.Freeze(targetFreezerState)
389420
return err
390421
}
391422

423+
// Reset freezer state before we apply the configuration, to avoid clashing
424+
// with the freezer setting in the configuration.
425+
_ = m.Freeze(targetFreezerState)
426+
392427
for _, sys := range legacySubsystems {
393428
// Get the subsystem path, but don't error out for not found cgroups.
394429
path, err := getSubsystemPath(container.Cgroups, sys.Name())
395430
if err != nil && !cgroups.IsNotFound(err) {
396431
return err
397432
}
398-
399433
if err := sys.Set(path, container.Cgroups); err != nil {
400434
return err
401435
}

libcontainer/cgroups/systemd/v2.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
1717
"github.com/opencontainers/runc/libcontainer/configs"
1818
"github.com/pkg/errors"
19+
"github.com/sirupsen/logrus"
1920
)
2021

2122
type unifiedManager struct {
@@ -37,6 +38,17 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
3738
func genV2ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
3839
var properties []systemdDbus.Property
3940

41+
// NOTE: This is of questionable correctness because we insert our own
42+
// devices eBPF program later. Two programs with identical rules
43+
// aren't the end of the world, but it is a bit concerning. However
44+
// it's unclear if systemd removes all eBPF programs attached when
45+
// doing SetUnitProperties...
46+
deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
47+
if err != nil {
48+
return nil, err
49+
}
50+
properties = append(properties, deviceProperties...)
51+
4052
if c.Resources.Memory != 0 {
4153
properties = append(properties,
4254
newProp("MemoryMax", uint64(c.Resources.Memory)))
@@ -295,14 +307,41 @@ func (m *unifiedManager) Set(container *configs.Config) error {
295307
if err != nil {
296308
return err
297309
}
310+
311+
// Figure out the current freezer state, so we can revert to it after we
312+
// temporarily freeze the container.
313+
targetFreezerState, err := m.GetFreezerState()
314+
if err != nil {
315+
return err
316+
}
317+
if targetFreezerState == configs.Undefined {
318+
targetFreezerState = configs.Thawed
319+
}
320+
321+
// We have to freeze the container while systemd sets the cgroup settings.
322+
// The reason for this is that systemd's application of DeviceAllow rules
323+
// is done disruptively, resulting in spurrious errors to common devices
324+
// (unlike our fs driver, they will happily write deny-all rules to running
325+
// containers). So we freeze the container to avoid them hitting the cgroup
326+
// error. But if the freezer cgroup isn't supported, we just warn about it.
327+
if err := m.Freeze(configs.Frozen); err != nil {
328+
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
329+
}
330+
298331
dbusConnection, err := getDbusConnection(m.rootless)
299332
if err != nil {
333+
_ = m.Freeze(targetFreezerState)
300334
return err
301335
}
302336
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
337+
_ = m.Freeze(targetFreezerState)
303338
return errors.Wrap(err, "error while setting unit properties")
304339
}
305340

341+
// Reset freezer state before we apply the configuration, to avoid clashing
342+
// with the freezer setting in the configuration.
343+
_ = m.Freeze(targetFreezerState)
344+
306345
fsMgr, err := m.fsManager()
307346
if err != nil {
308347
return err

0 commit comments

Comments
 (0)