Skip to content

Commit 96bedbe

Browse files
committed
nspawn: replace syscall blacklist by a whitelist
Let's lock things down a bit, and maintain a list of what's permitted rather than a list of what's prohibited in nspawn (also to make things a bit more like Docker and friends). Note that this slightly alters the effect of --system-call-filter=, as now the negative list now takes precedence over the positive list. However, given that the option is just a few days old and not included in any released version it should be fine to change it at this point in time. Note that the whitelist is good chunk more restrictive thatn the previous blacklist. Specifically: - fanotify is not permitted (given the buffer size issues it's problematic in containers) - nfsservctl is not permitted (NFS server support is not virtualized) - pkey_xyz stuff is not permitted (really new stuff I don't grok) - @cpu-emulation is prohibited (untested legacy stuff mostly, and if people really want to run dosemu in nspawn, they should use --system-call-filter=@cpu-emulation and all should be good)
1 parent cff7bff commit 96bedbe

2 files changed

Lines changed: 159 additions & 40 deletions

File tree

man/systemd-nspawn.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -723,9 +723,9 @@
723723
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
724724
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
725725
combined. If both a positive and a negative list (that is one system call list without and one with the
726-
<literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
727-
that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
728-
and this command line option hence adds or removes entries from the default blacklist, depending on the
726+
<literal>~</literal> prefix) are configured, the negative list takes precedence over the positive list. Note
727+
that <command>systemd-nspawn</command> always implements a system call whitelist (as opposed to a blacklist),
728+
and this command line option hence adds or removes entries from the default whitelist, depending on the
729729
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
730730
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
731731
</varlistentry>

src/nspawn/nspawn-seccomp.c

Lines changed: 156 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter(
4747
static const struct {
4848
uint64_t capability;
4949
const char* name;
50-
} blacklist[] = {
51-
{ 0, "@obsolete" },
52-
{ 0, "@keyring" }, /* keyring is not namespaced */
53-
{ 0, "bpf" },
54-
{ 0, "kexec_file_load" },
55-
{ 0, "kexec_load" },
56-
{ 0, "lookup_dcookie" },
57-
{ 0, "open_by_handle_at" },
58-
{ 0, "perf_event_open" },
59-
{ 0, "quotactl" },
60-
{ 0, "@swap" },
61-
{ CAP_SYSLOG, "syslog" },
62-
{ CAP_SYS_MODULE, "@module" },
63-
{ CAP_SYS_PACCT, "acct" },
64-
{ CAP_SYS_PTRACE, "process_vm_readv" },
65-
{ CAP_SYS_PTRACE, "process_vm_writev" },
66-
{ CAP_SYS_PTRACE, "ptrace" },
67-
{ CAP_SYS_RAWIO, "@raw-io" },
68-
{ CAP_SYS_TIME, "@clock" },
50+
} whitelist[] = {
51+
/* Let's use set names where we can */
52+
{ 0, "@basic-io" },
53+
{ 0, "@credentials" },
54+
{ 0, "@default" },
55+
{ 0, "@file-system" },
56+
{ 0, "@io-event" },
57+
{ 0, "@ipc" },
58+
{ 0, "@mount" },
59+
{ 0, "@network-io" },
60+
{ 0, "@process" },
61+
{ 0, "@resources" },
62+
{ 0, "@setuid" },
63+
{ 0, "@signal" },
64+
{ 0, "@timer" },
65+
66+
/* The following four are sets we optionally enable, in case the caps have been configured for it */
67+
{ CAP_SYS_TIME, "@clock" },
68+
{ CAP_SYS_MODULE, "@module" },
69+
{ CAP_SYS_RAWIO, "@raw-io" },
70+
{ CAP_IPC_LOCK, "@memlock" },
71+
72+
/* Plus a good set of additional syscalls which are not part of any of the groups above */
73+
{ 0, "brk" },
74+
{ 0, "capset" },
75+
{ 0, "chown" },
76+
{ 0, "chown32" },
77+
{ 0, "copy_file_range" },
78+
{ 0, "fadvise64" },
79+
{ 0, "fadvise64_64" },
80+
{ 0, "fchown" },
81+
{ 0, "fchown32" },
82+
{ 0, "fchownat" },
83+
{ 0, "fdatasync" },
84+
{ 0, "flock" },
85+
{ 0, "fsync" },
86+
{ 0, "get_mempolicy" },
87+
{ 0, "getcpu" },
88+
{ 0, "getpriority" },
89+
{ 0, "getrandom" },
90+
{ 0, "io_cancel" },
91+
{ 0, "io_destroy" },
92+
{ 0, "io_getevents" },
93+
{ 0, "io_setup" },
94+
{ 0, "io_submit" },
95+
{ 0, "ioctl" },
96+
{ 0, "ioprio_get" },
97+
{ 0, "kcmp" },
98+
{ 0, "lchown" },
99+
{ 0, "lchown32" },
100+
{ 0, "madvise" },
101+
{ 0, "mincore" },
102+
{ 0, "mprotect" },
103+
{ 0, "mremap" },
104+
{ 0, "msync" },
105+
{ 0, "name_to_handle_at" },
106+
{ 0, "oldolduname" },
107+
{ 0, "olduname" },
108+
{ 0, "personality" },
109+
{ 0, "preadv2" },
110+
{ 0, "pwritev2" },
111+
{ 0, "readahead" },
112+
{ 0, "readdir" },
113+
{ 0, "remap_file_pages" },
114+
{ 0, "sched_get_priority_max" },
115+
{ 0, "sched_get_priority_min" },
116+
{ 0, "sched_getaffinity" },
117+
{ 0, "sched_getattr" },
118+
{ 0, "sched_getparam" },
119+
{ 0, "sched_getscheduler" },
120+
{ 0, "sched_rr_get_interval" },
121+
{ 0, "sched_yield" },
122+
{ 0, "seccomp" },
123+
{ 0, "sendfile" },
124+
{ 0, "sendfile64" },
125+
{ 0, "setdomainname" },
126+
{ 0, "setfsgid" },
127+
{ 0, "setfsgid32" },
128+
{ 0, "setfsuid" },
129+
{ 0, "setfsuid32" },
130+
{ 0, "sethostname" },
131+
{ 0, "setpgid" },
132+
{ 0, "setsid" },
133+
{ 0, "splice" },
134+
{ 0, "sync" },
135+
{ 0, "sync_file_range" },
136+
{ 0, "syncfs" },
137+
{ 0, "sysinfo" },
138+
{ 0, "tee" },
139+
{ 0, "ugetrlimit" },
140+
{ 0, "umask" },
141+
{ 0, "uname" },
142+
{ 0, "userfaultfd" },
143+
{ 0, "vmsplice" },
144+
145+
/* The following individual syscalls are added depending on specified caps */
146+
{ CAP_SYS_PACCT, "acct" },
147+
{ CAP_SYS_PTRACE, "process_vm_readv" },
148+
{ CAP_SYS_PTRACE, "process_vm_writev" },
149+
{ CAP_SYS_PTRACE, "ptrace" },
150+
{ CAP_SYS_BOOT, "reboot" },
151+
{ CAP_SYSLOG, "syslog" },
152+
{ CAP_SYS_TTY_CONFIG, "vhangup" },
153+
154+
/*
155+
* The following syscalls and groups are knowingly excluded:
156+
*
157+
* @cpu-emulation
158+
* @keyring (NB: keyring is not namespaced!)
159+
* @obsolete
160+
* @swap
161+
*
162+
* bpf (NB: bpffs is not namespaced!)
163+
* fanotify_init
164+
* fanotify_mark
165+
* kexec_file_load
166+
* kexec_load
167+
* lookup_dcookie
168+
* nfsservctl
169+
* open_by_handle_at
170+
* perf_event_open
171+
* pkey_alloc
172+
* pkey_free
173+
* pkey_mprotect
174+
* quotactl
175+
*/
69176
};
70177

71178
int r, c = 0;
72179
size_t i;
73180
char **p;
74181

75-
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
76-
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
182+
for (i = 0; i < ELEMENTSOF(whitelist); i++) {
183+
if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
77184
continue;
78185

79-
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
186+
r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
80187
if (r < 0)
81188
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
82-
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
189+
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
83190
else
84191
c++;
85192
}
86193

87-
STRV_FOREACH(p, syscall_blacklist) {
88-
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
194+
STRV_FOREACH(p, syscall_whitelist) {
195+
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
89196
if (r < 0)
90-
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
197+
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
91198
else
92199
c++;
93200
}
@@ -106,17 +213,32 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
106213

107214
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
108215
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
109-
int n;
110216

111-
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
217+
log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
112218

113-
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
219+
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
114220
if (r < 0)
115221
return log_error_errno(r, "Failed to allocate seccomp object: %m");
116222

117-
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
118-
if (n < 0)
119-
return n;
223+
r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
224+
if (r < 0)
225+
return r;
226+
227+
r = seccomp_load(seccomp);
228+
if (IN_SET(r, -EPERM, -EACCES))
229+
return log_error_errno(r, "Failed to install seccomp filter: %m");
230+
if (r < 0)
231+
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
232+
}
233+
234+
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
235+
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
236+
237+
log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
238+
239+
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
240+
if (r < 0)
241+
return log_error_errno(r, "Failed to allocate seccomp object: %m");
120242

121243
/*
122244
Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
@@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
133255
2,
134256
SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
135257
SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
136-
if (r < 0)
258+
if (r < 0) {
137259
log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
138-
else
139-
n++;
140-
141-
if (n <= 0) /* no rule added? then skip this architecture */
142260
continue;
261+
}
143262

144263
r = seccomp_load(seccomp);
145264
if (IN_SET(r, -EPERM, -EACCES))

0 commit comments

Comments
 (0)