@@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter(
4747 static const struct {
4848 uint64_t capability ;
4949 const char * name ;
50- } blacklist [] = {
51- { 0 , "@obsolete" },
52- { 0 , "@keyring" }, /* keyring is not namespaced */
53- { 0 , "bpf" },
54- { 0 , "kexec_file_load" },
55- { 0 , "kexec_load" },
56- { 0 , "lookup_dcookie" },
57- { 0 , "open_by_handle_at" },
58- { 0 , "perf_event_open" },
59- { 0 , "quotactl" },
60- { 0 , "@swap" },
61- { CAP_SYSLOG , "syslog" },
62- { CAP_SYS_MODULE , "@module" },
63- { CAP_SYS_PACCT , "acct" },
64- { CAP_SYS_PTRACE , "process_vm_readv" },
65- { CAP_SYS_PTRACE , "process_vm_writev" },
66- { CAP_SYS_PTRACE , "ptrace" },
67- { CAP_SYS_RAWIO , "@raw-io" },
68- { CAP_SYS_TIME , "@clock" },
50+ } whitelist [] = {
51+ /* Let's use set names where we can */
52+ { 0 , "@basic-io" },
53+ { 0 , "@credentials" },
54+ { 0 , "@default" },
55+ { 0 , "@file-system" },
56+ { 0 , "@io-event" },
57+ { 0 , "@ipc" },
58+ { 0 , "@mount" },
59+ { 0 , "@network-io" },
60+ { 0 , "@process" },
61+ { 0 , "@resources" },
62+ { 0 , "@setuid" },
63+ { 0 , "@signal" },
64+ { 0 , "@timer" },
65+
66+ /* The following four are sets we optionally enable, in case the caps have been configured for it */
67+ { CAP_SYS_TIME , "@clock" },
68+ { CAP_SYS_MODULE , "@module" },
69+ { CAP_SYS_RAWIO , "@raw-io" },
70+ { CAP_IPC_LOCK , "@memlock" },
71+
72+ /* Plus a good set of additional syscalls which are not part of any of the groups above */
73+ { 0 , "brk" },
74+ { 0 , "capset" },
75+ { 0 , "chown" },
76+ { 0 , "chown32" },
77+ { 0 , "copy_file_range" },
78+ { 0 , "fadvise64" },
79+ { 0 , "fadvise64_64" },
80+ { 0 , "fchown" },
81+ { 0 , "fchown32" },
82+ { 0 , "fchownat" },
83+ { 0 , "fdatasync" },
84+ { 0 , "flock" },
85+ { 0 , "fsync" },
86+ { 0 , "get_mempolicy" },
87+ { 0 , "getcpu" },
88+ { 0 , "getpriority" },
89+ { 0 , "getrandom" },
90+ { 0 , "io_cancel" },
91+ { 0 , "io_destroy" },
92+ { 0 , "io_getevents" },
93+ { 0 , "io_setup" },
94+ { 0 , "io_submit" },
95+ { 0 , "ioctl" },
96+ { 0 , "ioprio_get" },
97+ { 0 , "kcmp" },
98+ { 0 , "lchown" },
99+ { 0 , "lchown32" },
100+ { 0 , "madvise" },
101+ { 0 , "mincore" },
102+ { 0 , "mprotect" },
103+ { 0 , "mremap" },
104+ { 0 , "msync" },
105+ { 0 , "name_to_handle_at" },
106+ { 0 , "oldolduname" },
107+ { 0 , "olduname" },
108+ { 0 , "personality" },
109+ { 0 , "preadv2" },
110+ { 0 , "pwritev2" },
111+ { 0 , "readahead" },
112+ { 0 , "readdir" },
113+ { 0 , "remap_file_pages" },
114+ { 0 , "sched_get_priority_max" },
115+ { 0 , "sched_get_priority_min" },
116+ { 0 , "sched_getaffinity" },
117+ { 0 , "sched_getattr" },
118+ { 0 , "sched_getparam" },
119+ { 0 , "sched_getscheduler" },
120+ { 0 , "sched_rr_get_interval" },
121+ { 0 , "sched_yield" },
122+ { 0 , "seccomp" },
123+ { 0 , "sendfile" },
124+ { 0 , "sendfile64" },
125+ { 0 , "setdomainname" },
126+ { 0 , "setfsgid" },
127+ { 0 , "setfsgid32" },
128+ { 0 , "setfsuid" },
129+ { 0 , "setfsuid32" },
130+ { 0 , "sethostname" },
131+ { 0 , "setpgid" },
132+ { 0 , "setsid" },
133+ { 0 , "splice" },
134+ { 0 , "sync" },
135+ { 0 , "sync_file_range" },
136+ { 0 , "syncfs" },
137+ { 0 , "sysinfo" },
138+ { 0 , "tee" },
139+ { 0 , "ugetrlimit" },
140+ { 0 , "umask" },
141+ { 0 , "uname" },
142+ { 0 , "userfaultfd" },
143+ { 0 , "vmsplice" },
144+
145+ /* The following individual syscalls are added depending on specified caps */
146+ { CAP_SYS_PACCT , "acct" },
147+ { CAP_SYS_PTRACE , "process_vm_readv" },
148+ { CAP_SYS_PTRACE , "process_vm_writev" },
149+ { CAP_SYS_PTRACE , "ptrace" },
150+ { CAP_SYS_BOOT , "reboot" },
151+ { CAP_SYSLOG , "syslog" },
152+ { CAP_SYS_TTY_CONFIG , "vhangup" },
153+
154+ /*
155+ * The following syscalls and groups are knowingly excluded:
156+ *
157+ * @cpu-emulation
158+ * @keyring (NB: keyring is not namespaced!)
159+ * @obsolete
160+ * @swap
161+ *
162+ * bpf (NB: bpffs is not namespaced!)
163+ * fanotify_init
164+ * fanotify_mark
165+ * kexec_file_load
166+ * kexec_load
167+ * lookup_dcookie
168+ * nfsservctl
169+ * open_by_handle_at
170+ * perf_event_open
171+ * pkey_alloc
172+ * pkey_free
173+ * pkey_mprotect
174+ * quotactl
175+ */
69176 };
70177
71178 int r , c = 0 ;
72179 size_t i ;
73180 char * * p ;
74181
75- for (i = 0 ; i < ELEMENTSOF (blacklist ); i ++ ) {
76- if (blacklist [i ].capability != 0 && (cap_list_retain & (1ULL << blacklist [i ].capability )))
182+ for (i = 0 ; i < ELEMENTSOF (whitelist ); i ++ ) {
183+ if (whitelist [i ].capability != 0 && (cap_list_retain & (1ULL << whitelist [i ].capability )) == 0 )
77184 continue ;
78185
79- r = seccomp_add_syscall_filter_item (ctx , blacklist [i ].name , SCMP_ACT_ERRNO ( EPERM ), syscall_whitelist );
186+ r = seccomp_add_syscall_filter_item (ctx , whitelist [i ].name , SCMP_ACT_ALLOW , syscall_blacklist );
80187 if (r < 0 )
81188 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
82- log_debug_errno (r , "Failed to add rule for system call %s, ignoring: %m" , blacklist [i ].name );
189+ log_debug_errno (r , "Failed to add rule for system call %s on %s , ignoring: %m" , whitelist [i ].name , seccomp_arch_to_string ( arch ) );
83190 else
84191 c ++ ;
85192 }
86193
87- STRV_FOREACH (p , syscall_blacklist ) {
88- r = seccomp_add_syscall_filter_item (ctx , * p , SCMP_ACT_ERRNO ( EPERM ), syscall_whitelist );
194+ STRV_FOREACH (p , syscall_whitelist ) {
195+ r = seccomp_add_syscall_filter_item (ctx , * p , SCMP_ACT_ALLOW , syscall_blacklist );
89196 if (r < 0 )
90- log_debug_errno (r , "Failed to add rule for system call %s, ignoring: %m" , * p );
197+ log_debug_errno (r , "Failed to add rule for system call %s on %s , ignoring: %m" , * p , seccomp_arch_to_string ( arch ) );
91198 else
92199 c ++ ;
93200 }
@@ -106,17 +213,32 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
106213
107214 SECCOMP_FOREACH_LOCAL_ARCH (arch ) {
108215 _cleanup_ (seccomp_releasep ) scmp_filter_ctx seccomp = NULL ;
109- int n ;
110216
111- log_debug ("Operating on architecture: %s" , seccomp_arch_to_string (arch ));
217+ log_debug ("Applying whitelist on architecture: %s" , seccomp_arch_to_string (arch ));
112218
113- r = seccomp_init_for_arch (& seccomp , arch , SCMP_ACT_ALLOW );
219+ r = seccomp_init_for_arch (& seccomp , arch , SCMP_ACT_ERRNO ( EPERM ) );
114220 if (r < 0 )
115221 return log_error_errno (r , "Failed to allocate seccomp object: %m" );
116222
117- n = seccomp_add_default_syscall_filter (seccomp , arch , cap_list_retain , syscall_whitelist , syscall_blacklist );
118- if (n < 0 )
119- return n ;
223+ r = seccomp_add_default_syscall_filter (seccomp , arch , cap_list_retain , syscall_whitelist , syscall_blacklist );
224+ if (r < 0 )
225+ return r ;
226+
227+ r = seccomp_load (seccomp );
228+ if (IN_SET (r , - EPERM , - EACCES ))
229+ return log_error_errno (r , "Failed to install seccomp filter: %m" );
230+ if (r < 0 )
231+ log_debug_errno (r , "Failed to install filter set for architecture %s, skipping: %m" , seccomp_arch_to_string (arch ));
232+ }
233+
234+ SECCOMP_FOREACH_LOCAL_ARCH (arch ) {
235+ _cleanup_ (seccomp_releasep ) scmp_filter_ctx seccomp = NULL ;
236+
237+ log_debug ("Applying NETLINK_AUDIT mask on architecture: %s" , seccomp_arch_to_string (arch ));
238+
239+ r = seccomp_init_for_arch (& seccomp , arch , SCMP_ACT_ALLOW );
240+ if (r < 0 )
241+ return log_error_errno (r , "Failed to allocate seccomp object: %m" );
120242
121243 /*
122244 Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
@@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
133255 2 ,
134256 SCMP_A0 (SCMP_CMP_EQ , AF_NETLINK ),
135257 SCMP_A2 (SCMP_CMP_EQ , NETLINK_AUDIT ));
136- if (r < 0 )
258+ if (r < 0 ) {
137259 log_debug_errno (r , "Failed to add audit seccomp rule, ignoring: %m" );
138- else
139- n ++ ;
140-
141- if (n <= 0 ) /* no rule added? then skip this architecture */
142260 continue ;
261+ }
143262
144263 r = seccomp_load (seccomp );
145264 if (IN_SET (r , - EPERM , - EACCES ))
0 commit comments