-
-
Notifications
You must be signed in to change notification settings - Fork 4.3k
nspawn: R/W support for /sysfs, /proc, and /proc/sys/net #4395
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -225,9 +225,10 @@ static int tmpfs_patch_options( | |
| return !!buf; | ||
| } | ||
|
|
||
| int mount_sysfs(const char *dest) { | ||
| int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { | ||
| const char *full, *top, *x; | ||
| int r; | ||
| unsigned long extra_flags = 0; | ||
|
|
||
| top = prefix_roota(dest, "/sys"); | ||
| r = path_check_fstype(top, SYSFS_MAGIC); | ||
|
|
@@ -244,8 +245,11 @@ int mount_sysfs(const char *dest) { | |
|
|
||
| (void) mkdir(full, 0755); | ||
|
|
||
| if (mount_settings & MOUNT_APPLY_APIVFS_RO) | ||
| extra_flags |= MS_RDONLY; | ||
|
|
||
| r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs", | ||
| MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); | ||
| MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL); | ||
| if (r < 0) | ||
| return r; | ||
|
|
||
|
|
@@ -267,7 +271,7 @@ int mount_sysfs(const char *dest) { | |
| return r; | ||
|
|
||
| r = mount_verbose(LOG_ERR, NULL, to, NULL, | ||
| MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL); | ||
| MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); | ||
| if (r < 0) | ||
| return r; | ||
| } | ||
|
|
@@ -291,7 +295,7 @@ int mount_sysfs(const char *dest) { | |
| } | ||
|
|
||
| return mount_verbose(LOG_ERR, NULL, top, NULL, | ||
| MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL); | ||
| MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); | ||
| } | ||
|
|
||
| static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) { | ||
|
|
@@ -348,8 +352,7 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, boo | |
| } | ||
|
|
||
| int mount_all(const char *dest, | ||
| bool use_userns, bool in_userns, | ||
| bool use_netns, | ||
| MountSettingsMask mount_settings, | ||
| uid_t uid_shift, uid_t uid_range, | ||
| const char *selinux_apifs_context) { | ||
|
|
||
|
|
@@ -359,41 +362,52 @@ int mount_all(const char *dest, | |
| const char *type; | ||
| const char *options; | ||
| unsigned long flags; | ||
| bool fatal; | ||
| bool in_userns; | ||
| bool use_netns; | ||
| MountSettingsMask mount_settings; | ||
| } MountPoint; | ||
|
|
||
| static const MountPoint mount_table[] = { | ||
| { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false }, | ||
| { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/ | ||
| { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */ | ||
| { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */ | ||
| { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/ | ||
| { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */ | ||
| { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true }, | ||
| { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false }, | ||
| { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false }, | ||
| { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, | ||
| { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, | ||
| { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, true, false }, | ||
| /* inner child mounts */ | ||
| { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS }, | ||
| { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/ | ||
| { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */ | ||
| { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ | ||
| { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/ | ||
| { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ | ||
| { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, MOUNT_FATAL|MOUNT_IN_USERNS }, | ||
|
|
||
| /* outer child mounts */ | ||
| { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS }, | ||
| { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */ | ||
| { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */ | ||
|
|
||
| { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL }, | ||
| { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL }, | ||
| { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL }, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This mount table looked, and looks pretty complicated, but I didn't have a better idea on how to refactor this to be more readable. |
||
| #ifdef HAVE_SELINUX | ||
| { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */ | ||
| { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */ | ||
| { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */ | ||
| { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */ | ||
| #endif | ||
| }; | ||
|
|
||
| unsigned k; | ||
| int r; | ||
| bool use_userns = (mount_settings & MOUNT_USE_USERNS); | ||
| bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS); | ||
| bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO); | ||
| bool in_userns = (mount_settings & MOUNT_IN_USERNS); | ||
|
|
||
| for (k = 0; k < ELEMENTSOF(mount_table); k++) { | ||
| _cleanup_free_ char *where = NULL, *options = NULL; | ||
| const char *o; | ||
| bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL); | ||
|
|
||
| if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS)) | ||
| continue; | ||
|
|
||
| if (in_userns != mount_table[k].in_userns) | ||
| if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS)) | ||
| continue; | ||
|
|
||
| if (!use_netns && mount_table[k].use_netns) | ||
| if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO)) | ||
| continue; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A helper variable would make the intent clear: |
||
|
|
||
| where = prefix_root(dest, mount_table[k].where); | ||
|
|
@@ -410,7 +424,7 @@ int mount_all(const char *dest, | |
|
|
||
| r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift); | ||
| if (r < 0 && r != -EEXIST) { | ||
| if (mount_table[k].fatal) | ||
| if (fatal) | ||
| return log_error_errno(r, "Failed to create directory %s: %m", where); | ||
|
|
||
| log_debug_errno(r, "Failed to create directory %s: %m", where); | ||
|
|
@@ -429,13 +443,13 @@ int mount_all(const char *dest, | |
| o = options; | ||
| } | ||
|
|
||
| r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_DEBUG, | ||
| r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG, | ||
| mount_table[k].what, | ||
| where, | ||
| mount_table[k].type, | ||
| mount_table[k].flags, | ||
| o); | ||
| if (r < 0 && mount_table[k].fatal) | ||
| if (r < 0 && fatal) | ||
| return r; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -195,6 +195,7 @@ static const char *arg_container_service_name = "systemd-nspawn"; | |
| static bool arg_notify_ready = false; | ||
| static bool arg_use_cgns = true; | ||
| static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS; | ||
| static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO; | ||
|
|
||
| static void help(void) { | ||
| printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" | ||
|
|
@@ -378,6 +379,31 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) { | |
| arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag); | ||
| } | ||
|
|
||
| static void parse_mount_settings_env(void) { | ||
| int r; | ||
| const char *e; | ||
|
|
||
| e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE"); | ||
| if (!e) | ||
| return; | ||
|
|
||
| if (streq(e, "network")) { | ||
| arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS; | ||
| return; | ||
| } | ||
|
|
||
| r = parse_boolean(e); | ||
| if (r < 0) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not that it matters spurious newline... we try to keep functions and their error handling together... |
||
| log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring."); | ||
| return; | ||
| } else if (r > 0) | ||
| arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO; | ||
| else | ||
| arg_mount_settings |= MOUNT_APPLY_APIVFS_RO; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should turn off MOUNT_APPLY_APIVFS_NETNS here, no? (also, no {} around single-line if blocks, see CODING_STYLE)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually in this case no. Even if the user specifies
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why that? why shouldn't the env var setting override the implicit automatic logic? I'd claim if people use --private-network in conjunction with SYSTEMD_NSPAWN_API_VFS_WRITABLE=false then that's their own fault, no?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This breaks the backward compatibility. No?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, both of you have a point ;-)
After rethinking I am leaning towards 1. because we then have the control of the target read-write settings at the expense of a possible user surprise.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @s-urbaniak , I prefer to not break the backward compatibility (for no reason).
@s-urbaniak , @poettering , what do you think?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
As an user I don't really know what is undocumented env var
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's leave the final word to @poettering :-) I am fine with the current (backwards compatible) implementation, and also with the one suggested by @poettering. Looking forward to land this! :-)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I follow the discussion now. I think the code should be backwards compat when the env var is not set, but do exactly what the env var says when it is set. That means: if the env var is not set then we mount /proc/sys and /sys read-only by default, except for /proc/sys/net — but the latter only if private networking is on. However, if the env var is set to yes, no or network, then the private networking setting should not have any effect anymore, and what the user asked for should take effect. That means: if set to "yes", then all of /proc/sys and /sys should be writable. If set to "no", then nothing of it should be writable, and if set to "network", then /proc/sys + /sys should be read-only, but /proc/sys/net should be writable. Or in other words: I think the env var should turn off any kind of automatism, and permit the caller to force the invocation into a certain mode. And thus I think the MOUNT_APPLY_APIVFS_NETNS flag should be turned off in this branch... Or am I missing something?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @poettering your assertion is correct, I updated the PR according to your spec, and the commit message replacing /sysfs with /sys (doh). |
||
|
|
||
| arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS; | ||
| } | ||
|
|
||
| static int parse_argv(int argc, char *argv[]) { | ||
|
|
||
| enum { | ||
|
|
@@ -1070,6 +1096,14 @@ static int parse_argv(int argc, char *argv[]) { | |
| parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS); | ||
| parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); | ||
|
|
||
| if (arg_userns_mode != USER_NAMESPACE_NO) | ||
| arg_mount_settings |= MOUNT_USE_USERNS; | ||
|
|
||
| if (arg_private_network) | ||
| arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS; | ||
|
|
||
| parse_mount_settings_env(); | ||
|
|
||
| if (!(arg_clone_ns_flags & CLONE_NEWPID) || | ||
| !(arg_clone_ns_flags & CLONE_NEWUTS)) { | ||
| arg_register = false; | ||
|
|
@@ -1164,6 +1198,15 @@ static int parse_argv(int argc, char *argv[]) { | |
| } | ||
|
|
||
| static int verify_arguments(void) { | ||
| if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) { | ||
| log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network."); | ||
| return -EINVAL; | ||
| } | ||
|
|
||
| if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) { | ||
| log_error("Cannot combine --private-users with read-write mounts."); | ||
| return -EINVAL; | ||
| } | ||
|
|
||
| if (arg_volatile_mode != VOLATILE_NO && arg_read_only) { | ||
| log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); | ||
|
|
@@ -2700,17 +2743,15 @@ static int inner_child( | |
| return log_error_errno(r, "Couldn't become new root: %m"); | ||
|
|
||
| r = mount_all(NULL, | ||
| arg_userns_mode != USER_NAMESPACE_NO, | ||
| true, | ||
| arg_private_network, | ||
| arg_mount_settings | MOUNT_IN_USERNS, | ||
| arg_uid_shift, | ||
| arg_uid_range, | ||
| arg_selinux_apifs_context); | ||
|
|
||
| if (r < 0) | ||
| return r; | ||
|
|
||
| r = mount_sysfs(NULL); | ||
| r = mount_sysfs(NULL, arg_mount_settings); | ||
| if (r < 0) | ||
| return r; | ||
|
|
||
|
|
@@ -3077,9 +3118,7 @@ static int outer_child( | |
| } | ||
|
|
||
| r = mount_all(directory, | ||
| arg_userns_mode != USER_NAMESPACE_NO, | ||
| false, | ||
| arg_private_network, | ||
| arg_mount_settings, | ||
| arg_uid_shift, | ||
| arg_uid_range, | ||
| arg_selinux_apifs_context); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this table is finally legible.