Skip to content

Commit 8f502d5

Browse files
committed
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull usernamespace mount fixes from Eric Biederman: "Way back in October Andrey Vagin reported that umount(MNT_DETACH) could be used to defeat MNT_LOCKED. As I worked to fix this I discovered that combined with mount propagation and an appropriate selection of shared subtrees a reference to a directory on an unmounted filesystem is not necessary. That MNT_DETACH is allowed in user namespace in a form that can break MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does. To avoid breaking existing userspace the conflict between MNT_DETACH and MNT_LOCKED is fixed by leaving mounts that are locked to their parents in the mount hash table until the last reference goes away. While investigating this issue I also found an issue with __detach_mounts. The code was unnecessarily and incorrectly triggering mount propagation. Resulting in too many mounts going away when a directory is deleted, and too many cpu cycles are burned while doing that. Looking some more I realized that __detach_mounts by only keeping mounts connected that were MNT_LOCKED it had the potential to still leak information so I tweaked the code to keep everything locked together that possibly could be. This code was almost ready last cycle but Al invented fs_pin which slightly simplifies this code but required rewrites and retesting, and I have not been in top form for a while so it took me a while to get all of that done. Similiarly this pull request is late because I have been feeling absolutely miserable all week. The issue of being able to escape a bind mount has not yet been addressed, as the fixes are not yet mature" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: mnt: Update detach_mounts to leave mounts connected mnt: Fix the error check in __detach_mounts mnt: Honor MNT_LOCKED when detaching mounts fs_pin: Allow for the possibility that m_list or s_list go unused. mnt: Factor umount_mnt from umount_tree mnt: Factor out unhash_mnt from detach_mnt and umount_tree mnt: Fail collect_mounts when applied to unmounted mounts mnt: Don't propagate unmounts to locked mounts mnt: On an unmount propagate clearing of MNT_LOCKED mnt: Delay removal from the mount hash. mnt: Add MNT_UMOUNT flag mnt: In umount_tree reuse mnt_list instead of mnt_hash mnt: Don't propagate umounts in __detach_mounts mnt: Improve the umount_tree flags mnt: Use hlist_move_list in namespace_unlock
2 parents 06a60de + e0c9c0a commit 8f502d5

6 files changed

Lines changed: 159 additions & 57 deletions

File tree

fs/fs_pin.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
99
void pin_remove(struct fs_pin *pin)
1010
{
1111
spin_lock(&pin_lock);
12-
hlist_del(&pin->m_list);
13-
hlist_del(&pin->s_list);
12+
hlist_del_init(&pin->m_list);
13+
hlist_del_init(&pin->s_list);
1414
spin_unlock(&pin_lock);
1515
spin_lock_irq(&pin->wait.lock);
1616
pin->done = 1;

fs/namespace.c

Lines changed: 96 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
632632
*/
633633
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
634634
{
635-
struct mount *p, *res;
636-
res = p = __lookup_mnt(mnt, dentry);
635+
struct mount *p, *res = NULL;
636+
p = __lookup_mnt(mnt, dentry);
637637
if (!p)
638638
goto out;
639+
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
640+
res = p;
639641
hlist_for_each_entry_continue(p, mnt_hash) {
640642
if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
641643
break;
642-
res = p;
644+
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
645+
res = p;
643646
}
644647
out:
645648
return res;
@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
795798
/*
796799
* vfsmount lock must be held for write
797800
*/
798-
static void detach_mnt(struct mount *mnt, struct path *old_path)
801+
static void unhash_mnt(struct mount *mnt)
799802
{
800-
old_path->dentry = mnt->mnt_mountpoint;
801-
old_path->mnt = &mnt->mnt_parent->mnt;
802803
mnt->mnt_parent = mnt;
803804
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
804805
list_del_init(&mnt->mnt_child);
@@ -808,6 +809,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
808809
mnt->mnt_mp = NULL;
809810
}
810811

812+
/*
813+
* vfsmount lock must be held for write
814+
*/
815+
static void detach_mnt(struct mount *mnt, struct path *old_path)
816+
{
817+
old_path->dentry = mnt->mnt_mountpoint;
818+
old_path->mnt = &mnt->mnt_parent->mnt;
819+
unhash_mnt(mnt);
820+
}
821+
822+
/*
823+
* vfsmount lock must be held for write
824+
*/
825+
static void umount_mnt(struct mount *mnt)
826+
{
827+
/* old mountpoint will be dropped when we can do that */
828+
mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
829+
unhash_mnt(mnt);
830+
}
831+
811832
/*
812833
* vfsmount lock must be held for write
813834
*/
@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
10781099
rcu_read_unlock();
10791100

10801101
list_del(&mnt->mnt_instance);
1102+
1103+
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1104+
struct mount *p, *tmp;
1105+
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1106+
umount_mnt(p);
1107+
}
1108+
}
10811109
unlock_mount_hash();
10821110

10831111
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
12981326

12991327
static void namespace_unlock(void)
13001328
{
1301-
struct hlist_head head = unmounted;
1329+
struct hlist_head head;
13021330

1303-
if (likely(hlist_empty(&head))) {
1304-
up_write(&namespace_sem);
1305-
return;
1306-
}
1331+
hlist_move_list(&unmounted, &head);
13071332

1308-
head.first->pprev = &head.first;
1309-
INIT_HLIST_HEAD(&unmounted);
13101333
up_write(&namespace_sem);
13111334

1335+
if (likely(hlist_empty(&head)))
1336+
return;
1337+
13121338
synchronize_rcu();
13131339

13141340
group_pin_kill(&head);
@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
13191345
down_write(&namespace_sem);
13201346
}
13211347

1348+
enum umount_tree_flags {
1349+
UMOUNT_SYNC = 1,
1350+
UMOUNT_PROPAGATE = 2,
1351+
UMOUNT_CONNECTED = 4,
1352+
};
13221353
/*
13231354
* mount_lock must be held
13241355
* namespace_sem must be held for write
1325-
* how = 0 => just this tree, don't propagate
1326-
* how = 1 => propagate; we know that nobody else has reference to any victims
1327-
* how = 2 => lazy umount
13281356
*/
1329-
void umount_tree(struct mount *mnt, int how)
1357+
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
13301358
{
1331-
HLIST_HEAD(tmp_list);
1359+
LIST_HEAD(tmp_list);
13321360
struct mount *p;
13331361

1362+
if (how & UMOUNT_PROPAGATE)
1363+
propagate_mount_unlock(mnt);
1364+
1365+
/* Gather the mounts to umount */
13341366
for (p = mnt; p; p = next_mnt(p, mnt)) {
1335-
hlist_del_init_rcu(&p->mnt_hash);
1336-
hlist_add_head(&p->mnt_hash, &tmp_list);
1367+
p->mnt.mnt_flags |= MNT_UMOUNT;
1368+
list_move(&p->mnt_list, &tmp_list);
13371369
}
13381370

1339-
hlist_for_each_entry(p, &tmp_list, mnt_hash)
1371+
/* Hide the mounts from mnt_mounts */
1372+
list_for_each_entry(p, &tmp_list, mnt_list) {
13401373
list_del_init(&p->mnt_child);
1374+
}
13411375

1342-
if (how)
1376+
/* Add propogated mounts to the tmp_list */
1377+
if (how & UMOUNT_PROPAGATE)
13431378
propagate_umount(&tmp_list);
13441379

1345-
while (!hlist_empty(&tmp_list)) {
1346-
p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
1347-
hlist_del_init_rcu(&p->mnt_hash);
1380+
while (!list_empty(&tmp_list)) {
1381+
bool disconnect;
1382+
p = list_first_entry(&tmp_list, struct mount, mnt_list);
13481383
list_del_init(&p->mnt_expire);
13491384
list_del_init(&p->mnt_list);
13501385
__touch_mnt_namespace(p->mnt_ns);
13511386
p->mnt_ns = NULL;
1352-
if (how < 2)
1387+
if (how & UMOUNT_SYNC)
13531388
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
13541389

1355-
pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
1390+
disconnect = !(((how & UMOUNT_CONNECTED) &&
1391+
mnt_has_parent(p) &&
1392+
(p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
1393+
IS_MNT_LOCKED_AND_LAZY(p));
1394+
1395+
pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
1396+
disconnect ? &unmounted : NULL);
13561397
if (mnt_has_parent(p)) {
1357-
hlist_del_init(&p->mnt_mp_list);
1358-
put_mountpoint(p->mnt_mp);
13591398
mnt_add_count(p->mnt_parent, -1);
1360-
/* old mountpoint will be dropped when we can do that */
1361-
p->mnt_ex_mountpoint = p->mnt_mountpoint;
1362-
p->mnt_mountpoint = p->mnt.mnt_root;
1363-
p->mnt_parent = p;
1364-
p->mnt_mp = NULL;
1399+
if (!disconnect) {
1400+
/* Don't forget about p */
1401+
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1402+
} else {
1403+
umount_mnt(p);
1404+
}
13651405
}
13661406
change_mnt_propagation(p, MS_PRIVATE);
13671407
}
@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)
14471487

14481488
if (flags & MNT_DETACH) {
14491489
if (!list_empty(&mnt->mnt_list))
1450-
umount_tree(mnt, 2);
1490+
umount_tree(mnt, UMOUNT_PROPAGATE);
14511491
retval = 0;
14521492
} else {
14531493
shrink_submounts(mnt);
14541494
retval = -EBUSY;
14551495
if (!propagate_mount_busy(mnt, 2)) {
14561496
if (!list_empty(&mnt->mnt_list))
1457-
umount_tree(mnt, 1);
1497+
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
14581498
retval = 0;
14591499
}
14601500
}
@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)
14801520

14811521
namespace_lock();
14821522
mp = lookup_mountpoint(dentry);
1483-
if (!mp)
1523+
if (IS_ERR_OR_NULL(mp))
14841524
goto out_unlock;
14851525

14861526
lock_mount_hash();
14871527
while (!hlist_empty(&mp->m_list)) {
14881528
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1489-
umount_tree(mnt, 2);
1529+
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1530+
struct mount *p, *tmp;
1531+
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1532+
hlist_add_head(&p->mnt_umount.s_list, &unmounted);
1533+
umount_mnt(p);
1534+
}
1535+
}
1536+
else umount_tree(mnt, UMOUNT_CONNECTED);
14901537
}
14911538
unlock_mount_hash();
14921539
put_mountpoint(mp);
@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
16481695
out:
16491696
if (res) {
16501697
lock_mount_hash();
1651-
umount_tree(res, 0);
1698+
umount_tree(res, UMOUNT_SYNC);
16521699
unlock_mount_hash();
16531700
}
16541701
return q;
@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
16601707
{
16611708
struct mount *tree;
16621709
namespace_lock();
1663-
tree = copy_tree(real_mount(path->mnt), path->dentry,
1664-
CL_COPY_ALL | CL_PRIVATE);
1710+
if (!check_mnt(real_mount(path->mnt)))
1711+
tree = ERR_PTR(-EINVAL);
1712+
else
1713+
tree = copy_tree(real_mount(path->mnt), path->dentry,
1714+
CL_COPY_ALL | CL_PRIVATE);
16651715
namespace_unlock();
16661716
if (IS_ERR(tree))
16671717
return ERR_CAST(tree);
@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
16721722
{
16731723
namespace_lock();
16741724
lock_mount_hash();
1675-
umount_tree(real_mount(mnt), 0);
1725+
umount_tree(real_mount(mnt), UMOUNT_SYNC);
16761726
unlock_mount_hash();
16771727
namespace_unlock();
16781728
}
@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
18551905
out_cleanup_ids:
18561906
while (!hlist_empty(&tree_list)) {
18571907
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1858-
umount_tree(child, 0);
1908+
umount_tree(child, UMOUNT_SYNC);
18591909
}
18601910
unlock_mount_hash();
18611911
cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
20352085
err = graft_tree(mnt, parent, mp);
20362086
if (err) {
20372087
lock_mount_hash();
2038-
umount_tree(mnt, 0);
2088+
umount_tree(mnt, UMOUNT_SYNC);
20392089
unlock_mount_hash();
20402090
}
20412091
out2:
@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
24062456
while (!list_empty(&graveyard)) {
24072457
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
24082458
touch_mnt_namespace(mnt->mnt_ns);
2409-
umount_tree(mnt, 1);
2459+
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
24102460
}
24112461
unlock_mount_hash();
24122462
namespace_unlock();
@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
24772527
m = list_first_entry(&graveyard, struct mount,
24782528
mnt_expire);
24792529
touch_mnt_namespace(m->mnt_ns);
2480-
umount_tree(m, 1);
2530+
umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
24812531
}
24822532
}
24832533
}

fs/pnode.c

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
361361
return ret;
362362
}
363363

364+
/*
365+
* Clear MNT_LOCKED when it can be shown to be safe.
366+
*
367+
* mount_lock lock must be held for write
368+
*/
369+
void propagate_mount_unlock(struct mount *mnt)
370+
{
371+
struct mount *parent = mnt->mnt_parent;
372+
struct mount *m, *child;
373+
374+
BUG_ON(parent == mnt);
375+
376+
for (m = propagation_next(parent, parent); m;
377+
m = propagation_next(m, parent)) {
378+
child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
379+
if (child)
380+
child->mnt.mnt_flags &= ~MNT_LOCKED;
381+
}
382+
}
383+
384+
/*
385+
* Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
386+
*/
387+
static void mark_umount_candidates(struct mount *mnt)
388+
{
389+
struct mount *parent = mnt->mnt_parent;
390+
struct mount *m;
391+
392+
BUG_ON(parent == mnt);
393+
394+
for (m = propagation_next(parent, parent); m;
395+
m = propagation_next(m, parent)) {
396+
struct mount *child = __lookup_mnt_last(&m->mnt,
397+
mnt->mnt_mountpoint);
398+
if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
399+
SET_MNT_MARK(child);
400+
}
401+
}
402+
}
403+
364404
/*
365405
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
366406
* parent propagates to.
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
378418
struct mount *child = __lookup_mnt_last(&m->mnt,
379419
mnt->mnt_mountpoint);
380420
/*
381-
* umount the child only if the child has no
382-
* other children
421+
* umount the child only if the child has no children
422+
* and the child is marked safe to unmount.
383423
*/
384-
if (child && list_empty(&child->mnt_mounts)) {
424+
if (!child || !IS_MNT_MARKED(child))
425+
continue;
426+
CLEAR_MNT_MARK(child);
427+
if (list_empty(&child->mnt_mounts)) {
385428
list_del_init(&child->mnt_child);
386-
hlist_del_init_rcu(&child->mnt_hash);
387-
hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
429+
child->mnt.mnt_flags |= MNT_UMOUNT;
430+
list_move_tail(&child->mnt_list, &mnt->mnt_list);
388431
}
389432
}
390433
}
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
396439
*
397440
* vfsmount lock must be held for write
398441
*/
399-
int propagate_umount(struct hlist_head *list)
442+
int propagate_umount(struct list_head *list)
400443
{
401444
struct mount *mnt;
402445

403-
hlist_for_each_entry(mnt, list, mnt_hash)
446+
list_for_each_entry_reverse(mnt, list, mnt_list)
447+
mark_umount_candidates(mnt);
448+
449+
list_for_each_entry(mnt, list, mnt_list)
404450
__propagate_umount(mnt);
405451
return 0;
406452
}

0 commit comments

Comments
 (0)