namespace.c 67.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
A
Al Viro 已提交
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
L
Linus Torvalds 已提交
16 17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/idr.h>
A
Al Viro 已提交
19 20 21 22 23
#include <linux/acct.h>		/* acct_auto_close_mnt */
#include <linux/ramfs.h>	/* init_rootfs */
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
24
#include <linux/proc_fs.h>
25
#include "pnode.h"
26
#include "internal.h"
L
Linus Torvalds 已提交
27

E
Eric Dumazet 已提交
28 29 30
#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
#define HASH_SIZE (1UL << HASH_SHIFT)

A
Al Viro 已提交
31
static int event;
32
static DEFINE_IDA(mnt_id_ida);
33
static DEFINE_IDA(mnt_group_ida);
N
Nick Piggin 已提交
34
static DEFINE_SPINLOCK(mnt_id_lock);
35 36
static int mnt_id_start = 0;
static int mnt_group_start = 1;
L
Linus Torvalds 已提交
37

38
static struct list_head *mount_hashtable __read_mostly;
39
static struct kmem_cache *mnt_cache __read_mostly;
R
Ram Pai 已提交
40
static struct rw_semaphore namespace_sem;
L
Linus Torvalds 已提交
41

M
Miklos Szeredi 已提交
42
/* /sys/fs */
43 44
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
M
Miklos Szeredi 已提交
45

N
Nick Piggin 已提交
46 47 48 49 50 51 52 53 54 55
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
DEFINE_BRLOCK(vfsmount_lock);

L
Linus Torvalds 已提交
56 57
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
R
Ram Pai 已提交
58 59
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
E
Eric Dumazet 已提交
60 61
	tmp = tmp + (tmp >> HASH_SHIFT);
	return tmp & (HASH_SIZE - 1);
L
Linus Torvalds 已提交
62 63
}

64 65
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)

N
Nick Piggin 已提交
66 67 68 69
/*
 * allocation is serialized by namespace_sem, but we need the spinlock to
 * serialize with freeing.
 */
70
static int mnt_alloc_id(struct mount *mnt)
71 72 73 74 75
{
	int res;

retry:
	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
N
Nick Piggin 已提交
76
	spin_lock(&mnt_id_lock);
A
Al Viro 已提交
77
	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
78
	if (!res)
A
Al Viro 已提交
79
		mnt_id_start = mnt->mnt_id + 1;
N
Nick Piggin 已提交
80
	spin_unlock(&mnt_id_lock);
81 82 83 84 85 86
	if (res == -EAGAIN)
		goto retry;

	return res;
}

87
static void mnt_free_id(struct mount *mnt)
88
{
A
Al Viro 已提交
89
	int id = mnt->mnt_id;
N
Nick Piggin 已提交
90
	spin_lock(&mnt_id_lock);
91 92 93
	ida_remove(&mnt_id_ida, id);
	if (mnt_id_start > id)
		mnt_id_start = id;
N
Nick Piggin 已提交
94
	spin_unlock(&mnt_id_lock);
95 96
}

97 98 99 100 101
/*
 * Allocate a new peer group ID
 *
 * mnt_group_ida is protected by namespace_sem
 */
102
static int mnt_alloc_group_id(struct mount *mnt)
103
{
104 105
	int res;

106 107 108
	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
		return -ENOMEM;

109 110
	res = ida_get_new_above(&mnt_group_ida,
				mnt_group_start,
A
Al Viro 已提交
111
				&mnt->mnt_group_id);
112
	if (!res)
A
Al Viro 已提交
113
		mnt_group_start = mnt->mnt_group_id + 1;
114 115

	return res;
116 117 118 119 120
}

/*
 * Release a peer group ID
 */
121
void mnt_release_group_id(struct mount *mnt)
122
{
A
Al Viro 已提交
123
	int id = mnt->mnt_group_id;
124 125 126
	ida_remove(&mnt_group_ida, id);
	if (mnt_group_start > id)
		mnt_group_start = id;
A
Al Viro 已提交
127
	mnt->mnt_group_id = 0;
128 129
}

N
Nick Piggin 已提交
130 131 132
/*
 * vfsmount lock must be held for read
 */
133
static inline void mnt_add_count(struct mount *mnt, int n)
N
Nick Piggin 已提交
134 135
{
#ifdef CONFIG_SMP
136
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
N
Nick Piggin 已提交
137 138
#else
	preempt_disable();
139
	mnt->mnt_count += n;
N
Nick Piggin 已提交
140 141 142 143 144 145 146
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
147
unsigned int mnt_get_count(struct mount *mnt)
N
Nick Piggin 已提交
148 149
{
#ifdef CONFIG_SMP
A
Al Viro 已提交
150
	unsigned int count = 0;
N
Nick Piggin 已提交
151 152 153
	int cpu;

	for_each_possible_cpu(cpu) {
154
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
N
Nick Piggin 已提交
155 156 157 158
	}

	return count;
#else
159
	return mnt->mnt_count;
N
Nick Piggin 已提交
160 161 162
#endif
}

163
static struct mount *alloc_vfsmnt(const char *name)
L
Linus Torvalds 已提交
164
{
165 166
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
167 168
		int err;

169
		err = mnt_alloc_id(mnt);
170 171 172 173
		if (err)
			goto out_free_cache;

		if (name) {
174 175
			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
			if (!mnt->mnt_devname)
176
				goto out_free_id;
177 178
		}

N
Nick Piggin 已提交
179
#ifdef CONFIG_SMP
180 181
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
N
Nick Piggin 已提交
182 183
			goto out_free_devname;

184
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
N
Nick Piggin 已提交
185
#else
186 187
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
N
Nick Piggin 已提交
188 189
#endif

190 191 192 193 194 195 196 197
		INIT_LIST_HEAD(&mnt->mnt_hash);
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
198 199
#ifdef CONFIG_FSNOTIFY
		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
N
npiggin@suse.de 已提交
200
#endif
L
Linus Torvalds 已提交
201
	}
202
	return mnt;
203

N
npiggin@suse.de 已提交
204 205
#ifdef CONFIG_SMP
out_free_devname:
206
	kfree(mnt->mnt_devname);
N
npiggin@suse.de 已提交
207
#endif
208
out_free_id:
209
	mnt_free_id(mnt);
210
out_free_cache:
211
	kmem_cache_free(mnt_cache, mnt);
212
	return NULL;
L
Linus Torvalds 已提交
213 214
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
236 237 238 239 240
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
	if (mnt->mnt_sb->s_flags & MS_RDONLY)
		return 1;
	return 0;
241 242 243
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

244
static inline void mnt_inc_writers(struct mount *mnt)
N
npiggin@suse.de 已提交
245 246
{
#ifdef CONFIG_SMP
247
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
N
npiggin@suse.de 已提交
248
#else
249
	mnt->mnt_writers++;
N
npiggin@suse.de 已提交
250 251
#endif
}
252

253
static inline void mnt_dec_writers(struct mount *mnt)
254
{
N
npiggin@suse.de 已提交
255
#ifdef CONFIG_SMP
256
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
N
npiggin@suse.de 已提交
257
#else
258
	mnt->mnt_writers--;
N
npiggin@suse.de 已提交
259
#endif
260 261
}

262
static unsigned int mnt_get_writers(struct mount *mnt)
263
{
N
npiggin@suse.de 已提交
264 265
#ifdef CONFIG_SMP
	unsigned int count = 0;
266 267 268
	int cpu;

	for_each_possible_cpu(cpu) {
269
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
270 271
	}

N
npiggin@suse.de 已提交
272 273 274 275
	return count;
#else
	return mnt->mnt_writers;
#endif
276 277
}

278 279 280 281 282 283 284 285 286
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

287
/*
288 289 290 291
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
292 293
 */
/**
294
 * __mnt_want_write - get write access to a mount without freeze protection
295
 * @m: the mount on which to take a write
296
 *
297 298 299 300 301
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
302
 */
303
int __mnt_want_write(struct vfsmount *m)
304
{
305
	struct mount *mnt = real_mount(m);
306 307
	int ret = 0;

N
npiggin@suse.de 已提交
308
	preempt_disable();
309
	mnt_inc_writers(mnt);
N
npiggin@suse.de 已提交
310
	/*
311
	 * The store to mnt_inc_writers must be visible before we pass
N
npiggin@suse.de 已提交
312 313 314 315
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
316
	while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
N
npiggin@suse.de 已提交
317 318 319 320 321 322 323
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
324
	if (mnt_is_readonly(m)) {
325
		mnt_dec_writers(mnt);
326 327
		ret = -EROFS;
	}
N
npiggin@suse.de 已提交
328
	preempt_enable();
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
350
	return ret;
351 352 353
}
EXPORT_SYMBOL_GPL(mnt_want_write);

N
npiggin@suse.de 已提交
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
372
	mnt_inc_writers(real_mount(mnt));
N
npiggin@suse.de 已提交
373 374 375 376 377 378
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
379
 * __mnt_want_write_file - get write access to a file's mount
N
npiggin@suse.de 已提交
380 381
 * @file: the file who's mount on which to take a write
 *
382
 * This is like __mnt_want_write, but it takes a file and can
N
npiggin@suse.de 已提交
383 384
 * do some optimisations if the file is open for write already
 */
385
int __mnt_want_write_file(struct file *file)
N
npiggin@suse.de 已提交
386
{
387
	struct inode *inode = file->f_dentry->d_inode;
388

389
	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
390
		return __mnt_want_write(file->f_path.mnt);
N
npiggin@suse.de 已提交
391 392 393
	else
		return mnt_clone_write(file->f_path.mnt);
}
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

	sb_start_write(file->f_path.mnt->mnt_sb);
	ret = __mnt_want_write_file(file);
	if (ret)
		sb_end_write(file->f_path.mnt->mnt_sb);
	return ret;
}
N
npiggin@suse.de 已提交
412 413
EXPORT_SYMBOL_GPL(mnt_want_write_file);

414
/**
415
 * __mnt_drop_write - give up write access to a mount
416 417 418 419
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
420
 * __mnt_want_write() call above.
421
 */
422
void __mnt_drop_write(struct vfsmount *mnt)
423
{
N
npiggin@suse.de 已提交
424
	preempt_disable();
425
	mnt_dec_writers(real_mount(mnt));
N
npiggin@suse.de 已提交
426
	preempt_enable();
427
}
428 429 430 431 432 433 434 435 436 437 438 439 440 441

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
442 443
EXPORT_SYMBOL_GPL(mnt_drop_write);

444 445 446 447 448
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

A
Al Viro 已提交
449 450 451 452 453 454
void mnt_drop_write_file(struct file *file)
{
	mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(mnt_drop_write_file);

455
static int mnt_make_readonly(struct mount *mnt)
456
{
457 458
	int ret = 0;

A
Andi Kleen 已提交
459
	br_write_lock(&vfsmount_lock);
460
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
461
	/*
N
npiggin@suse.de 已提交
462 463
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
464
	 */
N
npiggin@suse.de 已提交
465 466
	smp_mb();

467
	/*
N
npiggin@suse.de 已提交
468 469 470 471 472 473 474 475 476 477 478 479 480 481
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
482
	 */
483
	if (mnt_get_writers(mnt) > 0)
N
npiggin@suse.de 已提交
484 485
		ret = -EBUSY;
	else
486
		mnt->mnt.mnt_flags |= MNT_READONLY;
N
npiggin@suse.de 已提交
487 488 489 490 491
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
492
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
A
Andi Kleen 已提交
493
	br_write_unlock(&vfsmount_lock);
494
	return ret;
495 496
}

497
static void __mnt_unmake_readonly(struct mount *mnt)
498
{
A
Andi Kleen 已提交
499
	br_write_lock(&vfsmount_lock);
500
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
A
Andi Kleen 已提交
501
	br_write_unlock(&vfsmount_lock);
502 503
}

504 505 506 507 508
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

509 510 511 512
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

A
Andi Kleen 已提交
513
	br_write_lock(&vfsmount_lock);
514 515 516 517 518 519 520 521 522 523
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
524 525 526
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

527 528 529 530 531 532 533 534
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
A
Andi Kleen 已提交
535
	br_write_unlock(&vfsmount_lock);
536 537 538 539

	return err;
}

540
static void free_vfsmnt(struct mount *mnt)
L
Linus Torvalds 已提交
541
{
A
Al Viro 已提交
542
	kfree(mnt->mnt_devname);
543
	mnt_free_id(mnt);
N
npiggin@suse.de 已提交
544
#ifdef CONFIG_SMP
545
	free_percpu(mnt->mnt_pcp);
N
npiggin@suse.de 已提交
546
#endif
547
	kmem_cache_free(mnt_cache, mnt);
L
Linus Torvalds 已提交
548 549 550
}

/*
R
Ram Pai 已提交
551 552
 * find the first or last mount at @dentry on vfsmount @mnt depending on
 * @dir. If @dir is set return the first mount else return the last mount.
N
Nick Piggin 已提交
553
 * vfsmount_lock must be held for read or write.
L
Linus Torvalds 已提交
554
 */
555
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
R
Ram Pai 已提交
556
			      int dir)
L
Linus Torvalds 已提交
557
{
R
Ram Pai 已提交
558 559
	struct list_head *head = mount_hashtable + hash(mnt, dentry);
	struct list_head *tmp = head;
560
	struct mount *p, *found = NULL;
L
Linus Torvalds 已提交
561 562

	for (;;) {
R
Ram Pai 已提交
563
		tmp = dir ? tmp->next : tmp->prev;
L
Linus Torvalds 已提交
564 565 566
		p = NULL;
		if (tmp == head)
			break;
A
Al Viro 已提交
567
		p = list_entry(tmp, struct mount, mnt_hash);
568
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
R
Ram Pai 已提交
569
			found = p;
L
Linus Torvalds 已提交
570 571 572 573 574 575
			break;
		}
	}
	return found;
}

R
Ram Pai 已提交
576
/*
577 578 579 580 581 582 583 584 585 586 587 588 589 590
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
R
Ram Pai 已提交
591
 */
A
Al Viro 已提交
592
struct vfsmount *lookup_mnt(struct path *path)
R
Ram Pai 已提交
593
{
594
	struct mount *child_mnt;
N
Nick Piggin 已提交
595

A
Andi Kleen 已提交
596
	br_read_lock(&vfsmount_lock);
597 598 599
	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
	if (child_mnt) {
		mnt_add_count(child_mnt, 1);
A
Andi Kleen 已提交
600
		br_read_unlock(&vfsmount_lock);
601 602
		return &child_mnt->mnt;
	} else {
A
Andi Kleen 已提交
603
		br_read_unlock(&vfsmount_lock);
604 605
		return NULL;
	}
R
Ram Pai 已提交
606 607
}

A
Al Viro 已提交
608
static inline int check_mnt(struct mount *mnt)
L
Linus Torvalds 已提交
609
{
610
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
L
Linus Torvalds 已提交
611 612
}

N
Nick Piggin 已提交
613 614 615
/*
 * vfsmount lock must be held for write
 */
616
static void touch_mnt_namespace(struct mnt_namespace *ns)
A
Al Viro 已提交
617 618 619 620 621 622 623
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

N
Nick Piggin 已提交
624 625 626
/*
 * vfsmount lock must be held for write
 */
627
static void __touch_mnt_namespace(struct mnt_namespace *ns)
A
Al Viro 已提交
628 629 630 631 632 633 634
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

N
Nick Piggin 已提交
635 636 637 638
/*
 * Clear dentry's mounted state if it has no remaining mounts.
 * vfsmount_lock must be held for write.
 */
639
static void dentry_reset_mounted(struct dentry *dentry)
N
Nick Piggin 已提交
640 641 642 643
{
	unsigned u;

	for (u = 0; u < HASH_SIZE; u++) {
644
		struct mount *p;
N
Nick Piggin 已提交
645

A
Al Viro 已提交
646
		list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
647
			if (p->mnt_mountpoint == dentry)
N
Nick Piggin 已提交
648 649 650 651 652 653 654 655
				return;
		}
	}
	spin_lock(&dentry->d_lock);
	dentry->d_flags &= ~DCACHE_MOUNTED;
	spin_unlock(&dentry->d_lock);
}

N
Nick Piggin 已提交
656 657 658
/*
 * vfsmount lock must be held for write
 */
659 660
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
661
	old_path->dentry = mnt->mnt_mountpoint;
662 663
	old_path->mnt = &mnt->mnt_parent->mnt;
	mnt->mnt_parent = mnt;
664
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
665
	list_del_init(&mnt->mnt_child);
A
Al Viro 已提交
666
	list_del_init(&mnt->mnt_hash);
667
	dentry_reset_mounted(old_path->dentry);
L
Linus Torvalds 已提交
668 669
}

N
Nick Piggin 已提交
670 671 672
/*
 * vfsmount lock must be held for write
 */
673
void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
674
			struct mount *child_mnt)
675
{
676
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
677
	child_mnt->mnt_mountpoint = dget(dentry);
678
	child_mnt->mnt_parent = mnt;
N
Nick Piggin 已提交
679 680 681
	spin_lock(&dentry->d_lock);
	dentry->d_flags |= DCACHE_MOUNTED;
	spin_unlock(&dentry->d_lock);
682 683
}

N
Nick Piggin 已提交
684 685 686
/*
 * vfsmount lock must be held for write
 */
687
static void attach_mnt(struct mount *mnt, struct path *path)
L
Linus Torvalds 已提交
688
{
689
	mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
A
Al Viro 已提交
690
	list_add_tail(&mnt->mnt_hash, mount_hashtable +
691
			hash(path->mnt, path->dentry));
692
	list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
693 694 695
}

/*
N
Nick Piggin 已提交
696
 * vfsmount lock must be held for write
697
 */
A
Al Viro 已提交
698
static void commit_tree(struct mount *mnt)
699
{
700
	struct mount *parent = mnt->mnt_parent;
701
	struct mount *m;
702
	LIST_HEAD(head);
A
Al Viro 已提交
703
	struct mnt_namespace *n = parent->mnt_ns;
704

705
	BUG_ON(parent == mnt);
706

A
Al Viro 已提交
707
	list_add_tail(&head, &mnt->mnt_list);
A
Al Viro 已提交
708
	list_for_each_entry(m, &head, mnt_list)
A
Al Viro 已提交
709
		m->mnt_ns = n;
A
Al Viro 已提交
710

711 712
	list_splice(&head, n->list.prev);

A
Al Viro 已提交
713
	list_add_tail(&mnt->mnt_hash, mount_hashtable +
714
				hash(&parent->mnt, mnt->mnt_mountpoint));
715
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
716
	touch_mnt_namespace(n);
L
Linus Torvalds 已提交
717 718
}

719
static struct mount *next_mnt(struct mount *p, struct mount *root)
L
Linus Torvalds 已提交
720
{
721 722
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
L
Linus Torvalds 已提交
723
		while (1) {
724
			if (p == root)
L
Linus Torvalds 已提交
725
				return NULL;
726 727
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
L
Linus Torvalds 已提交
728
				break;
729
			p = p->mnt_parent;
L
Linus Torvalds 已提交
730 731
		}
	}
732
	return list_entry(next, struct mount, mnt_child);
L
Linus Torvalds 已提交
733 734
}

735
static struct mount *skip_mnt_tree(struct mount *p)
R
Ram Pai 已提交
736
{
737 738 739 740
	struct list_head *prev = p->mnt_mounts.prev;
	while (prev != &p->mnt_mounts) {
		p = list_entry(prev, struct mount, mnt_child);
		prev = p->mnt_mounts.prev;
R
Ram Pai 已提交
741 742 743 744
	}
	return p;
}

745 746 747
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
748
	struct mount *mnt;
749 750 751 752 753 754 755 756 757 758
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name);
	if (!mnt)
		return ERR_PTR(-ENOMEM);

	if (flags & MS_KERNMOUNT)
759
		mnt->mnt.mnt_flags = MNT_INTERNAL;
760 761 762 763 764 765 766

	root = mount_fs(type, flags, name, data);
	if (IS_ERR(root)) {
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

767 768
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
769
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
770
	mnt->mnt_parent = mnt;
A
Andi Kleen 已提交
771
	br_write_lock(&vfsmount_lock);
772
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
A
Andi Kleen 已提交
773
	br_write_unlock(&vfsmount_lock);
774
	return &mnt->mnt;
775 776 777
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

778
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
R
Ram Pai 已提交
779
					int flag)
L
Linus Torvalds 已提交
780
{
781
	struct super_block *sb = old->mnt.mnt_sb;
782 783
	struct mount *mnt;
	int err;
L
Linus Torvalds 已提交
784

785 786 787
	mnt = alloc_vfsmnt(old->mnt_devname);
	if (!mnt)
		return ERR_PTR(-ENOMEM);
788

789
	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
790 791 792
		mnt->mnt_group_id = 0; /* not a peer of original */
	else
		mnt->mnt_group_id = old->mnt_group_id;
793

794 795 796 797
	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
		err = mnt_alloc_group_id(mnt);
		if (err)
			goto out_free;
L
Linus Torvalds 已提交
798
	}
799 800 801 802 803 804 805 806 807 808 809

	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
	atomic_inc(&sb->s_active);
	mnt->mnt.mnt_sb = sb;
	mnt->mnt.mnt_root = dget(root);
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
	br_write_lock(&vfsmount_lock);
	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
	br_write_unlock(&vfsmount_lock);

810 811
	if ((flag & CL_SLAVE) ||
	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
		mnt->mnt_master = old;
		CLEAR_MNT_SHARED(mnt);
	} else if (!(flag & CL_PRIVATE)) {
		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
			list_add(&mnt->mnt_share, &old->mnt_share);
		if (IS_MNT_SLAVE(old))
			list_add(&mnt->mnt_slave, &old->mnt_slave);
		mnt->mnt_master = old->mnt_master;
	}
	if (flag & CL_MAKE_SHARED)
		set_mnt_shared(mnt);

	/* stick the duplicate mount on the same expiry list
	 * as the original if that was on one */
	if (flag & CL_EXPIRE) {
		if (!list_empty(&old->mnt_expire))
			list_add(&mnt->mnt_expire, &old->mnt_expire);
	}

832
	return mnt;
833 834 835

 out_free:
	free_vfsmnt(mnt);
836
	return ERR_PTR(err);
L
Linus Torvalds 已提交
837 838
}

839
static inline void mntfree(struct mount *mnt)
L
Linus Torvalds 已提交
840
{
841 842
	struct vfsmount *m = &mnt->mnt;
	struct super_block *sb = m->mnt_sb;
N
Nick Piggin 已提交
843

844 845 846 847 848 849
	/*
	 * This probably indicates that somebody messed
	 * up a mnt_want/drop_write() pair.  If this
	 * happens, the filesystem was probably unable
	 * to make r/w->r/o transitions.
	 */
N
npiggin@suse.de 已提交
850
	/*
N
Nick Piggin 已提交
851 852
	 * The locking used to deal with mnt_count decrement provides barriers,
	 * so mnt_get_writers() below is safe.
N
npiggin@suse.de 已提交
853
	 */
854
	WARN_ON(mnt_get_writers(mnt));
855 856 857
	fsnotify_vfsmount_delete(m);
	dput(m->mnt_root);
	free_vfsmnt(mnt);
L
Linus Torvalds 已提交
858 859 860
	deactivate_super(sb);
}

861
static void mntput_no_expire(struct mount *mnt)
N
Nick Piggin 已提交
862 863
{
put_again:
A
Al Viro 已提交
864
#ifdef CONFIG_SMP
A
Andi Kleen 已提交
865
	br_read_lock(&vfsmount_lock);
A
Al Viro 已提交
866 867
	if (likely(mnt->mnt_ns)) {
		/* shouldn't be the last one */
868
		mnt_add_count(mnt, -1);
A
Andi Kleen 已提交
869
		br_read_unlock(&vfsmount_lock);
A
Al Viro 已提交
870
		return;
N
Nick Piggin 已提交
871
	}
A
Andi Kleen 已提交
872
	br_read_unlock(&vfsmount_lock);
N
Nick Piggin 已提交
873

A
Andi Kleen 已提交
874
	br_write_lock(&vfsmount_lock);
875
	mnt_add_count(mnt, -1);
N
Nick Piggin 已提交
876
	if (mnt_get_count(mnt)) {
A
Andi Kleen 已提交
877
		br_write_unlock(&vfsmount_lock);
N
Nick Piggin 已提交
878 879
		return;
	}
N
Nick Piggin 已提交
880
#else
881
	mnt_add_count(mnt, -1);
N
Nick Piggin 已提交
882
	if (likely(mnt_get_count(mnt)))
N
Nick Piggin 已提交
883
		return;
A
Andi Kleen 已提交
884
	br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
885
#endif
886 887 888
	if (unlikely(mnt->mnt_pinned)) {
		mnt_add_count(mnt, mnt->mnt_pinned + 1);
		mnt->mnt_pinned = 0;
A
Andi Kleen 已提交
889
		br_write_unlock(&vfsmount_lock);
890
		acct_auto_close_mnt(&mnt->mnt);
N
Nick Piggin 已提交
891
		goto put_again;
892
	}
A
Andi Kleen 已提交
893

894
	list_del(&mnt->mnt_instance);
A
Andi Kleen 已提交
895
	br_write_unlock(&vfsmount_lock);
N
Nick Piggin 已提交
896 897 898 899 900 901
	mntfree(mnt);
}

void mntput(struct vfsmount *mnt)
{
	if (mnt) {
902
		struct mount *m = real_mount(mnt);
N
Nick Piggin 已提交
903
		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
904 905 906
		if (unlikely(m->mnt_expiry_mark))
			m->mnt_expiry_mark = 0;
		mntput_no_expire(m);
N
Nick Piggin 已提交
907 908 909 910 911 912 913
	}
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
	if (mnt)
914
		mnt_add_count(real_mount(mnt), 1);
N
Nick Piggin 已提交
915 916 917 918
	return mnt;
}
EXPORT_SYMBOL(mntget);

919 920
void mnt_pin(struct vfsmount *mnt)
{
A
Andi Kleen 已提交
921
	br_write_lock(&vfsmount_lock);
922
	real_mount(mnt)->mnt_pinned++;
A
Andi Kleen 已提交
923
	br_write_unlock(&vfsmount_lock);
924 925 926
}
EXPORT_SYMBOL(mnt_pin);

927
void mnt_unpin(struct vfsmount *m)
928
{
929
	struct mount *mnt = real_mount(m);
A
Andi Kleen 已提交
930
	br_write_lock(&vfsmount_lock);
931
	if (mnt->mnt_pinned) {
932
		mnt_add_count(mnt, 1);
933 934
		mnt->mnt_pinned--;
	}
A
Andi Kleen 已提交
935
	br_write_unlock(&vfsmount_lock);
936 937
}
EXPORT_SYMBOL(mnt_unpin);
L
Linus Torvalds 已提交
938

939 940 941 942 943 944 945 946 947 948 949
static inline void mangle(struct seq_file *m, const char *s)
{
	seq_escape(m, s, " \t\n\\");
}

/*
 * Simple .show_options callback for filesystems which don't want to
 * implement more complex mount option showing.
 *
 * See also save_mount_options().
 */
950
int generic_show_options(struct seq_file *m, struct dentry *root)
951
{
952 953 954
	const char *options;

	rcu_read_lock();
955
	options = rcu_dereference(root->d_sb->s_options);
956 957 958 959 960

	if (options != NULL && options[0]) {
		seq_putc(m, ',');
		mangle(m, options);
	}
961
	rcu_read_unlock();
962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981

	return 0;
}
EXPORT_SYMBOL(generic_show_options);

/*
 * If filesystem uses generic_show_options(), this function should be
 * called from the fill_super() callback.
 *
 * The .remount_fs callback usually needs to be handled in a special
 * way, to make sure, that previous options are not overwritten if the
 * remount fails.
 *
 * Also note, that if the filesystem's .remount_fs function doesn't
 * reset all options to their default value, but changes only newly
 * given options, then the displayed options will not reflect reality
 * any more.
 */
void save_mount_options(struct super_block *sb, char *options)
{
982 983
	BUG_ON(sb->s_options);
	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
984 985 986
}
EXPORT_SYMBOL(save_mount_options);

987 988 989 990 991 992 993 994 995 996 997
void replace_mount_options(struct super_block *sb, char *options)
{
	char *old = sb->s_options;
	rcu_assign_pointer(sb->s_options, options);
	if (old) {
		synchronize_rcu();
		kfree(old);
	}
}
EXPORT_SYMBOL(replace_mount_options);

998
#ifdef CONFIG_PROC_FS
999
/* iterator; we want it to have access to namespace_sem, thus here... */
L
Linus Torvalds 已提交
1000 1001
static void *m_start(struct seq_file *m, loff_t *pos)
{
A
Al Viro 已提交
1002
	struct proc_mounts *p = proc_mounts(m);
L
Linus Torvalds 已提交
1003

R
Ram Pai 已提交
1004
	down_read(&namespace_sem);
1005
	return seq_list_start(&p->ns->list, *pos);
L
Linus Torvalds 已提交
1006 1007 1008 1009
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
A
Al Viro 已提交
1010
	struct proc_mounts *p = proc_mounts(m);
1011

1012
	return seq_list_next(v, &p->ns->list, pos);
L
Linus Torvalds 已提交
1013 1014 1015 1016
}

static void m_stop(struct seq_file *m, void *v)
{
R
Ram Pai 已提交
1017
	up_read(&namespace_sem);
L
Linus Torvalds 已提交
1018 1019
}

1020
static int m_show(struct seq_file *m, void *v)
1021
{
A
Al Viro 已提交
1022
	struct proc_mounts *p = proc_mounts(m);
A
Al Viro 已提交
1023
	struct mount *r = list_entry(v, struct mount, mnt_list);
1024
	return p->show(m, &r->mnt);
L
Linus Torvalds 已提交
1025 1026
}

1027
const struct seq_operations mounts_op = {
L
Linus Torvalds 已提交
1028 1029 1030
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
1031
	.show	= m_show,
1032
};
1033
#endif  /* CONFIG_PROC_FS */
1034

L
Linus Torvalds 已提交
1035 1036 1037 1038 1039 1040 1041 1042
/**
 * may_umount_tree - check if a mount tree is busy
 * @mnt: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
1043
int may_umount_tree(struct vfsmount *m)
L
Linus Torvalds 已提交
1044
{
1045
	struct mount *mnt = real_mount(m);
R
Ram Pai 已提交
1046 1047
	int actual_refs = 0;
	int minimum_refs = 0;
1048
	struct mount *p;
1049
	BUG_ON(!m);
L
Linus Torvalds 已提交
1050

N
Nick Piggin 已提交
1051
	/* write lock needed for mnt_get_count */
A
Andi Kleen 已提交
1052
	br_write_lock(&vfsmount_lock);
1053
	for (p = mnt; p; p = next_mnt(p, mnt)) {
1054
		actual_refs += mnt_get_count(p);
L
Linus Torvalds 已提交
1055 1056
		minimum_refs += 2;
	}
A
Andi Kleen 已提交
1057
	br_write_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
1058 1059

	if (actual_refs > minimum_refs)
1060
		return 0;
L
Linus Torvalds 已提交
1061

1062
	return 1;
L
Linus Torvalds 已提交
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
1082
	int ret = 1;
A
Al Viro 已提交
1083
	down_read(&namespace_sem);
A
Andi Kleen 已提交
1084
	br_write_lock(&vfsmount_lock);
1085
	if (propagate_mount_busy(real_mount(mnt), 2))
1086
		ret = 0;
A
Andi Kleen 已提交
1087
	br_write_unlock(&vfsmount_lock);
A
Al Viro 已提交
1088
	up_read(&namespace_sem);
R
Ram Pai 已提交
1089
	return ret;
L
Linus Torvalds 已提交
1090 1091 1092 1093
}

EXPORT_SYMBOL(may_umount);

1094
void release_mounts(struct list_head *head)
R
Ram Pai 已提交
1095
{
1096
	struct mount *mnt;
M
Miklos Szeredi 已提交
1097
	while (!list_empty(head)) {
A
Al Viro 已提交
1098 1099
		mnt = list_first_entry(head, struct mount, mnt_hash);
		list_del_init(&mnt->mnt_hash);
1100
		if (mnt_has_parent(mnt)) {
R
Ram Pai 已提交
1101
			struct dentry *dentry;
1102
			struct mount *m;
N
Nick Piggin 已提交
1103

A
Andi Kleen 已提交
1104
			br_write_lock(&vfsmount_lock);
1105
			dentry = mnt->mnt_mountpoint;
1106
			m = mnt->mnt_parent;
1107
			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1108
			mnt->mnt_parent = mnt;
1109
			m->mnt_ghosts--;
A
Andi Kleen 已提交
1110
			br_write_unlock(&vfsmount_lock);
R
Ram Pai 已提交
1111
			dput(dentry);
1112
			mntput(&m->mnt);
R
Ram Pai 已提交
1113
		}
1114
		mntput(&mnt->mnt);
R
Ram Pai 已提交
1115 1116 1117
	}
}

N
Nick Piggin 已提交
1118 1119 1120 1121
/*
 * vfsmount lock must be held for write
 * namespace_sem must be held for write
 */
1122
void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
L
Linus Torvalds 已提交
1123
{
A
Al Viro 已提交
1124
	LIST_HEAD(tmp_list);
1125
	struct mount *p;
L
Linus Torvalds 已提交
1126

1127
	for (p = mnt; p; p = next_mnt(p, mnt))
A
Al Viro 已提交
1128
		list_move(&p->mnt_hash, &tmp_list);
L
Linus Torvalds 已提交
1129

R
Ram Pai 已提交
1130
	if (propagate)
A
Al Viro 已提交
1131
		propagate_umount(&tmp_list);
R
Ram Pai 已提交
1132

A
Al Viro 已提交
1133
	list_for_each_entry(p, &tmp_list, mnt_hash) {
1134
		list_del_init(&p->mnt_expire);
A
Al Viro 已提交
1135
		list_del_init(&p->mnt_list);
A
Al Viro 已提交
1136 1137
		__touch_mnt_namespace(p->mnt_ns);
		p->mnt_ns = NULL;
1138
		list_del_init(&p->mnt_child);
1139
		if (mnt_has_parent(p)) {
1140
			p->mnt_parent->mnt_ghosts++;
1141
			dentry_reset_mounted(p->mnt_mountpoint);
1142
		}
1143
		change_mnt_propagation(p, MS_PRIVATE);
L
Linus Torvalds 已提交
1144
	}
A
Al Viro 已提交
1145
	list_splice(&tmp_list, kill);
L
Linus Torvalds 已提交
1146 1147
}

1148
static void shrink_submounts(struct mount *mnt, struct list_head *umounts);
1149

1150
static int do_umount(struct mount *mnt, int flags)
L
Linus Torvalds 已提交
1151
{
1152
	struct super_block *sb = mnt->mnt.mnt_sb;
L
Linus Torvalds 已提交
1153
	int retval;
R
Ram Pai 已提交
1154
	LIST_HEAD(umount_list);
L
Linus Torvalds 已提交
1155

1156
	retval = security_sb_umount(&mnt->mnt, flags);
L
Linus Torvalds 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	if (retval)
		return retval;

	/*
	 * Allow userspace to request a mountpoint be expired rather than
	 * unmounting unconditionally. Unmount only happens if:
	 *  (1) the mark is already set (the mark is cleared by mntput())
	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
	 */
	if (flags & MNT_EXPIRE) {
1167
		if (&mnt->mnt == current->fs->root.mnt ||
L
Linus Torvalds 已提交
1168 1169 1170
		    flags & (MNT_FORCE | MNT_DETACH))
			return -EINVAL;

N
Nick Piggin 已提交
1171 1172 1173 1174
		/*
		 * probably don't strictly need the lock here if we examined
		 * all race cases, but it's a slowpath.
		 */
A
Andi Kleen 已提交
1175
		br_write_lock(&vfsmount_lock);
1176
		if (mnt_get_count(mnt) != 2) {
A
Andi Kleen 已提交
1177
			br_write_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
1178
			return -EBUSY;
N
Nick Piggin 已提交
1179
		}
A
Andi Kleen 已提交
1180
		br_write_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
1181

1182
		if (!xchg(&mnt->mnt_expiry_mark, 1))
L
Linus Torvalds 已提交
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
			return -EAGAIN;
	}

	/*
	 * If we may have to abort operations to get out of this
	 * mount, and they will themselves hold resources we must
	 * allow the fs to do things. In the Unix tradition of
	 * 'Gee thats tricky lets do it in userspace' the umount_begin
	 * might fail to complete on the first run through as other tasks
	 * must return, and the like. Thats for the mount program to worry
	 * about for the moment.
	 */

1196 1197 1198
	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
		sb->s_op->umount_begin(sb);
	}
L
Linus Torvalds 已提交
1199 1200 1201 1202 1203 1204 1205 1206 1207 1208

	/*
	 * No sense to grab the lock for this test, but test itself looks
	 * somewhat bogus. Suggestions for better replacement?
	 * Ho-hum... In principle, we might treat that as umount + switch
	 * to rootfs. GC would eventually take care of the old vfsmount.
	 * Actually it makes sense, especially if rootfs would contain a
	 * /reboot - static binary that would close all descriptors and
	 * call reboot(9). Then init(8) could umount root and exec /reboot.
	 */
1209
	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
L
Linus Torvalds 已提交
1210 1211 1212 1213 1214
		/*
		 * Special case for "unmounting" root ...
		 * we just try to remount it readonly.
		 */
		down_write(&sb->s_umount);
A
Al Viro 已提交
1215
		if (!(sb->s_flags & MS_RDONLY))
L
Linus Torvalds 已提交
1216 1217 1218 1219 1220
			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
		up_write(&sb->s_umount);
		return retval;
	}

R
Ram Pai 已提交
1221
	down_write(&namespace_sem);
A
Andi Kleen 已提交
1222
	br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
1223
	event++;
L
Linus Torvalds 已提交
1224

1225
	if (!(flags & MNT_DETACH))
1226
		shrink_submounts(mnt, &umount_list);
1227

L
Linus Torvalds 已提交
1228
	retval = -EBUSY;
R
Ram Pai 已提交
1229
	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
A
Al Viro 已提交
1230
		if (!list_empty(&mnt->mnt_list))
1231
			umount_tree(mnt, 1, &umount_list);
L
Linus Torvalds 已提交
1232 1233
		retval = 0;
	}
A
Andi Kleen 已提交
1234
	br_write_unlock(&vfsmount_lock);
R
Ram Pai 已提交
1235
	up_write(&namespace_sem);
R
Ram Pai 已提交
1236
	release_mounts(&umount_list);
L
Linus Torvalds 已提交
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
	return retval;
}

/*
 * Now umount can handle mount points as well as block devices.
 * This is important for filesystems which use unnamed block devices.
 *
 * We now support a flag for forced unmount like the other 'big iron'
 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
 */

1248
SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
L
Linus Torvalds 已提交
1249
{
1250
	struct path path;
1251
	struct mount *mnt;
L
Linus Torvalds 已提交
1252
	int retval;
1253
	int lookup_flags = 0;
L
Linus Torvalds 已提交
1254

1255 1256 1257 1258 1259 1260 1261
	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
		return -EINVAL;

	if (!(flags & UMOUNT_NOFOLLOW))
		lookup_flags |= LOOKUP_FOLLOW;

	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
L
Linus Torvalds 已提交
1262 1263
	if (retval)
		goto out;
1264
	mnt = real_mount(path.mnt);
L
Linus Torvalds 已提交
1265
	retval = -EINVAL;
1266
	if (path.dentry != path.mnt->mnt_root)
L
Linus Torvalds 已提交
1267
		goto dput_and_out;
A
Al Viro 已提交
1268
	if (!check_mnt(mnt))
L
Linus Torvalds 已提交
1269 1270 1271
		goto dput_and_out;

	retval = -EPERM;
1272
	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1273 1274
		goto dput_and_out;

1275
	retval = do_umount(mnt, flags);
L
Linus Torvalds 已提交
1276
dput_and_out:
J
Jan Blunck 已提交
1277
	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
1278
	dput(path.dentry);
1279
	mntput_no_expire(mnt);
L
Linus Torvalds 已提交
1280 1281 1282 1283 1284 1285 1286
out:
	return retval;
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
R
Ram Pai 已提交
1287
 *	The 2.0 compatible umount. No flags.
L
Linus Torvalds 已提交
1288
 */
1289
SYSCALL_DEFINE1(oldumount, char __user *, name)
L
Linus Torvalds 已提交
1290
{
R
Ram Pai 已提交
1291
	return sys_umount(name, 0);
L
Linus Torvalds 已提交
1292 1293 1294 1295
}

#endif

1296
static int mount_is_safe(struct path *path)
L
Linus Torvalds 已提交
1297
{
1298
	if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1299 1300 1301
		return 0;
	return -EPERM;
#ifdef notyet
1302
	if (S_ISLNK(path->dentry->d_inode->i_mode))
L
Linus Torvalds 已提交
1303
		return -EPERM;
1304
	if (path->dentry->d_inode->i_mode & S_ISVTX) {
1305
		if (current_uid() != path->dentry->d_inode->i_uid)
L
Linus Torvalds 已提交
1306 1307
			return -EPERM;
	}
1308
	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
L
Linus Torvalds 已提交
1309 1310 1311 1312 1313
		return -EPERM;
	return 0;
#endif
}

1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
static bool mnt_ns_loop(struct path *path)
{
	/* Could bind mounting the mount namespace inode cause a
	 * mount namespace loop?
	 */
	struct inode *inode = path->dentry->d_inode;
	struct proc_inode *ei;
	struct mnt_namespace *mnt_ns;

	if (!proc_ns_inode(inode))
		return false;

	ei = PROC_I(inode);
	if (ei->ns_ops != &mntns_operations)
		return false;

	mnt_ns = ei->ns;
	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

1334
struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
R
Ram Pai 已提交
1335
					int flag)
L
Linus Torvalds 已提交
1336
{
1337
	struct mount *res, *p, *q, *r;
1338
	struct path path;
L
Linus Torvalds 已提交
1339

1340
	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
1341
		return ERR_PTR(-EINVAL);
R
Ram Pai 已提交
1342

R
Ram Pai 已提交
1343
	res = q = clone_mnt(mnt, dentry, flag);
1344 1345 1346
	if (IS_ERR(q))
		return q;

1347
	q->mnt_mountpoint = mnt->mnt_mountpoint;
L
Linus Torvalds 已提交
1348 1349

	p = mnt;
1350
	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1351
		struct mount *s;
1352
		if (!is_subdir(r->mnt_mountpoint, dentry))
L
Linus Torvalds 已提交
1353 1354
			continue;

1355
		for (s = r; s; s = next_mnt(s, r)) {
1356
			if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
R
Ram Pai 已提交
1357 1358 1359
				s = skip_mnt_tree(s);
				continue;
			}
1360 1361 1362
			while (p != s->mnt_parent) {
				p = p->mnt_parent;
				q = q->mnt_parent;
L
Linus Torvalds 已提交
1363
			}
1364
			p = s;
1365
			path.mnt = &q->mnt;
1366
			path.dentry = p->mnt_mountpoint;
1367
			q = clone_mnt(p, p->mnt.mnt_root, flag);
1368 1369
			if (IS_ERR(q))
				goto out;
A
Andi Kleen 已提交
1370
			br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
1371
			list_add_tail(&q->mnt_list, &res->mnt_list);
1372
			attach_mnt(q, &path);
A
Andi Kleen 已提交
1373
			br_write_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
1374 1375 1376
		}
	}
	return res;
1377
out:
L
Linus Torvalds 已提交
1378
	if (res) {
R
Ram Pai 已提交
1379
		LIST_HEAD(umount_list);
A
Andi Kleen 已提交
1380
		br_write_lock(&vfsmount_lock);
1381
		umount_tree(res, 0, &umount_list);
A
Andi Kleen 已提交
1382
		br_write_unlock(&vfsmount_lock);
R
Ram Pai 已提交
1383
		release_mounts(&umount_list);
L
Linus Torvalds 已提交
1384
	}
1385
	return q;
L
Linus Torvalds 已提交
1386 1387
}

1388 1389
/* Caller should check returned pointer for errors */

A
Al Viro 已提交
1390
struct vfsmount *collect_mounts(struct path *path)
1391
{
1392
	struct mount *tree;
1393
	down_write(&namespace_sem);
1394 1395
	tree = copy_tree(real_mount(path->mnt), path->dentry,
			 CL_COPY_ALL | CL_PRIVATE);
1396
	up_write(&namespace_sem);
1397 1398 1399
	if (IS_ERR(tree))
		return NULL;
	return &tree->mnt;
1400 1401 1402 1403 1404
}

void drop_collected_mounts(struct vfsmount *mnt)
{
	LIST_HEAD(umount_list);
1405
	down_write(&namespace_sem);
A
Andi Kleen 已提交
1406
	br_write_lock(&vfsmount_lock);
1407
	umount_tree(real_mount(mnt), 0, &umount_list);
A
Andi Kleen 已提交
1408
	br_write_unlock(&vfsmount_lock);
1409
	up_write(&namespace_sem);
1410 1411 1412
	release_mounts(&umount_list);
}

A
Al Viro 已提交
1413 1414 1415
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
		   struct vfsmount *root)
{
A
Al Viro 已提交
1416
	struct mount *mnt;
A
Al Viro 已提交
1417 1418 1419
	int res = f(root, arg);
	if (res)
		return res;
A
Al Viro 已提交
1420 1421
	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
		res = f(&mnt->mnt, arg);
A
Al Viro 已提交
1422 1423 1424 1425 1426 1427
		if (res)
			return res;
	}
	return 0;
}

1428
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
1429
{
1430
	struct mount *p;
1431

1432
	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1433
		if (p->mnt_group_id && !IS_MNT_SHARED(p))
1434
			mnt_release_group_id(p);
1435 1436 1437
	}
}

1438
static int invent_group_ids(struct mount *mnt, bool recurse)
1439
{
1440
	struct mount *p;
1441

1442
	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1443
		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1444
			int err = mnt_alloc_group_id(p);
1445
			if (err) {
1446
				cleanup_group_ids(mnt, p);
1447 1448 1449 1450 1451 1452 1453 1454
				return err;
			}
		}
	}

	return 0;
}

1455 1456
/*
 *  @source_mnt : mount tree to be attached
R
Ram Pai 已提交
1457 1458 1459 1460
 *  @nd         : place the mount tree @source_mnt is attached
 *  @parent_nd  : if non-null, detach the source_mnt from its parent and
 *  		   store the parent mount and mountpoint dentry.
 *  		   (done when source_mnt is moved)
1461 1462 1463
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
R
Ram Pai 已提交
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
1476 1477 1478 1479 1480 1481 1482 1483 1484
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 * 	 tree of the destination mount and the cloned mount is added to
 * 	 the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
R
Ram Pai 已提交
1485 1486 1487 1488 1489 1490 1491
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 * 	 source mount.
 *
R
Ram Pai 已提交
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
 * ---------------------------------------------------------------------------
 * |         		MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
R
Ram Pai 已提交
1504 1505 1506
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 * 	all the mounts in the propagation tree of the destination mount.
R
Ram Pai 已提交
1507
 * (+*)  the mount is moved to the destination.
R
Ram Pai 已提交
1508 1509 1510 1511
 * (+++)  the mount is moved to the destination and is then propagated to
 * 	all the mounts belonging to the destination mount's propagation tree.
 * 	the mount is marked as 'shared and slave'.
 * (*)	the mount continues to be a slave at the new location.
1512 1513 1514 1515 1516 1517
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 */
1518
static int attach_recursive_mnt(struct mount *source_mnt,
1519
			struct path *path, struct path *parent_path)
1520 1521
{
	LIST_HEAD(tree_list);
1522
	struct mount *dest_mnt = real_mount(path->mnt);
1523
	struct dentry *dest_dentry = path->dentry;
1524
	struct mount *child, *p;
1525
	int err;
1526

1527
	if (IS_MNT_SHARED(dest_mnt)) {
1528
		err = invent_group_ids(source_mnt, true);
1529 1530 1531
		if (err)
			goto out;
	}
1532
	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
1533 1534
	if (err)
		goto out_cleanup_ids;
1535

A
Andi Kleen 已提交
1536
	br_write_lock(&vfsmount_lock);
1537

1538
	if (IS_MNT_SHARED(dest_mnt)) {
1539
		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1540
			set_mnt_shared(p);
1541
	}
1542
	if (parent_path) {
1543 1544
		detach_mnt(source_mnt, parent_path);
		attach_mnt(source_mnt, path);
A
Al Viro 已提交
1545
		touch_mnt_namespace(source_mnt->mnt_ns);
R
Ram Pai 已提交
1546
	} else {
1547
		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
1548
		commit_tree(source_mnt);
R
Ram Pai 已提交
1549
	}
1550

A
Al Viro 已提交
1551 1552
	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
		list_del_init(&child->mnt_hash);
A
Al Viro 已提交
1553
		commit_tree(child);
1554
	}
A
Andi Kleen 已提交
1555
	br_write_unlock(&vfsmount_lock);
N
Nick Piggin 已提交
1556

1557
	return 0;
1558 1559

 out_cleanup_ids:
1560
	if (IS_MNT_SHARED(dest_mnt))
1561
		cleanup_group_ids(source_mnt, NULL);
1562 1563
 out:
	return err;
1564 1565
}

1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
static int lock_mount(struct path *path)
{
	struct vfsmount *mnt;
retry:
	mutex_lock(&path->dentry->d_inode->i_mutex);
	if (unlikely(cant_mount(path->dentry))) {
		mutex_unlock(&path->dentry->d_inode->i_mutex);
		return -ENOENT;
	}
	down_write(&namespace_sem);
	mnt = lookup_mnt(path);
	if (likely(!mnt))
		return 0;
	up_write(&namespace_sem);
	mutex_unlock(&path->dentry->d_inode->i_mutex);
	path_put(path);
	path->mnt = mnt;
	path->dentry = dget(mnt->mnt_root);
	goto retry;
}

static void unlock_mount(struct path *path)
{
	up_write(&namespace_sem);
	mutex_unlock(&path->dentry->d_inode->i_mutex);
}

1593
static int graft_tree(struct mount *mnt, struct path *path)
L
Linus Torvalds 已提交
1594
{
1595
	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
L
Linus Torvalds 已提交
1596 1597
		return -EINVAL;

1598
	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
1599
	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
L
Linus Torvalds 已提交
1600 1601
		return -ENOTDIR;

1602 1603
	if (d_unlinked(path->dentry))
		return -ENOENT;
L
Linus Torvalds 已提交
1604

1605
	return attach_recursive_mnt(mnt, path, NULL);
L
Linus Torvalds 已提交
1606 1607
}

1608 1609 1610 1611 1612 1613
/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int flags)
{
1614
	int type = flags & ~(MS_REC | MS_SILENT);
1615 1616 1617 1618 1619 1620 1621 1622 1623 1624

	/* Fail if any non-propagation flags are set */
	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		return 0;
	/* Only one propagation flag should be set */
	if (!is_power_of_2(type))
		return 0;
	return type;
}

1625 1626 1627
/*
 * recursively change the type of the mountpoint.
 */
1628
static int do_change_type(struct path *path, int flag)
1629
{
1630
	struct mount *m;
1631
	struct mount *mnt = real_mount(path->mnt);
1632
	int recurse = flag & MS_REC;
1633
	int type;
1634
	int err = 0;
1635

1636
	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1637 1638
		return -EPERM;

1639
	if (path->dentry != path->mnt->mnt_root)
1640 1641
		return -EINVAL;

1642 1643 1644 1645
	type = flags_to_propagation_type(flag);
	if (!type)
		return -EINVAL;

1646
	down_write(&namespace_sem);
1647 1648 1649 1650 1651 1652
	if (type == MS_SHARED) {
		err = invent_group_ids(mnt, recurse);
		if (err)
			goto out_unlock;
	}

A
Andi Kleen 已提交
1653
	br_write_lock(&vfsmount_lock);
1654
	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1655
		change_mnt_propagation(m, type);
A
Andi Kleen 已提交
1656
	br_write_unlock(&vfsmount_lock);
1657 1658

 out_unlock:
1659
	up_write(&namespace_sem);
1660
	return err;
1661 1662
}

L
Linus Torvalds 已提交
1663 1664 1665
/*
 * do loopback mount.
 */
A
Al Viro 已提交
1666
static int do_loopback(struct path *path, const char *old_name,
1667
				int recurse)
L
Linus Torvalds 已提交
1668
{
1669
	LIST_HEAD(umount_list);
1670
	struct path old_path;
1671
	struct mount *mnt = NULL, *old;
1672
	int err = mount_is_safe(path);
L
Linus Torvalds 已提交
1673 1674 1675 1676
	if (err)
		return err;
	if (!old_name || !*old_name)
		return -EINVAL;
1677
	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
L
Linus Torvalds 已提交
1678 1679 1680
	if (err)
		return err;

1681 1682 1683 1684
	err = -EINVAL;
	if (mnt_ns_loop(&old_path))
		goto out; 

1685 1686 1687 1688
	err = lock_mount(path);
	if (err)
		goto out;

1689 1690
	old = real_mount(old_path.mnt);

L
Linus Torvalds 已提交
1691
	err = -EINVAL;
1692
	if (IS_MNT_UNBINDABLE(old))
1693
		goto out2;
R
Ram Pai 已提交
1694

A
Al Viro 已提交
1695
	if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
1696
		goto out2;
L
Linus Torvalds 已提交
1697

1698
	if (recurse)
1699
		mnt = copy_tree(old, old_path.dentry, 0);
1700
	else
1701
		mnt = clone_mnt(old, old_path.dentry, 0);
1702

1703 1704 1705 1706
	if (IS_ERR(mnt)) {
		err = PTR_ERR(mnt);
		goto out;
	}
1707

1708
	err = graft_tree(mnt, path);
1709
	if (err) {
A
Andi Kleen 已提交
1710
		br_write_lock(&vfsmount_lock);
1711
		umount_tree(mnt, 0, &umount_list);
A
Andi Kleen 已提交
1712
		br_write_unlock(&vfsmount_lock);
1713
	}
1714 1715 1716
out2:
	unlock_mount(path);
	release_mounts(&umount_list);
1717
out:
1718
	path_put(&old_path);
L
Linus Torvalds 已提交
1719 1720 1721
	return err;
}

1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732
static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
{
	int error = 0;
	int readonly_request = 0;

	if (ms_flags & MS_RDONLY)
		readonly_request = 1;
	if (readonly_request == __mnt_is_readonly(mnt))
		return 0;

	if (readonly_request)
1733
		error = mnt_make_readonly(real_mount(mnt));
1734
	else
1735
		__mnt_unmake_readonly(real_mount(mnt));
1736 1737 1738
	return error;
}

L
Linus Torvalds 已提交
1739 1740 1741 1742 1743
/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
1744
static int do_remount(struct path *path, int flags, int mnt_flags,
L
Linus Torvalds 已提交
1745 1746 1747
		      void *data)
{
	int err;
1748
	struct super_block *sb = path->mnt->mnt_sb;
A
Al Viro 已提交
1749
	struct mount *mnt = real_mount(path->mnt);
L
Linus Torvalds 已提交
1750 1751 1752 1753

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

A
Al Viro 已提交
1754
	if (!check_mnt(mnt))
L
Linus Torvalds 已提交
1755 1756
		return -EINVAL;

1757
	if (path->dentry != path->mnt->mnt_root)
L
Linus Torvalds 已提交
1758 1759
		return -EINVAL;

1760 1761 1762 1763
	err = security_sb_remount(sb, data);
	if (err)
		return err;

L
Linus Torvalds 已提交
1764
	down_write(&sb->s_umount);
1765
	if (flags & MS_BIND)
1766
		err = change_mount_flags(path->mnt, flags);
A
Al Viro 已提交
1767
	else
1768
		err = do_remount_sb(sb, flags, data, 0);
A
Al Viro 已提交
1769
	if (!err) {
A
Andi Kleen 已提交
1770
		br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
1771 1772
		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
		mnt->mnt.mnt_flags = mnt_flags;
A
Andi Kleen 已提交
1773
		br_write_unlock(&vfsmount_lock);
A
Al Viro 已提交
1774
	}
L
Linus Torvalds 已提交
1775
	up_write(&sb->s_umount);
1776
	if (!err) {
A
Andi Kleen 已提交
1777
		br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
1778
		touch_mnt_namespace(mnt->mnt_ns);
A
Andi Kleen 已提交
1779
		br_write_unlock(&vfsmount_lock);
1780
	}
L
Linus Torvalds 已提交
1781 1782 1783
	return err;
}

1784
static inline int tree_contains_unbindable(struct mount *mnt)
R
Ram Pai 已提交
1785
{
1786
	struct mount *p;
1787
	for (p = mnt; p; p = next_mnt(p, mnt)) {
1788
		if (IS_MNT_UNBINDABLE(p))
R
Ram Pai 已提交
1789 1790 1791 1792 1793
			return 1;
	}
	return 0;
}

A
Al Viro 已提交
1794
static int do_move_mount(struct path *path, const char *old_name)
L
Linus Torvalds 已提交
1795
{
1796
	struct path old_path, parent_path;
1797
	struct mount *p;
1798
	struct mount *old;
L
Linus Torvalds 已提交
1799
	int err = 0;
1800
	if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1801 1802 1803
		return -EPERM;
	if (!old_name || !*old_name)
		return -EINVAL;
1804
	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
L
Linus Torvalds 已提交
1805 1806 1807
	if (err)
		return err;

1808
	err = lock_mount(path);
1809 1810 1811
	if (err < 0)
		goto out;

A
Al Viro 已提交
1812
	old = real_mount(old_path.mnt);
1813
	p = real_mount(path->mnt);
A
Al Viro 已提交
1814

L
Linus Torvalds 已提交
1815
	err = -EINVAL;
1816
	if (!check_mnt(p) || !check_mnt(old))
L
Linus Torvalds 已提交
1817 1818
		goto out1;

1819
	if (d_unlinked(path->dentry))
R
Ram Pai 已提交
1820
		goto out1;
L
Linus Torvalds 已提交
1821 1822

	err = -EINVAL;
1823
	if (old_path.dentry != old_path.mnt->mnt_root)
R
Ram Pai 已提交
1824
		goto out1;
L
Linus Torvalds 已提交
1825

1826
	if (!mnt_has_parent(old))
R
Ram Pai 已提交
1827
		goto out1;
L
Linus Torvalds 已提交
1828

1829 1830
	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
	      S_ISDIR(old_path.dentry->d_inode->i_mode))
R
Ram Pai 已提交
1831 1832 1833 1834
		goto out1;
	/*
	 * Don't move a mount residing in a shared parent.
	 */
1835
	if (IS_MNT_SHARED(old->mnt_parent))
R
Ram Pai 已提交
1836
		goto out1;
R
Ram Pai 已提交
1837 1838 1839 1840
	/*
	 * Don't move a mount tree containing unbindable mounts to a destination
	 * mount which is shared.
	 */
1841
	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
R
Ram Pai 已提交
1842
		goto out1;
L
Linus Torvalds 已提交
1843
	err = -ELOOP;
1844
	for (; mnt_has_parent(p); p = p->mnt_parent)
1845
		if (p == old)
R
Ram Pai 已提交
1846
			goto out1;
L
Linus Torvalds 已提交
1847

1848
	err = attach_recursive_mnt(old, path, &parent_path);
1849
	if (err)
R
Ram Pai 已提交
1850
		goto out1;
L
Linus Torvalds 已提交
1851 1852 1853

	/* if the mount is moved, it should no longer be expire
	 * automatically */
1854
	list_del_init(&old->mnt_expire);
L
Linus Torvalds 已提交
1855
out1:
1856
	unlock_mount(path);
L
Linus Torvalds 已提交
1857 1858
out:
	if (!err)
1859
		path_put(&parent_path);
1860
	path_put(&old_path);
L
Linus Torvalds 已提交
1861 1862 1863
	return err;
}

1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
{
	int err;
	const char *subtype = strchr(fstype, '.');
	if (subtype) {
		subtype++;
		err = -EINVAL;
		if (!subtype[0])
			goto err;
	} else
		subtype = "";

	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
	err = -ENOMEM;
	if (!mnt->mnt_sb->s_subtype)
		goto err;
	return mnt;

 err:
	mntput(mnt);
	return ERR_PTR(err);
}

/*
 * add a mount into a namespace's mount tree
 */
1890
static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
1891 1892 1893 1894 1895
{
	int err;

	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);

1896 1897 1898
	err = lock_mount(path);
	if (err)
		return err;
1899 1900

	err = -EINVAL;
A
Al Viro 已提交
1901 1902 1903 1904 1905 1906 1907 1908
	if (unlikely(!check_mnt(real_mount(path->mnt)))) {
		/* that's acceptable only for automounts done in private ns */
		if (!(mnt_flags & MNT_SHRINKABLE))
			goto unlock;
		/* ... and for those we'd better have mountpoint still alive */
		if (!real_mount(path->mnt)->mnt_ns)
			goto unlock;
	}
1909 1910 1911

	/* Refuse the same filesystem on the same mount point */
	err = -EBUSY;
1912
	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
1913 1914 1915 1916
	    path->mnt->mnt_root == path->dentry)
		goto unlock;

	err = -EINVAL;
1917
	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
1918 1919
		goto unlock;

1920
	newmnt->mnt.mnt_flags = mnt_flags;
1921 1922 1923
	err = graft_tree(newmnt, path);

unlock:
1924
	unlock_mount(path);
1925 1926
	return err;
}
A
Al Viro 已提交
1927

L
Linus Torvalds 已提交
1928 1929 1930 1931
/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
1932
static int do_new_mount(struct path *path, const char *fstype, int flags,
A
Al Viro 已提交
1933
			int mnt_flags, const char *name, void *data)
L
Linus Torvalds 已提交
1934
{
1935 1936
	struct file_system_type *type;
	struct user_namespace *user_ns;
L
Linus Torvalds 已提交
1937
	struct vfsmount *mnt;
1938
	int err;
L
Linus Torvalds 已提交
1939

1940
	if (!fstype)
L
Linus Torvalds 已提交
1941 1942 1943
		return -EINVAL;

	/* we need capabilities... */
1944 1945
	user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1946 1947
		return -EPERM;

1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
	type = get_fs_type(fstype);
	if (!type)
		return -ENODEV;

	if (user_ns != &init_user_ns) {
		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
			put_filesystem(type);
			return -EPERM;
		}
		/* Only in special cases allow devices from mounts
		 * created outside the initial user namespace.
		 */
		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
			flags |= MS_NODEV;
			mnt_flags |= MNT_NODEV;
		}
	}

	mnt = vfs_kern_mount(type, flags, name, data);
	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
	    !mnt->mnt_sb->s_subtype)
		mnt = fs_set_subtype(mnt, fstype);

	put_filesystem(type);
L
Linus Torvalds 已提交
1972 1973 1974
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);

1975
	err = do_add_mount(real_mount(mnt), path, mnt_flags);
1976 1977 1978
	if (err)
		mntput(mnt);
	return err;
L
Linus Torvalds 已提交
1979 1980
}

1981 1982
int finish_automount(struct vfsmount *m, struct path *path)
{
1983
	struct mount *mnt = real_mount(m);
1984 1985 1986 1987
	int err;
	/* The new mount record should have at least 2 refs to prevent it being
	 * expired before we get a chance to add it
	 */
1988
	BUG_ON(mnt_get_count(mnt) < 2);
1989 1990 1991

	if (m->mnt_sb == path->mnt->mnt_sb &&
	    m->mnt_root == path->dentry) {
A
Al Viro 已提交
1992 1993
		err = -ELOOP;
		goto fail;
1994 1995
	}

1996
	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
A
Al Viro 已提交
1997 1998 1999 2000
	if (!err)
		return 0;
fail:
	/* remove m from any expiration list it may be on */
2001
	if (!list_empty(&mnt->mnt_expire)) {
A
Al Viro 已提交
2002
		down_write(&namespace_sem);
A
Andi Kleen 已提交
2003
		br_write_lock(&vfsmount_lock);
2004
		list_del_init(&mnt->mnt_expire);
A
Andi Kleen 已提交
2005
		br_write_unlock(&vfsmount_lock);
A
Al Viro 已提交
2006
		up_write(&namespace_sem);
2007
	}
A
Al Viro 已提交
2008 2009
	mntput(m);
	mntput(m);
2010 2011 2012
	return err;
}

2013 2014 2015 2016 2017 2018 2019 2020
/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
	down_write(&namespace_sem);
A
Andi Kleen 已提交
2021
	br_write_lock(&vfsmount_lock);
2022

2023
	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2024

A
Andi Kleen 已提交
2025
	br_write_unlock(&vfsmount_lock);
2026 2027 2028 2029
	up_write(&namespace_sem);
}
EXPORT_SYMBOL(mnt_set_expiry);

L
Linus Torvalds 已提交
2030 2031 2032 2033 2034 2035 2036
/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
2037
	struct mount *mnt, *next;
L
Linus Torvalds 已提交
2038
	LIST_HEAD(graveyard);
2039
	LIST_HEAD(umounts);
L
Linus Torvalds 已提交
2040 2041 2042 2043

	if (list_empty(mounts))
		return;

2044
	down_write(&namespace_sem);
A
Andi Kleen 已提交
2045
	br_write_lock(&vfsmount_lock);
L
Linus Torvalds 已提交
2046 2047 2048 2049 2050 2051 2052

	/* extract from the expiration list every vfsmount that matches the
	 * following criteria:
	 * - only referenced by its parent vfsmount
	 * - still marked for expiry (marked on the last call here; marks are
	 *   cleared by mntput())
	 */
2053
	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
2054
		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
2055
			propagate_mount_busy(mnt, 1))
L
Linus Torvalds 已提交
2056
			continue;
2057
		list_move(&mnt->mnt_expire, &graveyard);
L
Linus Torvalds 已提交
2058
	}
2059
	while (!list_empty(&graveyard)) {
2060
		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
A
Al Viro 已提交
2061
		touch_mnt_namespace(mnt->mnt_ns);
2062 2063
		umount_tree(mnt, 1, &umounts);
	}
A
Andi Kleen 已提交
2064
	br_write_unlock(&vfsmount_lock);
2065 2066 2067
	up_write(&namespace_sem);

	release_mounts(&umounts);
T
Trond Myklebust 已提交
2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
2078
static int select_submounts(struct mount *parent, struct list_head *graveyard)
T
Trond Myklebust 已提交
2079
{
2080
	struct mount *this_parent = parent;
T
Trond Myklebust 已提交
2081 2082 2083 2084
	struct list_head *next;
	int found = 0;

repeat:
2085
	next = this_parent->mnt_mounts.next;
T
Trond Myklebust 已提交
2086
resume:
2087
	while (next != &this_parent->mnt_mounts) {
T
Trond Myklebust 已提交
2088
		struct list_head *tmp = next;
2089
		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
T
Trond Myklebust 已提交
2090 2091

		next = tmp->next;
2092
		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
L
Linus Torvalds 已提交
2093
			continue;
T
Trond Myklebust 已提交
2094 2095 2096
		/*
		 * Descend a level if the d_mounts list is non-empty.
		 */
2097
		if (!list_empty(&mnt->mnt_mounts)) {
T
Trond Myklebust 已提交
2098 2099 2100
			this_parent = mnt;
			goto repeat;
		}
L
Linus Torvalds 已提交
2101

2102
		if (!propagate_mount_busy(mnt, 1)) {
2103
			list_move_tail(&mnt->mnt_expire, graveyard);
T
Trond Myklebust 已提交
2104 2105
			found++;
		}
L
Linus Torvalds 已提交
2106
	}
T
Trond Myklebust 已提交
2107 2108 2109 2110
	/*
	 * All done at this level ... ascend and resume the search
	 */
	if (this_parent != parent) {
2111
		next = this_parent->mnt_child.next;
2112
		this_parent = this_parent->mnt_parent;
T
Trond Myklebust 已提交
2113 2114 2115 2116 2117 2118 2119 2120
		goto resume;
	}
	return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
N
Nick Piggin 已提交
2121 2122
 *
 * vfsmount_lock must be held for write
T
Trond Myklebust 已提交
2123
 */
2124
static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
T
Trond Myklebust 已提交
2125 2126
{
	LIST_HEAD(graveyard);
2127
	struct mount *m;
T
Trond Myklebust 已提交
2128 2129

	/* extract submounts of 'mountpoint' from the expiration list */
2130
	while (select_submounts(mnt, &graveyard)) {
2131
		while (!list_empty(&graveyard)) {
2132
			m = list_first_entry(&graveyard, struct mount,
2133
						mnt_expire);
A
Al Viro 已提交
2134
			touch_mnt_namespace(m->mnt_ns);
E
Eric W. Biederman 已提交
2135
			umount_tree(m, 1, umounts);
2136 2137
		}
	}
L
Linus Torvalds 已提交
2138 2139 2140 2141 2142 2143 2144 2145
}

/*
 * Some copy_from_user() implementations do not return the exact number of
 * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
 * Note that this function differs from copy_from_user() in that it will oops
 * on bad values of `to', rather than returning a short copy.
 */
R
Ram Pai 已提交
2146 2147
static long exact_copy_from_user(void *to, const void __user * from,
				 unsigned long n)
L
Linus Torvalds 已提交
2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
{
	char *t = to;
	const char __user *f = from;
	char c;

	if (!access_ok(VERIFY_READ, from, n))
		return n;

	while (n) {
		if (__get_user(c, f)) {
			memset(t, 0, n);
			break;
		}
		*t++ = c;
		f++;
		n--;
	}
	return n;
}

R
Ram Pai 已提交
2168
int copy_mount_options(const void __user * data, unsigned long *where)
L
Linus Torvalds 已提交
2169 2170 2171 2172
{
	int i;
	unsigned long page;
	unsigned long size;
R
Ram Pai 已提交
2173

L
Linus Torvalds 已提交
2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
	*where = 0;
	if (!data)
		return 0;

	if (!(page = __get_free_page(GFP_KERNEL)))
		return -ENOMEM;

	/* We only care that *some* data at the address the user
	 * gave us is valid.  Just in case, we'll zero
	 * the remainder of the page.
	 */
	/* copy_from_user cannot cross TASK_SIZE ! */
	size = TASK_SIZE - (unsigned long)data;
	if (size > PAGE_SIZE)
		size = PAGE_SIZE;

	i = size - exact_copy_from_user((void *)page, data, size);
	if (!i) {
R
Ram Pai 已提交
2192
		free_page(page);
L
Linus Torvalds 已提交
2193 2194 2195 2196 2197 2198 2199 2200
		return -EFAULT;
	}
	if (i != PAGE_SIZE)
		memset((char *)page + i, 0, PAGE_SIZE - i);
	*where = page;
	return 0;
}

2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217
int copy_mount_string(const void __user *data, char **where)
{
	char *tmp;

	if (!data) {
		*where = NULL;
		return 0;
	}

	tmp = strndup_user(data, PAGE_SIZE);
	if (IS_ERR(tmp))
		return PTR_ERR(tmp);

	*where = tmp;
	return 0;
}

L
Linus Torvalds 已提交
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
A
Al Viro 已提交
2232 2233
long do_mount(const char *dev_name, const char *dir_name,
		const char *type_page, unsigned long flags, void *data_page)
L
Linus Torvalds 已提交
2234
{
2235
	struct path path;
L
Linus Torvalds 已提交
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */

	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
		return -EINVAL;

	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

2251 2252 2253 2254 2255 2256 2257 2258 2259 2260
	/* ... and get the mountpoint */
	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
	if (retval)
		return retval;

	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (retval)
		goto dput_out;

2261 2262 2263
	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;
M
Matthew Garrett 已提交
2264

L
Linus Torvalds 已提交
2265 2266 2267 2268 2269 2270 2271
	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;
2272 2273 2274 2275
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
M
Matthew Garrett 已提交
2276 2277
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
2278 2279
	if (flags & MS_RDONLY)
		mnt_flags |= MNT_READONLY;
2280

A
Al Viro 已提交
2281
	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
M
Matthew Garrett 已提交
2282 2283
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME);
L
Linus Torvalds 已提交
2284 2285

	if (flags & MS_REMOUNT)
2286
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
L
Linus Torvalds 已提交
2287 2288
				    data_page);
	else if (flags & MS_BIND)
2289
		retval = do_loopback(&path, dev_name, flags & MS_REC);
R
Ram Pai 已提交
2290
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2291
		retval = do_change_type(&path, flags);
L
Linus Torvalds 已提交
2292
	else if (flags & MS_MOVE)
2293
		retval = do_move_mount(&path, dev_name);
L
Linus Torvalds 已提交
2294
	else
2295
		retval = do_new_mount(&path, type_page, flags, mnt_flags,
L
Linus Torvalds 已提交
2296 2297
				      dev_name, data_page);
dput_out:
2298
	path_put(&path);
L
Linus Torvalds 已提交
2299 2300 2301
	return retval;
}

2302 2303 2304 2305 2306 2307
static void free_mnt_ns(struct mnt_namespace *ns)
{
	put_user_ns(ns->user_ns);
	kfree(ns);
}

2308 2309 2310 2311 2312 2313 2314 2315 2316
/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2317
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2318 2319 2320 2321 2322 2323
{
	struct mnt_namespace *new_ns;

	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
	if (!new_ns)
		return ERR_PTR(-ENOMEM);
2324
	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2325 2326 2327 2328 2329
	atomic_set(&new_ns->count, 1);
	new_ns->root = NULL;
	INIT_LIST_HEAD(&new_ns->list);
	init_waitqueue_head(&new_ns->poll);
	new_ns->event = 0;
2330
	new_ns->user_ns = get_user_ns(user_ns);
2331 2332 2333
	return new_ns;
}

2334 2335 2336 2337
/*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
 */
2338
static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2339
		struct user_namespace *user_ns, struct fs_struct *fs)
L
Linus Torvalds 已提交
2340
{
2341
	struct mnt_namespace *new_ns;
A
Al Viro 已提交
2342
	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2343
	struct mount *p, *q;
2344
	struct mount *old = mnt_ns->root;
2345
	struct mount *new;
2346
	int copy_flags;
L
Linus Torvalds 已提交
2347

2348
	new_ns = alloc_mnt_ns(user_ns);
2349 2350
	if (IS_ERR(new_ns))
		return new_ns;
L
Linus Torvalds 已提交
2351

R
Ram Pai 已提交
2352
	down_write(&namespace_sem);
L
Linus Torvalds 已提交
2353
	/* First pass: copy the tree topology */
2354 2355 2356 2357
	copy_flags = CL_COPY_ALL | CL_EXPIRE;
	if (user_ns != mnt_ns->user_ns)
		copy_flags |= CL_SHARED_TO_SLAVE;
	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2358
	if (IS_ERR(new)) {
R
Ram Pai 已提交
2359
		up_write(&namespace_sem);
2360
		free_mnt_ns(new_ns);
2361
		return ERR_CAST(new);
L
Linus Torvalds 已提交
2362
	}
2363
	new_ns->root = new;
A
Andi Kleen 已提交
2364
	br_write_lock(&vfsmount_lock);
A
Al Viro 已提交
2365
	list_add_tail(&new_ns->list, &new->mnt_list);
A
Andi Kleen 已提交
2366
	br_write_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
2367 2368 2369 2370 2371 2372

	/*
	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
	 * as belonging to new namespace.  We have already acquired a private
	 * fs_struct, so tsk->fs->lock is not needed.
	 */
2373
	p = old;
2374
	q = new;
L
Linus Torvalds 已提交
2375
	while (p) {
A
Al Viro 已提交
2376
		q->mnt_ns = new_ns;
L
Linus Torvalds 已提交
2377
		if (fs) {
2378 2379 2380
			if (&p->mnt == fs->root.mnt) {
				fs->root.mnt = mntget(&q->mnt);
				rootmnt = &p->mnt;
L
Linus Torvalds 已提交
2381
			}
2382 2383 2384
			if (&p->mnt == fs->pwd.mnt) {
				fs->pwd.mnt = mntget(&q->mnt);
				pwdmnt = &p->mnt;
L
Linus Torvalds 已提交
2385 2386
			}
		}
2387 2388
		p = next_mnt(p, old);
		q = next_mnt(q, new);
L
Linus Torvalds 已提交
2389
	}
R
Ram Pai 已提交
2390
	up_write(&namespace_sem);
L
Linus Torvalds 已提交
2391 2392

	if (rootmnt)
A
Al Viro 已提交
2393
		mntput(rootmnt);
L
Linus Torvalds 已提交
2394
	if (pwdmnt)
A
Al Viro 已提交
2395
		mntput(pwdmnt);
L
Linus Torvalds 已提交
2396

2397 2398 2399
	return new_ns;
}

2400
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2401
		struct user_namespace *user_ns, struct fs_struct *new_fs)
2402
{
2403
	struct mnt_namespace *new_ns;
2404

2405
	BUG_ON(!ns);
2406
	get_mnt_ns(ns);
2407 2408

	if (!(flags & CLONE_NEWNS))
2409
		return ns;
2410

2411
	new_ns = dup_mnt_ns(ns, user_ns, new_fs);
2412

2413
	put_mnt_ns(ns);
2414
	return new_ns;
L
Linus Torvalds 已提交
2415 2416
}

2417 2418 2419 2420
/**
 * create_mnt_ns - creates a private namespace and adds a root filesystem
 * @mnt: pointer to the new root filesystem mountpoint
 */
A
Al Viro 已提交
2421
static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2422
{
2423
	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2424
	if (!IS_ERR(new_ns)) {
A
Al Viro 已提交
2425 2426
		struct mount *mnt = real_mount(m);
		mnt->mnt_ns = new_ns;
2427
		new_ns->root = mnt;
A
Al Viro 已提交
2428
		list_add(&new_ns->list, &mnt->mnt_list);
2429
	} else {
A
Al Viro 已提交
2430
		mntput(m);
2431 2432 2433 2434
	}
	return new_ns;
}

A
Al Viro 已提交
2435 2436 2437
struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
{
	struct mnt_namespace *ns;
A
Al Viro 已提交
2438
	struct super_block *s;
A
Al Viro 已提交
2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454
	struct path path;
	int err;

	ns = create_mnt_ns(mnt);
	if (IS_ERR(ns))
		return ERR_CAST(ns);

	err = vfs_path_lookup(mnt->mnt_root, mnt,
			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

	put_mnt_ns(ns);

	if (err)
		return ERR_PTR(err);

	/* trade a vfsmount reference for active sb one */
A
Al Viro 已提交
2455 2456
	s = path.mnt->mnt_sb;
	atomic_inc(&s->s_active);
A
Al Viro 已提交
2457 2458
	mntput(path.mnt);
	/* lock the sucker */
A
Al Viro 已提交
2459
	down_write(&s->s_umount);
A
Al Viro 已提交
2460 2461 2462 2463 2464
	/* ... and return the root of (sub)tree on it */
	return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

2465 2466
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
L
Linus Torvalds 已提交
2467
{
2468 2469
	int ret;
	char *kernel_type;
2470
	struct filename *kernel_dir;
2471
	char *kernel_dev;
L
Linus Torvalds 已提交
2472 2473
	unsigned long data_page;

2474 2475 2476
	ret = copy_mount_string(type, &kernel_type);
	if (ret < 0)
		goto out_type;
L
Linus Torvalds 已提交
2477

2478 2479 2480 2481 2482
	kernel_dir = getname(dir_name);
	if (IS_ERR(kernel_dir)) {
		ret = PTR_ERR(kernel_dir);
		goto out_dir;
	}
L
Linus Torvalds 已提交
2483

2484 2485 2486
	ret = copy_mount_string(dev_name, &kernel_dev);
	if (ret < 0)
		goto out_dev;
L
Linus Torvalds 已提交
2487

2488 2489 2490
	ret = copy_mount_options(data, &data_page);
	if (ret < 0)
		goto out_data;
L
Linus Torvalds 已提交
2491

2492
	ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
2493
		(void *) data_page);
L
Linus Torvalds 已提交
2494

2495 2496 2497 2498 2499 2500 2501 2502 2503
	free_page(data_page);
out_data:
	kfree(kernel_dev);
out_dev:
	putname(kernel_dir);
out_dir:
	kfree(kernel_type);
out_type:
	return ret;
L
Linus Torvalds 已提交
2504 2505
}

A
Al Viro 已提交
2506 2507 2508 2509 2510
/*
 * Return true if path is reachable from root
 *
 * namespace_sem or vfsmount_lock is held
 */
2511
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
A
Al Viro 已提交
2512 2513
			 const struct path *root)
{
2514
	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
2515
		dentry = mnt->mnt_mountpoint;
2516
		mnt = mnt->mnt_parent;
A
Al Viro 已提交
2517
	}
2518
	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
A
Al Viro 已提交
2519 2520 2521 2522 2523
}

int path_is_under(struct path *path1, struct path *path2)
{
	int res;
A
Andi Kleen 已提交
2524
	br_read_lock(&vfsmount_lock);
2525
	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
A
Andi Kleen 已提交
2526
	br_read_unlock(&vfsmount_lock);
A
Al Viro 已提交
2527 2528 2529 2530
	return res;
}
EXPORT_SYMBOL(path_is_under);

L
Linus Torvalds 已提交
2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543
/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
N
Neil Brown 已提交
2544 2545 2546 2547
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
 * in this situation.
 *
L
Linus Torvalds 已提交
2548 2549 2550 2551 2552 2553 2554 2555
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
2556 2557
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
		const char __user *, put_old)
L
Linus Torvalds 已提交
2558
{
2559
	struct path new, old, parent_path, root_parent, root;
2560
	struct mount *new_mnt, *root_mnt;
L
Linus Torvalds 已提交
2561 2562
	int error;

2563
	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
2564 2565
		return -EPERM;

2566
	error = user_path_dir(new_root, &new);
L
Linus Torvalds 已提交
2567 2568 2569
	if (error)
		goto out0;

2570
	error = user_path_dir(put_old, &old);
L
Linus Torvalds 已提交
2571 2572 2573
	if (error)
		goto out1;

2574
	error = security_sb_pivotroot(&old, &new);
2575 2576
	if (error)
		goto out2;
L
Linus Torvalds 已提交
2577

2578
	get_fs_root(current->fs, &root);
2579 2580 2581 2582
	error = lock_mount(&old);
	if (error)
		goto out3;

L
Linus Torvalds 已提交
2583
	error = -EINVAL;
2584 2585
	new_mnt = real_mount(new.mnt);
	root_mnt = real_mount(root.mnt);
2586 2587 2588
	if (IS_MNT_SHARED(real_mount(old.mnt)) ||
		IS_MNT_SHARED(new_mnt->mnt_parent) ||
		IS_MNT_SHARED(root_mnt->mnt_parent))
2589
		goto out4;
A
Al Viro 已提交
2590
	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2591
		goto out4;
L
Linus Torvalds 已提交
2592
	error = -ENOENT;
2593
	if (d_unlinked(new.dentry))
2594
		goto out4;
2595
	if (d_unlinked(old.dentry))
2596
		goto out4;
L
Linus Torvalds 已提交
2597
	error = -EBUSY;
2598 2599
	if (new.mnt == root.mnt ||
	    old.mnt == root.mnt)
2600
		goto out4; /* loop, on the same file system  */
L
Linus Torvalds 已提交
2601
	error = -EINVAL;
2602
	if (root.mnt->mnt_root != root.dentry)
2603
		goto out4; /* not a mountpoint */
2604
	if (!mnt_has_parent(root_mnt))
2605
		goto out4; /* not attached */
2606
	if (new.mnt->mnt_root != new.dentry)
2607
		goto out4; /* not a mountpoint */
2608
	if (!mnt_has_parent(new_mnt))
2609
		goto out4; /* not attached */
2610
	/* make sure we can reach put_old from new_root */
2611
	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
2612
		goto out4;
A
Andi Kleen 已提交
2613
	br_write_lock(&vfsmount_lock);
2614 2615
	detach_mnt(new_mnt, &parent_path);
	detach_mnt(root_mnt, &root_parent);
2616
	/* mount old root on put_old */
2617
	attach_mnt(root_mnt, &old);
2618
	/* mount new_root on / */
2619
	attach_mnt(new_mnt, &root_parent);
2620
	touch_mnt_namespace(current->nsproxy->mnt_ns);
A
Andi Kleen 已提交
2621
	br_write_unlock(&vfsmount_lock);
2622
	chroot_fs_refs(&root, &new);
L
Linus Torvalds 已提交
2623
	error = 0;
2624 2625 2626 2627 2628 2629 2630
out4:
	unlock_mount(&old);
	if (!error) {
		path_put(&root_parent);
		path_put(&parent_path);
	}
out3:
2631
	path_put(&root);
2632
out2:
2633
	path_put(&old);
L
Linus Torvalds 已提交
2634
out1:
2635
	path_put(&new);
L
Linus Torvalds 已提交
2636 2637 2638 2639 2640 2641 2642
out0:
	return error;
}

static void __init init_mount_tree(void)
{
	struct vfsmount *mnt;
2643
	struct mnt_namespace *ns;
2644
	struct path root;
2645
	struct file_system_type *type;
L
Linus Torvalds 已提交
2646

2647 2648 2649 2650 2651
	type = get_fs_type("rootfs");
	if (!type)
		panic("Can't find rootfs type");
	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
	put_filesystem(type);
L
Linus Torvalds 已提交
2652 2653
	if (IS_ERR(mnt))
		panic("Can't create rootfs");
N
Nick Piggin 已提交
2654

2655 2656
	ns = create_mnt_ns(mnt);
	if (IS_ERR(ns))
L
Linus Torvalds 已提交
2657
		panic("Can't allocate initial namespace");
2658 2659 2660 2661

	init_task.nsproxy->mnt_ns = ns;
	get_mnt_ns(ns);

2662 2663
	root.mnt = mnt;
	root.dentry = mnt->mnt_root;
2664 2665 2666

	set_fs_pwd(current->fs, &root);
	set_fs_root(current->fs, &root);
L
Linus Torvalds 已提交
2667 2668
}

2669
void __init mnt_init(void)
L
Linus Torvalds 已提交
2670
{
E
Eric Dumazet 已提交
2671
	unsigned u;
2672
	int err;
L
Linus Torvalds 已提交
2673

R
Ram Pai 已提交
2674 2675
	init_rwsem(&namespace_sem);

A
Al Viro 已提交
2676
	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
2677
			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
L
Linus Torvalds 已提交
2678

R
Ram Pai 已提交
2679
	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
2680 2681 2682 2683

	if (!mount_hashtable)
		panic("Failed to allocate mount hash table\n");

2684
	printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
E
Eric Dumazet 已提交
2685 2686 2687

	for (u = 0; u < HASH_SIZE; u++)
		INIT_LIST_HEAD(&mount_hashtable[u]);
L
Linus Torvalds 已提交
2688

A
Andi Kleen 已提交
2689
	br_lock_init(&vfsmount_lock);
N
Nick Piggin 已提交
2690

2691 2692 2693
	err = sysfs_init();
	if (err)
		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
2694
			__func__, err);
2695 2696
	fs_kobj = kobject_create_and_add("fs", NULL);
	if (!fs_kobj)
2697
		printk(KERN_WARNING "%s: kobj create error\n", __func__);
L
Linus Torvalds 已提交
2698 2699 2700 2701
	init_rootfs();
	init_mount_tree();
}

2702
void put_mnt_ns(struct mnt_namespace *ns)
L
Linus Torvalds 已提交
2703
{
R
Ram Pai 已提交
2704
	LIST_HEAD(umount_list);
2705

2706
	if (!atomic_dec_and_test(&ns->count))
2707
		return;
R
Ram Pai 已提交
2708
	down_write(&namespace_sem);
A
Andi Kleen 已提交
2709
	br_write_lock(&vfsmount_lock);
2710
	umount_tree(ns->root, 0, &umount_list);
A
Andi Kleen 已提交
2711
	br_write_unlock(&vfsmount_lock);
R
Ram Pai 已提交
2712
	up_write(&namespace_sem);
R
Ram Pai 已提交
2713
	release_mounts(&umount_list);
2714
	free_mnt_ns(ns);
L
Linus Torvalds 已提交
2715
}
2716 2717 2718

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
{
2719 2720 2721 2722 2723 2724 2725
	struct vfsmount *mnt;
	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
	if (!IS_ERR(mnt)) {
		/*
		 * it is a longterm mount, don't release mnt until
		 * we unmount before file sys is unregistered
		*/
A
Al Viro 已提交
2726
		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
2727 2728
	}
	return mnt;
2729 2730
}
EXPORT_SYMBOL_GPL(kern_mount_data);
2731 2732 2733 2734 2735

void kern_unmount(struct vfsmount *mnt)
{
	/* release long term mount so mount point can be released */
	if (!IS_ERR_OR_NULL(mnt)) {
A
Al Viro 已提交
2736 2737 2738
		br_write_lock(&vfsmount_lock);
		real_mount(mnt)->mnt_ns = NULL;
		br_write_unlock(&vfsmount_lock);
2739 2740 2741 2742
		mntput(mnt);
	}
}
EXPORT_SYMBOL(kern_unmount);
2743 2744 2745

bool our_mnt(struct vfsmount *mnt)
{
A
Al Viro 已提交
2746
	return check_mnt(real_mount(mnt));
2747
}
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775

static void *mntns_get(struct task_struct *task)
{
	struct mnt_namespace *ns = NULL;
	struct nsproxy *nsproxy;

	rcu_read_lock();
	nsproxy = task_nsproxy(task);
	if (nsproxy) {
		ns = nsproxy->mnt_ns;
		get_mnt_ns(ns);
	}
	rcu_read_unlock();

	return ns;
}

static void mntns_put(void *ns)
{
	put_mnt_ns(ns);
}

static int mntns_install(struct nsproxy *nsproxy, void *ns)
{
	struct fs_struct *fs = current->fs;
	struct mnt_namespace *mnt_ns = ns;
	struct path root;

2776 2777
	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
	    !nsown_capable(CAP_SYS_CHROOT))
2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
		return -EINVAL;

	if (fs->users != 1)
		return -EINVAL;

	get_mnt_ns(mnt_ns);
	put_mnt_ns(nsproxy->mnt_ns);
	nsproxy->mnt_ns = mnt_ns;

	/* Find the root */
	root.mnt    = &mnt_ns->root->mnt;
	root.dentry = mnt_ns->root->mnt.mnt_root;
	path_get(&root);
	while(d_mountpoint(root.dentry) && follow_down_one(&root))
		;

	/* Update the pwd and root */
	set_fs_pwd(fs, &root);
	set_fs_root(fs, &root);

	path_put(&root);
	return 0;
}

const struct proc_ns_operations mntns_operations = {
	.name		= "mnt",
	.type		= CLONE_NEWNS,
	.get		= mntns_get,
	.put		= mntns_put,
	.install	= mntns_install,
};