namei.c 114.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
L
Linus Torvalds 已提交
20 21 22 23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
24
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
25 26
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
27
#include <linux/ima.h>
L
Linus Torvalds 已提交
28 29 30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
37
#include <linux/hash.h>
L
Linus Torvalds 已提交
38 39
#include <asm/uaccess.h>

40
#include "internal.h"
41
#include "mount.h"
42

L
Linus Torvalds 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
77
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
110
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
111 112 113 114 115 116 117 118 119 120
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
121

A
Al Viro 已提交
122
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
123

124
struct filename *
125 126
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
127
	struct filename *result;
128
	char *kname;
A
Al Viro 已提交
129
	int len;
130

131 132 133 134
	result = audit_reusename(filename);
	if (result)
		return result;

135
	result = __getname();
136
	if (unlikely(!result))
137 138
		return ERR_PTR(-ENOMEM);

139 140 141 142
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
143
	kname = (char *)result->iname;
144
	result->name = kname;
145

A
Al Viro 已提交
146
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
147
	if (unlikely(len < 0)) {
A
Al Viro 已提交
148 149
		__putname(result);
		return ERR_PTR(len);
150
	}
151

152 153 154 155 156 157
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
158
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
159
		const size_t size = offsetof(struct filename, iname[1]);
160 161
		kname = (char *)result;

A
Al Viro 已提交
162 163 164 165 166 167
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
168 169 170
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
171 172
		}
		result->name = kname;
A
Al Viro 已提交
173 174 175 176 177 178 179 180 181 182 183
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
184 185
	}

A
Al Viro 已提交
186
	result->refcnt = 1;
187 188 189
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
190
			*empty = 1;
A
Al Viro 已提交
191 192 193 194
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
195
	}
196

197
	result->uptr = filename;
198
	result->aname = NULL;
199 200
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
201 202
}

203 204
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
205
{
206
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
207 208
}

209 210 211 212
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
213
	int len = strlen(filename) + 1;
214 215 216 217 218

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

219
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
220
		result->name = (char *)result->iname;
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
	} else if (len <= PATH_MAX) {
		struct filename *tmp;

		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
236 237
	result->uptr = NULL;
	result->aname = NULL;
238
	result->refcnt = 1;
239
	audit_getname(result);
240 241 242 243

	return result;
}

244
void putname(struct filename *name)
L
Linus Torvalds 已提交
245
{
246 247 248 249 250
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
251
	if (name->name != name->iname) {
252 253 254 255
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
256 257
}

258 259
static int check_acl(struct inode *inode, int mask)
{
260
#ifdef CONFIG_FS_POSIX_ACL
261 262 263
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
264 265
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
266
	                return -EAGAIN;
267 268 269
		/* no ->get_acl() calls in RCU mode... */
		if (acl == ACL_NOT_CACHED)
			return -ECHILD;
A
Ari Savolainen 已提交
270
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
271 272
	}

C
Christoph Hellwig 已提交
273 274 275
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
276 277 278 279 280
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
281
#endif
282 283 284 285

	return -EAGAIN;
}

286
/*
287
 * This does the basic permission checking
L
Linus Torvalds 已提交
288
 */
289
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
290
{
291
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
292

293
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
L
Linus Torvalds 已提交
294 295
		mode >>= 6;
	else {
296
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
297
			int error = check_acl(inode, mask);
298 299
			if (error != -EAGAIN)
				return error;
L
Linus Torvalds 已提交
300 301 302 303 304 305 306 307 308
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
309
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
L
Linus Torvalds 已提交
310
		return 0;
311 312 313 314
	return -EACCES;
}

/**
315
 * generic_permission -  check for access rights on a Posix-like filesystem
316
 * @inode:	inode to check access rights for
317
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
318 319 320 321
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
322 323 324 325 326
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
327
 */
328
int generic_permission(struct inode *inode, int mask)
329 330 331 332
{
	int ret;

	/*
333
	 * Do the basic permission checks.
334
	 */
335
	ret = acl_permission_check(inode, mask);
336 337
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
338

339 340
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
341
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
342 343
			return 0;
		if (!(mask & MAY_WRITE))
344 345
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
346 347 348
				return 0;
		return -EACCES;
	}
L
Linus Torvalds 已提交
349 350
	/*
	 * Read/write DACs are always overridable.
351 352
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
L
Linus Torvalds 已提交
353
	 */
354
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
355
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
356 357 358 359 360
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
361
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
362
	if (mask == MAY_READ)
363
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
364 365 366 367
			return 0;

	return -EACCES;
}
368
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
369

370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

C
Christoph Hellwig 已提交
390
/**
D
David Howells 已提交
391 392 393
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
C
Christoph Hellwig 已提交
394
 *
D
David Howells 已提交
395
 * Check for read/write/execute permissions on an inode.
396 397
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
D
David Howells 已提交
398 399 400
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
C
Christoph Hellwig 已提交
401
 */
D
David Howells 已提交
402
int __inode_permission(struct inode *inode, int mask)
L
Linus Torvalds 已提交
403
{
404
	int retval;
L
Linus Torvalds 已提交
405

406
	if (unlikely(mask & MAY_WRITE)) {
L
Linus Torvalds 已提交
407 408 409 410 411 412 413
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

414
	retval = do_inode_permission(inode, mask);
L
Linus Torvalds 已提交
415 416 417
	if (retval)
		return retval;

418 419 420 421
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

422
	return security_inode_permission(inode, mask);
L
Linus Torvalds 已提交
423
}
424
EXPORT_SYMBOL(__inode_permission);
L
Linus Torvalds 已提交
425

D
David Howells 已提交
426 427 428
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
429
 * @inode: Inode to check permission on
D
David Howells 已提交
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}
467
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
468

J
Jan Blunck 已提交
469 470 471 472 473 474
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
475
void path_get(const struct path *path)
J
Jan Blunck 已提交
476 477 478 479 480 481
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
482 483 484 485 486 487
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
488
void path_put(const struct path *path)
L
Linus Torvalds 已提交
489
{
J
Jan Blunck 已提交
490 491
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
492
}
J
Jan Blunck 已提交
493
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
494

495
#define EMBEDDED_LEVELS 2
496 497
struct nameidata {
	struct path	path;
A
Al Viro 已提交
498
	struct qstr	last;
499 500 501
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
502
	unsigned	seq, m_seq;
503 504
	int		last_type;
	unsigned	depth;
505
	int		total_link_count;
506 507 508 509
	struct saved {
		struct path link;
		void *cookie;
		const char *name;
A
Al Viro 已提交
510
		struct inode *inode;
511
		unsigned seq;
512
	} *stack, internal[EMBEDDED_LEVELS];
513 514 515 516
	struct filename	*name;
	struct nameidata *saved;
	unsigned	root_seq;
	int		dfd;
517 518
};

519
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
520
{
521 522
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
523 524
	p->dfd = dfd;
	p->name = name;
525
	p->total_link_count = old ? old->total_link_count : 0;
526
	p->saved = old;
527
	current->nameidata = p;
528 529
}

530
static void restore_nameidata(void)
531
{
532
	struct nameidata *now = current->nameidata, *old = now->saved;
533 534 535 536 537 538 539

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
	if (now->stack != now->internal) {
		kfree(now->stack);
		now->stack = now->internal;
540 541 542 543 544
	}
}

static int __nd_alloc_stack(struct nameidata *nd)
{
A
Al Viro 已提交
545 546 547 548 549 550 551 552 553
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
554
				  GFP_KERNEL);
A
Al Viro 已提交
555 556 557
		if (unlikely(!p))
			return -ENOMEM;
	}
558 559 560 561 562 563 564
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

static inline int nd_alloc_stack(struct nameidata *nd)
{
565
	if (likely(nd->depth != EMBEDDED_LEVELS))
566 567 568 569 570 571
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
		struct inode *inode = last->inode;
		if (last->cookie && inode->i_op->put_link) {
			inode->i_op->put_link(inode, last->cookie);
			last->cookie = NULL;
		}
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
593 594 595 596
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

A
Al Viro 已提交
638
/*
N
Nick Piggin 已提交
639
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
640 641 642 643 644 645 646
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
647 648 649
 */

/**
A
Al Viro 已提交
650 651 652
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
653
 * @seq: seq number to check dentry against
654
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
655
 *
A
Al Viro 已提交
656 657 658
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
659 660
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
N
Nick Piggin 已提交
661
 */
662
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
N
Nick Piggin 已提交
663 664 665 666
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
667 668

	nd->flags &= ~LOOKUP_RCU;
669 670 671 672 673 674
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
		goto out1;
A
Al Viro 已提交
675

676 677 678 679 680 681 682 683 684 685 686
	/*
	 * For a negative lookup, the lookup sequence point is the parents
	 * sequence point, and it only needs to revalidate the parent dentry.
	 *
	 * For a positive lookup, we need to move both the parent and the
	 * dentry from the RCU domain to be properly refcounted. And the
	 * sequence number in the dentry validates *both* dentry counters,
	 * since we checked the sequence number of the parent after we got
	 * the child sequence number. So we know the parent must still
	 * be valid if the child sequence number is still valid.
	 */
A
Al Viro 已提交
687
	if (!dentry) {
688 689
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			goto out;
A
Al Viro 已提交
690 691
		BUG_ON(nd->inode != parent->d_inode);
	} else {
692 693
		if (!lockref_get_not_dead(&dentry->d_lockref))
			goto out;
694
		if (read_seqcount_retry(&dentry->d_seq, seq))
695
			goto drop_dentry;
A
Al Viro 已提交
696
	}
697 698 699 700 701 702

	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
703 704 705 706
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
707
		}
N
Nick Piggin 已提交
708 709
	}

A
Al Viro 已提交
710
	rcu_read_unlock();
N
Nick Piggin 已提交
711
	return 0;
A
Al Viro 已提交
712

713
drop_dentry:
A
Al Viro 已提交
714
	rcu_read_unlock();
715
	dput(dentry);
716
	goto drop_root_mnt;
717 718 719 720
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
721
out:
A
Al Viro 已提交
722
	rcu_read_unlock();
723 724 725
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
N
Nick Piggin 已提交
726 727 728
	return -ECHILD;
}

729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(!legitimize_path(nd, link, seq))) {
		drop_links(nd);
		nd->depth = 0;
		nd->flags &= ~LOOKUP_RCU;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
		return 0;
	}
	path_put(link);
	return -ECHILD;
}

747
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
748
{
749
	return dentry->d_op->d_revalidate(dentry, flags);
750 751
}

752 753 754
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
755
 *
756 757 758 759 760
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
761
 */
762
static int complete_walk(struct nameidata *nd)
763
{
A
Al Viro 已提交
764
	struct dentry *dentry = nd->path.dentry;
765 766
	int status;

767 768 769
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
770
		if (unlikely(unlazy_walk(nd, NULL, 0)))
771 772 773
			return -ECHILD;
	}

A
Al Viro 已提交
774 775 776
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

777
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
778 779
		return 0;

780
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
781 782 783
	if (status > 0)
		return 0;

A
Al Viro 已提交
784
	if (!status)
785
		status = -ESTALE;
A
Al Viro 已提交
786

787 788 789
	return status;
}

A
Al Viro 已提交
790
static void set_root(struct nameidata *nd)
A
Al Viro 已提交
791
{
792
	get_fs_root(current->fs, &nd->root);
A
Al Viro 已提交
793 794
}

A
Al Viro 已提交
795
static unsigned set_root_rcu(struct nameidata *nd)
N
Nick Piggin 已提交
796
{
797
	struct fs_struct *fs = current->fs;
798
	unsigned seq;
N
Nick Piggin 已提交
799

800 801 802
	do {
		seq = read_seqcount_begin(&fs->seq);
		nd->root = fs->root;
803
		nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
804
	} while (read_seqcount_retry(&fs->seq, seq));
805
	return nd->root_seq;
N
Nick Piggin 已提交
806 807
}

J
Jan Blunck 已提交
808
static void path_put_conditional(struct path *path, struct nameidata *nd)
809 810
{
	dput(path->dentry);
811
	if (path->mnt != nd->path.mnt)
812 813 814
		mntput(path->mnt);
}

815 816
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
817
{
N
Nick Piggin 已提交
818 819 820 821
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
822
	}
N
Nick Piggin 已提交
823
	nd->path.mnt = path->mnt;
824
	nd->path.dentry = path->dentry;
825 826
}

C
Christoph Hellwig 已提交
827 828 829 830
/*
 * Helper to directly jump to a known parsed path from ->follow_link,
 * caller must have taken a reference to path beforehand.
 */
831
void nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
832
{
833
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
834 835 836 837 838 839 840
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

841
static inline void put_link(struct nameidata *nd)
842
{
A
Al Viro 已提交
843
	struct saved *last = nd->stack + --nd->depth;
A
Al Viro 已提交
844
	struct inode *inode = last->inode;
845
	if (last->cookie && inode->i_op->put_link)
846
		inode->i_op->put_link(inode, last->cookie);
A
Al Viro 已提交
847 848
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
849 850
}

851 852
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
K
Kees Cook 已提交
853 854 855

/**
 * may_follow_link - Check symlink following for unsafe situations
856
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
857 858 859 860 861 862 863 864 865 866 867 868
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
A
Al Viro 已提交
869
static inline int may_follow_link(struct nameidata *nd)
K
Kees Cook 已提交
870 871 872 873 874 875 876 877
{
	const struct inode *inode;
	const struct inode *parent;

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
A
Al Viro 已提交
878
	inode = nd->stack[0].inode;
879
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
880 881 882 883 884 885 886 887
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
	parent = nd->path.dentry->d_inode;
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
888
	if (uid_eq(parent->i_uid, inode->i_uid))
K
Kees Cook 已提交
889 890
		return 0;

891 892 893
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

A
Al Viro 已提交
894
	audit_log_link_denied("follow_link", &nd->stack[0].link);
K
Kees Cook 已提交
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	const struct cred *cred;
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	cred = current_cred();
	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
959
	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
K
Kees Cook 已提交
960 961 962
	    capable(CAP_FOWNER))
		return 0;

963
	audit_log_link_denied("linkat", link);
K
Kees Cook 已提交
964 965 966
	return -EPERM;
}

967 968
static __always_inline
const char *get_link(struct nameidata *nd)
L
Linus Torvalds 已提交
969
{
970
	struct saved *last = nd->stack + nd->depth - 1;
A
Al Viro 已提交
971
	struct dentry *dentry = last->link.dentry;
A
Al Viro 已提交
972
	struct inode *inode = last->inode;
973
	int error;
974
	const char *res;
L
Linus Torvalds 已提交
975

976 977 978 979
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
A
Al Viro 已提交
980 981
		if (unlikely(unlazy_walk(nd, NULL, 0)))
			return ERR_PTR(-ECHILD);
982
		touch_atime(&last->link);
A
Al Viro 已提交
983
	}
984

985 986 987
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
988
		return ERR_PTR(error);
989

990
	nd->last_type = LAST_BIND;
991 992
	res = inode->i_link;
	if (!res) {
993 994 995 996
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, NULL, 0)))
				return ERR_PTR(-ECHILD);
		}
997
		res = inode->i_op->follow_link(dentry, &last->cookie);
998
		if (IS_ERR_OR_NULL(res)) {
999
			last->cookie = NULL;
1000 1001 1002 1003
			return res;
		}
	}
	if (*res == '/') {
1004
		if (nd->flags & LOOKUP_RCU) {
1005 1006 1007 1008 1009 1010 1011 1012
			struct dentry *d;
			if (!nd->root.mnt)
				set_root_rcu(nd);
			nd->path = nd->root;
			d = nd->path.dentry;
			nd->inode = d->d_inode;
			nd->seq = nd->root_seq;
			if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
1013
				return ERR_PTR(-ECHILD);
1014 1015 1016 1017 1018 1019 1020
		} else {
			if (!nd->root.mnt)
				set_root(nd);
			path_put(&nd->path);
			nd->path = nd->root;
			path_get(&nd->root);
			nd->inode = nd->path.dentry->d_inode;
1021
		}
1022 1023 1024
		nd->flags |= LOOKUP_JUMPED;
		while (unlikely(*++res == '/'))
			;
L
Linus Torvalds 已提交
1025
	}
1026 1027
	if (!*res)
		res = NULL;
1028 1029
	return res;
}
1030

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1041
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1042
{
1043 1044
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1045
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1046

A
Al Viro 已提交
1047
	read_seqlock_excl(&mount_lock);
1048
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1049
	if (parent == mnt) {
A
Al Viro 已提交
1050
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1051 1052
		return 0;
	}
1053
	mntget(&parent->mnt);
1054
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1055
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1056 1057 1058
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1059
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1060 1061
	return 1;
}
1062
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1063

N
Nick Piggin 已提交
1064
/*
1065 1066 1067
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1068
 */
1069
static int follow_automount(struct path *path, struct nameidata *nd,
1070
			    bool *need_mntput)
N
Nick Piggin 已提交
1071
{
1072
	struct vfsmount *mnt;
1073
	int err;
1074 1075 1076 1077

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1088
	 */
1089 1090
	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1091 1092 1093
	    path->dentry->d_inode)
		return -EISDIR;

1094 1095
	nd->total_link_count++;
	if (nd->total_link_count >= 40)
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
1109
		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1110 1111
			return -EREMOTE;
		return PTR_ERR(mnt);
N
Nick Piggin 已提交
1112
	}
1113

1114 1115
	if (!mnt) /* mount collision */
		return 0;
N
Nick Piggin 已提交
1116

1117 1118 1119 1120 1121
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1122
	err = finish_automount(mnt, path);
1123

1124 1125 1126
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1127
		return 0;
1128
	case 0:
1129
		path_put(path);
1130 1131 1132
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1133 1134
	default:
		return err;
1135
	}
1136

A
Al Viro 已提交
1137 1138
}

1139 1140
/*
 * Handle a dentry that is managed in some way.
1141
 * - Flagged for transit management (autofs)
1142 1143 1144 1145 1146 1147 1148
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
1149
static int follow_managed(struct path *path, struct nameidata *nd)
L
Linus Torvalds 已提交
1150
{
1151
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1152 1153
	unsigned managed;
	bool need_mntput = false;
1154
	int ret = 0;
1155 1156 1157 1158 1159 1160 1161

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1162 1163 1164 1165 1166
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1167
			ret = path->dentry->d_op->d_manage(path->dentry, false);
1168
			if (ret < 0)
1169
				break;
1170 1171
		}

1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
A
Al Viro 已提交
1187 1188
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1189 1190 1191 1192
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
1193
			ret = follow_automount(path, nd, &need_mntput);
1194
			if (ret < 0)
1195
				break;
1196 1197 1198 1199 1200
			continue;
		}

		/* We didn't change the current path point */
		break;
L
Linus Torvalds 已提交
1201
	}
1202 1203 1204 1205 1206

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (ret == -EISDIR)
		ret = 0;
1207 1208 1209 1210 1211
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
	return ret;
L
Linus Torvalds 已提交
1212 1213
}

1214
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1215 1216 1217
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1218
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1219
	if (mounted) {
A
Al Viro 已提交
1220 1221 1222 1223
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1224 1225 1226 1227
		return 1;
	}
	return 0;
}
1228
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1229

1230
static inline int managed_dentry_rcu(struct dentry *dentry)
1231
{
1232 1233
	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		dentry->d_op->d_manage(dentry, true) : 0;
1234 1235
}

1236
/*
1237 1238
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1239 1240
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1241
			       struct inode **inode, unsigned *seqp)
1242
{
1243
	for (;;) {
1244
		struct mount *mounted;
1245 1246 1247 1248
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1249 1250 1251
		switch (managed_dentry_rcu(path->dentry)) {
		case -ECHILD:
		default:
1252
			return false;
1253 1254 1255 1256 1257
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1258 1259

		if (!d_mountpoint(path->dentry))
1260
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1261

A
Al Viro 已提交
1262
		mounted = __lookup_mnt(path->mnt, path->dentry);
1263 1264
		if (!mounted)
			break;
1265 1266
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1267
		nd->flags |= LOOKUP_JUMPED;
1268
		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1269 1270 1271 1272 1273 1274
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1275
	}
1276
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1277
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1278 1279
}

N
Nick Piggin 已提交
1280 1281
static int follow_dotdot_rcu(struct nameidata *nd)
{
1282
	struct inode *inode = nd->inode;
1283 1284
	if (!nd->root.mnt)
		set_root_rcu(nd);
N
Nick Piggin 已提交
1285

1286
	while (1) {
1287
		if (path_equal(&nd->path, &nd->root))
N
Nick Piggin 已提交
1288 1289 1290 1291 1292 1293
			break;
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1294
			inode = parent->d_inode;
N
Nick Piggin 已提交
1295
			seq = read_seqcount_begin(&parent->d_seq);
1296 1297
			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				return -ECHILD;
N
Nick Piggin 已提交
1298 1299 1300
			nd->path.dentry = parent;
			nd->seq = seq;
			break;
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315
		} else {
			struct mount *mnt = real_mount(nd->path.mnt);
			struct mount *mparent = mnt->mnt_parent;
			struct dentry *mountpoint = mnt->mnt_mountpoint;
			struct inode *inode2 = mountpoint->d_inode;
			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				return -ECHILD;
			if (&mparent->mnt == nd->path.mnt)
				break;
			/* we know that mountpoint was pinned */
			nd->path.dentry = mountpoint;
			nd->path.mnt = &mparent->mnt;
			inode = inode2;
			nd->seq = seq;
N
Nick Piggin 已提交
1316 1317
		}
	}
1318
	while (unlikely(d_mountpoint(nd->path.dentry))) {
1319 1320
		struct mount *mounted;
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1321 1322
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return -ECHILD;
1323 1324 1325 1326
		if (!mounted)
			break;
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
1327
		inode = nd->path.dentry->d_inode;
1328 1329
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
	}
1330
	nd->inode = inode;
N
Nick Piggin 已提交
1331 1332 1333
	return 0;
}

1334 1335 1336 1337 1338
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
1339
int follow_down(struct path *path)
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358
{
	unsigned managed;
	int ret;

	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held.
		 *
		 * We indicate to the filesystem if someone is trying to mount
		 * something here.  This gives autofs the chance to deny anyone
		 * other than its daemon the right to mount on its
		 * superstructure.
		 *
		 * The filesystem may sleep at this point.
		 */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1359
			ret = path->dentry->d_op->d_manage(
1360
				path->dentry, false);
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
			if (ret < 0)
				return ret == -EISDIR ? 0 : ret;
		}

		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (!mounted)
				break;
			dput(path->dentry);
			mntput(path->mnt);
			path->mnt = mounted;
			path->dentry = dget(mounted->mnt_root);
			continue;
		}

		/* Don't handle automount points here */
		break;
	}
	return 0;
}
1382
EXPORT_SYMBOL(follow_down);
1383

1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399
/*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 */
static void follow_mount(struct path *path)
{
	while (d_mountpoint(path->dentry)) {
		struct vfsmount *mounted = lookup_mnt(path);
		if (!mounted)
			break;
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
	}
}

N
Nick Piggin 已提交
1400
static void follow_dotdot(struct nameidata *nd)
L
Linus Torvalds 已提交
1401
{
1402 1403
	if (!nd->root.mnt)
		set_root(nd);
1404

L
Linus Torvalds 已提交
1405
	while(1) {
1406
		struct dentry *old = nd->path.dentry;
L
Linus Torvalds 已提交
1407

A
Al Viro 已提交
1408 1409
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
L
Linus Torvalds 已提交
1410 1411
			break;
		}
1412
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
A
Al Viro 已提交
1413 1414
			/* rare case of legitimate dget_parent()... */
			nd->path.dentry = dget_parent(nd->path.dentry);
L
Linus Torvalds 已提交
1415 1416 1417
			dput(old);
			break;
		}
A
Al Viro 已提交
1418
		if (!follow_up(&nd->path))
L
Linus Torvalds 已提交
1419 1420
			break;
	}
A
Al Viro 已提交
1421
	follow_mount(&nd->path);
N
Nick Piggin 已提交
1422
	nd->inode = nd->path.dentry->d_inode;
L
Linus Torvalds 已提交
1423 1424
}

1425
/*
M
Miklos Szeredi 已提交
1426 1427 1428 1429 1430
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
 *
 * dir->d_inode->i_mutex must be held
1431
 */
M
Miklos Szeredi 已提交
1432
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1433
				    unsigned int flags, bool *need_lookup)
1434 1435
{
	struct dentry *dentry;
M
Miklos Szeredi 已提交
1436
	int error;
1437

M
Miklos Szeredi 已提交
1438 1439 1440
	*need_lookup = false;
	dentry = d_lookup(dir, name);
	if (dentry) {
J
Jeff Layton 已提交
1441
		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1442
			error = d_revalidate(dentry, flags);
M
Miklos Szeredi 已提交
1443 1444 1445 1446
			if (unlikely(error <= 0)) {
				if (error < 0) {
					dput(dentry);
					return ERR_PTR(error);
1447 1448
				} else {
					d_invalidate(dentry);
M
Miklos Szeredi 已提交
1449 1450 1451 1452 1453 1454
					dput(dentry);
					dentry = NULL;
				}
			}
		}
	}
1455

M
Miklos Szeredi 已提交
1456 1457 1458 1459
	if (!dentry) {
		dentry = d_alloc(dir, name);
		if (unlikely(!dentry))
			return ERR_PTR(-ENOMEM);
1460

M
Miklos Szeredi 已提交
1461
		*need_lookup = true;
1462 1463 1464 1465
	}
	return dentry;
}

1466
/*
1467 1468
 * Call i_op->lookup on the dentry.  The dentry must be negative and
 * unhashed.
M
Miklos Szeredi 已提交
1469 1470
 *
 * dir->d_inode->i_mutex must be held
1471
 */
M
Miklos Szeredi 已提交
1472
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1473
				  unsigned int flags)
1474 1475 1476 1477
{
	struct dentry *old;

	/* Don't create child dentry for a dead directory. */
M
Miklos Szeredi 已提交
1478
	if (unlikely(IS_DEADDIR(dir))) {
1479
		dput(dentry);
1480
		return ERR_PTR(-ENOENT);
1481
	}
1482

1483
	old = dir->i_op->lookup(dir, dentry, flags);
1484 1485 1486 1487 1488 1489 1490
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
}

1491
static struct dentry *__lookup_hash(struct qstr *name,
1492
		struct dentry *base, unsigned int flags)
1493
{
M
Miklos Szeredi 已提交
1494
	bool need_lookup;
1495 1496
	struct dentry *dentry;

1497
	dentry = lookup_dcache(name, base, flags, &need_lookup);
M
Miklos Szeredi 已提交
1498 1499
	if (!need_lookup)
		return dentry;
1500

1501
	return lookup_real(base->d_inode, dentry, flags);
1502 1503
}

L
Linus Torvalds 已提交
1504 1505 1506 1507 1508
/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
A
Al Viro 已提交
1509
static int lookup_fast(struct nameidata *nd,
1510 1511
		       struct path *path, struct inode **inode,
		       unsigned *seqp)
L
Linus Torvalds 已提交
1512
{
1513
	struct vfsmount *mnt = nd->path.mnt;
N
Nick Piggin 已提交
1514
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1515 1516
	int need_reval = 1;
	int status = 1;
1517 1518
	int err;

1519 1520 1521 1522 1523
	/*
	 * Rename seqlock is not required here because in the off chance
	 * of a false negative due to a concurrent rename, we're going to
	 * do the non-racy lookup, below.
	 */
N
Nick Piggin 已提交
1524 1525
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1526
		bool negative;
1527
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1528 1529 1530
		if (!dentry)
			goto unlazy;

1531 1532 1533 1534
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1535
		*inode = d_backing_inode(dentry);
1536
		negative = d_is_negative(dentry);
1537 1538
		if (read_seqcount_retry(&dentry->d_seq, seq))
			return -ECHILD;
1539 1540
		if (negative)
			return -ENOENT;
1541 1542 1543 1544 1545 1546 1547 1548

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
N
Nick Piggin 已提交
1549 1550
		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
			return -ECHILD;
A
Al Viro 已提交
1551

1552
		*seqp = seq;
1553
		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1554
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1555 1556 1557 1558 1559
			if (unlikely(status <= 0)) {
				if (status != -ECHILD)
					need_reval = 0;
				goto unlazy;
			}
1560
		}
N
Nick Piggin 已提交
1561 1562
		path->mnt = mnt;
		path->dentry = dentry;
1563
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1564
			return 0;
A
Al Viro 已提交
1565
unlazy:
1566
		if (unlazy_walk(nd, dentry, seq))
A
Al Viro 已提交
1567
			return -ECHILD;
A
Al Viro 已提交
1568
	} else {
A
Al Viro 已提交
1569
		dentry = __d_lookup(parent, &nd->last);
1570
	}
A
Al Viro 已提交
1571

1572 1573 1574
	if (unlikely(!dentry))
		goto need_lookup;

A
Al Viro 已提交
1575
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1576
		status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1577 1578 1579 1580 1581
	if (unlikely(status <= 0)) {
		if (status < 0) {
			dput(dentry);
			return status;
		}
1582 1583 1584
		d_invalidate(dentry);
		dput(dentry);
		goto need_lookup;
1585
	}
M
Miklos Szeredi 已提交
1586

1587 1588 1589 1590
	if (unlikely(d_is_negative(dentry))) {
		dput(dentry);
		return -ENOENT;
	}
1591 1592
	path->mnt = mnt;
	path->dentry = dentry;
1593
	err = follow_managed(path, nd);
1594
	if (likely(!err))
1595
		*inode = d_backing_inode(path->dentry);
1596
	return err;
1597 1598

need_lookup:
M
Miklos Szeredi 已提交
1599 1600 1601 1602
	return 1;
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1603
static int lookup_slow(struct nameidata *nd, struct path *path)
M
Miklos Szeredi 已提交
1604 1605 1606 1607
{
	struct dentry *dentry, *parent;

	parent = nd->path.dentry;
1608 1609 1610
	BUG_ON(nd->inode != parent->d_inode);

	mutex_lock(&parent->d_inode->i_mutex);
A
Al Viro 已提交
1611
	dentry = __lookup_hash(&nd->last, parent, nd->flags);
1612 1613 1614
	mutex_unlock(&parent->d_inode->i_mutex);
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
1615 1616
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
1617
	return follow_managed(path, nd);
L
Linus Torvalds 已提交
1618 1619
}

1620 1621 1622
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1623
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1624 1625
		if (err != -ECHILD)
			return err;
1626
		if (unlazy_walk(nd, NULL, 0))
1627 1628
			return -ECHILD;
	}
1629
	return inode_permission(nd->inode, MAY_EXEC);
1630 1631
}

1632 1633 1634 1635
static inline int handle_dots(struct nameidata *nd, int type)
{
	if (type == LAST_DOTDOT) {
		if (nd->flags & LOOKUP_RCU) {
1636
			return follow_dotdot_rcu(nd);
1637 1638 1639 1640 1641 1642
		} else
			follow_dotdot(nd);
	}
	return 0;
}

1643 1644
static int pick_link(struct nameidata *nd, struct path *link,
		     struct inode *inode, unsigned seq)
1645
{
1646
	int error;
A
Al Viro 已提交
1647
	struct saved *last;
1648
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1649 1650 1651
		path_to_nameidata(link, nd);
		return -ELOOP;
	}
A
Al Viro 已提交
1652
	if (!(nd->flags & LOOKUP_RCU)) {
1653 1654
		if (link->mnt == nd->path.mnt)
			mntget(link->mnt);
1655
	}
1656 1657
	error = nd_alloc_stack(nd);
	if (unlikely(error)) {
A
Al Viro 已提交
1658 1659 1660 1661 1662 1663 1664 1665 1666
		if (error == -ECHILD) {
			if (unlikely(unlazy_link(nd, link, seq)))
				return -ECHILD;
			error = nd_alloc_stack(nd);
		}
		if (error) {
			path_put(link);
			return error;
		}
1667 1668
	}

1669
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1670
	last->link = *link;
1671
	last->cookie = NULL;
1672
	last->inode = inode;
1673
	last->seq = seq;
1674 1675 1676
	return 1;
}

1677 1678 1679 1680 1681 1682
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1683
static inline int should_follow_link(struct nameidata *nd, struct path *link,
1684 1685
				     int follow,
				     struct inode *inode, unsigned seq)
1686
{
1687 1688 1689 1690
	if (likely(!d_is_symlink(link->dentry)))
		return 0;
	if (!follow)
		return 0;
1691
	return pick_link(nd, link, inode, seq);
1692 1693
}

1694 1695 1696
enum {WALK_GET = 1, WALK_PUT = 2};

static int walk_component(struct nameidata *nd, int flags)
1697
{
A
Al Viro 已提交
1698
	struct path path;
1699
	struct inode *inode;
1700
	unsigned seq;
1701 1702 1703 1704 1705 1706
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1707 1708 1709 1710 1711 1712
	if (unlikely(nd->last_type != LAST_NORM)) {
		err = handle_dots(nd, nd->last_type);
		if (flags & WALK_PUT)
			put_link(nd);
		return err;
	}
1713
	err = lookup_fast(nd, &path, &inode, &seq);
1714
	if (unlikely(err)) {
M
Miklos Szeredi 已提交
1715
		if (err < 0)
1716
			return err;
M
Miklos Szeredi 已提交
1717

A
Al Viro 已提交
1718
		err = lookup_slow(nd, &path);
M
Miklos Szeredi 已提交
1719
		if (err < 0)
1720
			return err;
M
Miklos Szeredi 已提交
1721

1722
		inode = d_backing_inode(path.dentry);
1723
		seq = 0;	/* we are already out of RCU mode */
1724
		err = -ENOENT;
A
Al Viro 已提交
1725
		if (d_is_negative(path.dentry))
1726
			goto out_path_put;
1727
	}
M
Miklos Szeredi 已提交
1728

1729 1730
	if (flags & WALK_PUT)
		put_link(nd);
1731
	err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
1732 1733
	if (unlikely(err))
		return err;
A
Al Viro 已提交
1734
	path_to_nameidata(&path, nd);
1735
	nd->inode = inode;
1736
	nd->seq = seq;
1737
	return 0;
M
Miklos Szeredi 已提交
1738 1739

out_path_put:
A
Al Viro 已提交
1740
	path_to_nameidata(&path, nd);
M
Miklos Szeredi 已提交
1741
	return err;
1742 1743
}

1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1763
#include <asm/word-at-a-time.h>
1764

1765
#ifdef CONFIG_64BIT
1766 1767 1768

static inline unsigned int fold_hash(unsigned long hash)
{
1769
	return hash_64(hash, 32);
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
}

#else	/* 32-bit case */

#define fold_hash(x) (x)

#endif

unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long a, mask;
	unsigned long hash = 0;

	for (;;) {
1784
		a = load_unaligned_zeropad(name);
1785 1786 1787
		if (len < sizeof(unsigned long))
			break;
		hash += a;
1788
		hash *= 9;
1789 1790 1791 1792 1793
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
		if (!len)
			goto done;
	}
1794
	mask = bytemask_from_count(len);
1795 1796 1797 1798 1799 1800 1801 1802
	hash += mask & a;
done:
	return fold_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/*
 * Calculate the length and hash of the path component, and
1803
 * return the "hash_len" as the result.
1804
 */
1805
static inline u64 hash_name(const char *name)
1806
{
1807 1808
	unsigned long a, b, adata, bdata, mask, hash, len;
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1809 1810 1811 1812 1813 1814

	hash = a = 0;
	len = -sizeof(unsigned long);
	do {
		hash = (hash + a) * 9;
		len += sizeof(unsigned long);
1815
		a = load_unaligned_zeropad(name+len);
1816 1817 1818 1819 1820 1821 1822 1823 1824
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);

	mask = create_zero_mask(adata | bdata);

	hash += a & zero_bytemask(mask);
1825
	len += find_zero(mask);
1826
	return hashlen_create(fold_hash(hash), len);
1827 1828 1829 1830
}

#else

L
Linus Torvalds 已提交
1831 1832 1833 1834 1835 1836 1837
unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long hash = init_name_hash();
	while (len--)
		hash = partial_name_hash(*name++, hash);
	return end_name_hash(hash);
}
1838
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
1839

1840 1841 1842 1843
/*
 * We know there's a real path component here of at least
 * one character.
 */
1844
static inline u64 hash_name(const char *name)
1845 1846 1847 1848 1849 1850 1851 1852 1853 1854
{
	unsigned long hash = init_name_hash();
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
1855
	return hashlen_create(end_name_hash(hash), len);
1856 1857
}

1858 1859
#endif

L
Linus Torvalds 已提交
1860 1861
/*
 * Name resolution.
1862 1863
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
1864
 *
1865 1866
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
1867
 */
1868
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
1869 1870
{
	int err;
A
Al Viro 已提交
1871

L
Linus Torvalds 已提交
1872 1873 1874
	while (*name=='/')
		name++;
	if (!*name)
1875
		return 0;
L
Linus Torvalds 已提交
1876 1877 1878

	/* At this point we know we have a real path component. */
	for(;;) {
1879
		u64 hash_len;
A
Al Viro 已提交
1880
		int type;
L
Linus Torvalds 已提交
1881

1882
		err = may_lookup(nd);
L
Linus Torvalds 已提交
1883
 		if (err)
1884
			return err;
L
Linus Torvalds 已提交
1885

1886
		hash_len = hash_name(name);
L
Linus Torvalds 已提交
1887

A
Al Viro 已提交
1888
		type = LAST_NORM;
1889
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
1890
			case 2:
1891
				if (name[1] == '.') {
A
Al Viro 已提交
1892
					type = LAST_DOTDOT;
A
Al Viro 已提交
1893 1894
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
1895 1896 1897 1898
				break;
			case 1:
				type = LAST_DOT;
		}
1899 1900
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
1901
			nd->flags &= ~LOOKUP_JUMPED;
1902
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1903
				struct qstr this = { { .hash_len = hash_len }, .name = name };
1904
				err = parent->d_op->d_hash(parent, &this);
1905
				if (err < 0)
1906
					return err;
1907 1908
				hash_len = this.hash_len;
				name = this.name;
1909 1910
			}
		}
A
Al Viro 已提交
1911

1912 1913
		nd->last.hash_len = hash_len;
		nd->last.name = name;
1914 1915
		nd->last_type = type;

1916 1917
		name += hashlen_len(hash_len);
		if (!*name)
1918
			goto OK;
1919 1920 1921 1922 1923
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
1924 1925
			name++;
		} while (unlikely(*name == '/'));
1926 1927
		if (unlikely(!*name)) {
OK:
1928
			/* pathname body, done */
1929 1930 1931
			if (!nd->depth)
				return 0;
			name = nd->stack[nd->depth - 1].name;
1932
			/* trailing symlink, done */
1933 1934 1935
			if (!name)
				return 0;
			/* last component of nested symlink */
1936
			err = walk_component(nd, WALK_GET | WALK_PUT);
1937
		} else {
1938
			err = walk_component(nd, WALK_GET);
1939
		}
1940
		if (err < 0)
1941
			return err;
L
Linus Torvalds 已提交
1942

1943
		if (err) {
1944
			const char *s = get_link(nd);
1945

1946 1947
			if (unlikely(IS_ERR(s)))
				return PTR_ERR(s);
1948 1949 1950
			err = 0;
			if (unlikely(!s)) {
				/* jumped */
1951
				put_link(nd);
1952
			} else {
1953 1954 1955
				nd->stack[nd->depth - 1].name = name;
				name = s;
				continue;
1956
			}
N
Nick Piggin 已提交
1957
		}
1958 1959
		if (unlikely(!d_can_lookup(nd->path.dentry)))
			return -ENOTDIR;
L
Linus Torvalds 已提交
1960 1961 1962
	}
}

1963
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
1964 1965
{
	int retval = 0;
1966
	const char *s = nd->name->name;
N
Nick Piggin 已提交
1967 1968

	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1969
	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
N
Nick Piggin 已提交
1970
	nd->depth = 0;
1971
	nd->total_link_count = 0;
1972
	if (flags & LOOKUP_ROOT) {
1973 1974
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
A
Al Viro 已提交
1975
		if (*s) {
M
Miklos Szeredi 已提交
1976
			if (!d_can_lookup(root))
1977
				return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
1978 1979
			retval = inode_permission(inode, MAY_EXEC);
			if (retval)
1980
				return ERR_PTR(retval);
A
Al Viro 已提交
1981
		}
1982 1983 1984
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
1985
			rcu_read_lock();
1986
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1987
			nd->root_seq = nd->seq;
A
Al Viro 已提交
1988
			nd->m_seq = read_seqbegin(&mount_lock);
1989 1990 1991
		} else {
			path_get(&nd->path);
		}
1992
		return s;
1993 1994
	}

N
Nick Piggin 已提交
1995 1996
	nd->root.mnt = NULL;

A
Al Viro 已提交
1997
	nd->m_seq = read_seqbegin(&mount_lock);
A
Al Viro 已提交
1998
	if (*s == '/') {
A
Al Viro 已提交
1999
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2000
			rcu_read_lock();
2001
			nd->seq = set_root_rcu(nd);
A
Al Viro 已提交
2002 2003 2004 2005 2006
		} else {
			set_root(nd);
			path_get(&nd->root);
		}
		nd->path = nd->root;
2007
	} else if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2008 2009 2010
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2011

A
Al Viro 已提交
2012
			rcu_read_lock();
N
Nick Piggin 已提交
2013

A
Al Viro 已提交
2014 2015 2016 2017 2018 2019 2020 2021
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
		}
N
Nick Piggin 已提交
2022
	} else {
2023
		/* Caller must check execute permissions on the starting path component */
2024
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2025 2026
		struct dentry *dentry;

2027
		if (!f.file)
2028
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2029

2030
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2031

A
Al Viro 已提交
2032
		if (*s) {
M
Miklos Szeredi 已提交
2033
			if (!d_can_lookup(dentry)) {
2034
				fdput(f);
2035
				return ERR_PTR(-ENOTDIR);
2036
			}
A
Al Viro 已提交
2037
		}
N
Nick Piggin 已提交
2038

2039
		nd->path = f.file->f_path;
A
Al Viro 已提交
2040
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2041
			rcu_read_lock();
A
Al Viro 已提交
2042 2043
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2044
		} else {
2045
			path_get(&nd->path);
A
Al Viro 已提交
2046
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2047
		}
A
Al Viro 已提交
2048
		fdput(f);
2049
		return s;
N
Nick Piggin 已提交
2050 2051 2052
	}

	nd->inode = nd->path.dentry->d_inode;
2053
	if (!(flags & LOOKUP_RCU))
2054
		return s;
2055
	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
2056
		return s;
2057 2058 2059
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
	rcu_read_unlock();
2060
	return ERR_PTR(-ECHILD);
2061 2062
}

2063
static const char *trailing_symlink(struct nameidata *nd)
2064 2065
{
	const char *s;
A
Al Viro 已提交
2066
	int error = may_follow_link(nd);
2067
	if (unlikely(error))
2068
		return ERR_PTR(error);
2069
	nd->flags |= LOOKUP_PARENT;
2070
	nd->stack[0].name = NULL;
2071
	s = get_link(nd);
2072
	return s ? s : "";
2073 2074
}

A
Al Viro 已提交
2075
static inline int lookup_last(struct nameidata *nd)
2076 2077 2078 2079 2080
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
2081
	return walk_component(nd,
2082 2083 2084 2085 2086
			nd->flags & LOOKUP_FOLLOW
				? nd->depth
					? WALK_PUT | WALK_GET
					: WALK_GET
				: 0);
2087 2088
}

2089
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2090
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2091
{
2092
	const char *s = path_init(nd, flags);
2093
	int err;
N
Nick Piggin 已提交
2094

2095 2096
	if (IS_ERR(s))
		return PTR_ERR(s);
2097 2098 2099 2100 2101 2102
	while (!(err = link_path_walk(s, nd))
		&& ((err = lookup_last(nd)) > 0)) {
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			err = PTR_ERR(s);
			break;
2103 2104
		}
	}
2105 2106
	if (!err)
		err = complete_walk(nd);
2107

2108 2109
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2110
			err = -ENOTDIR;
2111 2112 2113 2114 2115 2116
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2117
	return err;
A
Al Viro 已提交
2118
}
N
Nick Piggin 已提交
2119

2120
static int filename_lookup(int dfd, struct filename *name, unsigned flags,
2121
			   struct path *path, struct path *root)
A
Al Viro 已提交
2122
{
2123
	int retval;
2124
	struct nameidata nd;
2125 2126
	if (IS_ERR(name))
		return PTR_ERR(name);
2127 2128 2129 2130
	if (unlikely(root)) {
		nd.root = *root;
		flags |= LOOKUP_ROOT;
	}
2131
	set_nameidata(&nd, dfd, name);
2132
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2133
	if (unlikely(retval == -ECHILD))
2134
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2135
	if (unlikely(retval == -ESTALE))
2136
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2137

2138
	if (likely(!retval))
2139
		audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2140
	restore_nameidata();
2141
	putname(name);
2142
	return retval;
L
Linus Torvalds 已提交
2143 2144
}

2145
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2146
static int path_parentat(struct nameidata *nd, unsigned flags,
2147
				struct path *parent)
2148
{
2149
	const char *s = path_init(nd, flags);
2150 2151 2152 2153
	int err;
	if (IS_ERR(s))
		return PTR_ERR(s);
	err = link_path_walk(s, nd);
2154 2155
	if (!err)
		err = complete_walk(nd);
2156 2157 2158 2159 2160 2161
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2162 2163 2164
	return err;
}

2165
static struct filename *filename_parentat(int dfd, struct filename *name,
2166 2167
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2168 2169
{
	int retval;
2170
	struct nameidata nd;
2171

2172 2173
	if (IS_ERR(name))
		return name;
2174
	set_nameidata(&nd, dfd, name);
2175
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2176
	if (unlikely(retval == -ECHILD))
2177
		retval = path_parentat(&nd, flags, parent);
2178
	if (unlikely(retval == -ESTALE))
2179
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2180 2181 2182 2183
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
		audit_inode(name, parent->dentry, LOOKUP_PARENT);
2184 2185 2186
	} else {
		putname(name);
		name = ERR_PTR(retval);
2187
	}
2188
	restore_nameidata();
2189
	return name;
2190 2191
}

A
Al Viro 已提交
2192 2193
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2194
{
2195 2196
	struct filename *filename;
	struct dentry *d;
2197 2198
	struct qstr last;
	int type;
2199

2200 2201
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2202 2203
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2204
	if (unlikely(type != LAST_NORM)) {
2205
		path_put(path);
2206 2207
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2208
	}
2209 2210
	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2211
	if (IS_ERR(d)) {
2212 2213
		mutex_unlock(&path->dentry->d_inode->i_mutex);
		path_put(path);
A
Al Viro 已提交
2214
	}
2215
	putname(filename);
A
Al Viro 已提交
2216
	return d;
2217 2218
}

A
Al Viro 已提交
2219 2220
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2221 2222
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2223
}
2224
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2225

2226 2227 2228 2229 2230 2231
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2232
 * @path: pointer to struct path to fill
2233 2234 2235
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2236
		    struct path *path)
2237
{
2238 2239
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2240 2241
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2242
}
2243
EXPORT_SYMBOL(vfs_path_lookup);
2244

2245
/**
2246
 * lookup_one_len - filesystem helper to lookup single pathname component
2247 2248 2249 2250
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
2251
 * Note that this routine is purely a helper for filesystem usage and should
2252
 * not be called by generic code.
2253
 */
2254 2255 2256
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
A
Al Viro 已提交
2257
	unsigned int c;
2258
	int err;
2259

2260 2261
	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));

A
Al Viro 已提交
2262 2263
	this.name = name;
	this.len = len;
L
Linus Torvalds 已提交
2264
	this.hash = full_name_hash(name, len);
A
Al Viro 已提交
2265 2266 2267
	if (!len)
		return ERR_PTR(-EACCES);

A
Al Viro 已提交
2268 2269 2270 2271 2272
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
			return ERR_PTR(-EACCES);
	}

A
Al Viro 已提交
2273 2274 2275 2276 2277
	while (len--) {
		c = *(const unsigned char *)name++;
		if (c == '/' || c == '\0')
			return ERR_PTR(-EACCES);
	}
2278 2279 2280 2281 2282
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2283
		int err = base->d_op->d_hash(base, &this);
2284 2285 2286
		if (err < 0)
			return ERR_PTR(err);
	}
2287

2288 2289 2290 2291
	err = inode_permission(base->d_inode, MAY_EXEC);
	if (err)
		return ERR_PTR(err);

2292
	return __lookup_hash(&this, base, 0);
2293
}
2294
EXPORT_SYMBOL(lookup_one_len);
2295

2296 2297
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2298
{
2299 2300
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2301 2302
}

2303 2304 2305
int user_path_at(int dfd, const char __user *name, unsigned flags,
		 struct path *path)
{
2306
	return user_path_at_empty(dfd, name, flags, path, NULL);
2307
}
2308
EXPORT_SYMBOL(user_path_at);
2309

2310 2311 2312 2313 2314 2315
/*
 * NB: most callers don't do anything directly with the reference to the
 *     to struct filename, but the nd->last pointer points into the name string
 *     allocated by getname. So we must hold the reference to it until all
 *     path-walking is complete.
 */
A
Al Viro 已提交
2316
static inline struct filename *
2317 2318 2319 2320
user_path_parent(int dfd, const char __user *path,
		 struct path *parent,
		 struct qstr *last,
		 int *type,
2321
		 unsigned int flags)
2322
{
2323
	/* only LOOKUP_REVAL is allowed in extra flags */
2324 2325
	return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
				 parent, last, type);
2326 2327
}

2328
/**
2329
 * mountpoint_last - look up last component for umount
2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355
 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
 * @path: pointer to container for result
 *
 * This is a special lookup_last function just for umount. In this case, we
 * need to resolve the path without doing any revalidation.
 *
 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
 * in almost all cases, this lookup will be served out of the dcache. The only
 * cases where it won't are if nd->last refers to a symlink or the path is
 * bogus and it doesn't exist.
 *
 * Returns:
 * -error: if there was an error during lookup. This includes -ENOENT if the
 *         lookup found a negative dentry. The nd->path reference will also be
 *         put in this case.
 *
 * 0:      if we successfully resolved nd->path and found it to not to be a
 *         symlink that needs to be followed. "path" will also be populated.
 *         The nd->path reference will also be put.
 *
 * 1:      if we successfully resolved nd->last and found it to be a symlink
 *         that needs to be followed. "path" will be populated with the path
 *         to the link, and nd->path will *not* be put.
 */
static int
2356
mountpoint_last(struct nameidata *nd, struct path *path)
2357 2358 2359 2360 2361
{
	int error = 0;
	struct dentry *dentry;
	struct dentry *dir = nd->path.dentry;

2362 2363
	/* If we're in rcuwalk, drop out of it to handle last component */
	if (nd->flags & LOOKUP_RCU) {
2364
		if (unlazy_walk(nd, NULL, 0))
2365
			return -ECHILD;
2366 2367 2368 2369 2370 2371
	}

	nd->flags &= ~LOOKUP_PARENT;

	if (unlikely(nd->last_type != LAST_NORM)) {
		error = handle_dots(nd, nd->last_type);
2372
		if (error)
2373
			return error;
2374 2375
		dentry = dget(nd->path.dentry);
		goto done;
2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
	}

	mutex_lock(&dir->d_inode->i_mutex);
	dentry = d_lookup(dir, &nd->last);
	if (!dentry) {
		/*
		 * No cached dentry. Mounted dentries are pinned in the cache,
		 * so that means that this dentry is probably a symlink or the
		 * path doesn't actually point to a mounted dentry.
		 */
		dentry = d_alloc(dir, &nd->last);
		if (!dentry) {
2388
			mutex_unlock(&dir->d_inode->i_mutex);
2389
			return -ENOMEM;
2390
		}
2391
		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
2392 2393
		if (IS_ERR(dentry)) {
			mutex_unlock(&dir->d_inode->i_mutex);
2394
			return PTR_ERR(dentry);
2395
		}
2396 2397 2398
	}
	mutex_unlock(&dir->d_inode->i_mutex);

2399
done:
2400
	if (d_is_negative(dentry)) {
2401
		dput(dentry);
2402
		return -ENOENT;
2403
	}
2404 2405
	if (nd->depth)
		put_link(nd);
2406
	path->dentry = dentry;
2407
	path->mnt = nd->path.mnt;
2408 2409
	error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
				   d_backing_inode(dentry), 0);
2410
	if (unlikely(error))
2411
		return error;
2412
	mntget(path->mnt);
2413
	follow_mount(path);
2414
	return 0;
2415 2416 2417
}

/**
2418
 * path_mountpoint - look up a path to be umounted
2419
 * @nameidata:	lookup context
2420
 * @flags:	lookup flags
2421
 * @path:	pointer to container for result
2422 2423
 *
 * Look up the given name, but don't attempt to revalidate the last component.
2424
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2425 2426
 */
static int
2427
path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2428
{
2429
	const char *s = path_init(nd, flags);
2430 2431 2432
	int err;
	if (IS_ERR(s))
		return PTR_ERR(s);
2433 2434 2435 2436 2437
	while (!(err = link_path_walk(s, nd)) &&
		(err = mountpoint_last(nd, path)) > 0) {
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			err = PTR_ERR(s);
2438
			break;
2439
		}
2440
	}
2441
	terminate_walk(nd);
2442 2443 2444
	return err;
}

A
Al Viro 已提交
2445
static int
2446
filename_mountpoint(int dfd, struct filename *name, struct path *path,
A
Al Viro 已提交
2447 2448
			unsigned int flags)
{
2449
	struct nameidata nd;
2450
	int error;
2451 2452
	if (IS_ERR(name))
		return PTR_ERR(name);
2453
	set_nameidata(&nd, dfd, name);
2454
	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2455
	if (unlikely(error == -ECHILD))
2456
		error = path_mountpoint(&nd, flags, path);
A
Al Viro 已提交
2457
	if (unlikely(error == -ESTALE))
2458
		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
A
Al Viro 已提交
2459
	if (likely(!error))
2460
		audit_inode(name, path->dentry, 0);
2461
	restore_nameidata();
2462
	putname(name);
A
Al Viro 已提交
2463 2464 2465
	return error;
}

2466
/**
2467
 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480
 * @dfd:	directory file descriptor
 * @name:	pathname from userland
 * @flags:	lookup flags
 * @path:	pointer to container to hold result
 *
 * A umount is a special case for path walking. We're not actually interested
 * in the inode in this situation, and ESTALE errors can be a problem. We
 * simply want track down the dentry and vfsmount attached at the mountpoint
 * and avoid revalidating the last component.
 *
 * Returns 0 and populates "path" on success.
 */
int
2481
user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2482 2483
			struct path *path)
{
2484
	return filename_mountpoint(dfd, getname(name), path, flags);
2485 2486
}

A
Al Viro 已提交
2487 2488 2489 2490
int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
			unsigned int flags)
{
2491
	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
A
Al Viro 已提交
2492 2493 2494
}
EXPORT_SYMBOL(kern_path_mountpoint);

M
Miklos Szeredi 已提交
2495
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2496
{
2497
	kuid_t fsuid = current_fsuid();
2498

2499
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2500
		return 0;
2501
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2502
		return 0;
2503
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2504
}
M
Miklos Szeredi 已提交
2505
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 *  9. We can't remove a root or mountpoint.
 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
2526
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2527
{
2528
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2529 2530
	int error;

2531
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2532
		return -ENOENT;
2533
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2534 2535

	BUG_ON(victim->d_parent->d_inode != dir);
2536
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2537

2538
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2539 2540 2541 2542
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2543 2544 2545

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
L
Linus Torvalds 已提交
2546 2547
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2548
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2549 2550 2551
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2552
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We should have write and exec permissions on dir
 *  4. We can't do it if dir is immutable (done in permission())
 */
2569
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2570
{
2571
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2572 2573 2574 2575
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2576
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2577 2578 2579 2580 2581 2582 2583 2584 2585 2586
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
I
Ingo Molnar 已提交
2587
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2588 2589 2590
		return NULL;
	}

2591
	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2592

2593 2594 2595 2596 2597
	p = d_ancestor(p2, p1);
	if (p) {
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2598 2599
	}

2600 2601 2602 2603 2604
	p = d_ancestor(p1, p2);
	if (p) {
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2605 2606
	}

I
Ingo Molnar 已提交
2607
	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2608
	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2609 2610
	return NULL;
}
2611
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2612 2613 2614

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
2615
	mutex_unlock(&p1->d_inode->i_mutex);
L
Linus Torvalds 已提交
2616
	if (p1 != p2) {
2617
		mutex_unlock(&p2->d_inode->i_mutex);
2618
		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2619 2620
	}
}
2621
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2622

A
Al Viro 已提交
2623
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2624
		bool want_excl)
L
Linus Torvalds 已提交
2625
{
2626
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2627 2628 2629
	if (error)
		return error;

A
Al Viro 已提交
2630
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2631 2632 2633 2634 2635 2636
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2637
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2638
	if (!error)
2639
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2640 2641
	return error;
}
2642
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2643

A
Al Viro 已提交
2644
static int may_open(struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2645
{
2646
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2647 2648 2649
	struct inode *inode = dentry->d_inode;
	int error;

A
Al Viro 已提交
2650 2651 2652 2653
	/* O_PATH? */
	if (!acc_mode)
		return 0;

L
Linus Torvalds 已提交
2654 2655 2656
	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2657 2658
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2659
		return -ELOOP;
C
Christoph Hellwig 已提交
2660 2661 2662 2663 2664 2665
	case S_IFDIR:
		if (acc_mode & MAY_WRITE)
			return -EISDIR;
		break;
	case S_IFBLK:
	case S_IFCHR:
2666
		if (path->mnt->mnt_flags & MNT_NODEV)
L
Linus Torvalds 已提交
2667
			return -EACCES;
C
Christoph Hellwig 已提交
2668 2669 2670
		/*FALLTHRU*/
	case S_IFIFO:
	case S_IFSOCK:
L
Linus Torvalds 已提交
2671
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2672
		break;
2673
	}
2674

2675
	error = inode_permission(inode, acc_mode);
2676 2677
	if (error)
		return error;
M
Mimi Zohar 已提交
2678

L
Linus Torvalds 已提交
2679 2680 2681 2682
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2683
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2684
			return -EPERM;
L
Linus Torvalds 已提交
2685
		if (flag & O_TRUNC)
2686
			return -EPERM;
L
Linus Torvalds 已提交
2687 2688 2689
	}

	/* O_NOATIME can only be set by the owner or superuser */
2690
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2691
		return -EPERM;
L
Linus Torvalds 已提交
2692

2693
	return 0;
2694
}
L
Linus Torvalds 已提交
2695

2696
static int handle_truncate(struct file *filp)
2697
{
2698
	struct path *path = &filp->f_path;
2699 2700 2701 2702 2703 2704 2705
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2706
	error = locks_verify_locked(filp);
2707
	if (!error)
2708
		error = security_path_truncate(path);
2709 2710 2711
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2712
				    filp);
2713 2714
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2715
	return error;
L
Linus Torvalds 已提交
2716 2717
}

2718 2719
static inline int open_to_namei_flags(int flag)
{
2720 2721
	if ((flag & O_ACCMODE) == 3)
		flag--;
2722 2723 2724
	return flag;
}

M
Miklos Szeredi 已提交
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737
static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
{
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * Returns 1 if the file was looked up only or didn't need creating.  The
 * caller will need to perform the open themselves.  @path will have been
 * updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
2751 2752 2753
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
			struct path *path, struct file *file,
			const struct open_flags *op,
2754
			bool got_write, bool need_lookup,
2755
			int *opened)
M
Miklos Szeredi 已提交
2756 2757 2758 2759 2760 2761 2762 2763
{
	struct inode *dir =  nd->path.dentry->d_inode;
	unsigned open_flag = open_to_namei_flags(op->open_flag);
	umode_t mode;
	int error;
	int acc_mode;
	int create_error = 0;
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2764
	bool excl;
M
Miklos Szeredi 已提交
2765 2766 2767 2768 2769

	BUG_ON(dentry->d_inode);

	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir))) {
2770
		error = -ENOENT;
M
Miklos Szeredi 已提交
2771 2772 2773
		goto out;
	}

2774
	mode = op->mode;
M
Miklos Szeredi 已提交
2775 2776 2777
	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
		mode &= ~current_umask();

2778 2779
	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
	if (excl)
M
Miklos Szeredi 已提交
2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790
		open_flag &= ~O_TRUNC;

	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
2791 2792 2793
	if (((open_flag & (O_CREAT | O_TRUNC)) ||
	    (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
		if (!(open_flag & O_CREAT)) {
M
Miklos Szeredi 已提交
2794 2795 2796 2797 2798 2799 2800
			/*
			 * No O_CREATE -> atomicity not a requirement -> fall
			 * back to lookup + open
			 */
			goto no_open;
		} else if (open_flag & (O_EXCL | O_TRUNC)) {
			/* Fall back and fail with the right error */
2801
			create_error = -EROFS;
M
Miklos Szeredi 已提交
2802 2803 2804
			goto no_open;
		} else {
			/* No side effects, safe to clear O_CREAT */
2805
			create_error = -EROFS;
M
Miklos Szeredi 已提交
2806 2807 2808 2809 2810
			open_flag &= ~O_CREAT;
		}
	}

	if (open_flag & O_CREAT) {
2811
		error = may_o_create(&nd->path, dentry, mode);
M
Miklos Szeredi 已提交
2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
		if (error) {
			create_error = error;
			if (open_flag & O_EXCL)
				goto no_open;
			open_flag &= ~O_CREAT;
		}
	}

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
2823 2824 2825
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
	error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2826
				      opened);
A
Al Viro 已提交
2827 2828 2829
	if (error < 0) {
		if (create_error && error == -ENOENT)
			error = create_error;
M
Miklos Szeredi 已提交
2830 2831 2832
		goto out;
	}

A
Al Viro 已提交
2833
	if (error) {	/* returned 1, that is */
A
Al Viro 已提交
2834
		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2835
			error = -EIO;
M
Miklos Szeredi 已提交
2836 2837
			goto out;
		}
A
Al Viro 已提交
2838
		if (file->f_path.dentry) {
M
Miklos Szeredi 已提交
2839
			dput(dentry);
A
Al Viro 已提交
2840
			dentry = file->f_path.dentry;
M
Miklos Szeredi 已提交
2841
		}
2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854
		if (*opened & FILE_CREATED)
			fsnotify_create(dir, dentry);
		if (!dentry->d_inode) {
			WARN_ON(*opened & FILE_CREATED);
			if (create_error) {
				error = create_error;
				goto out;
			}
		} else {
			if (excl && !(*opened & FILE_CREATED)) {
				error = -EEXIST;
				goto out;
			}
2855
		}
M
Miklos Szeredi 已提交
2856 2857 2858 2859 2860 2861 2862
		goto looked_up;
	}

	/*
	 * We didn't have the inode before the open, so check open permission
	 * here.
	 */
2863 2864 2865 2866 2867 2868
	acc_mode = op->acc_mode;
	if (*opened & FILE_CREATED) {
		WARN_ON(!(open_flag & O_CREAT));
		fsnotify_create(dir, dentry);
		acc_mode = MAY_OPEN;
	}
2869 2870 2871
	error = may_open(&file->f_path, acc_mode, open_flag);
	if (error)
		fput(file);
M
Miklos Szeredi 已提交
2872 2873 2874

out:
	dput(dentry);
2875
	return error;
M
Miklos Szeredi 已提交
2876 2877 2878

no_open:
	if (need_lookup) {
2879
		dentry = lookup_real(dir, dentry, nd->flags);
M
Miklos Szeredi 已提交
2880
		if (IS_ERR(dentry))
2881
			return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2882 2883 2884 2885

		if (create_error) {
			int open_flag = op->open_flag;

2886
			error = create_error;
M
Miklos Szeredi 已提交
2887 2888 2889 2890 2891 2892
			if ((open_flag & O_EXCL)) {
				if (!dentry->d_inode)
					goto out;
			} else if (!dentry->d_inode) {
				goto out;
			} else if ((open_flag & O_TRUNC) &&
2893
				   d_is_reg(dentry)) {
M
Miklos Szeredi 已提交
2894 2895 2896 2897 2898 2899 2900 2901
				goto out;
			}
			/* will fail later, go on to get the right error */
		}
	}
looked_up:
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2902
	return 1;
M
Miklos Szeredi 已提交
2903 2904
}

M
Miklos Szeredi 已提交
2905
/*
2906
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
2907 2908 2909
 *
 * Must be called with i_mutex held on parent.
 *
2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921
 * Returns 0 if the file was successfully atomically created (if necessary) and
 * opened.  In this case the file will be returned attached to @file.
 *
 * Returns 1 if the file was not completely opened at this time, though lookups
 * and creations will have been performed and the dentry returned in @path will
 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
 * specified then a negative dentry may be returned.
 *
 * An error code is returned otherwise.
 *
 * FILE_CREATE will be set in @*opened if the dentry was created and will be
 * cleared otherwise prior to returning.
M
Miklos Szeredi 已提交
2922
 */
2923 2924 2925
static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
2926
			bool got_write, int *opened)
M
Miklos Szeredi 已提交
2927 2928
{
	struct dentry *dir = nd->path.dentry;
2929
	struct inode *dir_inode = dir->d_inode;
M
Miklos Szeredi 已提交
2930 2931
	struct dentry *dentry;
	int error;
2932
	bool need_lookup;
M
Miklos Szeredi 已提交
2933

2934
	*opened &= ~FILE_CREATED;
2935
	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
M
Miklos Szeredi 已提交
2936
	if (IS_ERR(dentry))
2937
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2938

M
Miklos Szeredi 已提交
2939 2940 2941 2942 2943
	/* Cached positive dentry: will open in f_op->open */
	if (!need_lookup && dentry->d_inode)
		goto out_no_open;

	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2944
		return atomic_open(nd, dentry, path, file, op, got_write,
2945
				   need_lookup, opened);
M
Miklos Szeredi 已提交
2946 2947
	}

2948 2949 2950
	if (need_lookup) {
		BUG_ON(dentry->d_inode);

2951
		dentry = lookup_real(dir_inode, dentry, nd->flags);
2952
		if (IS_ERR(dentry))
2953
			return PTR_ERR(dentry);
2954 2955
	}

M
Miklos Szeredi 已提交
2956 2957 2958 2959 2960 2961 2962 2963 2964 2965
	/* Negative dentry, just create the file */
	if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
		umode_t mode = op->mode;
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
		/*
		 * This write is needed to ensure that a
		 * rw->ro transition does not occur between
		 * the time when the file is created and when
		 * a permanent write count is taken through
2966
		 * the 'struct file' in finish_open().
M
Miklos Szeredi 已提交
2967
		 */
2968 2969
		if (!got_write) {
			error = -EROFS;
M
Miklos Szeredi 已提交
2970
			goto out_dput;
2971
		}
2972
		*opened |= FILE_CREATED;
M
Miklos Szeredi 已提交
2973 2974 2975
		error = security_path_mknod(&nd->path, dentry, mode, 0);
		if (error)
			goto out_dput;
A
Al Viro 已提交
2976 2977
		error = vfs_create(dir->d_inode, dentry, mode,
				   nd->flags & LOOKUP_EXCL);
M
Miklos Szeredi 已提交
2978 2979 2980
		if (error)
			goto out_dput;
	}
M
Miklos Szeredi 已提交
2981
out_no_open:
M
Miklos Szeredi 已提交
2982 2983
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2984
	return 1;
M
Miklos Szeredi 已提交
2985 2986 2987

out_dput:
	dput(dentry);
2988
	return error;
M
Miklos Szeredi 已提交
2989 2990
}

N
Nick Piggin 已提交
2991
/*
2992
 * Handle the last step of open()
N
Nick Piggin 已提交
2993
 */
2994
static int do_last(struct nameidata *nd,
2995
		   struct file *file, const struct open_flags *op,
A
Al Viro 已提交
2996
		   int *opened)
2997
{
2998
	struct dentry *dir = nd->path.dentry;
2999
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3000
	bool will_truncate = (open_flag & O_TRUNC) != 0;
3001
	bool got_write = false;
A
Al Viro 已提交
3002
	int acc_mode = op->acc_mode;
3003
	unsigned seq;
3004
	struct inode *inode;
3005
	struct path save_parent = { .dentry = NULL, .mnt = NULL };
3006
	struct path path;
3007
	bool retried = false;
A
Al Viro 已提交
3008
	int error;
3009

3010 3011 3012
	nd->flags &= ~LOOKUP_PARENT;
	nd->flags |= op->intent;

3013
	if (nd->last_type != LAST_NORM) {
3014
		error = handle_dots(nd, nd->last_type);
3015
		if (unlikely(error))
3016
			return error;
M
Miklos Szeredi 已提交
3017
		goto finish_open;
3018
	}
3019

3020
	if (!(open_flag & O_CREAT)) {
3021 3022 3023
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3024
		error = lookup_fast(nd, &path, &inode, &seq);
3025 3026 3027 3028
		if (likely(!error))
			goto finish_lookup;

		if (error < 0)
3029
			return error;
3030 3031

		BUG_ON(nd->inode != dir->d_inode);
3032 3033 3034 3035 3036 3037 3038 3039
	} else {
		/* create side of things */
		/*
		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
		 * has been cleared when we got to the last component we are
		 * about to look up
		 */
		error = complete_walk(nd);
3040
		if (error)
3041
			return error;
3042

A
Al Viro 已提交
3043
		audit_inode(nd->name, dir, LOOKUP_PARENT);
3044
		/* trailing slashes? */
3045 3046
		if (unlikely(nd->last.name[nd->last.len]))
			return -EISDIR;
3047
	}
A
Al Viro 已提交
3048

3049
retry_lookup:
3050 3051 3052 3053 3054 3055 3056 3057 3058 3059
	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
		error = mnt_want_write(nd->path.mnt);
		if (!error)
			got_write = true;
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3060
	mutex_lock(&dir->d_inode->i_mutex);
3061
	error = lookup_open(nd, &path, file, op, got_write, opened);
M
Miklos Szeredi 已提交
3062
	mutex_unlock(&dir->d_inode->i_mutex);
3063

3064 3065
	if (error <= 0) {
		if (error)
M
Miklos Szeredi 已提交
3066 3067
			goto out;

3068
		if ((*opened & FILE_CREATED) ||
A
Al Viro 已提交
3069
		    !S_ISREG(file_inode(file)->i_mode))
M
Miklos Szeredi 已提交
3070
			will_truncate = false;
M
Miklos Szeredi 已提交
3071

A
Al Viro 已提交
3072
		audit_inode(nd->name, file->f_path.dentry, 0);
M
Miklos Szeredi 已提交
3073 3074
		goto opened;
	}
3075

3076
	if (*opened & FILE_CREATED) {
3077
		/* Don't check for write permission, don't truncate */
3078
		open_flag &= ~O_TRUNC;
M
Miklos Szeredi 已提交
3079
		will_truncate = false;
A
Al Viro 已提交
3080
		acc_mode = MAY_OPEN;
3081
		path_to_nameidata(&path, nd);
M
Miklos Szeredi 已提交
3082
		goto finish_open_created;
3083 3084 3085
	}

	/*
3086
	 * create/update audit record if it already exists.
3087
	 */
3088
	if (d_is_positive(path.dentry))
A
Al Viro 已提交
3089
		audit_inode(nd->name, path.dentry, 0);
3090

M
Miklos Szeredi 已提交
3091 3092 3093 3094 3095
	/*
	 * If atomic_open() acquired write access it is dropped now due to
	 * possible mount and symlink following (this might be optimized away if
	 * necessary...)
	 */
3096
	if (got_write) {
M
Miklos Szeredi 已提交
3097
		mnt_drop_write(nd->path.mnt);
3098
		got_write = false;
M
Miklos Szeredi 已提交
3099 3100
	}

3101 3102 3103 3104
	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
		path_to_nameidata(&path, nd);
		return -EEXIST;
	}
3105

3106
	error = follow_managed(&path, nd);
3107 3108
	if (unlikely(error < 0))
		return error;
3109

3110
	BUG_ON(nd->flags & LOOKUP_RCU);
3111
	inode = d_backing_inode(path.dentry);
3112
	seq = 0;	/* out of RCU mode, so the value doesn't matter */
3113
	if (unlikely(d_is_negative(path.dentry))) {
3114
		path_to_nameidata(&path, nd);
3115
		return -ENOENT;
3116
	}
3117
finish_lookup:
3118 3119
	if (nd->depth)
		put_link(nd);
3120 3121
	error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
				   inode, seq);
3122
	if (unlikely(error))
3123
		return error;
3124

3125 3126
	if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
		path_to_nameidata(&path, nd);
3127
		return -ELOOP;
3128 3129
	}

3130 3131
	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
		path_to_nameidata(&path, nd);
3132 3133
	} else {
		save_parent.dentry = nd->path.dentry;
3134 3135
		save_parent.mnt = mntget(path.mnt);
		nd->path.dentry = path.dentry;
3136 3137

	}
3138
	nd->inode = inode;
3139
	nd->seq = seq;
3140
	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3141
finish_open:
3142
	error = complete_walk(nd);
3143 3144
	if (error) {
		path_put(&save_parent);
3145
		return error;
3146
	}
A
Al Viro 已提交
3147
	audit_inode(nd->name, nd->path.dentry, 0);
3148
	error = -EISDIR;
M
Miklos Szeredi 已提交
3149
	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3150
		goto out;
3151
	error = -ENOTDIR;
M
Miklos Szeredi 已提交
3152
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3153
		goto out;
3154
	if (!d_is_reg(nd->path.dentry))
M
Miklos Szeredi 已提交
3155
		will_truncate = false;
3156

3157 3158 3159
	if (will_truncate) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
3160
			goto out;
3161
		got_write = true;
3162
	}
M
Miklos Szeredi 已提交
3163
finish_open_created:
A
Al Viro 已提交
3164
	error = may_open(&nd->path, acc_mode, open_flag);
3165
	if (error)
3166
		goto out;
M
Miklos Szeredi 已提交
3167 3168 3169 3170 3171 3172

	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
	error = vfs_open(&nd->path, file, current_cred());
	if (!error) {
		*opened |= FILE_OPENED;
	} else {
A
Al Viro 已提交
3173
		if (error == -EOPENSTALE)
M
Miklos Szeredi 已提交
3174
			goto stale_open;
3175
		goto out;
M
Miklos Szeredi 已提交
3176
	}
3177
opened:
3178
	error = open_check_o_direct(file);
3179 3180
	if (error)
		goto exit_fput;
3181
	error = ima_file_check(file, op->acc_mode, *opened);
3182 3183 3184 3185
	if (error)
		goto exit_fput;

	if (will_truncate) {
3186
		error = handle_truncate(file);
3187 3188
		if (error)
			goto exit_fput;
3189
	}
3190
out:
3191
	if (got_write)
3192
		mnt_drop_write(nd->path.mnt);
3193
	path_put(&save_parent);
3194
	return error;
3195

3196
exit_fput:
3197 3198
	fput(file);
	goto out;
3199

M
Miklos Szeredi 已提交
3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210
stale_open:
	/* If no saved parent or already retried then can't retry */
	if (!save_parent.dentry || retried)
		goto out;

	BUG_ON(save_parent.dentry != dir);
	path_put(&nd->path);
	nd->path = save_parent;
	nd->inode = dir->d_inode;
	save_parent.mnt = NULL;
	save_parent.dentry = NULL;
3211
	if (got_write) {
M
Miklos Szeredi 已提交
3212
		mnt_drop_write(nd->path.mnt);
3213
		got_write = false;
M
Miklos Szeredi 已提交
3214 3215 3216
	}
	retried = true;
	goto retry_lookup;
3217 3218
}

3219
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3220 3221 3222 3223
		const struct open_flags *op,
		struct file *file, int *opened)
{
	static const struct qstr name = QSTR_INIT("/", 1);
3224
	struct dentry *child;
3225
	struct inode *dir;
3226
	struct path path;
3227
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3228 3229
	if (unlikely(error))
		return error;
3230
	error = mnt_want_write(path.mnt);
3231 3232
	if (unlikely(error))
		goto out;
3233
	dir = path.dentry->d_inode;
3234
	/* we want directory to be writable */
3235
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3236 3237 3238 3239 3240 3241
	if (error)
		goto out2;
	if (!dir->i_op->tmpfile) {
		error = -EOPNOTSUPP;
		goto out2;
	}
3242
	child = d_alloc(path.dentry, &name);
3243 3244 3245 3246
	if (unlikely(!child)) {
		error = -ENOMEM;
		goto out2;
	}
3247 3248 3249
	dput(path.dentry);
	path.dentry = child;
	error = dir->i_op->tmpfile(dir, child, op->mode);
3250 3251
	if (error)
		goto out2;
3252
	audit_inode(nd->name, child, 0);
3253
	/* Don't check for other permissions, the inode was just created */
3254
	error = may_open(&path, MAY_OPEN, op->open_flag);
3255 3256
	if (error)
		goto out2;
3257 3258
	file->f_path.mnt = path.mnt;
	error = finish_open(file, child, NULL, opened);
3259 3260 3261
	if (error)
		goto out2;
	error = open_check_o_direct(file);
3262
	if (error) {
3263
		fput(file);
3264 3265 3266 3267 3268 3269
	} else if (!(op->open_flag & O_EXCL)) {
		struct inode *inode = file_inode(file);
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3270
out2:
3271
	mnt_drop_write(path.mnt);
3272
out:
3273
	path_put(&path);
3274 3275 3276
	return error;
}

3277 3278
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3279
{
3280
	const char *s;
A
Al Viro 已提交
3281
	struct file *file;
3282
	int opened = 0;
3283
	int error;
N
Nick Piggin 已提交
3284

A
Al Viro 已提交
3285
	file = get_empty_filp();
3286 3287
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3288

A
Al Viro 已提交
3289
	file->f_flags = op->open_flag;
N
Nick Piggin 已提交
3290

A
Al Viro 已提交
3291
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3292
		error = do_tmpfile(nd, flags, op, file, &opened);
A
Al Viro 已提交
3293
		goto out2;
3294 3295
	}

3296
	s = path_init(nd, flags);
3297 3298 3299 3300
	if (IS_ERR(s)) {
		put_filp(file);
		return ERR_CAST(s);
	}
3301
	while (!(error = link_path_walk(s, nd)) &&
A
Al Viro 已提交
3302
		(error = do_last(nd, file, op, &opened)) > 0) {
A
Al Viro 已提交
3303
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3304 3305 3306
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			error = PTR_ERR(s);
3307
			break;
3308
		}
3309
	}
3310
	terminate_walk(nd);
A
Al Viro 已提交
3311
out2:
3312 3313
	if (!(opened & FILE_OPENED)) {
		BUG_ON(!error);
A
Al Viro 已提交
3314
		put_filp(file);
3315
	}
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325
	if (unlikely(error)) {
		if (error == -EOPENSTALE) {
			if (flags & LOOKUP_RCU)
				error = -ECHILD;
			else
				error = -ESTALE;
		}
		file = ERR_PTR(error);
	}
	return file;
L
Linus Torvalds 已提交
3326 3327
}

3328
struct file *do_filp_open(int dfd, struct filename *pathname,
3329
		const struct open_flags *op)
3330
{
3331
	struct nameidata nd;
3332
	int flags = op->lookup_flags;
3333 3334
	struct file *filp;

3335
	set_nameidata(&nd, dfd, pathname);
3336
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3337
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3338
		filp = path_openat(&nd, op, flags);
3339
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3340
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3341
	restore_nameidata();
3342 3343 3344
	return filp;
}

A
Al Viro 已提交
3345
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3346
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3347
{
3348
	struct nameidata nd;
A
Al Viro 已提交
3349
	struct file *file;
3350
	struct filename *filename;
3351
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3352 3353 3354 3355

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3356
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3357 3358
		return ERR_PTR(-ELOOP);

3359 3360 3361 3362
	filename = getname_kernel(name);
	if (unlikely(IS_ERR(filename)))
		return ERR_CAST(filename);

3363
	set_nameidata(&nd, -1, filename);
3364
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3365
	if (unlikely(file == ERR_PTR(-ECHILD)))
3366
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3367
	if (unlikely(file == ERR_PTR(-ESTALE)))
3368
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3369
	restore_nameidata();
3370
	putname(filename);
A
Al Viro 已提交
3371 3372 3373
	return file;
}

3374
static struct dentry *filename_create(int dfd, struct filename *name,
3375
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3376
{
3377
	struct dentry *dentry = ERR_PTR(-EEXIST);
3378 3379
	struct qstr last;
	int type;
3380
	int err2;
3381 3382 3383 3384 3385 3386 3387 3388 3389
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3390 3391 3392
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3393

3394 3395 3396 3397
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3398
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3399
		goto out;
3400

3401
	/* don't fail immediately if it's r/o, at least try to report other errors */
3402
	err2 = mnt_want_write(path->mnt);
3403 3404 3405
	/*
	 * Do the final lookup.
	 */
3406 3407 3408
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3409
	if (IS_ERR(dentry))
3410
		goto unlock;
3411

3412
	error = -EEXIST;
3413
	if (d_is_positive(dentry))
3414
		goto fail;
3415

3416 3417 3418 3419 3420 3421
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3422
	if (unlikely(!is_dir && last.name[last.len])) {
3423
		error = -ENOENT;
A
Al Viro 已提交
3424
		goto fail;
3425
	}
3426 3427
	if (unlikely(err2)) {
		error = err2;
3428
		goto fail;
3429
	}
3430
	putname(name);
L
Linus Torvalds 已提交
3431 3432
	return dentry;
fail:
3433 3434 3435
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
3436
	mutex_unlock(&path->dentry->d_inode->i_mutex);
3437
	if (!err2)
3438
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3439
out:
3440
	path_put(path);
3441
	putname(name);
L
Linus Torvalds 已提交
3442 3443
	return dentry;
}
3444 3445 3446 3447

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3448 3449
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3450
}
3451 3452
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3453 3454 3455 3456
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
	mutex_unlock(&path->dentry->d_inode->i_mutex);
3457
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3458 3459 3460 3461
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3462
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3463
				struct path *path, unsigned int lookup_flags)
3464
{
3465
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3466 3467 3468
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3469
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3470
{
3471
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3472 3473 3474 3475

	if (error)
		return error;

3476
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3477 3478
		return -EPERM;

A
Al Viro 已提交
3479
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3480 3481
		return -EPERM;

3482 3483 3484 3485
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3486 3487 3488 3489 3490
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3491
	if (!error)
3492
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3493 3494
	return error;
}
3495
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3496

A
Al Viro 已提交
3497
static int may_mknod(umode_t mode)
3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

A
Al Viro 已提交
3514
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3515
		unsigned, dev)
L
Linus Torvalds 已提交
3516
{
3517
	struct dentry *dentry;
3518 3519
	struct path path;
	int error;
3520
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3521

3522 3523 3524
	error = may_mknod(mode);
	if (error)
		return error;
3525 3526
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3527 3528
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3529

3530
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3531
		mode &= ~current_umask();
3532
	error = security_path_mknod(&path, dentry, mode, dev);
3533
	if (error)
3534
		goto out;
3535
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3536
		case 0: case S_IFREG:
A
Al Viro 已提交
3537
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
L
Linus Torvalds 已提交
3538 3539
			break;
		case S_IFCHR: case S_IFBLK:
3540
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3541 3542 3543
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3544
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3545 3546
			break;
	}
3547
out:
A
Al Viro 已提交
3548
	done_path_create(&path, dentry);
3549 3550 3551 3552
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3553 3554 3555
	return error;
}

A
Al Viro 已提交
3556
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3557 3558 3559 3560
{
	return sys_mknodat(AT_FDCWD, filename, mode, dev);
}

3561
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3562
{
3563
	int error = may_create(dir, dentry);
3564
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3565 3566 3567 3568

	if (error)
		return error;

A
Al Viro 已提交
3569
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3570 3571 3572 3573 3574 3575 3576
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3577 3578 3579
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3580
	error = dir->i_op->mkdir(dir, dentry, mode);
3581
	if (!error)
3582
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3583 3584
	return error;
}
3585
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3586

3587
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
L
Linus Torvalds 已提交
3588
{
3589
	struct dentry *dentry;
3590 3591
	struct path path;
	int error;
3592
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3593

3594 3595
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3596
	if (IS_ERR(dentry))
3597
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3598

3599
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3600
		mode &= ~current_umask();
3601
	error = security_path_mkdir(&path, dentry, mode);
3602 3603
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3604
	done_path_create(&path, dentry);
3605 3606 3607 3608
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3609 3610 3611
	return error;
}

3612
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3613 3614 3615 3616
{
	return sys_mkdirat(AT_FDCWD, pathname, mode);
}

L
Linus Torvalds 已提交
3617
/*
S
Sage Weil 已提交
3618
 * The dentry_unhash() helper will try to drop the dentry early: we
3619
 * should have a usage count of 1 if we're the only user of this
S
Sage Weil 已提交
3620 3621
 * dentry, and if that is true (possibly after pruning the dcache),
 * then we drop the dentry now.
L
Linus Torvalds 已提交
3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633
 *
 * A low-level filesystem can, if it choses, legally
 * do a
 *
 *	if (!d_unhashed(dentry))
 *		return -EBUSY;
 *
 * if it cannot handle the case of removing a directory
 * that is still in use by something else..
 */
void dentry_unhash(struct dentry *dentry)
{
3634
	shrink_dcache_parent(dentry);
L
Linus Torvalds 已提交
3635
	spin_lock(&dentry->d_lock);
3636
	if (dentry->d_lockref.count == 1)
L
Linus Torvalds 已提交
3637 3638 3639
		__d_drop(dentry);
	spin_unlock(&dentry->d_lock);
}
3640
EXPORT_SYMBOL(dentry_unhash);
L
Linus Torvalds 已提交
3641 3642 3643 3644 3645 3646 3647 3648

int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3649
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3650 3651
		return -EPERM;

3652
	dget(dentry);
3653
	mutex_lock(&dentry->d_inode->i_mutex);
S
Sage Weil 已提交
3654 3655

	error = -EBUSY;
3656
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3657 3658 3659 3660 3661 3662
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

3663
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3664 3665 3666 3667 3668 3669
	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3670
	detach_mounts(dentry);
S
Sage Weil 已提交
3671 3672

out:
3673
	mutex_unlock(&dentry->d_inode->i_mutex);
3674
	dput(dentry);
S
Sage Weil 已提交
3675
	if (!error)
L
Linus Torvalds 已提交
3676 3677 3678
		d_delete(dentry);
	return error;
}
3679
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3680

3681
static long do_rmdir(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3682 3683
{
	int error = 0;
3684
	struct filename *name;
L
Linus Torvalds 已提交
3685
	struct dentry *dentry;
3686 3687 3688
	struct path path;
	struct qstr last;
	int type;
3689 3690
	unsigned int lookup_flags = 0;
retry:
3691 3692
	name = user_path_parent(dfd, pathname,
				&path, &last, &type, lookup_flags);
3693 3694
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3695

3696
	switch (type) {
3697 3698 3699 3700 3701 3702 3703 3704 3705
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3706
	}
3707

3708
	error = mnt_want_write(path.mnt);
3709 3710
	if (error)
		goto exit1;
3711

3712 3713
	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3714
	error = PTR_ERR(dentry);
3715 3716
	if (IS_ERR(dentry))
		goto exit2;
3717 3718 3719 3720
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3721
	error = security_path_rmdir(&path, dentry);
3722
	if (error)
3723
		goto exit3;
3724
	error = vfs_rmdir(path.dentry->d_inode, dentry);
3725
exit3:
3726 3727
	dput(dentry);
exit2:
3728 3729
	mutex_unlock(&path.dentry->d_inode->i_mutex);
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3730
exit1:
3731
	path_put(&path);
L
Linus Torvalds 已提交
3732
	putname(name);
3733 3734 3735 3736
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3737 3738 3739
	return error;
}

3740
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3741 3742 3743 3744
{
	return do_rmdir(AT_FDCWD, pathname);
}

3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3764
{
J
J. Bruce Fields 已提交
3765
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3766 3767 3768 3769 3770
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3771
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3772 3773
		return -EPERM;

J
J. Bruce Fields 已提交
3774
	mutex_lock(&target->i_mutex);
3775
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3776 3777 3778
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3779
		if (!error) {
3780 3781
			error = try_break_deleg(target, delegated_inode);
			if (error)
3782
				goto out;
L
Linus Torvalds 已提交
3783
			error = dir->i_op->unlink(dir, dentry);
3784
			if (!error) {
3785
				dont_mount(dentry);
3786 3787
				detach_mounts(dentry);
			}
3788
		}
L
Linus Torvalds 已提交
3789
	}
3790
out:
J
J. Bruce Fields 已提交
3791
	mutex_unlock(&target->i_mutex);
L
Linus Torvalds 已提交
3792 3793 3794

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3795
		fsnotify_link_count(target);
J
John McCutchan 已提交
3796
		d_delete(dentry);
L
Linus Torvalds 已提交
3797
	}
R
Robert Love 已提交
3798

L
Linus Torvalds 已提交
3799 3800
	return error;
}
3801
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3802 3803 3804

/*
 * Make sure that the actual truncation of the file will occur outside its
3805
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3806 3807 3808
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3809
static long do_unlinkat(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3810
{
3811
	int error;
3812
	struct filename *name;
L
Linus Torvalds 已提交
3813
	struct dentry *dentry;
3814 3815 3816
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
3817
	struct inode *inode = NULL;
3818
	struct inode *delegated_inode = NULL;
3819 3820
	unsigned int lookup_flags = 0;
retry:
3821 3822
	name = user_path_parent(dfd, pathname,
				&path, &last, &type, lookup_flags);
3823 3824
	if (IS_ERR(name))
		return PTR_ERR(name);
3825

L
Linus Torvalds 已提交
3826
	error = -EISDIR;
3827
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
3828
		goto exit1;
3829

3830
	error = mnt_want_write(path.mnt);
3831 3832
	if (error)
		goto exit1;
3833
retry_deleg:
3834 3835
	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3836 3837 3838
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3839
		if (last.name[last.len])
3840
			goto slashes;
L
Linus Torvalds 已提交
3841
		inode = dentry->d_inode;
3842
		if (d_is_negative(dentry))
3843 3844
			goto slashes;
		ihold(inode);
3845
		error = security_path_unlink(&path, dentry);
3846
		if (error)
3847
			goto exit2;
3848
		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
3849
exit2:
L
Linus Torvalds 已提交
3850 3851
		dput(dentry);
	}
3852
	mutex_unlock(&path.dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3853 3854
	if (inode)
		iput(inode);	/* truncate the inode here */
3855 3856
	inode = NULL;
	if (delegated_inode) {
3857
		error = break_deleg_wait(&delegated_inode);
3858 3859 3860
		if (!error)
			goto retry_deleg;
	}
3861
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3862
exit1:
3863
	path_put(&path);
L
Linus Torvalds 已提交
3864
	putname(name);
3865 3866 3867 3868 3869
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
L
Linus Torvalds 已提交
3870 3871 3872
	return error;

slashes:
3873 3874
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
3875
	else if (d_is_dir(dentry))
3876 3877 3878
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
3879 3880 3881
	goto exit2;
}

3882
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3883 3884 3885 3886 3887 3888 3889 3890 3891 3892
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
		return do_rmdir(dfd, pathname);

	return do_unlinkat(dfd, pathname);
}

3893
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3894 3895 3896 3897
{
	return do_unlinkat(AT_FDCWD, pathname);
}

3898
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
3899
{
3900
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3901 3902 3903 3904

	if (error)
		return error;

A
Al Viro 已提交
3905
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
3906 3907 3908 3909 3910 3911 3912
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
3913
	if (!error)
3914
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3915 3916
	return error;
}
3917
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
3918

3919 3920
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
L
Linus Torvalds 已提交
3921
{
3922
	int error;
3923
	struct filename *from;
3924
	struct dentry *dentry;
3925
	struct path path;
3926
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3927 3928

	from = getname(oldname);
3929
	if (IS_ERR(from))
L
Linus Torvalds 已提交
3930
		return PTR_ERR(from);
3931 3932
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3933 3934
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
3935
		goto out_putname;
3936

3937
	error = security_path_symlink(&path, dentry, from->name);
3938
	if (!error)
3939
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
3940
	done_path_create(&path, dentry);
3941 3942 3943 3944
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
3945
out_putname:
L
Linus Torvalds 已提交
3946 3947 3948 3949
	putname(from);
	return error;
}

3950
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3951 3952 3953 3954
{
	return sys_symlinkat(oldname, AT_FDCWD, newname);
}

J
J. Bruce Fields 已提交
3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3975 3976
{
	struct inode *inode = old_dentry->d_inode;
3977
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3978 3979 3980 3981 3982
	int error;

	if (!inode)
		return -ENOENT;

3983
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
A
Al Viro 已提交
3995
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
3996
		return -EPERM;
3997
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
3998 3999 4000 4001 4002 4003
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

4004
	mutex_lock(&inode->i_mutex);
4005
	/* Make sure we don't allow creating hardlink to an unlinked file */
4006
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4007
		error =  -ENOENT;
4008 4009
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4010 4011 4012 4013 4014
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4015 4016 4017 4018 4019 4020

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
4021
	mutex_unlock(&inode->i_mutex);
4022
	if (!error)
4023
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4024 4025
	return error;
}
4026
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4027 4028 4029 4030 4031 4032 4033 4034 4035 4036

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4037 4038
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
L
Linus Torvalds 已提交
4039 4040
{
	struct dentry *new_dentry;
4041
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4042
	struct inode *delegated_inode = NULL;
4043
	int how = 0;
L
Linus Torvalds 已提交
4044 4045
	int error;

4046
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4047
		return -EINVAL;
4048
	/*
4049 4050 4051
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4052
	 */
4053 4054 4055
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4056
		how = LOOKUP_EMPTY;
4057
	}
4058 4059 4060

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4061
retry:
4062
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4063
	if (error)
4064 4065
		return error;

4066 4067
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4068
	error = PTR_ERR(new_dentry);
4069
	if (IS_ERR(new_dentry))
4070 4071 4072 4073 4074
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4075 4076 4077
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4078
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4079
	if (error)
4080
		goto out_dput;
J
J. Bruce Fields 已提交
4081
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4082
out_dput:
A
Al Viro 已提交
4083
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4084 4085
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4086 4087
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4088
			goto retry;
4089
		}
J
J. Bruce Fields 已提交
4090
	}
4091
	if (retry_estale(error, how)) {
4092
		path_put(&old_path);
4093 4094 4095
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4096
out:
4097
	path_put(&old_path);
L
Linus Torvalds 已提交
4098 4099 4100 4101

	return error;
}

4102
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4103
{
4104
	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4105 4106
}

4107 4108 4109 4110 4111 4112 4113
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4114
 * @flags:	rename flags
4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4129 4130 4131
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4132
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4133 4134
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4135
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4136
 *	   story.
4137 4138
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4139
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4140 4141
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4142
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4143 4144 4145
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4146
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4147 4148 4149
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4150
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4151
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4152
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4153
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4154 4155
 *	   locking].
 */
4156 4157
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4158
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4159
{
4160 4161 4162 4163
	int error;
	bool is_dir = d_is_dir(old_dentry);
	const unsigned char *old_name;
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4164
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4165 4166
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
4167 4168 4169 4170 4171 4172 4173 4174

	if (source == target)
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4175
	if (!target) {
4176
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4177 4178 4179 4180 4181 4182 4183 4184
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4185 4186 4187
	if (error)
		return error;

M
Miklos Szeredi 已提交
4188
	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4189
		return -EPERM;
L
Linus Torvalds 已提交
4190

M
Miklos Szeredi 已提交
4191 4192 4193
	if (flags && !old_dir->i_op->rename2)
		return -EINVAL;

L
Linus Torvalds 已提交
4194 4195 4196 4197
	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4209 4210
	}

4211 4212
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4213 4214 4215
	if (error)
		return error;

4216
	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4217
	dget(new_dentry);
M
Miklos Szeredi 已提交
4218
	if (!is_dir || (flags & RENAME_EXCHANGE))
4219 4220
		lock_two_nondirectories(source, target);
	else if (target)
4221
		mutex_lock(&target->i_mutex);
S
Sage Weil 已提交
4222 4223

	error = -EBUSY;
4224
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4225 4226
		goto out;

M
Miklos Szeredi 已提交
4227
	if (max_links && new_dir != old_dir) {
4228
		error = -EMLINK;
M
Miklos Szeredi 已提交
4229
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4230
			goto out;
M
Miklos Szeredi 已提交
4231 4232 4233 4234 4235 4236 4237
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
		shrink_dcache_parent(new_dentry);
	if (!is_dir) {
4238
		error = try_break_deleg(source, delegated_inode);
4239 4240
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4241 4242 4243 4244 4245
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4246
	}
M
Miklos Szeredi 已提交
4247
	if (!old_dir->i_op->rename2) {
M
Miklos Szeredi 已提交
4248 4249 4250
		error = old_dir->i_op->rename(old_dir, old_dentry,
					      new_dir, new_dentry);
	} else {
M
Miklos Szeredi 已提交
4251
		WARN_ON(old_dir->i_op->rename != NULL);
M
Miklos Szeredi 已提交
4252 4253 4254
		error = old_dir->i_op->rename2(old_dir, old_dentry,
					       new_dir, new_dentry, flags);
	}
S
Sage Weil 已提交
4255 4256 4257
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4258
	if (!(flags & RENAME_EXCHANGE) && target) {
4259 4260
		if (is_dir)
			target->i_flags |= S_DEAD;
S
Sage Weil 已提交
4261
		dont_mount(new_dentry);
4262
		detach_mounts(new_dentry);
4263
	}
M
Miklos Szeredi 已提交
4264 4265 4266 4267 4268 4269
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4270
out:
M
Miklos Szeredi 已提交
4271
	if (!is_dir || (flags & RENAME_EXCHANGE))
4272 4273 4274
		unlock_two_nondirectories(source, target);
	else if (target)
		mutex_unlock(&target->i_mutex);
L
Linus Torvalds 已提交
4275
	dput(new_dentry);
M
Miklos Szeredi 已提交
4276
	if (!error) {
4277
		fsnotify_move(old_dir, new_dir, old_name, is_dir,
M
Miklos Szeredi 已提交
4278 4279 4280 4281 4282 4283
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
				      new_is_dir, NULL, new_dentry);
		}
	}
R
Robert Love 已提交
4284 4285
	fsnotify_oldname_free(old_name);

L
Linus Torvalds 已提交
4286 4287
	return error;
}
4288
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4289

M
Miklos Szeredi 已提交
4290 4291
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
L
Linus Torvalds 已提交
4292
{
4293 4294
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4295 4296 4297
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4298
	struct inode *delegated_inode = NULL;
4299 4300
	struct filename *from;
	struct filename *to;
4301
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4302
	bool should_retry = false;
4303
	int error;
M
Miklos Szeredi 已提交
4304

M
Miklos Szeredi 已提交
4305
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
4306 4307
		return -EINVAL;

M
Miklos Szeredi 已提交
4308 4309
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
M
Miklos Szeredi 已提交
4310 4311
		return -EINVAL;

M
Miklos Szeredi 已提交
4312 4313 4314
	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
		return -EPERM;

4315 4316 4317
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4318
retry:
4319 4320
	from = user_path_parent(olddfd, oldname,
				&old_path, &old_last, &old_type, lookup_flags);
4321 4322
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
L
Linus Torvalds 已提交
4323
		goto exit;
4324
	}
L
Linus Torvalds 已提交
4325

4326 4327
	to = user_path_parent(newdfd, newname,
				&new_path, &new_last, &new_type, lookup_flags);
4328 4329
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4330
		goto exit1;
4331
	}
L
Linus Torvalds 已提交
4332 4333

	error = -EXDEV;
4334
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4335 4336 4337
		goto exit2;

	error = -EBUSY;
4338
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4339 4340
		goto exit2;

M
Miklos Szeredi 已提交
4341 4342
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4343
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4344 4345
		goto exit2;

4346
	error = mnt_want_write(old_path.mnt);
4347 4348 4349
	if (error)
		goto exit2;

4350
retry_deleg:
4351
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4352

4353
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4354 4355 4356 4357 4358
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4359
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4360
		goto exit4;
4361
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4362 4363 4364 4365 4366 4367
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4368 4369 4370 4371 4372 4373 4374
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4375
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4376 4377 4378
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4379
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4380
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4381
		error = -ENOTDIR;
4382
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4383
			goto exit5;
4384
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4385
			goto exit5;
L
Linus Torvalds 已提交
4386 4387 4388 4389
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4390
		goto exit5;
L
Linus Torvalds 已提交
4391
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4392 4393
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4394 4395 4396
	if (new_dentry == trap)
		goto exit5;

4397 4398
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4399
	if (error)
4400
		goto exit5;
4401 4402
	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
			   new_path.dentry->d_inode, new_dentry,
M
Miklos Szeredi 已提交
4403
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4404 4405 4406 4407 4408
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4409
	unlock_rename(new_path.dentry, old_path.dentry);
4410 4411 4412 4413 4414
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4415
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4416
exit2:
4417 4418
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4419
	path_put(&new_path);
4420
	putname(to);
L
Linus Torvalds 已提交
4421
exit1:
4422
	path_put(&old_path);
L
Linus Torvalds 已提交
4423
	putname(from);
4424 4425 4426 4427 4428
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4429
exit:
L
Linus Torvalds 已提交
4430 4431 4432
	return error;
}

M
Miklos Szeredi 已提交
4433 4434 4435 4436 4437 4438
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
}

4439
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4440
{
M
Miklos Szeredi 已提交
4441
	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4442 4443
}

M
Miklos Szeredi 已提交
4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
	int error = may_create(dir, dentry);
	if (error)
		return error;

	if (!dir->i_op->mknod)
		return -EPERM;

	return dir->i_op->mknod(dir, dentry,
				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}
EXPORT_SYMBOL(vfs_whiteout);

A
Al Viro 已提交
4458
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4459
{
A
Al Viro 已提交
4460
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}
A
Al Viro 已提交
4472
EXPORT_SYMBOL(readlink_copy);
L
Linus Torvalds 已提交
4473 4474 4475 4476 4477 4478 4479 4480

/*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
 * using) it for any given inode is up to filesystem.
 */
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4481
	void *cookie;
4482 4483
	struct inode *inode = d_inode(dentry);
	const char *link = inode->i_link;
4484
	int res;
4485

4486
	if (!link) {
4487
		link = inode->i_op->follow_link(dentry, &cookie);
4488 4489 4490
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
4491
	res = readlink_copy(buffer, buflen, link);
4492 4493
	if (inode->i_op->put_link)
		inode->i_op->put_link(inode, cookie);
4494
	return res;
L
Linus Torvalds 已提交
4495
}
4496
EXPORT_SYMBOL(generic_readlink);
L
Linus Torvalds 已提交
4497 4498 4499 4500

/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
{
4501 4502
	char *kaddr;
	struct page *page;
L
Linus Torvalds 已提交
4503
	struct address_space *mapping = dentry->d_inode->i_mapping;
4504
	page = read_mapping_page(mapping, 0, NULL);
L
Linus Torvalds 已提交
4505
	if (IS_ERR(page))
4506
		return (char*)page;
L
Linus Torvalds 已提交
4507
	*ppage = page;
4508 4509 4510
	kaddr = kmap(page);
	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
	return kaddr;
L
Linus Torvalds 已提交
4511 4512 4513 4514 4515
}

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct page *page = NULL;
A
Al Viro 已提交
4516
	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
L
Linus Torvalds 已提交
4517 4518 4519 4520 4521 4522
	if (page) {
		kunmap(page);
		page_cache_release(page);
	}
	return res;
}
4523
EXPORT_SYMBOL(page_readlink);
L
Linus Torvalds 已提交
4524

4525
const char *page_follow_link_light(struct dentry *dentry, void **cookie)
L
Linus Torvalds 已提交
4526
{
4527
	struct page *page = NULL;
4528 4529 4530 4531
	char *res = page_getlink(dentry, &page);
	if (!IS_ERR(res))
		*cookie = page;
	return res;
L
Linus Torvalds 已提交
4532
}
4533
EXPORT_SYMBOL(page_follow_link_light);
L
Linus Torvalds 已提交
4534

4535
void page_put_link(struct inode *unused, void *cookie)
L
Linus Torvalds 已提交
4536
{
4537
	struct page *page = cookie;
4538 4539
	kunmap(page);
	page_cache_release(page);
L
Linus Torvalds 已提交
4540
}
4541
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4542

4543 4544 4545 4546
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4547 4548
{
	struct address_space *mapping = inode->i_mapping;
4549
	struct page *page;
4550
	void *fsdata;
4551
	int err;
L
Linus Torvalds 已提交
4552
	char *kaddr;
4553 4554 4555
	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4556

4557
retry:
4558
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4559
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4560
	if (err)
4561 4562
		goto fail;

4563
	kaddr = kmap_atomic(page);
L
Linus Torvalds 已提交
4564
	memcpy(kaddr, symname, len-1);
4565
	kunmap_atomic(kaddr);
4566 4567 4568

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4569 4570
	if (err < 0)
		goto fail;
4571 4572 4573
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4574 4575 4576 4577 4578
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4579
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4580

4581 4582 4583
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4584
			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
4585
}
4586
EXPORT_SYMBOL(page_symlink);
4587

4588
const struct inode_operations page_symlink_inode_operations = {
L
Linus Torvalds 已提交
4589 4590 4591 4592 4593
	.readlink	= generic_readlink,
	.follow_link	= page_follow_link_light,
	.put_link	= page_put_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);