namei.c 112.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
L
Linus Torvalds 已提交
20 21 22 23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
24
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
25 26
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
27
#include <linux/ima.h>
L
Linus Torvalds 已提交
28 29 30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
37
#include <linux/hash.h>
L
Linus Torvalds 已提交
38 39
#include <asm/uaccess.h>

40
#include "internal.h"
41
#include "mount.h"
42

L
Linus Torvalds 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
77
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
110
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
111 112 113 114 115 116 117 118 119 120
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
121
void final_putname(struct filename *name)
L
Linus Torvalds 已提交
122
{
123 124 125 126 127 128
	if (name->separate) {
		__putname(name->name);
		kfree(name);
	} else {
		__putname(name);
	}
129 130
}

131 132
#define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))

133 134 135 136
static struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
	struct filename *result, *err;
137
	int len;
138 139
	long max;
	char *kname;
140

141 142 143 144
	result = audit_reusename(filename);
	if (result)
		return result;

145
	result = __getname();
146
	if (unlikely(!result))
147 148
		return ERR_PTR(-ENOMEM);

149 150 151 152 153
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
	kname = (char *)result + sizeof(*result);
154
	result->name = kname;
155 156 157 158 159
	result->separate = false;
	max = EMBEDDED_NAME_MAX;

recopy:
	len = strncpy_from_user(kname, filename, max);
160 161
	if (unlikely(len < 0)) {
		err = ERR_PTR(len);
162
		goto error;
163
	}
164

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
	if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
		kname = (char *)result;

		result = kzalloc(sizeof(*result), GFP_KERNEL);
		if (!result) {
			err = ERR_PTR(-ENOMEM);
			result = (struct filename *)kname;
			goto error;
		}
		result->name = kname;
		result->separate = true;
		max = PATH_MAX;
		goto recopy;
	}

186 187 188
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
189
			*empty = 1;
190 191 192
		err = ERR_PTR(-ENOENT);
		if (!(flags & LOOKUP_EMPTY))
			goto error;
L
Linus Torvalds 已提交
193
	}
194 195

	err = ERR_PTR(-ENAMETOOLONG);
196 197 198 199
	if (unlikely(len >= PATH_MAX))
		goto error;

	result->uptr = filename;
200
	result->aname = NULL;
201 202
	audit_getname(result);
	return result;
203 204

error:
205
	final_putname(result);
206
	return err;
L
Linus Torvalds 已提交
207 208
}

209 210
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
211
{
212
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
213 214
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
/*
 * The "getname_kernel()" interface doesn't do pathnames longer
 * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
 */
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
	char *kname;
	int len;

	len = strlen(filename);
	if (len >= EMBEDDED_NAME_MAX)
		return ERR_PTR(-ENAMETOOLONG);

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

	kname = (char *)result + sizeof(*result);
	result->name = kname;
	result->uptr = NULL;
	result->aname = NULL;
	result->separate = false;

	strlcpy(kname, filename, EMBEDDED_NAME_MAX);
	return result;
}

L
Linus Torvalds 已提交
244
#ifdef CONFIG_AUDITSYSCALL
245
void putname(struct filename *name)
L
Linus Torvalds 已提交
246
{
247
	if (unlikely(!audit_dummy_context()))
248 249
		return audit_putname(name);
	final_putname(name);
L
Linus Torvalds 已提交
250 251 252
}
#endif

253 254
static int check_acl(struct inode *inode, int mask)
{
255
#ifdef CONFIG_FS_POSIX_ACL
256 257 258
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
259 260
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
261
	                return -EAGAIN;
262 263 264
		/* no ->get_acl() calls in RCU mode... */
		if (acl == ACL_NOT_CACHED)
			return -ECHILD;
A
Ari Savolainen 已提交
265
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
266 267
	}

C
Christoph Hellwig 已提交
268 269 270
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
271 272 273 274 275
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
276
#endif
277 278 279 280

	return -EAGAIN;
}

281
/*
282
 * This does the basic permission checking
L
Linus Torvalds 已提交
283
 */
284
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
285
{
286
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
287

288
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
L
Linus Torvalds 已提交
289 290
		mode >>= 6;
	else {
291
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
292
			int error = check_acl(inode, mask);
293 294
			if (error != -EAGAIN)
				return error;
L
Linus Torvalds 已提交
295 296 297 298 299 300 301 302 303
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
304
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
L
Linus Torvalds 已提交
305
		return 0;
306 307 308 309
	return -EACCES;
}

/**
310
 * generic_permission -  check for access rights on a Posix-like filesystem
311
 * @inode:	inode to check access rights for
312
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
313 314 315 316
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
317 318 319 320 321
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
322
 */
323
int generic_permission(struct inode *inode, int mask)
324 325 326 327
{
	int ret;

	/*
328
	 * Do the basic permission checks.
329
	 */
330
	ret = acl_permission_check(inode, mask);
331 332
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
333

334 335
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
336
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
337 338
			return 0;
		if (!(mask & MAY_WRITE))
339 340
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
341 342 343
				return 0;
		return -EACCES;
	}
L
Linus Torvalds 已提交
344 345
	/*
	 * Read/write DACs are always overridable.
346 347
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
L
Linus Torvalds 已提交
348
	 */
349
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
350
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
351 352 353 354 355
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
356
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
357
	if (mask == MAY_READ)
358
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
359 360 361 362
			return 0;

	return -EACCES;
}
363
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
364

365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

C
Christoph Hellwig 已提交
385
/**
D
David Howells 已提交
386 387 388
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
C
Christoph Hellwig 已提交
389
 *
D
David Howells 已提交
390
 * Check for read/write/execute permissions on an inode.
391 392
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
D
David Howells 已提交
393 394 395
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
C
Christoph Hellwig 已提交
396
 */
D
David Howells 已提交
397
int __inode_permission(struct inode *inode, int mask)
L
Linus Torvalds 已提交
398
{
399
	int retval;
L
Linus Torvalds 已提交
400

401
	if (unlikely(mask & MAY_WRITE)) {
L
Linus Torvalds 已提交
402 403 404 405 406 407 408
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

409
	retval = do_inode_permission(inode, mask);
L
Linus Torvalds 已提交
410 411 412
	if (retval)
		return retval;

413 414 415 416
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

417
	return security_inode_permission(inode, mask);
L
Linus Torvalds 已提交
418
}
419
EXPORT_SYMBOL(__inode_permission);
L
Linus Torvalds 已提交
420

D
David Howells 已提交
421 422 423
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
424
 * @inode: Inode to check permission on
D
David Howells 已提交
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}
462
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
463

J
Jan Blunck 已提交
464 465 466 467 468 469
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
470
void path_get(const struct path *path)
J
Jan Blunck 已提交
471 472 473 474 475 476
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
477 478 479 480 481 482
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
483
void path_put(const struct path *path)
L
Linus Torvalds 已提交
484
{
J
Jan Blunck 已提交
485 486
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
487
}
J
Jan Blunck 已提交
488
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
489

490 491 492 493 494 495 496 497 498 499 500 501
struct nameidata {
	struct path	path;
	struct qstr	last;
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
	unsigned	seq, m_seq;
	int		last_type;
	unsigned	depth;
	char *saved_names[MAX_NESTED_LINKS + 1];
};

A
Al Viro 已提交
502
/*
N
Nick Piggin 已提交
503
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
504 505 506 507 508 509 510
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
511 512 513
 */

/**
A
Al Viro 已提交
514 515 516
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
517
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
518
 *
A
Al Viro 已提交
519 520 521
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
N
Nick Piggin 已提交
522
 */
A
Al Viro 已提交
523
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
N
Nick Piggin 已提交
524 525 526 527 528
{
	struct fs_struct *fs = current->fs;
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
529 530

	/*
A
Al Viro 已提交
531 532 533 534 535 536
	 * After legitimizing the bastards, terminate_walk()
	 * will do the right thing for non-RCU mode, and all our
	 * subsequent exit cases should rcu_read_unlock()
	 * before returning.  Do vfsmount first; if dentry
	 * can't be legitimized, just set nd->path.dentry to NULL
	 * and rely on dput(NULL) being a no-op.
537
	 */
A
Al Viro 已提交
538
	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
539 540
		return -ECHILD;
	nd->flags &= ~LOOKUP_RCU;
541

A
Al Viro 已提交
542 543
	if (!lockref_get_not_dead(&parent->d_lockref)) {
		nd->path.dentry = NULL;	
544
		goto out;
A
Al Viro 已提交
545 546
	}

547 548 549 550 551 552 553 554 555 556 557
	/*
	 * For a negative lookup, the lookup sequence point is the parents
	 * sequence point, and it only needs to revalidate the parent dentry.
	 *
	 * For a positive lookup, we need to move both the parent and the
	 * dentry from the RCU domain to be properly refcounted. And the
	 * sequence number in the dentry validates *both* dentry counters,
	 * since we checked the sequence number of the parent after we got
	 * the child sequence number. So we know the parent must still
	 * be valid if the child sequence number is still valid.
	 */
A
Al Viro 已提交
558
	if (!dentry) {
559 560
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			goto out;
A
Al Viro 已提交
561 562
		BUG_ON(nd->inode != parent->d_inode);
	} else {
563 564 565 566
		if (!lockref_get_not_dead(&dentry->d_lockref))
			goto out;
		if (read_seqcount_retry(&dentry->d_seq, nd->seq))
			goto drop_dentry;
A
Al Viro 已提交
567
	}
568 569 570 571 572 573 574 575 576

	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		spin_lock(&fs->lock);
		if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
			goto unlock_and_drop_dentry;
N
Nick Piggin 已提交
577 578 579 580
		path_get(&nd->root);
		spin_unlock(&fs->lock);
	}

A
Al Viro 已提交
581
	rcu_read_unlock();
N
Nick Piggin 已提交
582
	return 0;
A
Al Viro 已提交
583

584 585 586
unlock_and_drop_dentry:
	spin_unlock(&fs->lock);
drop_dentry:
A
Al Viro 已提交
587
	rcu_read_unlock();
588
	dput(dentry);
589
	goto drop_root_mnt;
590
out:
A
Al Viro 已提交
591
	rcu_read_unlock();
592 593 594
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
N
Nick Piggin 已提交
595 596 597
	return -ECHILD;
}

598
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
599
{
600
	return dentry->d_op->d_revalidate(dentry, flags);
601 602
}

603 604 605
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
606
 *
607 608 609 610 611
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
612
 */
613
static int complete_walk(struct nameidata *nd)
614
{
A
Al Viro 已提交
615
	struct dentry *dentry = nd->path.dentry;
616 617
	int status;

618 619 620 621
	if (nd->flags & LOOKUP_RCU) {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
622

A
Al Viro 已提交
623
		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
A
Al Viro 已提交
624
			rcu_read_unlock();
A
Al Viro 已提交
625 626
			return -ECHILD;
		}
627
		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
A
Al Viro 已提交
628
			rcu_read_unlock();
A
Al Viro 已提交
629
			mntput(nd->path.mnt);
630 631 632
			return -ECHILD;
		}
		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
A
Al Viro 已提交
633
			rcu_read_unlock();
634
			dput(dentry);
A
Al Viro 已提交
635
			mntput(nd->path.mnt);
636 637
			return -ECHILD;
		}
A
Al Viro 已提交
638
		rcu_read_unlock();
639 640
	}

A
Al Viro 已提交
641 642 643
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

644
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
645 646
		return 0;

647
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
648 649 650
	if (status > 0)
		return 0;

A
Al Viro 已提交
651
	if (!status)
652
		status = -ESTALE;
A
Al Viro 已提交
653

654
	path_put(&nd->path);
655 656 657
	return status;
}

A
Al Viro 已提交
658 659
static __always_inline void set_root(struct nameidata *nd)
{
660
	get_fs_root(current->fs, &nd->root);
A
Al Viro 已提交
661 662
}

663 664
static int link_path_walk(const char *, struct nameidata *);

665
static __always_inline unsigned set_root_rcu(struct nameidata *nd)
N
Nick Piggin 已提交
666
{
667 668
	struct fs_struct *fs = current->fs;
	unsigned seq, res;
N
Nick Piggin 已提交
669

670 671 672 673 674 675
	do {
		seq = read_seqcount_begin(&fs->seq);
		nd->root = fs->root;
		res = __read_seqcount_begin(&nd->root.dentry->d_seq);
	} while (read_seqcount_retry(&fs->seq, seq));
	return res;
N
Nick Piggin 已提交
676 677
}

J
Jan Blunck 已提交
678
static void path_put_conditional(struct path *path, struct nameidata *nd)
679 680
{
	dput(path->dentry);
681
	if (path->mnt != nd->path.mnt)
682 683 684
		mntput(path->mnt);
}

685 686
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
687
{
N
Nick Piggin 已提交
688 689 690 691
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
692
	}
N
Nick Piggin 已提交
693
	nd->path.mnt = path->mnt;
694
	nd->path.dentry = path->dentry;
695 696
}

C
Christoph Hellwig 已提交
697 698 699 700 701 702 703 704 705 706 707 708 709
/*
 * Helper to directly jump to a known parsed path from ->follow_link,
 * caller must have taken a reference to path beforehand.
 */
void nd_jump_link(struct nameidata *nd, struct path *path)
{
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

710 711 712 713 714 715 716 717 718 719 720 721
void nd_set_link(struct nameidata *nd, char *path)
{
	nd->saved_names[nd->depth] = path;
}
EXPORT_SYMBOL(nd_set_link);

char *nd_get_link(struct nameidata *nd)
{
	return nd->saved_names[nd->depth];
}
EXPORT_SYMBOL(nd_get_link);

722 723 724
static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
{
	struct inode *inode = link->dentry->d_inode;
725
	if (inode->i_op->put_link)
726 727 728 729
		inode->i_op->put_link(link->dentry, nd, cookie);
	path_put(link);
}

730 731
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
K
Kees Cook 已提交
732 733 734 735

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @link: The path of the symlink
736
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct path *link, struct nameidata *nd)
{
	const struct inode *inode;
	const struct inode *parent;

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
	inode = link->dentry->d_inode;
759
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
760 761 762 763 764 765 766 767
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
	parent = nd->path.dentry->d_inode;
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
768
	if (uid_eq(parent->i_uid, inode->i_uid))
K
Kees Cook 已提交
769 770
		return 0;

771
	audit_log_link_denied("follow_link", link);
K
Kees Cook 已提交
772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
	path_put_conditional(link, nd);
	path_put(&nd->path);
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	const struct cred *cred;
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	cred = current_cred();
	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
838
	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
K
Kees Cook 已提交
839 840 841
	    capable(CAP_FOWNER))
		return 0;

842
	audit_log_link_denied("linkat", link);
K
Kees Cook 已提交
843 844 845
	return -EPERM;
}

A
Al Viro 已提交
846
static __always_inline int
847
follow_link(struct path *link, struct nameidata *nd, void **p)
L
Linus Torvalds 已提交
848
{
849
	struct dentry *dentry = link->dentry;
850 851
	int error;
	char *s;
L
Linus Torvalds 已提交
852

853 854
	BUG_ON(nd->flags & LOOKUP_RCU);

A
Al Viro 已提交
855 856 857
	if (link->mnt == nd->path.mnt)
		mntget(link->mnt);

858 859 860 861
	error = -ELOOP;
	if (unlikely(current->total_link_count >= 40))
		goto out_put_nd_path;

862 863 864
	cond_resched();
	current->total_link_count++;

A
Al Viro 已提交
865
	touch_atime(link);
L
Linus Torvalds 已提交
866
	nd_set_link(nd, NULL);
A
Al Viro 已提交
867

868
	error = security_inode_follow_link(link->dentry, nd);
869 870
	if (error)
		goto out_put_nd_path;
871

872
	nd->last_type = LAST_BIND;
A
Al Viro 已提交
873 874
	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
	error = PTR_ERR(*p);
875
	if (IS_ERR(*p))
876
		goto out_put_nd_path;
877 878 879 880

	error = 0;
	s = nd_get_link(nd);
	if (s) {
881 882 883 884 885 886
		if (unlikely(IS_ERR(s))) {
			path_put(&nd->path);
			put_link(nd, link, *p);
			return PTR_ERR(s);
		}
		if (*s == '/') {
887 888
			if (!nd->root.mnt)
				set_root(nd);
889 890 891 892 893 894 895
			path_put(&nd->path);
			nd->path = nd->root;
			path_get(&nd->root);
			nd->flags |= LOOKUP_JUMPED;
		}
		nd->inode = nd->path.dentry->d_inode;
		error = link_path_walk(s, nd);
C
Christoph Hellwig 已提交
896 897
		if (unlikely(error))
			put_link(nd, link, *p);
L
Linus Torvalds 已提交
898
	}
899 900 901 902

	return error;

out_put_nd_path:
A
Arnd Bergmann 已提交
903
	*p = NULL;
904 905
	path_put(&nd->path);
	path_put(link);
L
Linus Torvalds 已提交
906 907 908
	return error;
}

N
Nick Piggin 已提交
909 910
static int follow_up_rcu(struct path *path)
{
911 912
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
N
Nick Piggin 已提交
913 914
	struct dentry *mountpoint;

915 916
	parent = mnt->mnt_parent;
	if (&parent->mnt == path->mnt)
N
Nick Piggin 已提交
917
		return 0;
918
	mountpoint = mnt->mnt_mountpoint;
N
Nick Piggin 已提交
919
	path->dentry = mountpoint;
920
	path->mnt = &parent->mnt;
N
Nick Piggin 已提交
921 922 923
	return 1;
}

924 925 926 927 928 929 930 931 932 933
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
934
int follow_up(struct path *path)
L
Linus Torvalds 已提交
935
{
936 937
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
938
	struct dentry *mountpoint;
N
Nick Piggin 已提交
939

A
Al Viro 已提交
940
	read_seqlock_excl(&mount_lock);
941
	parent = mnt->mnt_parent;
A
Al Viro 已提交
942
	if (parent == mnt) {
A
Al Viro 已提交
943
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
944 945
		return 0;
	}
946
	mntget(&parent->mnt);
947
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
948
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
949 950 951
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
952
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
953 954
	return 1;
}
955
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
956

N
Nick Piggin 已提交
957
/*
958 959 960
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
961
 */
962 963
static int follow_automount(struct path *path, unsigned flags,
			    bool *need_mntput)
N
Nick Piggin 已提交
964
{
965
	struct vfsmount *mnt;
966
	int err;
967 968 969 970

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

971 972 973 974 975 976 977 978 979 980
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
981
	 */
982
	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
983
		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
984 985 986
	    path->dentry->d_inode)
		return -EISDIR;

987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
	current->total_link_count++;
	if (current->total_link_count >= 40)
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
A
Al Viro 已提交
1002
		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
1003 1004
			return -EREMOTE;
		return PTR_ERR(mnt);
N
Nick Piggin 已提交
1005
	}
1006

1007 1008
	if (!mnt) /* mount collision */
		return 0;
N
Nick Piggin 已提交
1009

1010 1011 1012 1013 1014
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1015
	err = finish_automount(mnt, path);
1016

1017 1018 1019
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1020
		return 0;
1021
	case 0:
1022
		path_put(path);
1023 1024 1025
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1026 1027
	default:
		return err;
1028
	}
1029

A
Al Viro 已提交
1030 1031
}

1032 1033
/*
 * Handle a dentry that is managed in some way.
1034
 * - Flagged for transit management (autofs)
1035 1036 1037 1038 1039 1040 1041 1042
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
static int follow_managed(struct path *path, unsigned flags)
L
Linus Torvalds 已提交
1043
{
1044
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1045 1046
	unsigned managed;
	bool need_mntput = false;
1047
	int ret = 0;
1048 1049 1050 1051 1052 1053 1054

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1055 1056 1057 1058 1059
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1060
			ret = path->dentry->d_op->d_manage(path->dentry, false);
1061
			if (ret < 0)
1062
				break;
1063 1064
		}

1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
A
Al Viro 已提交
1080 1081
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1082 1083 1084 1085 1086 1087
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
			ret = follow_automount(path, flags, &need_mntput);
			if (ret < 0)
1088
				break;
1089 1090 1091 1092 1093
			continue;
		}

		/* We didn't change the current path point */
		break;
L
Linus Torvalds 已提交
1094
	}
1095 1096 1097 1098 1099

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (ret == -EISDIR)
		ret = 0;
1100
	return ret < 0 ? ret : need_mntput;
L
Linus Torvalds 已提交
1101 1102
}

1103
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1104 1105 1106
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1107
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1108
	if (mounted) {
A
Al Viro 已提交
1109 1110 1111 1112
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1113 1114 1115 1116
		return 1;
	}
	return 0;
}
1117
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1118

1119
static inline int managed_dentry_rcu(struct dentry *dentry)
1120
{
1121 1122
	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		dentry->d_op->d_manage(dentry, true) : 0;
1123 1124
}

1125
/*
1126 1127
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1128 1129
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1130
			       struct inode **inode)
1131
{
1132
	for (;;) {
1133
		struct mount *mounted;
1134 1135 1136 1137
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1138 1139 1140
		switch (managed_dentry_rcu(path->dentry)) {
		case -ECHILD:
		default:
1141
			return false;
1142 1143 1144 1145 1146
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1147 1148

		if (!d_mountpoint(path->dentry))
1149
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1150

A
Al Viro 已提交
1151
		mounted = __lookup_mnt(path->mnt, path->dentry);
1152 1153
		if (!mounted)
			break;
1154 1155
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1156
		nd->flags |= LOOKUP_JUMPED;
1157
		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1158 1159 1160 1161 1162 1163
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1164
	}
1165
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1166
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1167 1168
}

N
Nick Piggin 已提交
1169 1170
static int follow_dotdot_rcu(struct nameidata *nd)
{
1171
	struct inode *inode = nd->inode;
1172 1173
	if (!nd->root.mnt)
		set_root_rcu(nd);
N
Nick Piggin 已提交
1174

1175
	while (1) {
N
Nick Piggin 已提交
1176 1177 1178 1179 1180 1181 1182 1183 1184
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
			break;
		}
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1185
			inode = parent->d_inode;
N
Nick Piggin 已提交
1186 1187
			seq = read_seqcount_begin(&parent->d_seq);
			if (read_seqcount_retry(&old->d_seq, nd->seq))
1188
				goto failed;
N
Nick Piggin 已提交
1189 1190 1191 1192 1193 1194
			nd->path.dentry = parent;
			nd->seq = seq;
			break;
		}
		if (!follow_up_rcu(&nd->path))
			break;
1195
		inode = nd->path.dentry->d_inode;
N
Nick Piggin 已提交
1196 1197
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
	}
1198 1199 1200 1201 1202 1203 1204
	while (d_mountpoint(nd->path.dentry)) {
		struct mount *mounted;
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
		if (!mounted)
			break;
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
1205
		inode = nd->path.dentry->d_inode;
1206
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1207
		if (read_seqretry(&mount_lock, nd->m_seq))
1208 1209
			goto failed;
	}
1210
	nd->inode = inode;
N
Nick Piggin 已提交
1211
	return 0;
1212 1213 1214

failed:
	nd->flags &= ~LOOKUP_RCU;
1215 1216
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
A
Al Viro 已提交
1217
	rcu_read_unlock();
1218
	return -ECHILD;
N
Nick Piggin 已提交
1219 1220
}

1221 1222 1223 1224 1225
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
1226
int follow_down(struct path *path)
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
{
	unsigned managed;
	int ret;

	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held.
		 *
		 * We indicate to the filesystem if someone is trying to mount
		 * something here.  This gives autofs the chance to deny anyone
		 * other than its daemon the right to mount on its
		 * superstructure.
		 *
		 * The filesystem may sleep at this point.
		 */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1246
			ret = path->dentry->d_op->d_manage(
1247
				path->dentry, false);
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
			if (ret < 0)
				return ret == -EISDIR ? 0 : ret;
		}

		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (!mounted)
				break;
			dput(path->dentry);
			mntput(path->mnt);
			path->mnt = mounted;
			path->dentry = dget(mounted->mnt_root);
			continue;
		}

		/* Don't handle automount points here */
		break;
	}
	return 0;
}
1269
EXPORT_SYMBOL(follow_down);
1270

1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
/*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 */
static void follow_mount(struct path *path)
{
	while (d_mountpoint(path->dentry)) {
		struct vfsmount *mounted = lookup_mnt(path);
		if (!mounted)
			break;
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
	}
}

N
Nick Piggin 已提交
1287
static void follow_dotdot(struct nameidata *nd)
L
Linus Torvalds 已提交
1288
{
1289 1290
	if (!nd->root.mnt)
		set_root(nd);
1291

L
Linus Torvalds 已提交
1292
	while(1) {
1293
		struct dentry *old = nd->path.dentry;
L
Linus Torvalds 已提交
1294

A
Al Viro 已提交
1295 1296
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
L
Linus Torvalds 已提交
1297 1298
			break;
		}
1299
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
A
Al Viro 已提交
1300 1301
			/* rare case of legitimate dget_parent()... */
			nd->path.dentry = dget_parent(nd->path.dentry);
L
Linus Torvalds 已提交
1302 1303 1304
			dput(old);
			break;
		}
A
Al Viro 已提交
1305
		if (!follow_up(&nd->path))
L
Linus Torvalds 已提交
1306 1307
			break;
	}
A
Al Viro 已提交
1308
	follow_mount(&nd->path);
N
Nick Piggin 已提交
1309
	nd->inode = nd->path.dentry->d_inode;
L
Linus Torvalds 已提交
1310 1311
}

1312
/*
M
Miklos Szeredi 已提交
1313 1314 1315 1316 1317
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
 *
 * dir->d_inode->i_mutex must be held
1318
 */
M
Miklos Szeredi 已提交
1319
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1320
				    unsigned int flags, bool *need_lookup)
1321 1322
{
	struct dentry *dentry;
M
Miklos Szeredi 已提交
1323
	int error;
1324

M
Miklos Szeredi 已提交
1325 1326 1327
	*need_lookup = false;
	dentry = d_lookup(dir, name);
	if (dentry) {
J
Jeff Layton 已提交
1328
		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1329
			error = d_revalidate(dentry, flags);
M
Miklos Szeredi 已提交
1330 1331 1332 1333
			if (unlikely(error <= 0)) {
				if (error < 0) {
					dput(dentry);
					return ERR_PTR(error);
1334 1335
				} else {
					d_invalidate(dentry);
M
Miklos Szeredi 已提交
1336 1337 1338 1339 1340 1341
					dput(dentry);
					dentry = NULL;
				}
			}
		}
	}
1342

M
Miklos Szeredi 已提交
1343 1344 1345 1346
	if (!dentry) {
		dentry = d_alloc(dir, name);
		if (unlikely(!dentry))
			return ERR_PTR(-ENOMEM);
1347

M
Miklos Szeredi 已提交
1348
		*need_lookup = true;
1349 1350 1351 1352
	}
	return dentry;
}

1353
/*
1354 1355
 * Call i_op->lookup on the dentry.  The dentry must be negative and
 * unhashed.
M
Miklos Szeredi 已提交
1356 1357
 *
 * dir->d_inode->i_mutex must be held
1358
 */
M
Miklos Szeredi 已提交
1359
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1360
				  unsigned int flags)
1361 1362 1363 1364
{
	struct dentry *old;

	/* Don't create child dentry for a dead directory. */
M
Miklos Szeredi 已提交
1365
	if (unlikely(IS_DEADDIR(dir))) {
1366
		dput(dentry);
1367
		return ERR_PTR(-ENOENT);
1368
	}
1369

1370
	old = dir->i_op->lookup(dir, dentry, flags);
1371 1372 1373 1374 1375 1376 1377
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
}

1378
static struct dentry *__lookup_hash(struct qstr *name,
1379
		struct dentry *base, unsigned int flags)
1380
{
M
Miklos Szeredi 已提交
1381
	bool need_lookup;
1382 1383
	struct dentry *dentry;

1384
	dentry = lookup_dcache(name, base, flags, &need_lookup);
M
Miklos Szeredi 已提交
1385 1386
	if (!need_lookup)
		return dentry;
1387

1388
	return lookup_real(base->d_inode, dentry, flags);
1389 1390
}

L
Linus Torvalds 已提交
1391 1392 1393 1394 1395
/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
A
Al Viro 已提交
1396
static int lookup_fast(struct nameidata *nd,
M
Miklos Szeredi 已提交
1397
		       struct path *path, struct inode **inode)
L
Linus Torvalds 已提交
1398
{
1399
	struct vfsmount *mnt = nd->path.mnt;
N
Nick Piggin 已提交
1400
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1401 1402
	int need_reval = 1;
	int status = 1;
1403 1404
	int err;

1405 1406 1407 1408 1409
	/*
	 * Rename seqlock is not required here because in the off chance
	 * of a false negative due to a concurrent rename, we're going to
	 * do the non-racy lookup, below.
	 */
N
Nick Piggin 已提交
1410 1411
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1412
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1413 1414 1415
		if (!dentry)
			goto unlazy;

1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
		*inode = dentry->d_inode;
		if (read_seqcount_retry(&dentry->d_seq, seq))
			return -ECHILD;

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
N
Nick Piggin 已提交
1431 1432 1433
		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
			return -ECHILD;
		nd->seq = seq;
A
Al Viro 已提交
1434

1435
		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1436
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1437 1438 1439 1440 1441
			if (unlikely(status <= 0)) {
				if (status != -ECHILD)
					need_reval = 0;
				goto unlazy;
			}
1442
		}
N
Nick Piggin 已提交
1443 1444
		path->mnt = mnt;
		path->dentry = dentry;
1445 1446
		if (likely(__follow_mount_rcu(nd, path, inode)))
			return 0;
A
Al Viro 已提交
1447
unlazy:
A
Al Viro 已提交
1448 1449
		if (unlazy_walk(nd, dentry))
			return -ECHILD;
A
Al Viro 已提交
1450
	} else {
A
Al Viro 已提交
1451
		dentry = __d_lookup(parent, &nd->last);
1452
	}
A
Al Viro 已提交
1453

1454 1455 1456
	if (unlikely(!dentry))
		goto need_lookup;

A
Al Viro 已提交
1457
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1458
		status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1459 1460 1461 1462 1463
	if (unlikely(status <= 0)) {
		if (status < 0) {
			dput(dentry);
			return status;
		}
1464 1465 1466
		d_invalidate(dentry);
		dput(dentry);
		goto need_lookup;
1467
	}
M
Miklos Szeredi 已提交
1468

1469 1470 1471
	path->mnt = mnt;
	path->dentry = dentry;
	err = follow_managed(path, nd->flags);
1472 1473
	if (unlikely(err < 0)) {
		path_put_conditional(path, nd);
1474
		return err;
1475
	}
1476 1477
	if (err)
		nd->flags |= LOOKUP_JUMPED;
1478
	*inode = path->dentry->d_inode;
L
Linus Torvalds 已提交
1479
	return 0;
1480 1481

need_lookup:
M
Miklos Szeredi 已提交
1482 1483 1484 1485
	return 1;
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1486
static int lookup_slow(struct nameidata *nd, struct path *path)
M
Miklos Szeredi 已提交
1487 1488 1489 1490 1491
{
	struct dentry *dentry, *parent;
	int err;

	parent = nd->path.dentry;
1492 1493 1494
	BUG_ON(nd->inode != parent->d_inode);

	mutex_lock(&parent->d_inode->i_mutex);
A
Al Viro 已提交
1495
	dentry = __lookup_hash(&nd->last, parent, nd->flags);
1496 1497 1498
	mutex_unlock(&parent->d_inode->i_mutex);
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
	err = follow_managed(path, nd->flags);
	if (unlikely(err < 0)) {
		path_put_conditional(path, nd);
		return err;
	}
	if (err)
		nd->flags |= LOOKUP_JUMPED;
	return 0;
L
Linus Torvalds 已提交
1509 1510
}

1511 1512 1513
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1514
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1515 1516
		if (err != -ECHILD)
			return err;
A
Al Viro 已提交
1517
		if (unlazy_walk(nd, NULL))
1518 1519
			return -ECHILD;
	}
1520
	return inode_permission(nd->inode, MAY_EXEC);
1521 1522
}

1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
static inline int handle_dots(struct nameidata *nd, int type)
{
	if (type == LAST_DOTDOT) {
		if (nd->flags & LOOKUP_RCU) {
			if (follow_dotdot_rcu(nd))
				return -ECHILD;
		} else
			follow_dotdot(nd);
	}
	return 0;
}

1535 1536 1537 1538 1539 1540
static void terminate_walk(struct nameidata *nd)
{
	if (!(nd->flags & LOOKUP_RCU)) {
		path_put(&nd->path);
	} else {
		nd->flags &= ~LOOKUP_RCU;
1541 1542
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
A
Al Viro 已提交
1543
		rcu_read_unlock();
1544 1545 1546
	}
}

1547 1548 1549 1550 1551 1552
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1553
static inline int should_follow_link(struct dentry *dentry, int follow)
1554
{
1555
	return unlikely(d_is_symlink(dentry)) ? follow : 0;
1556 1557
}

1558
static inline int walk_component(struct nameidata *nd, struct path *path,
1559
		int follow)
1560 1561 1562 1563 1564 1565 1566 1567
{
	struct inode *inode;
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1568 1569
	if (unlikely(nd->last_type != LAST_NORM))
		return handle_dots(nd, nd->last_type);
A
Al Viro 已提交
1570
	err = lookup_fast(nd, path, &inode);
1571
	if (unlikely(err)) {
M
Miklos Szeredi 已提交
1572 1573 1574
		if (err < 0)
			goto out_err;

A
Al Viro 已提交
1575
		err = lookup_slow(nd, path);
M
Miklos Szeredi 已提交
1576 1577 1578 1579
		if (err < 0)
			goto out_err;

		inode = path->dentry->d_inode;
1580
	}
M
Miklos Szeredi 已提交
1581
	err = -ENOENT;
1582
	if (!inode || d_is_negative(path->dentry))
M
Miklos Szeredi 已提交
1583 1584
		goto out_path_put;

1585
	if (should_follow_link(path->dentry, follow)) {
A
Al Viro 已提交
1586 1587
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, path->dentry))) {
M
Miklos Szeredi 已提交
1588 1589
				err = -ECHILD;
				goto out_err;
A
Al Viro 已提交
1590 1591
			}
		}
1592 1593 1594 1595 1596 1597
		BUG_ON(inode != path->dentry->d_inode);
		return 1;
	}
	path_to_nameidata(path, nd);
	nd->inode = inode;
	return 0;
M
Miklos Szeredi 已提交
1598 1599 1600 1601 1602 1603

out_path_put:
	path_to_nameidata(path, nd);
out_err:
	terminate_walk(nd);
	return err;
1604 1605
}

1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
/*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
 *
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups.
 */
static inline int nested_symlink(struct path *path, struct nameidata *nd)
{
	int res;

	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
		path_put_conditional(path, nd);
		path_put(&nd->path);
		return -ELOOP;
	}
1622
	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1623 1624 1625 1626 1627 1628 1629

	nd->depth++;
	current->link_count++;

	do {
		struct path link = *path;
		void *cookie;
1630 1631

		res = follow_link(&link, nd, &cookie);
1632 1633
		if (res)
			break;
1634
		res = walk_component(nd, path, LOOKUP_FOLLOW);
1635
		put_link(nd, &link, cookie);
1636 1637 1638 1639 1640 1641 1642
	} while (res > 0);

	current->link_count--;
	nd->depth--;
	return res;
}

1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1662
#include <asm/word-at-a-time.h>
1663

1664
#ifdef CONFIG_64BIT
1665 1666 1667

static inline unsigned int fold_hash(unsigned long hash)
{
1668
	return hash_64(hash, 32);
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
}

#else	/* 32-bit case */

#define fold_hash(x) (x)

#endif

unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long a, mask;
	unsigned long hash = 0;

	for (;;) {
1683
		a = load_unaligned_zeropad(name);
1684 1685 1686
		if (len < sizeof(unsigned long))
			break;
		hash += a;
1687
		hash *= 9;
1688 1689 1690 1691 1692
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
		if (!len)
			goto done;
	}
1693
	mask = bytemask_from_count(len);
1694 1695 1696 1697 1698 1699 1700 1701
	hash += mask & a;
done:
	return fold_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/*
 * Calculate the length and hash of the path component, and
1702
 * return the "hash_len" as the result.
1703
 */
1704
static inline u64 hash_name(const char *name)
1705
{
1706 1707
	unsigned long a, b, adata, bdata, mask, hash, len;
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1708 1709 1710 1711 1712 1713

	hash = a = 0;
	len = -sizeof(unsigned long);
	do {
		hash = (hash + a) * 9;
		len += sizeof(unsigned long);
1714
		a = load_unaligned_zeropad(name+len);
1715 1716 1717 1718 1719 1720 1721 1722 1723
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);

	mask = create_zero_mask(adata | bdata);

	hash += a & zero_bytemask(mask);
1724
	len += find_zero(mask);
1725
	return hashlen_create(fold_hash(hash), len);
1726 1727 1728 1729
}

#else

L
Linus Torvalds 已提交
1730 1731 1732 1733 1734 1735 1736
unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long hash = init_name_hash();
	while (len--)
		hash = partial_name_hash(*name++, hash);
	return end_name_hash(hash);
}
1737
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
1738

1739 1740 1741 1742
/*
 * We know there's a real path component here of at least
 * one character.
 */
1743
static inline u64 hash_name(const char *name)
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
{
	unsigned long hash = init_name_hash();
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
1754
	return hashlen_create(end_name_hash(hash), len);
1755 1756
}

1757 1758
#endif

L
Linus Torvalds 已提交
1759 1760
/*
 * Name resolution.
1761 1762
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
1763
 *
1764 1765
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
1766
 */
1767
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
1768 1769 1770 1771 1772 1773 1774
{
	struct path next;
	int err;
	
	while (*name=='/')
		name++;
	if (!*name)
1775
		return 0;
L
Linus Torvalds 已提交
1776 1777 1778

	/* At this point we know we have a real path component. */
	for(;;) {
1779
		u64 hash_len;
A
Al Viro 已提交
1780
		int type;
L
Linus Torvalds 已提交
1781

1782
		err = may_lookup(nd);
L
Linus Torvalds 已提交
1783 1784 1785
 		if (err)
			break;

1786
		hash_len = hash_name(name);
L
Linus Torvalds 已提交
1787

A
Al Viro 已提交
1788
		type = LAST_NORM;
1789
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
1790
			case 2:
1791
				if (name[1] == '.') {
A
Al Viro 已提交
1792
					type = LAST_DOTDOT;
A
Al Viro 已提交
1793 1794
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
1795 1796 1797 1798
				break;
			case 1:
				type = LAST_DOT;
		}
1799 1800
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
1801
			nd->flags &= ~LOOKUP_JUMPED;
1802
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1803
				struct qstr this = { { .hash_len = hash_len }, .name = name };
1804
				err = parent->d_op->d_hash(parent, &this);
1805 1806
				if (err < 0)
					break;
1807 1808
				hash_len = this.hash_len;
				name = this.name;
1809 1810
			}
		}
A
Al Viro 已提交
1811

1812 1813
		nd->last.hash_len = hash_len;
		nd->last.name = name;
1814 1815
		nd->last_type = type;

1816 1817
		name += hashlen_len(hash_len);
		if (!*name)
1818
			return 0;
1819 1820 1821 1822 1823
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
1824 1825 1826
			name++;
		} while (unlikely(*name == '/'));
		if (!*name)
1827 1828
			return 0;

1829
		err = walk_component(nd, &next, LOOKUP_FOLLOW);
1830 1831
		if (err < 0)
			return err;
L
Linus Torvalds 已提交
1832

1833
		if (err) {
1834
			err = nested_symlink(&next, nd);
L
Linus Torvalds 已提交
1835
			if (err)
1836
				return err;
N
Nick Piggin 已提交
1837
		}
M
Miklos Szeredi 已提交
1838
		if (!d_can_lookup(nd->path.dentry)) {
1839 1840 1841
			err = -ENOTDIR; 
			break;
		}
L
Linus Torvalds 已提交
1842
	}
1843
	terminate_walk(nd);
L
Linus Torvalds 已提交
1844 1845 1846
	return err;
}

A
Al Viro 已提交
1847 1848
static int path_init(int dfd, const char *name, unsigned int flags,
		     struct nameidata *nd, struct file **fp)
N
Nick Piggin 已提交
1849 1850 1851 1852
{
	int retval = 0;

	nd->last_type = LAST_ROOT; /* if there are only slashes... */
A
Al Viro 已提交
1853
	nd->flags = flags | LOOKUP_JUMPED;
N
Nick Piggin 已提交
1854
	nd->depth = 0;
1855
	if (flags & LOOKUP_ROOT) {
1856 1857
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
A
Al Viro 已提交
1858
		if (*name) {
M
Miklos Szeredi 已提交
1859
			if (!d_can_lookup(root))
A
Al Viro 已提交
1860 1861 1862 1863 1864
				return -ENOTDIR;
			retval = inode_permission(inode, MAY_EXEC);
			if (retval)
				return retval;
		}
1865 1866 1867
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
1868
			rcu_read_lock();
1869
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
1870
			nd->m_seq = read_seqbegin(&mount_lock);
1871 1872 1873 1874 1875 1876
		} else {
			path_get(&nd->path);
		}
		return 0;
	}

N
Nick Piggin 已提交
1877 1878
	nd->root.mnt = NULL;

A
Al Viro 已提交
1879
	nd->m_seq = read_seqbegin(&mount_lock);
N
Nick Piggin 已提交
1880
	if (*name=='/') {
A
Al Viro 已提交
1881
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
1882
			rcu_read_lock();
1883
			nd->seq = set_root_rcu(nd);
A
Al Viro 已提交
1884 1885 1886 1887 1888
		} else {
			set_root(nd);
			path_get(&nd->root);
		}
		nd->path = nd->root;
N
Nick Piggin 已提交
1889
	} else if (dfd == AT_FDCWD) {
A
Al Viro 已提交
1890 1891 1892
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
1893

A
Al Viro 已提交
1894
			rcu_read_lock();
N
Nick Piggin 已提交
1895

A
Al Viro 已提交
1896 1897 1898 1899 1900 1901 1902 1903
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
		}
N
Nick Piggin 已提交
1904
	} else {
1905
		/* Caller must check execute permissions on the starting path component */
1906
		struct fd f = fdget_raw(dfd);
N
Nick Piggin 已提交
1907 1908
		struct dentry *dentry;

1909 1910
		if (!f.file)
			return -EBADF;
N
Nick Piggin 已提交
1911

1912
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
1913

A
Al Viro 已提交
1914
		if (*name) {
M
Miklos Szeredi 已提交
1915
			if (!d_can_lookup(dentry)) {
1916 1917 1918
				fdput(f);
				return -ENOTDIR;
			}
A
Al Viro 已提交
1919
		}
N
Nick Piggin 已提交
1920

1921
		nd->path = f.file->f_path;
A
Al Viro 已提交
1922
		if (flags & LOOKUP_RCU) {
1923
			if (f.flags & FDPUT_FPUT)
1924
				*fp = f.file;
A
Al Viro 已提交
1925
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
1926
			rcu_read_lock();
A
Al Viro 已提交
1927
		} else {
1928 1929
			path_get(&nd->path);
			fdput(f);
A
Al Viro 已提交
1930
		}
N
Nick Piggin 已提交
1931 1932 1933
	}

	nd->inode = nd->path.dentry->d_inode;
1934 1935 1936 1937 1938 1939 1940 1941
	if (!(flags & LOOKUP_RCU))
		return 0;
	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
		return 0;
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
	rcu_read_unlock();
	return -ECHILD;
1942 1943
}

1944 1945 1946 1947 1948 1949
static inline int lookup_last(struct nameidata *nd, struct path *path)
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
1950
	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
1951 1952
}

1953
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
A
Al Viro 已提交
1954
static int path_lookupat(int dfd, const char *name,
1955 1956
				unsigned int flags, struct nameidata *nd)
{
A
Al Viro 已提交
1957
	struct file *base = NULL;
1958 1959
	struct path path;
	int err;
N
Nick Piggin 已提交
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974

	/*
	 * Path walking is largely split up into 2 different synchronisation
	 * schemes, rcu-walk and ref-walk (explained in
	 * Documentation/filesystems/path-lookup.txt). These share much of the
	 * path walk code, but some things particularly setup, cleanup, and
	 * following mounts are sufficiently divergent that functions are
	 * duplicated. Typically there is a function foo(), and its RCU
	 * analogue, foo_rcu().
	 *
	 * -ECHILD is the error number of choice (just to avoid clashes) that
	 * is returned if some aspect of an rcu-walk fails. Such an error must
	 * be handled by restarting a traditional ref-walk (which will always
	 * be able to complete).
	 */
1975
	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
A
Al Viro 已提交
1976

1977
	if (unlikely(err))
1978
		goto out;
A
Al Viro 已提交
1979 1980

	current->total_link_count = 0;
1981 1982 1983 1984 1985 1986 1987
	err = link_path_walk(name, nd);

	if (!err && !(flags & LOOKUP_PARENT)) {
		err = lookup_last(nd, &path);
		while (err > 0) {
			void *cookie;
			struct path link = path;
K
Kees Cook 已提交
1988 1989 1990
			err = may_follow_link(&link, nd);
			if (unlikely(err))
				break;
1991
			nd->flags |= LOOKUP_PARENT;
1992
			err = follow_link(&link, nd, &cookie);
1993 1994 1995
			if (err)
				break;
			err = lookup_last(nd, &path);
1996
			put_link(nd, &link, cookie);
1997 1998
		}
	}
A
Al Viro 已提交
1999

2000 2001
	if (!err)
		err = complete_walk(nd);
2002 2003

	if (!err && nd->flags & LOOKUP_DIRECTORY) {
M
Miklos Szeredi 已提交
2004
		if (!d_can_lookup(nd->path.dentry)) {
2005
			path_put(&nd->path);
A
Al Viro 已提交
2006
			err = -ENOTDIR;
2007 2008
		}
	}
A
Al Viro 已提交
2009

2010
out:
A
Al Viro 已提交
2011 2012
	if (base)
		fput(base);
A
Al Viro 已提交
2013

2014
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
A
Al Viro 已提交
2015 2016 2017
		path_put(&nd->root);
		nd->root.mnt = NULL;
	}
2018
	return err;
A
Al Viro 已提交
2019
}
N
Nick Piggin 已提交
2020

2021
static int filename_lookup(int dfd, struct filename *name,
A
Al Viro 已提交
2022 2023
				unsigned int flags, struct nameidata *nd)
{
2024
	int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
A
Al Viro 已提交
2025
	if (unlikely(retval == -ECHILD))
2026
		retval = path_lookupat(dfd, name->name, flags, nd);
A
Al Viro 已提交
2027
	if (unlikely(retval == -ESTALE))
2028 2029
		retval = path_lookupat(dfd, name->name,
						flags | LOOKUP_REVAL, nd);
N
Nick Piggin 已提交
2030

2031
	if (likely(!retval))
2032
		audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
2033
	return retval;
L
Linus Torvalds 已提交
2034 2035
}

2036 2037 2038 2039 2040 2041 2042 2043
static int do_path_lookup(int dfd, const char *name,
				unsigned int flags, struct nameidata *nd)
{
	struct filename filename = { .name = name };

	return filename_lookup(dfd, &filename, flags, nd);
}

A
Al Viro 已提交
2044 2045
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2046
{
A
Al Viro 已提交
2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
	struct nameidata nd;
	struct dentry *d;
	int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
	if (err)
		return ERR_PTR(err);
	if (nd.last_type != LAST_NORM) {
		path_put(&nd.path);
		return ERR_PTR(-EINVAL);
	}
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2057
	d = __lookup_hash(&nd.last, nd.path.dentry, 0);
A
Al Viro 已提交
2058 2059 2060 2061 2062 2063 2064
	if (IS_ERR(d)) {
		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
		path_put(&nd.path);
		return d;
	}
	*path = nd.path;
	return d;
2065 2066
}

A
Al Viro 已提交
2067 2068 2069 2070 2071 2072 2073 2074
int kern_path(const char *name, unsigned int flags, struct path *path)
{
	struct nameidata nd;
	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
	if (!res)
		*path = nd.path;
	return res;
}
2075
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2076

2077 2078 2079 2080 2081 2082
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2083
 * @path: pointer to struct path to fill
2084 2085 2086
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2087
		    struct path *path)
2088
{
2089 2090 2091 2092 2093
	struct nameidata nd;
	int err;
	nd.root.dentry = dentry;
	nd.root.mnt = mnt;
	BUG_ON(flags & LOOKUP_PARENT);
2094
	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
2095 2096 2097 2098
	err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
	if (!err)
		*path = nd.path;
	return err;
2099
}
2100
EXPORT_SYMBOL(vfs_path_lookup);
2101

2102 2103 2104 2105 2106
/*
 * Restricted form of lookup. Doesn't follow links, single-component only,
 * needs parent already locked. Doesn't follow mounts.
 * SMP-safe.
 */
2107
static struct dentry *lookup_hash(struct nameidata *nd)
2108
{
2109
	return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
L
Linus Torvalds 已提交
2110 2111
}

2112
/**
2113
 * lookup_one_len - filesystem helper to lookup single pathname component
2114 2115 2116 2117
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
2118 2119
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.  Also note that by using this function the
2120 2121 2122
 * nameidata argument is passed to the filesystem methods and a filesystem
 * using this helper needs to be prepared for that.
 */
2123 2124 2125
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
A
Al Viro 已提交
2126
	unsigned int c;
2127
	int err;
2128

2129 2130
	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));

A
Al Viro 已提交
2131 2132
	this.name = name;
	this.len = len;
L
Linus Torvalds 已提交
2133
	this.hash = full_name_hash(name, len);
A
Al Viro 已提交
2134 2135 2136
	if (!len)
		return ERR_PTR(-EACCES);

A
Al Viro 已提交
2137 2138 2139 2140 2141
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
			return ERR_PTR(-EACCES);
	}

A
Al Viro 已提交
2142 2143 2144 2145 2146
	while (len--) {
		c = *(const unsigned char *)name++;
		if (c == '/' || c == '\0')
			return ERR_PTR(-EACCES);
	}
2147 2148 2149 2150 2151
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2152
		int err = base->d_op->d_hash(base, &this);
2153 2154 2155
		if (err < 0)
			return ERR_PTR(err);
	}
2156

2157 2158 2159 2160
	err = inode_permission(base->d_inode, MAY_EXEC);
	if (err)
		return ERR_PTR(err);

2161
	return __lookup_hash(&this, base, 0);
2162
}
2163
EXPORT_SYMBOL(lookup_one_len);
2164

2165 2166
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2167
{
2168
	struct nameidata nd;
2169
	struct filename *tmp = getname_flags(name, flags, empty);
L
Linus Torvalds 已提交
2170 2171
	int err = PTR_ERR(tmp);
	if (!IS_ERR(tmp)) {
2172 2173 2174

		BUG_ON(flags & LOOKUP_PARENT);

2175
		err = filename_lookup(dfd, tmp, flags, &nd);
L
Linus Torvalds 已提交
2176
		putname(tmp);
2177 2178
		if (!err)
			*path = nd.path;
L
Linus Torvalds 已提交
2179 2180 2181 2182
	}
	return err;
}

2183 2184 2185
int user_path_at(int dfd, const char __user *name, unsigned flags,
		 struct path *path)
{
2186
	return user_path_at_empty(dfd, name, flags, path, NULL);
2187
}
2188
EXPORT_SYMBOL(user_path_at);
2189

2190 2191 2192 2193 2194 2195
/*
 * NB: most callers don't do anything directly with the reference to the
 *     to struct filename, but the nd->last pointer points into the name string
 *     allocated by getname. So we must hold the reference to it until all
 *     path-walking is complete.
 */
2196
static struct filename *
2197 2198
user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
		 unsigned int flags)
2199
{
2200
	struct filename *s = getname(path);
2201 2202
	int error;

2203 2204 2205
	/* only LOOKUP_REVAL is allowed in extra flags */
	flags &= LOOKUP_REVAL;

2206
	if (IS_ERR(s))
2207
		return s;
2208

2209
	error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
2210
	if (error) {
2211
		putname(s);
2212 2213
		return ERR_PTR(error);
	}
2214

2215
	return s;
2216 2217
}

2218
/**
2219
 * mountpoint_last - look up last component for umount
2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
 * @path: pointer to container for result
 *
 * This is a special lookup_last function just for umount. In this case, we
 * need to resolve the path without doing any revalidation.
 *
 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
 * in almost all cases, this lookup will be served out of the dcache. The only
 * cases where it won't are if nd->last refers to a symlink or the path is
 * bogus and it doesn't exist.
 *
 * Returns:
 * -error: if there was an error during lookup. This includes -ENOENT if the
 *         lookup found a negative dentry. The nd->path reference will also be
 *         put in this case.
 *
 * 0:      if we successfully resolved nd->path and found it to not to be a
 *         symlink that needs to be followed. "path" will also be populated.
 *         The nd->path reference will also be put.
 *
 * 1:      if we successfully resolved nd->last and found it to be a symlink
 *         that needs to be followed. "path" will be populated with the path
 *         to the link, and nd->path will *not* be put.
 */
static int
2246
mountpoint_last(struct nameidata *nd, struct path *path)
2247 2248 2249 2250 2251
{
	int error = 0;
	struct dentry *dentry;
	struct dentry *dir = nd->path.dentry;

2252 2253 2254 2255 2256 2257
	/* If we're in rcuwalk, drop out of it to handle last component */
	if (nd->flags & LOOKUP_RCU) {
		if (unlazy_walk(nd, NULL)) {
			error = -ECHILD;
			goto out;
		}
2258 2259 2260 2261 2262 2263
	}

	nd->flags &= ~LOOKUP_PARENT;

	if (unlikely(nd->last_type != LAST_NORM)) {
		error = handle_dots(nd, nd->last_type);
2264 2265 2266 2267
		if (error)
			goto out;
		dentry = dget(nd->path.dentry);
		goto done;
2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280
	}

	mutex_lock(&dir->d_inode->i_mutex);
	dentry = d_lookup(dir, &nd->last);
	if (!dentry) {
		/*
		 * No cached dentry. Mounted dentries are pinned in the cache,
		 * so that means that this dentry is probably a symlink or the
		 * path doesn't actually point to a mounted dentry.
		 */
		dentry = d_alloc(dir, &nd->last);
		if (!dentry) {
			error = -ENOMEM;
2281
			mutex_unlock(&dir->d_inode->i_mutex);
2282
			goto out;
2283
		}
2284 2285
		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
		error = PTR_ERR(dentry);
2286 2287
		if (IS_ERR(dentry)) {
			mutex_unlock(&dir->d_inode->i_mutex);
2288
			goto out;
2289
		}
2290 2291 2292
	}
	mutex_unlock(&dir->d_inode->i_mutex);

2293
done:
2294
	if (!dentry->d_inode || d_is_negative(dentry)) {
2295 2296 2297
		error = -ENOENT;
		dput(dentry);
		goto out;
2298
	}
2299
	path->dentry = dentry;
2300
	path->mnt = nd->path.mnt;
2301
	if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
2302
		return 1;
2303
	mntget(path->mnt);
2304 2305 2306
	follow_mount(path);
	error = 0;
out:
2307 2308 2309 2310 2311
	terminate_walk(nd);
	return error;
}

/**
2312
 * path_mountpoint - look up a path to be umounted
2313 2314
 * @dfd:	directory file descriptor to start walk from
 * @name:	full pathname to walk
2315
 * @path:	pointer to container for result
2316 2317 2318
 * @flags:	lookup flags
 *
 * Look up the given name, but don't attempt to revalidate the last component.
2319
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2320 2321
 */
static int
2322
path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
2323 2324 2325 2326 2327 2328 2329
{
	struct file *base = NULL;
	struct nameidata nd;
	int err;

	err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
	if (unlikely(err))
2330
		goto out;
2331 2332 2333 2334 2335 2336

	current->total_link_count = 0;
	err = link_path_walk(name, &nd);
	if (err)
		goto out;

2337
	err = mountpoint_last(&nd, path);
2338 2339 2340 2341 2342 2343 2344 2345 2346 2347
	while (err > 0) {
		void *cookie;
		struct path link = *path;
		err = may_follow_link(&link, &nd);
		if (unlikely(err))
			break;
		nd.flags |= LOOKUP_PARENT;
		err = follow_link(&link, &nd, &cookie);
		if (err)
			break;
2348
		err = mountpoint_last(&nd, path);
2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360
		put_link(&nd, &link, cookie);
	}
out:
	if (base)
		fput(base);

	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
		path_put(&nd.root);

	return err;
}

A
Al Viro 已提交
2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374
static int
filename_mountpoint(int dfd, struct filename *s, struct path *path,
			unsigned int flags)
{
	int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
	if (unlikely(error == -ECHILD))
		error = path_mountpoint(dfd, s->name, path, flags);
	if (unlikely(error == -ESTALE))
		error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
	if (likely(!error))
		audit_inode(s, path->dentry, 0);
	return error;
}

2375
/**
2376
 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389
 * @dfd:	directory file descriptor
 * @name:	pathname from userland
 * @flags:	lookup flags
 * @path:	pointer to container to hold result
 *
 * A umount is a special case for path walking. We're not actually interested
 * in the inode in this situation, and ESTALE errors can be a problem. We
 * simply want track down the dentry and vfsmount attached at the mountpoint
 * and avoid revalidating the last component.
 *
 * Returns 0 and populates "path" on success.
 */
int
2390
user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2391 2392 2393 2394 2395 2396
			struct path *path)
{
	struct filename *s = getname(name);
	int error;
	if (IS_ERR(s))
		return PTR_ERR(s);
A
Al Viro 已提交
2397
	error = filename_mountpoint(dfd, s, path, flags);
2398 2399 2400 2401
	putname(s);
	return error;
}

A
Al Viro 已提交
2402 2403 2404 2405 2406 2407 2408 2409 2410
int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
			unsigned int flags)
{
	struct filename s = {.name = name};
	return filename_mountpoint(dfd, &s, path, flags);
}
EXPORT_SYMBOL(kern_path_mountpoint);

M
Miklos Szeredi 已提交
2411
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2412
{
2413
	kuid_t fsuid = current_fsuid();
2414

2415
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2416
		return 0;
2417
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2418
		return 0;
2419
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2420
}
M
Miklos Szeredi 已提交
2421
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 *  9. We can't remove a root or mountpoint.
 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
2442
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2443
{
2444
	struct inode *inode = victim->d_inode;
L
Linus Torvalds 已提交
2445 2446
	int error;

2447
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2448
		return -ENOENT;
2449
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2450 2451

	BUG_ON(victim->d_parent->d_inode != dir);
2452
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2453

2454
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2455 2456 2457 2458
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2459 2460 2461

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
L
Linus Torvalds 已提交
2462 2463
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2464
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2465 2466 2467
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2468
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We should have write and exec permissions on dir
 *  4. We can't do it if dir is immutable (done in permission())
 */
2485
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2486
{
2487
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2488 2489 2490 2491
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2492
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
I
Ingo Molnar 已提交
2503
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2504 2505 2506
		return NULL;
	}

2507
	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2508

2509 2510 2511 2512 2513
	p = d_ancestor(p2, p1);
	if (p) {
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2514 2515
	}

2516 2517 2518 2519 2520
	p = d_ancestor(p1, p2);
	if (p) {
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2521 2522
	}

I
Ingo Molnar 已提交
2523
	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2524
	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2525 2526
	return NULL;
}
2527
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2528 2529 2530

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
2531
	mutex_unlock(&p1->d_inode->i_mutex);
L
Linus Torvalds 已提交
2532
	if (p1 != p2) {
2533
		mutex_unlock(&p2->d_inode->i_mutex);
2534
		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2535 2536
	}
}
2537
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2538

A
Al Viro 已提交
2539
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2540
		bool want_excl)
L
Linus Torvalds 已提交
2541
{
2542
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2543 2544 2545
	if (error)
		return error;

A
Al Viro 已提交
2546
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2547 2548 2549 2550 2551 2552
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2553
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2554
	if (!error)
2555
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2556 2557
	return error;
}
2558
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2559

A
Al Viro 已提交
2560
static int may_open(struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2561
{
2562
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2563 2564 2565
	struct inode *inode = dentry->d_inode;
	int error;

A
Al Viro 已提交
2566 2567 2568 2569
	/* O_PATH? */
	if (!acc_mode)
		return 0;

L
Linus Torvalds 已提交
2570 2571 2572
	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2573 2574
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2575
		return -ELOOP;
C
Christoph Hellwig 已提交
2576 2577 2578 2579 2580 2581
	case S_IFDIR:
		if (acc_mode & MAY_WRITE)
			return -EISDIR;
		break;
	case S_IFBLK:
	case S_IFCHR:
2582
		if (path->mnt->mnt_flags & MNT_NODEV)
L
Linus Torvalds 已提交
2583
			return -EACCES;
C
Christoph Hellwig 已提交
2584 2585 2586
		/*FALLTHRU*/
	case S_IFIFO:
	case S_IFSOCK:
L
Linus Torvalds 已提交
2587
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2588
		break;
2589
	}
2590

2591
	error = inode_permission(inode, acc_mode);
2592 2593
	if (error)
		return error;
M
Mimi Zohar 已提交
2594

L
Linus Torvalds 已提交
2595 2596 2597 2598
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2599
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2600
			return -EPERM;
L
Linus Torvalds 已提交
2601
		if (flag & O_TRUNC)
2602
			return -EPERM;
L
Linus Torvalds 已提交
2603 2604 2605
	}

	/* O_NOATIME can only be set by the owner or superuser */
2606
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2607
		return -EPERM;
L
Linus Torvalds 已提交
2608

2609
	return 0;
2610
}
L
Linus Torvalds 已提交
2611

2612
static int handle_truncate(struct file *filp)
2613
{
2614
	struct path *path = &filp->f_path;
2615 2616 2617 2618 2619 2620 2621
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2622
	error = locks_verify_locked(filp);
2623
	if (!error)
2624
		error = security_path_truncate(path);
2625 2626 2627
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2628
				    filp);
2629 2630
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2631
	return error;
L
Linus Torvalds 已提交
2632 2633
}

2634 2635
static inline int open_to_namei_flags(int flag)
{
2636 2637
	if ((flag & O_ACCMODE) == 3)
		flag--;
2638 2639 2640
	return flag;
}

M
Miklos Szeredi 已提交
2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653
static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
{
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * Returns 1 if the file was looked up only or didn't need creating.  The
 * caller will need to perform the open themselves.  @path will have been
 * updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
2667 2668 2669
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
			struct path *path, struct file *file,
			const struct open_flags *op,
2670
			bool got_write, bool need_lookup,
2671
			int *opened)
M
Miklos Szeredi 已提交
2672 2673 2674 2675 2676 2677 2678 2679
{
	struct inode *dir =  nd->path.dentry->d_inode;
	unsigned open_flag = open_to_namei_flags(op->open_flag);
	umode_t mode;
	int error;
	int acc_mode;
	int create_error = 0;
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2680
	bool excl;
M
Miklos Szeredi 已提交
2681 2682 2683 2684 2685

	BUG_ON(dentry->d_inode);

	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir))) {
2686
		error = -ENOENT;
M
Miklos Szeredi 已提交
2687 2688 2689
		goto out;
	}

2690
	mode = op->mode;
M
Miklos Szeredi 已提交
2691 2692 2693
	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
		mode &= ~current_umask();

2694 2695
	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
	if (excl)
M
Miklos Szeredi 已提交
2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706
		open_flag &= ~O_TRUNC;

	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
2707 2708 2709
	if (((open_flag & (O_CREAT | O_TRUNC)) ||
	    (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
		if (!(open_flag & O_CREAT)) {
M
Miklos Szeredi 已提交
2710 2711 2712 2713 2714 2715 2716
			/*
			 * No O_CREATE -> atomicity not a requirement -> fall
			 * back to lookup + open
			 */
			goto no_open;
		} else if (open_flag & (O_EXCL | O_TRUNC)) {
			/* Fall back and fail with the right error */
2717
			create_error = -EROFS;
M
Miklos Szeredi 已提交
2718 2719 2720
			goto no_open;
		} else {
			/* No side effects, safe to clear O_CREAT */
2721
			create_error = -EROFS;
M
Miklos Szeredi 已提交
2722 2723 2724 2725 2726
			open_flag &= ~O_CREAT;
		}
	}

	if (open_flag & O_CREAT) {
2727
		error = may_o_create(&nd->path, dentry, mode);
M
Miklos Szeredi 已提交
2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738
		if (error) {
			create_error = error;
			if (open_flag & O_EXCL)
				goto no_open;
			open_flag &= ~O_CREAT;
		}
	}

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
2739 2740 2741
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
	error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2742
				      opened);
A
Al Viro 已提交
2743 2744 2745
	if (error < 0) {
		if (create_error && error == -ENOENT)
			error = create_error;
M
Miklos Szeredi 已提交
2746 2747 2748
		goto out;
	}

A
Al Viro 已提交
2749
	if (error) {	/* returned 1, that is */
A
Al Viro 已提交
2750
		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2751
			error = -EIO;
M
Miklos Szeredi 已提交
2752 2753
			goto out;
		}
A
Al Viro 已提交
2754
		if (file->f_path.dentry) {
M
Miklos Szeredi 已提交
2755
			dput(dentry);
A
Al Viro 已提交
2756
			dentry = file->f_path.dentry;
M
Miklos Szeredi 已提交
2757
		}
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770
		if (*opened & FILE_CREATED)
			fsnotify_create(dir, dentry);
		if (!dentry->d_inode) {
			WARN_ON(*opened & FILE_CREATED);
			if (create_error) {
				error = create_error;
				goto out;
			}
		} else {
			if (excl && !(*opened & FILE_CREATED)) {
				error = -EEXIST;
				goto out;
			}
2771
		}
M
Miklos Szeredi 已提交
2772 2773 2774 2775 2776 2777 2778
		goto looked_up;
	}

	/*
	 * We didn't have the inode before the open, so check open permission
	 * here.
	 */
2779 2780 2781 2782 2783 2784
	acc_mode = op->acc_mode;
	if (*opened & FILE_CREATED) {
		WARN_ON(!(open_flag & O_CREAT));
		fsnotify_create(dir, dentry);
		acc_mode = MAY_OPEN;
	}
2785 2786 2787
	error = may_open(&file->f_path, acc_mode, open_flag);
	if (error)
		fput(file);
M
Miklos Szeredi 已提交
2788 2789 2790

out:
	dput(dentry);
2791
	return error;
M
Miklos Szeredi 已提交
2792 2793 2794

no_open:
	if (need_lookup) {
2795
		dentry = lookup_real(dir, dentry, nd->flags);
M
Miklos Szeredi 已提交
2796
		if (IS_ERR(dentry))
2797
			return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2798 2799 2800 2801

		if (create_error) {
			int open_flag = op->open_flag;

2802
			error = create_error;
M
Miklos Szeredi 已提交
2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817
			if ((open_flag & O_EXCL)) {
				if (!dentry->d_inode)
					goto out;
			} else if (!dentry->d_inode) {
				goto out;
			} else if ((open_flag & O_TRUNC) &&
				   S_ISREG(dentry->d_inode->i_mode)) {
				goto out;
			}
			/* will fail later, go on to get the right error */
		}
	}
looked_up:
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2818
	return 1;
M
Miklos Szeredi 已提交
2819 2820
}

M
Miklos Szeredi 已提交
2821
/*
2822
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
2823 2824 2825
 *
 * Must be called with i_mutex held on parent.
 *
2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837
 * Returns 0 if the file was successfully atomically created (if necessary) and
 * opened.  In this case the file will be returned attached to @file.
 *
 * Returns 1 if the file was not completely opened at this time, though lookups
 * and creations will have been performed and the dentry returned in @path will
 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
 * specified then a negative dentry may be returned.
 *
 * An error code is returned otherwise.
 *
 * FILE_CREATE will be set in @*opened if the dentry was created and will be
 * cleared otherwise prior to returning.
M
Miklos Szeredi 已提交
2838
 */
2839 2840 2841
static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
2842
			bool got_write, int *opened)
M
Miklos Szeredi 已提交
2843 2844
{
	struct dentry *dir = nd->path.dentry;
2845
	struct inode *dir_inode = dir->d_inode;
M
Miklos Szeredi 已提交
2846 2847
	struct dentry *dentry;
	int error;
2848
	bool need_lookup;
M
Miklos Szeredi 已提交
2849

2850
	*opened &= ~FILE_CREATED;
2851
	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
M
Miklos Szeredi 已提交
2852
	if (IS_ERR(dentry))
2853
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2854

M
Miklos Szeredi 已提交
2855 2856 2857 2858 2859
	/* Cached positive dentry: will open in f_op->open */
	if (!need_lookup && dentry->d_inode)
		goto out_no_open;

	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2860
		return atomic_open(nd, dentry, path, file, op, got_write,
2861
				   need_lookup, opened);
M
Miklos Szeredi 已提交
2862 2863
	}

2864 2865 2866
	if (need_lookup) {
		BUG_ON(dentry->d_inode);

2867
		dentry = lookup_real(dir_inode, dentry, nd->flags);
2868
		if (IS_ERR(dentry))
2869
			return PTR_ERR(dentry);
2870 2871
	}

M
Miklos Szeredi 已提交
2872 2873 2874 2875 2876 2877 2878 2879 2880 2881
	/* Negative dentry, just create the file */
	if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
		umode_t mode = op->mode;
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
		/*
		 * This write is needed to ensure that a
		 * rw->ro transition does not occur between
		 * the time when the file is created and when
		 * a permanent write count is taken through
2882
		 * the 'struct file' in finish_open().
M
Miklos Szeredi 已提交
2883
		 */
2884 2885
		if (!got_write) {
			error = -EROFS;
M
Miklos Szeredi 已提交
2886
			goto out_dput;
2887
		}
2888
		*opened |= FILE_CREATED;
M
Miklos Szeredi 已提交
2889 2890 2891
		error = security_path_mknod(&nd->path, dentry, mode, 0);
		if (error)
			goto out_dput;
A
Al Viro 已提交
2892 2893
		error = vfs_create(dir->d_inode, dentry, mode,
				   nd->flags & LOOKUP_EXCL);
M
Miklos Szeredi 已提交
2894 2895 2896
		if (error)
			goto out_dput;
	}
M
Miklos Szeredi 已提交
2897
out_no_open:
M
Miklos Szeredi 已提交
2898 2899
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2900
	return 1;
M
Miklos Szeredi 已提交
2901 2902 2903

out_dput:
	dput(dentry);
2904
	return error;
M
Miklos Szeredi 已提交
2905 2906
}

N
Nick Piggin 已提交
2907
/*
2908
 * Handle the last step of open()
N
Nick Piggin 已提交
2909
 */
2910 2911
static int do_last(struct nameidata *nd, struct path *path,
		   struct file *file, const struct open_flags *op,
2912
		   int *opened, struct filename *name)
2913
{
2914
	struct dentry *dir = nd->path.dentry;
2915
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
2916
	bool will_truncate = (open_flag & O_TRUNC) != 0;
2917
	bool got_write = false;
A
Al Viro 已提交
2918
	int acc_mode = op->acc_mode;
2919
	struct inode *inode;
M
Miklos Szeredi 已提交
2920
	bool symlink_ok = false;
2921 2922
	struct path save_parent = { .dentry = NULL, .mnt = NULL };
	bool retried = false;
A
Al Viro 已提交
2923
	int error;
2924

2925 2926 2927
	nd->flags &= ~LOOKUP_PARENT;
	nd->flags |= op->intent;

2928
	if (nd->last_type != LAST_NORM) {
2929 2930
		error = handle_dots(nd, nd->last_type);
		if (error)
2931
			return error;
M
Miklos Szeredi 已提交
2932
		goto finish_open;
2933
	}
2934

2935
	if (!(open_flag & O_CREAT)) {
2936 2937
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
A
Al Viro 已提交
2938
		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
M
Miklos Szeredi 已提交
2939
			symlink_ok = true;
2940
		/* we _can_ be in RCU mode here */
A
Al Viro 已提交
2941
		error = lookup_fast(nd, path, &inode);
2942 2943 2944 2945
		if (likely(!error))
			goto finish_lookup;

		if (error < 0)
2946
			goto out;
2947 2948

		BUG_ON(nd->inode != dir->d_inode);
2949 2950 2951 2952 2953 2954 2955 2956 2957
	} else {
		/* create side of things */
		/*
		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
		 * has been cleared when we got to the last component we are
		 * about to look up
		 */
		error = complete_walk(nd);
		if (error)
2958
			return error;
2959

2960
		audit_inode(name, dir, LOOKUP_PARENT);
2961 2962 2963
		error = -EISDIR;
		/* trailing slashes? */
		if (nd->last.name[nd->last.len])
2964
			goto out;
2965
	}
A
Al Viro 已提交
2966

2967
retry_lookup:
2968 2969 2970 2971 2972 2973 2974 2975 2976 2977
	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
		error = mnt_want_write(nd->path.mnt);
		if (!error)
			got_write = true;
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
2978
	mutex_lock(&dir->d_inode->i_mutex);
2979
	error = lookup_open(nd, path, file, op, got_write, opened);
M
Miklos Szeredi 已提交
2980
	mutex_unlock(&dir->d_inode->i_mutex);
2981

2982 2983
	if (error <= 0) {
		if (error)
M
Miklos Szeredi 已提交
2984 2985
			goto out;

2986
		if ((*opened & FILE_CREATED) ||
A
Al Viro 已提交
2987
		    !S_ISREG(file_inode(file)->i_mode))
M
Miklos Szeredi 已提交
2988
			will_truncate = false;
M
Miklos Szeredi 已提交
2989

2990
		audit_inode(name, file->f_path.dentry, 0);
M
Miklos Szeredi 已提交
2991 2992
		goto opened;
	}
2993

2994
	if (*opened & FILE_CREATED) {
2995
		/* Don't check for write permission, don't truncate */
2996
		open_flag &= ~O_TRUNC;
M
Miklos Szeredi 已提交
2997
		will_truncate = false;
A
Al Viro 已提交
2998
		acc_mode = MAY_OPEN;
M
Miklos Szeredi 已提交
2999
		path_to_nameidata(path, nd);
M
Miklos Szeredi 已提交
3000
		goto finish_open_created;
3001 3002 3003
	}

	/*
3004
	 * create/update audit record if it already exists.
3005
	 */
3006
	if (d_is_positive(path->dentry))
3007
		audit_inode(name, path->dentry, 0);
3008

M
Miklos Szeredi 已提交
3009 3010 3011 3012 3013
	/*
	 * If atomic_open() acquired write access it is dropped now due to
	 * possible mount and symlink following (this might be optimized away if
	 * necessary...)
	 */
3014
	if (got_write) {
M
Miklos Szeredi 已提交
3015
		mnt_drop_write(nd->path.mnt);
3016
		got_write = false;
M
Miklos Szeredi 已提交
3017 3018
	}

3019
	error = -EEXIST;
A
Al Viro 已提交
3020
	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
3021 3022
		goto exit_dput;

3023 3024 3025
	error = follow_managed(path, nd->flags);
	if (error < 0)
		goto exit_dput;
3026

3027 3028 3029
	if (error)
		nd->flags |= LOOKUP_JUMPED;

3030 3031
	BUG_ON(nd->flags & LOOKUP_RCU);
	inode = path->dentry->d_inode;
3032 3033
finish_lookup:
	/* we _can_ be in RCU mode here */
3034
	error = -ENOENT;
3035
	if (!inode || d_is_negative(path->dentry)) {
3036
		path_to_nameidata(path, nd);
3037
		goto out;
3038
	}
A
Al Viro 已提交
3039

3040
	if (should_follow_link(path->dentry, !symlink_ok)) {
3041 3042 3043
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, path->dentry))) {
				error = -ECHILD;
3044
				goto out;
3045 3046 3047
			}
		}
		BUG_ON(inode != path->dentry->d_inode);
3048
		return 1;
3049
	}
3050

3051 3052 3053 3054 3055 3056 3057 3058
	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
		path_to_nameidata(path, nd);
	} else {
		save_parent.dentry = nd->path.dentry;
		save_parent.mnt = mntget(path->mnt);
		nd->path.dentry = path->dentry;

	}
3059
	nd->inode = inode;
3060
	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3061
finish_open:
3062
	error = complete_walk(nd);
3063 3064
	if (error) {
		path_put(&save_parent);
3065
		return error;
3066
	}
3067
	audit_inode(name, nd->path.dentry, 0);
3068
	error = -EISDIR;
M
Miklos Szeredi 已提交
3069
	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3070
		goto out;
3071
	error = -ENOTDIR;
M
Miklos Szeredi 已提交
3072
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3073
		goto out;
3074
	if (!S_ISREG(nd->inode->i_mode))
M
Miklos Szeredi 已提交
3075
		will_truncate = false;
3076

3077 3078 3079
	if (will_truncate) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
3080
			goto out;
3081
		got_write = true;
3082
	}
M
Miklos Szeredi 已提交
3083
finish_open_created:
A
Al Viro 已提交
3084
	error = may_open(&nd->path, acc_mode, open_flag);
3085
	if (error)
3086
		goto out;
M
Miklos Szeredi 已提交
3087 3088 3089 3090 3091 3092

	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
	error = vfs_open(&nd->path, file, current_cred());
	if (!error) {
		*opened |= FILE_OPENED;
	} else {
A
Al Viro 已提交
3093
		if (error == -EOPENSTALE)
M
Miklos Szeredi 已提交
3094
			goto stale_open;
3095
		goto out;
M
Miklos Szeredi 已提交
3096
	}
3097
opened:
3098
	error = open_check_o_direct(file);
3099 3100
	if (error)
		goto exit_fput;
3101
	error = ima_file_check(file, op->acc_mode, *opened);
3102 3103 3104 3105
	if (error)
		goto exit_fput;

	if (will_truncate) {
3106
		error = handle_truncate(file);
3107 3108
		if (error)
			goto exit_fput;
3109
	}
3110
out:
3111
	if (got_write)
3112
		mnt_drop_write(nd->path.mnt);
3113
	path_put(&save_parent);
3114
	terminate_walk(nd);
3115
	return error;
3116 3117 3118

exit_dput:
	path_put_conditional(path, nd);
3119
	goto out;
3120
exit_fput:
3121 3122
	fput(file);
	goto out;
3123

M
Miklos Szeredi 已提交
3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134
stale_open:
	/* If no saved parent or already retried then can't retry */
	if (!save_parent.dentry || retried)
		goto out;

	BUG_ON(save_parent.dentry != dir);
	path_put(&nd->path);
	nd->path = save_parent;
	nd->inode = dir->d_inode;
	save_parent.mnt = NULL;
	save_parent.dentry = NULL;
3135
	if (got_write) {
M
Miklos Szeredi 已提交
3136
		mnt_drop_write(nd->path.mnt);
3137
		got_write = false;
M
Miklos Szeredi 已提交
3138 3139 3140
	}
	retried = true;
	goto retry_lookup;
3141 3142
}

3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180
static int do_tmpfile(int dfd, struct filename *pathname,
		struct nameidata *nd, int flags,
		const struct open_flags *op,
		struct file *file, int *opened)
{
	static const struct qstr name = QSTR_INIT("/", 1);
	struct dentry *dentry, *child;
	struct inode *dir;
	int error = path_lookupat(dfd, pathname->name,
				  flags | LOOKUP_DIRECTORY, nd);
	if (unlikely(error))
		return error;
	error = mnt_want_write(nd->path.mnt);
	if (unlikely(error))
		goto out;
	/* we want directory to be writable */
	error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
	if (error)
		goto out2;
	dentry = nd->path.dentry;
	dir = dentry->d_inode;
	if (!dir->i_op->tmpfile) {
		error = -EOPNOTSUPP;
		goto out2;
	}
	child = d_alloc(dentry, &name);
	if (unlikely(!child)) {
		error = -ENOMEM;
		goto out2;
	}
	nd->flags &= ~LOOKUP_DIRECTORY;
	nd->flags |= op->intent;
	dput(nd->path.dentry);
	nd->path.dentry = child;
	error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
	if (error)
		goto out2;
	audit_inode(pathname, nd->path.dentry, 0);
3181 3182
	/* Don't check for other permissions, the inode was just created */
	error = may_open(&nd->path, MAY_OPEN, op->open_flag);
3183 3184 3185 3186 3187 3188 3189
	if (error)
		goto out2;
	file->f_path.mnt = nd->path.mnt;
	error = finish_open(file, nd->path.dentry, NULL, opened);
	if (error)
		goto out2;
	error = open_check_o_direct(file);
3190
	if (error) {
3191
		fput(file);
3192 3193 3194 3195 3196 3197
	} else if (!(op->open_flag & O_EXCL)) {
		struct inode *inode = file_inode(file);
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3198 3199 3200 3201 3202 3203 3204
out2:
	mnt_drop_write(nd->path.mnt);
out:
	path_put(&nd->path);
	return error;
}

3205
static struct file *path_openat(int dfd, struct filename *pathname,
A
Al Viro 已提交
3206
		struct nameidata *nd, const struct open_flags *op, int flags)
L
Linus Torvalds 已提交
3207
{
3208
	struct file *base = NULL;
A
Al Viro 已提交
3209
	struct file *file;
3210
	struct path path;
3211
	int opened = 0;
3212
	int error;
N
Nick Piggin 已提交
3213

A
Al Viro 已提交
3214
	file = get_empty_filp();
3215 3216
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3217

A
Al Viro 已提交
3218
	file->f_flags = op->open_flag;
N
Nick Piggin 已提交
3219

A
Al Viro 已提交
3220
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3221 3222 3223 3224
		error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
		goto out;
	}

3225
	error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
N
Nick Piggin 已提交
3226
	if (unlikely(error))
3227
		goto out;
N
Nick Piggin 已提交
3228

3229
	current->total_link_count = 0;
3230
	error = link_path_walk(pathname->name, nd);
N
Nick Piggin 已提交
3231
	if (unlikely(error))
3232
		goto out;
L
Linus Torvalds 已提交
3233

3234 3235
	error = do_last(nd, &path, file, op, &opened, pathname);
	while (unlikely(error > 0)) { /* trailing symlink */
3236
		struct path link = path;
A
Al Viro 已提交
3237
		void *cookie;
3238
		if (!(nd->flags & LOOKUP_FOLLOW)) {
A
Al Viro 已提交
3239 3240
			path_put_conditional(&path, nd);
			path_put(&nd->path);
3241
			error = -ELOOP;
3242 3243
			break;
		}
K
Kees Cook 已提交
3244 3245 3246
		error = may_follow_link(&link, nd);
		if (unlikely(error))
			break;
A
Al Viro 已提交
3247 3248
		nd->flags |= LOOKUP_PARENT;
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3249
		error = follow_link(&link, nd, &cookie);
3250
		if (unlikely(error))
3251 3252
			break;
		error = do_last(nd, &path, file, op, &opened, pathname);
3253
		put_link(nd, &link, cookie);
3254
	}
A
Al Viro 已提交
3255
out:
A
Al Viro 已提交
3256 3257
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
		path_put(&nd->root);
3258 3259
	if (base)
		fput(base);
3260 3261
	if (!(opened & FILE_OPENED)) {
		BUG_ON(!error);
A
Al Viro 已提交
3262
		put_filp(file);
3263
	}
3264 3265 3266 3267 3268 3269 3270 3271 3272 3273
	if (unlikely(error)) {
		if (error == -EOPENSTALE) {
			if (flags & LOOKUP_RCU)
				error = -ECHILD;
			else
				error = -ESTALE;
		}
		file = ERR_PTR(error);
	}
	return file;
L
Linus Torvalds 已提交
3274 3275
}

3276
struct file *do_filp_open(int dfd, struct filename *pathname,
3277
		const struct open_flags *op)
3278
{
A
Al Viro 已提交
3279
	struct nameidata nd;
3280
	int flags = op->lookup_flags;
3281 3282
	struct file *filp;

A
Al Viro 已提交
3283
	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
3284
	if (unlikely(filp == ERR_PTR(-ECHILD)))
A
Al Viro 已提交
3285
		filp = path_openat(dfd, pathname, &nd, op, flags);
3286
	if (unlikely(filp == ERR_PTR(-ESTALE)))
A
Al Viro 已提交
3287
		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
3288 3289 3290
	return filp;
}

A
Al Viro 已提交
3291
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3292
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3293 3294 3295
{
	struct nameidata nd;
	struct file *file;
3296
	struct filename filename = { .name = name };
3297
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3298 3299 3300 3301

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3302
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3303 3304
		return ERR_PTR(-ELOOP);

3305
	file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3306
	if (unlikely(file == ERR_PTR(-ECHILD)))
3307
		file = path_openat(-1, &filename, &nd, op, flags);
A
Al Viro 已提交
3308
	if (unlikely(file == ERR_PTR(-ESTALE)))
3309
		file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
A
Al Viro 已提交
3310 3311 3312
	return file;
}

3313 3314
struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3315
{
3316
	struct dentry *dentry = ERR_PTR(-EEXIST);
A
Al Viro 已提交
3317
	struct nameidata nd;
3318
	int err2;
3319 3320 3321 3322 3323 3324 3325 3326 3327 3328
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
A
Al Viro 已提交
3329 3330
	if (error)
		return ERR_PTR(error);
L
Linus Torvalds 已提交
3331

3332 3333 3334 3335
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
A
Al Viro 已提交
3336 3337 3338 3339
	if (nd.last_type != LAST_NORM)
		goto out;
	nd.flags &= ~LOOKUP_PARENT;
	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3340

3341 3342
	/* don't fail immediately if it's r/o, at least try to report other errors */
	err2 = mnt_want_write(nd.path.mnt);
3343 3344 3345
	/*
	 * Do the final lookup.
	 */
A
Al Viro 已提交
3346 3347
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
3348
	if (IS_ERR(dentry))
3349
		goto unlock;
3350

3351
	error = -EEXIST;
3352
	if (d_is_positive(dentry))
3353
		goto fail;
3354

3355 3356 3357 3358 3359 3360
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
A
Al Viro 已提交
3361
	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
3362
		error = -ENOENT;
A
Al Viro 已提交
3363
		goto fail;
3364
	}
3365 3366
	if (unlikely(err2)) {
		error = err2;
3367
		goto fail;
3368
	}
A
Al Viro 已提交
3369
	*path = nd.path;
L
Linus Torvalds 已提交
3370 3371
	return dentry;
fail:
3372 3373 3374
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3375
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3376 3377
	if (!err2)
		mnt_drop_write(nd.path.mnt);
A
Al Viro 已提交
3378 3379
out:
	path_put(&nd.path);
L
Linus Torvalds 已提交
3380 3381
	return dentry;
}
3382 3383
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3384 3385 3386 3387
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
	mutex_unlock(&path->dentry->d_inode->i_mutex);
3388
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3389 3390 3391 3392
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

3393 3394
struct dentry *user_path_create(int dfd, const char __user *pathname,
				struct path *path, unsigned int lookup_flags)
3395
{
3396
	struct filename *tmp = getname(pathname);
3397 3398 3399
	struct dentry *res;
	if (IS_ERR(tmp))
		return ERR_CAST(tmp);
3400
	res = kern_path_create(dfd, tmp->name, path, lookup_flags);
3401 3402 3403 3404 3405
	putname(tmp);
	return res;
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3406
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3407
{
3408
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3409 3410 3411 3412

	if (error)
		return error;

3413
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3414 3415
		return -EPERM;

A
Al Viro 已提交
3416
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3417 3418
		return -EPERM;

3419 3420 3421 3422
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3423 3424 3425 3426 3427
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3428
	if (!error)
3429
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3430 3431
	return error;
}
3432
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3433

A
Al Viro 已提交
3434
static int may_mknod(umode_t mode)
3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

A
Al Viro 已提交
3451
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3452
		unsigned, dev)
L
Linus Torvalds 已提交
3453
{
3454
	struct dentry *dentry;
3455 3456
	struct path path;
	int error;
3457
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3458

3459 3460 3461
	error = may_mknod(mode);
	if (error)
		return error;
3462 3463
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3464 3465
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3466

3467
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3468
		mode &= ~current_umask();
3469
	error = security_path_mknod(&path, dentry, mode, dev);
3470
	if (error)
3471
		goto out;
3472
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3473
		case 0: case S_IFREG:
A
Al Viro 已提交
3474
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
L
Linus Torvalds 已提交
3475 3476
			break;
		case S_IFCHR: case S_IFBLK:
3477
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3478 3479 3480
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3481
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3482 3483
			break;
	}
3484
out:
A
Al Viro 已提交
3485
	done_path_create(&path, dentry);
3486 3487 3488 3489
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3490 3491 3492
	return error;
}

A
Al Viro 已提交
3493
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3494 3495 3496 3497
{
	return sys_mknodat(AT_FDCWD, filename, mode, dev);
}

3498
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3499
{
3500
	int error = may_create(dir, dentry);
3501
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3502 3503 3504 3505

	if (error)
		return error;

A
Al Viro 已提交
3506
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3507 3508 3509 3510 3511 3512 3513
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3514 3515 3516
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3517
	error = dir->i_op->mkdir(dir, dentry, mode);
3518
	if (!error)
3519
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3520 3521
	return error;
}
3522
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3523

3524
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
L
Linus Torvalds 已提交
3525
{
3526
	struct dentry *dentry;
3527 3528
	struct path path;
	int error;
3529
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3530

3531 3532
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3533
	if (IS_ERR(dentry))
3534
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3535

3536
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3537
		mode &= ~current_umask();
3538
	error = security_path_mkdir(&path, dentry, mode);
3539 3540
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3541
	done_path_create(&path, dentry);
3542 3543 3544 3545
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3546 3547 3548
	return error;
}

3549
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3550 3551 3552 3553
{
	return sys_mkdirat(AT_FDCWD, pathname, mode);
}

L
Linus Torvalds 已提交
3554
/*
S
Sage Weil 已提交
3555
 * The dentry_unhash() helper will try to drop the dentry early: we
3556
 * should have a usage count of 1 if we're the only user of this
S
Sage Weil 已提交
3557 3558
 * dentry, and if that is true (possibly after pruning the dcache),
 * then we drop the dentry now.
L
Linus Torvalds 已提交
3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570
 *
 * A low-level filesystem can, if it choses, legally
 * do a
 *
 *	if (!d_unhashed(dentry))
 *		return -EBUSY;
 *
 * if it cannot handle the case of removing a directory
 * that is still in use by something else..
 */
void dentry_unhash(struct dentry *dentry)
{
3571
	shrink_dcache_parent(dentry);
L
Linus Torvalds 已提交
3572
	spin_lock(&dentry->d_lock);
3573
	if (dentry->d_lockref.count == 1)
L
Linus Torvalds 已提交
3574 3575 3576
		__d_drop(dentry);
	spin_unlock(&dentry->d_lock);
}
3577
EXPORT_SYMBOL(dentry_unhash);
L
Linus Torvalds 已提交
3578 3579 3580 3581 3582 3583 3584 3585

int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3586
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3587 3588
		return -EPERM;

3589
	dget(dentry);
3590
	mutex_lock(&dentry->d_inode->i_mutex);
S
Sage Weil 已提交
3591 3592

	error = -EBUSY;
3593
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3594 3595 3596 3597 3598 3599
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

3600
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3601 3602 3603 3604 3605 3606
	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3607
	detach_mounts(dentry);
S
Sage Weil 已提交
3608 3609

out:
3610
	mutex_unlock(&dentry->d_inode->i_mutex);
3611
	dput(dentry);
S
Sage Weil 已提交
3612
	if (!error)
L
Linus Torvalds 已提交
3613 3614 3615
		d_delete(dentry);
	return error;
}
3616
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3617

3618
static long do_rmdir(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3619 3620
{
	int error = 0;
3621
	struct filename *name;
L
Linus Torvalds 已提交
3622 3623
	struct dentry *dentry;
	struct nameidata nd;
3624 3625 3626
	unsigned int lookup_flags = 0;
retry:
	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3627 3628
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3629 3630

	switch(nd.last_type) {
3631 3632 3633 3634 3635 3636 3637 3638 3639
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3640
	}
3641 3642

	nd.flags &= ~LOOKUP_PARENT;
3643 3644 3645
	error = mnt_want_write(nd.path.mnt);
	if (error)
		goto exit1;
3646

3647
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3648
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
3649
	error = PTR_ERR(dentry);
3650 3651
	if (IS_ERR(dentry))
		goto exit2;
3652 3653 3654 3655
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3656 3657
	error = security_path_rmdir(&nd.path, dentry);
	if (error)
3658
		goto exit3;
3659
	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
3660
exit3:
3661 3662
	dput(dentry);
exit2:
3663
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3664
	mnt_drop_write(nd.path.mnt);
L
Linus Torvalds 已提交
3665
exit1:
J
Jan Blunck 已提交
3666
	path_put(&nd.path);
L
Linus Torvalds 已提交
3667
	putname(name);
3668 3669 3670 3671
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3672 3673 3674
	return error;
}

3675
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3676 3677 3678 3679
{
	return do_rmdir(AT_FDCWD, pathname);
}

3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3699
{
J
J. Bruce Fields 已提交
3700
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3701 3702 3703 3704 3705
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3706
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3707 3708
		return -EPERM;

J
J. Bruce Fields 已提交
3709
	mutex_lock(&target->i_mutex);
3710
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3711 3712 3713
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3714
		if (!error) {
3715 3716
			error = try_break_deleg(target, delegated_inode);
			if (error)
3717
				goto out;
L
Linus Torvalds 已提交
3718
			error = dir->i_op->unlink(dir, dentry);
3719
			if (!error) {
3720
				dont_mount(dentry);
3721 3722
				detach_mounts(dentry);
			}
3723
		}
L
Linus Torvalds 已提交
3724
	}
3725
out:
J
J. Bruce Fields 已提交
3726
	mutex_unlock(&target->i_mutex);
L
Linus Torvalds 已提交
3727 3728 3729

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3730
		fsnotify_link_count(target);
J
John McCutchan 已提交
3731
		d_delete(dentry);
L
Linus Torvalds 已提交
3732
	}
R
Robert Love 已提交
3733

L
Linus Torvalds 已提交
3734 3735
	return error;
}
3736
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3737 3738 3739

/*
 * Make sure that the actual truncation of the file will occur outside its
3740
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3741 3742 3743
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3744
static long do_unlinkat(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3745
{
3746
	int error;
3747
	struct filename *name;
L
Linus Torvalds 已提交
3748 3749 3750
	struct dentry *dentry;
	struct nameidata nd;
	struct inode *inode = NULL;
3751
	struct inode *delegated_inode = NULL;
3752 3753 3754
	unsigned int lookup_flags = 0;
retry:
	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3755 3756
	if (IS_ERR(name))
		return PTR_ERR(name);
3757

L
Linus Torvalds 已提交
3758 3759 3760
	error = -EISDIR;
	if (nd.last_type != LAST_NORM)
		goto exit1;
3761 3762

	nd.flags &= ~LOOKUP_PARENT;
3763 3764 3765
	error = mnt_want_write(nd.path.mnt);
	if (error)
		goto exit1;
3766
retry_deleg:
3767
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3768
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
3769 3770 3771
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3772 3773
		if (nd.last.name[nd.last.len])
			goto slashes;
L
Linus Torvalds 已提交
3774
		inode = dentry->d_inode;
3775
		if (d_is_negative(dentry))
3776 3777
			goto slashes;
		ihold(inode);
3778 3779
		error = security_path_unlink(&nd.path, dentry);
		if (error)
3780
			goto exit2;
3781
		error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
3782
exit2:
L
Linus Torvalds 已提交
3783 3784
		dput(dentry);
	}
3785
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3786 3787
	if (inode)
		iput(inode);	/* truncate the inode here */
3788 3789
	inode = NULL;
	if (delegated_inode) {
3790
		error = break_deleg_wait(&delegated_inode);
3791 3792 3793
		if (!error)
			goto retry_deleg;
	}
3794
	mnt_drop_write(nd.path.mnt);
L
Linus Torvalds 已提交
3795
exit1:
J
Jan Blunck 已提交
3796
	path_put(&nd.path);
L
Linus Torvalds 已提交
3797
	putname(name);
3798 3799 3800 3801 3802
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
L
Linus Torvalds 已提交
3803 3804 3805
	return error;

slashes:
3806 3807
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
3808
	else if (d_is_dir(dentry))
3809 3810 3811
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
3812 3813 3814
	goto exit2;
}

3815
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3816 3817 3818 3819 3820 3821 3822 3823 3824 3825
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
		return do_rmdir(dfd, pathname);

	return do_unlinkat(dfd, pathname);
}

3826
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3827 3828 3829 3830
{
	return do_unlinkat(AT_FDCWD, pathname);
}

3831
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
3832
{
3833
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3834 3835 3836 3837

	if (error)
		return error;

A
Al Viro 已提交
3838
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
3839 3840 3841 3842 3843 3844 3845
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
3846
	if (!error)
3847
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3848 3849
	return error;
}
3850
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
3851

3852 3853
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
L
Linus Torvalds 已提交
3854
{
3855
	int error;
3856
	struct filename *from;
3857
	struct dentry *dentry;
3858
	struct path path;
3859
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3860 3861

	from = getname(oldname);
3862
	if (IS_ERR(from))
L
Linus Torvalds 已提交
3863
		return PTR_ERR(from);
3864 3865
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3866 3867
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
3868
		goto out_putname;
3869

3870
	error = security_path_symlink(&path, dentry, from->name);
3871
	if (!error)
3872
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
3873
	done_path_create(&path, dentry);
3874 3875 3876 3877
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
3878
out_putname:
L
Linus Torvalds 已提交
3879 3880 3881 3882
	putname(from);
	return error;
}

3883
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3884 3885 3886 3887
{
	return sys_symlinkat(oldname, AT_FDCWD, newname);
}

J
J. Bruce Fields 已提交
3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3908 3909
{
	struct inode *inode = old_dentry->d_inode;
3910
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3911 3912 3913 3914 3915
	int error;

	if (!inode)
		return -ENOENT;

3916
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
A
Al Viro 已提交
3928
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
3929
		return -EPERM;
3930
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
3931 3932 3933 3934 3935 3936
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

3937
	mutex_lock(&inode->i_mutex);
3938
	/* Make sure we don't allow creating hardlink to an unlinked file */
3939
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
3940
		error =  -ENOENT;
3941 3942
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
3943 3944 3945 3946 3947
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
3948 3949 3950 3951 3952 3953

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3954
	mutex_unlock(&inode->i_mutex);
3955
	if (!error)
3956
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
3957 3958
	return error;
}
3959
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
3960 3961 3962 3963 3964 3965 3966 3967 3968 3969

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
3970 3971
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
L
Linus Torvalds 已提交
3972 3973
{
	struct dentry *new_dentry;
3974
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
3975
	struct inode *delegated_inode = NULL;
3976
	int how = 0;
L
Linus Torvalds 已提交
3977 3978
	int error;

3979
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3980
		return -EINVAL;
3981
	/*
3982 3983 3984
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
3985
	 */
3986 3987 3988
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
3989
		how = LOOKUP_EMPTY;
3990
	}
3991 3992 3993

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
3994
retry:
3995
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
3996
	if (error)
3997 3998
		return error;

3999 4000
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4001
	error = PTR_ERR(new_dentry);
4002
	if (IS_ERR(new_dentry))
4003 4004 4005 4006 4007
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4008 4009 4010
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4011
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4012
	if (error)
4013
		goto out_dput;
J
J. Bruce Fields 已提交
4014
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4015
out_dput:
A
Al Viro 已提交
4016
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4017 4018
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4019 4020
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4021
			goto retry;
4022
		}
J
J. Bruce Fields 已提交
4023
	}
4024
	if (retry_estale(error, how)) {
4025
		path_put(&old_path);
4026 4027 4028
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4029
out:
4030
	path_put(&old_path);
L
Linus Torvalds 已提交
4031 4032 4033 4034

	return error;
}

4035
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4036
{
4037
	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4038 4039
}

4040 4041 4042 4043 4044 4045 4046
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4047
 * @flags:	rename flags
4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4062 4063 4064
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4065
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4066 4067
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4068
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4069
 *	   story.
4070 4071
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4072
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4073 4074
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4075
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4076 4077 4078
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4079
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4080 4081 4082
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4083
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4084
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4085
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4086
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4087 4088
 *	   locking].
 */
4089 4090
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4091
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4092
{
4093 4094 4095 4096
	int error;
	bool is_dir = d_is_dir(old_dentry);
	const unsigned char *old_name;
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4097
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4098 4099
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
4100 4101 4102 4103 4104 4105 4106 4107

	if (source == target)
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4108
	if (!target) {
4109
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4110 4111 4112 4113 4114 4115 4116 4117
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4118 4119 4120
	if (error)
		return error;

M
Miklos Szeredi 已提交
4121
	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4122
		return -EPERM;
L
Linus Torvalds 已提交
4123

M
Miklos Szeredi 已提交
4124 4125 4126
	if (flags && !old_dir->i_op->rename2)
		return -EINVAL;

L
Linus Torvalds 已提交
4127 4128 4129 4130
	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4142 4143
	}

4144 4145
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4146 4147 4148
	if (error)
		return error;

4149
	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4150
	dget(new_dentry);
M
Miklos Szeredi 已提交
4151
	if (!is_dir || (flags & RENAME_EXCHANGE))
4152 4153
		lock_two_nondirectories(source, target);
	else if (target)
4154
		mutex_lock(&target->i_mutex);
S
Sage Weil 已提交
4155 4156

	error = -EBUSY;
4157
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4158 4159
		goto out;

M
Miklos Szeredi 已提交
4160
	if (max_links && new_dir != old_dir) {
4161
		error = -EMLINK;
M
Miklos Szeredi 已提交
4162
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4163
			goto out;
M
Miklos Szeredi 已提交
4164 4165 4166 4167 4168 4169 4170
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
		shrink_dcache_parent(new_dentry);
	if (!is_dir) {
4171
		error = try_break_deleg(source, delegated_inode);
4172 4173
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4174 4175 4176 4177 4178
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4179
	}
M
Miklos Szeredi 已提交
4180
	if (!old_dir->i_op->rename2) {
M
Miklos Szeredi 已提交
4181 4182 4183
		error = old_dir->i_op->rename(old_dir, old_dentry,
					      new_dir, new_dentry);
	} else {
M
Miklos Szeredi 已提交
4184
		WARN_ON(old_dir->i_op->rename != NULL);
M
Miklos Szeredi 已提交
4185 4186 4187
		error = old_dir->i_op->rename2(old_dir, old_dentry,
					       new_dir, new_dentry, flags);
	}
S
Sage Weil 已提交
4188 4189 4190
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4191
	if (!(flags & RENAME_EXCHANGE) && target) {
4192 4193
		if (is_dir)
			target->i_flags |= S_DEAD;
S
Sage Weil 已提交
4194
		dont_mount(new_dentry);
4195
		detach_mounts(new_dentry);
4196
	}
M
Miklos Szeredi 已提交
4197 4198 4199 4200 4201 4202
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4203
out:
M
Miklos Szeredi 已提交
4204
	if (!is_dir || (flags & RENAME_EXCHANGE))
4205 4206 4207
		unlock_two_nondirectories(source, target);
	else if (target)
		mutex_unlock(&target->i_mutex);
L
Linus Torvalds 已提交
4208
	dput(new_dentry);
M
Miklos Szeredi 已提交
4209
	if (!error) {
4210
		fsnotify_move(old_dir, new_dir, old_name, is_dir,
M
Miklos Szeredi 已提交
4211 4212 4213 4214 4215 4216
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
				      new_is_dir, NULL, new_dentry);
		}
	}
R
Robert Love 已提交
4217 4218
	fsnotify_oldname_free(old_name);

L
Linus Torvalds 已提交
4219 4220
	return error;
}
4221
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4222

M
Miklos Szeredi 已提交
4223 4224
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
L
Linus Torvalds 已提交
4225
{
4226 4227 4228
	struct dentry *old_dir, *new_dir;
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
L
Linus Torvalds 已提交
4229
	struct nameidata oldnd, newnd;
4230
	struct inode *delegated_inode = NULL;
4231 4232
	struct filename *from;
	struct filename *to;
4233 4234
	unsigned int lookup_flags = 0;
	bool should_retry = false;
4235
	int error;
M
Miklos Szeredi 已提交
4236

M
Miklos Szeredi 已提交
4237
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
4238 4239
		return -EINVAL;

M
Miklos Szeredi 已提交
4240 4241
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
M
Miklos Szeredi 已提交
4242 4243
		return -EINVAL;

M
Miklos Szeredi 已提交
4244 4245 4246
	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
		return -EPERM;

4247 4248
retry:
	from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4249 4250
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
L
Linus Torvalds 已提交
4251
		goto exit;
4252
	}
L
Linus Torvalds 已提交
4253

4254
	to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
4255 4256
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4257
		goto exit1;
4258
	}
L
Linus Torvalds 已提交
4259 4260

	error = -EXDEV;
4261
	if (oldnd.path.mnt != newnd.path.mnt)
L
Linus Torvalds 已提交
4262 4263
		goto exit2;

4264
	old_dir = oldnd.path.dentry;
L
Linus Torvalds 已提交
4265 4266 4267 4268
	error = -EBUSY;
	if (oldnd.last_type != LAST_NORM)
		goto exit2;

4269
	new_dir = newnd.path.dentry;
M
Miklos Szeredi 已提交
4270 4271
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
L
Linus Torvalds 已提交
4272 4273 4274
	if (newnd.last_type != LAST_NORM)
		goto exit2;

4275 4276 4277 4278
	error = mnt_want_write(oldnd.path.mnt);
	if (error)
		goto exit2;

4279 4280
	oldnd.flags &= ~LOOKUP_PARENT;
	newnd.flags &= ~LOOKUP_PARENT;
M
Miklos Szeredi 已提交
4281 4282
	if (!(flags & RENAME_EXCHANGE))
		newnd.flags |= LOOKUP_RENAME_TARGET;
4283

4284
retry_deleg:
L
Linus Torvalds 已提交
4285 4286
	trap = lock_rename(new_dir, old_dir);

4287
	old_dentry = lookup_hash(&oldnd);
L
Linus Torvalds 已提交
4288 4289 4290 4291 4292
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4293
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4294
		goto exit4;
M
Miklos Szeredi 已提交
4295 4296 4297 4298 4299 4300 4301
	new_dentry = lookup_hash(&newnd);
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
			if (newnd.last.name[newnd.last.len])
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4313
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4314
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4315 4316
		error = -ENOTDIR;
		if (oldnd.last.name[oldnd.last.len])
M
Miklos Szeredi 已提交
4317
			goto exit5;
M
Miklos Szeredi 已提交
4318
		if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
M
Miklos Szeredi 已提交
4319
			goto exit5;
L
Linus Torvalds 已提交
4320 4321 4322 4323
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4324
		goto exit5;
L
Linus Torvalds 已提交
4325
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4326 4327
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4328 4329 4330
	if (new_dentry == trap)
		goto exit5;

4331
	error = security_path_rename(&oldnd.path, old_dentry,
4332
				     &newnd.path, new_dentry, flags);
4333
	if (error)
4334
		goto exit5;
L
Linus Torvalds 已提交
4335
	error = vfs_rename(old_dir->d_inode, old_dentry,
M
Miklos Szeredi 已提交
4336 4337
			   new_dir->d_inode, new_dentry,
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4338 4339 4340 4341 4342 4343
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
	unlock_rename(new_dir, old_dir);
4344 4345 4346 4347 4348
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4349
	mnt_drop_write(oldnd.path.mnt);
L
Linus Torvalds 已提交
4350
exit2:
4351 4352
	if (retry_estale(error, lookup_flags))
		should_retry = true;
J
Jan Blunck 已提交
4353
	path_put(&newnd.path);
4354
	putname(to);
L
Linus Torvalds 已提交
4355
exit1:
J
Jan Blunck 已提交
4356
	path_put(&oldnd.path);
L
Linus Torvalds 已提交
4357
	putname(from);
4358 4359 4360 4361 4362
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4363
exit:
L
Linus Torvalds 已提交
4364 4365 4366
	return error;
}

M
Miklos Szeredi 已提交
4367 4368 4369 4370 4371 4372
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
}

4373
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4374
{
M
Miklos Szeredi 已提交
4375
	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4376 4377
}

M
Miklos Szeredi 已提交
4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
	int error = may_create(dir, dentry);
	if (error)
		return error;

	if (!dir->i_op->mknod)
		return -EPERM;

	return dir->i_op->mknod(dir, dentry,
				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}
EXPORT_SYMBOL(vfs_whiteout);

A
Al Viro 已提交
4392
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4393
{
A
Al Viro 已提交
4394
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}
A
Al Viro 已提交
4406
EXPORT_SYMBOL(readlink_copy);
L
Linus Torvalds 已提交
4407 4408 4409 4410 4411 4412 4413 4414 4415

/*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
 * using) it for any given inode is up to filesystem.
 */
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct nameidata nd;
4416
	void *cookie;
4417
	int res;
4418

L
Linus Torvalds 已提交
4419
	nd.depth = 0;
4420
	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
4421 4422 4423
	if (IS_ERR(cookie))
		return PTR_ERR(cookie);

A
Al Viro 已提交
4424
	res = readlink_copy(buffer, buflen, nd_get_link(&nd));
4425 4426 4427
	if (dentry->d_inode->i_op->put_link)
		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
	return res;
L
Linus Torvalds 已提交
4428
}
4429
EXPORT_SYMBOL(generic_readlink);
L
Linus Torvalds 已提交
4430 4431 4432 4433

/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
{
4434 4435
	char *kaddr;
	struct page *page;
L
Linus Torvalds 已提交
4436
	struct address_space *mapping = dentry->d_inode->i_mapping;
4437
	page = read_mapping_page(mapping, 0, NULL);
L
Linus Torvalds 已提交
4438
	if (IS_ERR(page))
4439
		return (char*)page;
L
Linus Torvalds 已提交
4440
	*ppage = page;
4441 4442 4443
	kaddr = kmap(page);
	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
	return kaddr;
L
Linus Torvalds 已提交
4444 4445 4446 4447 4448
}

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct page *page = NULL;
A
Al Viro 已提交
4449
	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
L
Linus Torvalds 已提交
4450 4451 4452 4453 4454 4455
	if (page) {
		kunmap(page);
		page_cache_release(page);
	}
	return res;
}
4456
EXPORT_SYMBOL(page_readlink);
L
Linus Torvalds 已提交
4457

4458
void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
L
Linus Torvalds 已提交
4459
{
4460
	struct page *page = NULL;
L
Linus Torvalds 已提交
4461
	nd_set_link(nd, page_getlink(dentry, &page));
4462
	return page;
L
Linus Torvalds 已提交
4463
}
4464
EXPORT_SYMBOL(page_follow_link_light);
L
Linus Torvalds 已提交
4465

4466
void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
L
Linus Torvalds 已提交
4467
{
4468 4469 4470
	struct page *page = cookie;

	if (page) {
L
Linus Torvalds 已提交
4471 4472 4473 4474
		kunmap(page);
		page_cache_release(page);
	}
}
4475
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4476

4477 4478 4479 4480
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4481 4482
{
	struct address_space *mapping = inode->i_mapping;
4483
	struct page *page;
4484
	void *fsdata;
4485
	int err;
L
Linus Torvalds 已提交
4486
	char *kaddr;
4487 4488 4489
	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4490

4491
retry:
4492
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4493
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4494
	if (err)
4495 4496
		goto fail;

4497
	kaddr = kmap_atomic(page);
L
Linus Torvalds 已提交
4498
	memcpy(kaddr, symname, len-1);
4499
	kunmap_atomic(kaddr);
4500 4501 4502

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4503 4504
	if (err < 0)
		goto fail;
4505 4506 4507
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4508 4509 4510 4511 4512
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4513
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4514

4515 4516 4517
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4518
			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
4519
}
4520
EXPORT_SYMBOL(page_symlink);
4521

4522
const struct inode_operations page_symlink_inode_operations = {
L
Linus Torvalds 已提交
4523 4524 4525 4526 4527
	.readlink	= generic_readlink,
	.follow_link	= page_follow_link_light,
	.put_link	= page_put_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);