namei.c 93.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
L
Linus Torvalds 已提交
20 21 22 23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
24
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
25 26
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
27
#include <linux/ima.h>
L
Linus Torvalds 已提交
28 29 30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
L
Linus Torvalds 已提交
37 38
#include <asm/uaccess.h>

39
#include "internal.h"
40
#include "mount.h"
41

L
Linus Torvalds 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
76
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
109
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
110 111 112 113 114 115 116 117 118 119
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
120
static char *getname_flags(const char __user *filename, int flags, int *empty)
L
Linus Torvalds 已提交
121
{
122 123
	char *result = __getname(), *err;
	int len;
124

125
	if (unlikely(!result))
126 127
		return ERR_PTR(-ENOMEM);

128 129 130 131 132 133 134 135
	len = strncpy_from_user(result, filename, PATH_MAX);
	err = ERR_PTR(len);
	if (unlikely(len < 0))
		goto error;

	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
136
			*empty = 1;
137 138 139
		err = ERR_PTR(-ENOENT);
		if (!(flags & LOOKUP_EMPTY))
			goto error;
L
Linus Torvalds 已提交
140
	}
141 142 143 144 145 146 147 148 149 150

	err = ERR_PTR(-ENAMETOOLONG);
	if (likely(len < PATH_MAX)) {
		audit_getname(result);
		return result;
	}

error:
	__putname(result);
	return err;
L
Linus Torvalds 已提交
151 152
}

A
Al Viro 已提交
153 154
char *getname(const char __user * filename)
{
155
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
156 157
}

L
Linus Torvalds 已提交
158 159 160
#ifdef CONFIG_AUDITSYSCALL
void putname(const char *name)
{
161
	if (unlikely(!audit_dummy_context()))
L
Linus Torvalds 已提交
162 163 164 165 166 167 168
		audit_putname(name);
	else
		__putname(name);
}
EXPORT_SYMBOL(putname);
#endif

169 170
static int check_acl(struct inode *inode, int mask)
{
171
#ifdef CONFIG_FS_POSIX_ACL
172 173 174
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
175 176
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
177
	                return -EAGAIN;
178 179 180
		/* no ->get_acl() calls in RCU mode... */
		if (acl == ACL_NOT_CACHED)
			return -ECHILD;
A
Ari Savolainen 已提交
181
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
182 183 184 185 186
	}

	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);

	/*
187 188 189
	 * A filesystem can force a ACL callback by just never filling the
	 * ACL cache. But normally you'd fill the cache either at inode
	 * instantiation time, or on the first ->get_acl call.
190
	 *
191 192
	 * If the filesystem doesn't have a get_acl() function at all, we'll
	 * just create the negative cache entry.
193 194
	 */
	if (acl == ACL_NOT_CACHED) {
195 196 197 198 199 200 201 202
	        if (inode->i_op->get_acl) {
			acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
			if (IS_ERR(acl))
				return PTR_ERR(acl);
		} else {
		        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
		        return -EAGAIN;
		}
203 204 205 206 207 208 209
	}

	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
210
#endif
211 212 213 214

	return -EAGAIN;
}

215
/*
216
 * This does the basic permission checking
L
Linus Torvalds 已提交
217
 */
218
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
219
{
220
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
221

222
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
L
Linus Torvalds 已提交
223 224
		mode >>= 6;
	else {
225
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
226
			int error = check_acl(inode, mask);
227 228
			if (error != -EAGAIN)
				return error;
L
Linus Torvalds 已提交
229 230 231 232 233 234 235 236 237
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
238
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
L
Linus Torvalds 已提交
239
		return 0;
240 241 242 243
	return -EACCES;
}

/**
244
 * generic_permission -  check for access rights on a Posix-like filesystem
245
 * @inode:	inode to check access rights for
246
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
247 248 249 250
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
251 252 253 254 255
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
256
 */
257
int generic_permission(struct inode *inode, int mask)
258 259 260 261
{
	int ret;

	/*
262
	 * Do the basic permission checks.
263
	 */
264
	ret = acl_permission_check(inode, mask);
265 266
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
267

268 269
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
270
		if (inode_capable(inode, CAP_DAC_OVERRIDE))
271 272
			return 0;
		if (!(mask & MAY_WRITE))
273
			if (inode_capable(inode, CAP_DAC_READ_SEARCH))
274 275 276
				return 0;
		return -EACCES;
	}
L
Linus Torvalds 已提交
277 278
	/*
	 * Read/write DACs are always overridable.
279 280
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
L
Linus Torvalds 已提交
281
	 */
282
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
283
		if (inode_capable(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
284 285 286 287 288
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
289
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
290
	if (mask == MAY_READ)
291
		if (inode_capable(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
292 293 294 295 296
			return 0;

	return -EACCES;
}

297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

C
Christoph Hellwig 已提交
317
/**
D
David Howells 已提交
318 319 320
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
C
Christoph Hellwig 已提交
321
 *
D
David Howells 已提交
322
 * Check for read/write/execute permissions on an inode.
323 324
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
D
David Howells 已提交
325 326 327
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
C
Christoph Hellwig 已提交
328
 */
D
David Howells 已提交
329
int __inode_permission(struct inode *inode, int mask)
L
Linus Torvalds 已提交
330
{
331
	int retval;
L
Linus Torvalds 已提交
332

333
	if (unlikely(mask & MAY_WRITE)) {
L
Linus Torvalds 已提交
334 335 336 337 338 339 340
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

341
	retval = do_inode_permission(inode, mask);
L
Linus Torvalds 已提交
342 343 344
	if (retval)
		return retval;

345 346 347 348
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

349
	return security_inode_permission(inode, mask);
L
Linus Torvalds 已提交
350 351
}

D
David Howells 已提交
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}

J
Jan Blunck 已提交
393 394 395 396 397 398 399 400 401 402 403 404 405
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(struct path *path)
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
406 407 408 409 410 411 412
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(struct path *path)
L
Linus Torvalds 已提交
413
{
J
Jan Blunck 已提交
414 415
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
416
}
J
Jan Blunck 已提交
417
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
418

A
Al Viro 已提交
419
/*
N
Nick Piggin 已提交
420
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
421 422 423 424 425 426 427
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
428 429 430
 */

/**
A
Al Viro 已提交
431 432 433
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
434
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
435
 *
A
Al Viro 已提交
436 437 438
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
N
Nick Piggin 已提交
439
 */
A
Al Viro 已提交
440
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
N
Nick Piggin 已提交
441 442 443
{
	struct fs_struct *fs = current->fs;
	struct dentry *parent = nd->path.dentry;
444
	int want_root = 0;
N
Nick Piggin 已提交
445 446

	BUG_ON(!(nd->flags & LOOKUP_RCU));
447 448
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		want_root = 1;
N
Nick Piggin 已提交
449 450 451 452 453 454
		spin_lock(&fs->lock);
		if (nd->root.mnt != fs->root.mnt ||
				nd->root.dentry != fs->root.dentry)
			goto err_root;
	}
	spin_lock(&parent->d_lock);
A
Al Viro 已提交
455 456 457 458 459
	if (!dentry) {
		if (!__d_rcu_to_refcount(parent, nd->seq))
			goto err_parent;
		BUG_ON(nd->inode != parent->d_inode);
	} else {
460 461
		if (dentry->d_parent != parent)
			goto err_parent;
A
Al Viro 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474 475
		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
		if (!__d_rcu_to_refcount(dentry, nd->seq))
			goto err_child;
		/*
		 * If the sequence check on the child dentry passed, then
		 * the child has not been removed from its parent. This
		 * means the parent dentry must be valid and able to take
		 * a reference at this point.
		 */
		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
		BUG_ON(!parent->d_count);
		parent->d_count++;
		spin_unlock(&dentry->d_lock);
	}
N
Nick Piggin 已提交
476
	spin_unlock(&parent->d_lock);
477
	if (want_root) {
N
Nick Piggin 已提交
478 479 480 481 482 483
		path_get(&nd->root);
		spin_unlock(&fs->lock);
	}
	mntget(nd->path.mnt);

	rcu_read_unlock();
A
Andi Kleen 已提交
484
	br_read_unlock(&vfsmount_lock);
N
Nick Piggin 已提交
485 486
	nd->flags &= ~LOOKUP_RCU;
	return 0;
A
Al Viro 已提交
487 488

err_child:
N
Nick Piggin 已提交
489
	spin_unlock(&dentry->d_lock);
A
Al Viro 已提交
490
err_parent:
N
Nick Piggin 已提交
491 492
	spin_unlock(&parent->d_lock);
err_root:
493
	if (want_root)
N
Nick Piggin 已提交
494 495 496 497
		spin_unlock(&fs->lock);
	return -ECHILD;
}

498
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
499
{
500
	return dentry->d_op->d_revalidate(dentry, flags);
501 502
}

503 504 505
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
506
 *
507 508 509 510 511
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
512
 */
513
static int complete_walk(struct nameidata *nd)
514
{
A
Al Viro 已提交
515
	struct dentry *dentry = nd->path.dentry;
516 517
	int status;

518 519 520 521 522 523 524 525
	if (nd->flags & LOOKUP_RCU) {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		spin_lock(&dentry->d_lock);
		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
			spin_unlock(&dentry->d_lock);
			rcu_read_unlock();
A
Andi Kleen 已提交
526
			br_read_unlock(&vfsmount_lock);
527 528 529 530 531 532
			return -ECHILD;
		}
		BUG_ON(nd->inode != dentry->d_inode);
		spin_unlock(&dentry->d_lock);
		mntget(nd->path.mnt);
		rcu_read_unlock();
A
Andi Kleen 已提交
533
		br_read_unlock(&vfsmount_lock);
534 535
	}

A
Al Viro 已提交
536 537 538 539
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
540 541
		return 0;

A
Al Viro 已提交
542 543 544 545
	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
		return 0;

	/* Note: we do not d_invalidate() */
546
	status = d_revalidate(dentry, nd->flags);
547 548 549
	if (status > 0)
		return 0;

A
Al Viro 已提交
550
	if (!status)
551
		status = -ESTALE;
A
Al Viro 已提交
552

553
	path_put(&nd->path);
554 555 556
	return status;
}

A
Al Viro 已提交
557 558
static __always_inline void set_root(struct nameidata *nd)
{
559 560
	if (!nd->root.mnt)
		get_fs_root(current->fs, &nd->root);
A
Al Viro 已提交
561 562
}

563 564
static int link_path_walk(const char *, struct nameidata *);

N
Nick Piggin 已提交
565 566 567 568
static __always_inline void set_root_rcu(struct nameidata *nd)
{
	if (!nd->root.mnt) {
		struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
569 570 571 572 573
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
574
			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
N
Nick Piggin 已提交
575
		} while (read_seqcount_retry(&fs->seq, seq));
N
Nick Piggin 已提交
576 577 578
	}
}

579
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
L
Linus Torvalds 已提交
580
{
N
Nick Piggin 已提交
581 582
	int ret;

L
Linus Torvalds 已提交
583 584 585 586
	if (IS_ERR(link))
		goto fail;

	if (*link == '/') {
A
Al Viro 已提交
587
		set_root(nd);
J
Jan Blunck 已提交
588
		path_put(&nd->path);
A
Al Viro 已提交
589 590
		nd->path = nd->root;
		path_get(&nd->root);
A
Al Viro 已提交
591
		nd->flags |= LOOKUP_JUMPED;
L
Linus Torvalds 已提交
592
	}
N
Nick Piggin 已提交
593
	nd->inode = nd->path.dentry->d_inode;
C
Christoph Hellwig 已提交
594

N
Nick Piggin 已提交
595 596
	ret = link_path_walk(link, nd);
	return ret;
L
Linus Torvalds 已提交
597
fail:
J
Jan Blunck 已提交
598
	path_put(&nd->path);
L
Linus Torvalds 已提交
599 600 601
	return PTR_ERR(link);
}

J
Jan Blunck 已提交
602
static void path_put_conditional(struct path *path, struct nameidata *nd)
603 604
{
	dput(path->dentry);
605
	if (path->mnt != nd->path.mnt)
606 607 608
		mntput(path->mnt);
}

609 610
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
611
{
N
Nick Piggin 已提交
612 613 614 615
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
616
	}
N
Nick Piggin 已提交
617
	nd->path.mnt = path->mnt;
618
	nd->path.dentry = path->dentry;
619 620
}

C
Christoph Hellwig 已提交
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
/*
 * Helper to directly jump to a known parsed path from ->follow_link,
 * caller must have taken a reference to path beforehand.
 */
void nd_jump_link(struct nameidata *nd, struct path *path)
{
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;

	BUG_ON(nd->inode->i_op->follow_link);
}

636 637 638
static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
{
	struct inode *inode = link->dentry->d_inode;
639
	if (inode->i_op->put_link)
640 641 642 643
		inode->i_op->put_link(link->dentry, nd, cookie);
	path_put(link);
}

A
Al Viro 已提交
644
static __always_inline int
645
follow_link(struct path *link, struct nameidata *nd, void **p)
L
Linus Torvalds 已提交
646
{
647
	struct dentry *dentry = link->dentry;
648 649
	int error;
	char *s;
L
Linus Torvalds 已提交
650

651 652
	BUG_ON(nd->flags & LOOKUP_RCU);

A
Al Viro 已提交
653 654 655
	if (link->mnt == nd->path.mnt)
		mntget(link->mnt);

656 657 658 659
	error = -ELOOP;
	if (unlikely(current->total_link_count >= 40))
		goto out_put_nd_path;

660 661 662
	cond_resched();
	current->total_link_count++;

A
Al Viro 已提交
663
	touch_atime(link);
L
Linus Torvalds 已提交
664
	nd_set_link(nd, NULL);
A
Al Viro 已提交
665

666
	error = security_inode_follow_link(link->dentry, nd);
667 668
	if (error)
		goto out_put_nd_path;
669

670
	nd->last_type = LAST_BIND;
A
Al Viro 已提交
671 672
	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
	error = PTR_ERR(*p);
673
	if (IS_ERR(*p))
674
		goto out_put_nd_path;
675 676 677 678 679

	error = 0;
	s = nd_get_link(nd);
	if (s) {
		error = __vfs_follow_link(nd, s);
C
Christoph Hellwig 已提交
680 681
		if (unlikely(error))
			put_link(nd, link, *p);
L
Linus Torvalds 已提交
682
	}
683 684 685 686 687 688

	return error;

out_put_nd_path:
	path_put(&nd->path);
	path_put(link);
L
Linus Torvalds 已提交
689 690 691
	return error;
}

N
Nick Piggin 已提交
692 693
static int follow_up_rcu(struct path *path)
{
694 695
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
N
Nick Piggin 已提交
696 697
	struct dentry *mountpoint;

698 699
	parent = mnt->mnt_parent;
	if (&parent->mnt == path->mnt)
N
Nick Piggin 已提交
700
		return 0;
701
	mountpoint = mnt->mnt_mountpoint;
N
Nick Piggin 已提交
702
	path->dentry = mountpoint;
703
	path->mnt = &parent->mnt;
N
Nick Piggin 已提交
704 705 706
	return 1;
}

707 708 709 710 711 712 713 714 715 716
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
717
int follow_up(struct path *path)
L
Linus Torvalds 已提交
718
{
719 720
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
721
	struct dentry *mountpoint;
N
Nick Piggin 已提交
722

A
Andi Kleen 已提交
723
	br_read_lock(&vfsmount_lock);
724
	parent = mnt->mnt_parent;
A
Al Viro 已提交
725
	if (parent == mnt) {
A
Andi Kleen 已提交
726
		br_read_unlock(&vfsmount_lock);
L
Linus Torvalds 已提交
727 728
		return 0;
	}
729
	mntget(&parent->mnt);
730
	mountpoint = dget(mnt->mnt_mountpoint);
A
Andi Kleen 已提交
731
	br_read_unlock(&vfsmount_lock);
A
Al Viro 已提交
732 733 734
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
735
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
736 737 738
	return 1;
}

N
Nick Piggin 已提交
739
/*
740 741 742
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
743
 */
744 745
static int follow_automount(struct path *path, unsigned flags,
			    bool *need_mntput)
N
Nick Piggin 已提交
746
{
747
	struct vfsmount *mnt;
748
	int err;
749 750 751 752

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

753 754 755 756 757 758 759 760 761 762
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
763
	 */
764
	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
765
		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
766 767 768
	    path->dentry->d_inode)
		return -EISDIR;

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
	current->total_link_count++;
	if (current->total_link_count >= 40)
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
A
Al Viro 已提交
784
		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
785 786
			return -EREMOTE;
		return PTR_ERR(mnt);
N
Nick Piggin 已提交
787
	}
788

789 790
	if (!mnt) /* mount collision */
		return 0;
N
Nick Piggin 已提交
791

792 793 794 795 796
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
797
	err = finish_automount(mnt, path);
798

799 800 801
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
802
		return 0;
803
	case 0:
804
		path_put(path);
805 806 807
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
808 809
	default:
		return err;
810
	}
811

A
Al Viro 已提交
812 813
}

814 815
/*
 * Handle a dentry that is managed in some way.
816
 * - Flagged for transit management (autofs)
817 818 819 820 821 822 823 824
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
static int follow_managed(struct path *path, unsigned flags)
L
Linus Torvalds 已提交
825
{
826
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
827 828
	unsigned managed;
	bool need_mntput = false;
829
	int ret = 0;
830 831 832 833 834 835 836

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
837 838 839 840 841
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
842
			ret = path->dentry->d_op->d_manage(path->dentry, false);
843
			if (ret < 0)
844
				break;
845 846
		}

847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
			 * namespace got unmounted before we managed to get the
			 * vfsmount_lock */
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
			ret = follow_automount(path, flags, &need_mntput);
			if (ret < 0)
870
				break;
871 872 873 874 875
			continue;
		}

		/* We didn't change the current path point */
		break;
L
Linus Torvalds 已提交
876
	}
877 878 879 880 881

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (ret == -EISDIR)
		ret = 0;
882
	return ret < 0 ? ret : need_mntput;
L
Linus Torvalds 已提交
883 884
}

885
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
886 887 888
{
	struct vfsmount *mounted;

A
Al Viro 已提交
889
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
890
	if (mounted) {
A
Al Viro 已提交
891 892 893 894
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
895 896 897 898 899
		return 1;
	}
	return 0;
}

900 901 902 903 904 905
static inline bool managed_dentry_might_block(struct dentry *dentry)
{
	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
		dentry->d_op->d_manage(dentry, true) < 0);
}

906
/*
907 908
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
909 910
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
911
			       struct inode **inode)
912
{
913
	for (;;) {
914
		struct mount *mounted;
915 916 917 918
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
919
		if (unlikely(managed_dentry_might_block(path->dentry)))
920
			return false;
921 922 923 924

		if (!d_mountpoint(path->dentry))
			break;

925 926 927
		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
		if (!mounted)
			break;
928 929
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
930
		nd->flags |= LOOKUP_JUMPED;
931
		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
932 933 934 935 936 937
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
938 939 940 941
	}
	return true;
}

942
static void follow_mount_rcu(struct nameidata *nd)
943
{
944
	while (d_mountpoint(nd->path.dentry)) {
945
		struct mount *mounted;
946
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
947 948
		if (!mounted)
			break;
949 950
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
951
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
952 953 954
	}
}

N
Nick Piggin 已提交
955 956 957 958
static int follow_dotdot_rcu(struct nameidata *nd)
{
	set_root_rcu(nd);

959
	while (1) {
N
Nick Piggin 已提交
960 961 962 963 964 965 966 967 968 969 970
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
			break;
		}
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

			seq = read_seqcount_begin(&parent->d_seq);
			if (read_seqcount_retry(&old->d_seq, nd->seq))
971
				goto failed;
N
Nick Piggin 已提交
972 973 974 975 976 977 978 979
			nd->path.dentry = parent;
			nd->seq = seq;
			break;
		}
		if (!follow_up_rcu(&nd->path))
			break;
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
	}
980 981
	follow_mount_rcu(nd);
	nd->inode = nd->path.dentry->d_inode;
N
Nick Piggin 已提交
982
	return 0;
983 984 985

failed:
	nd->flags &= ~LOOKUP_RCU;
986 987
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
988
	rcu_read_unlock();
A
Andi Kleen 已提交
989
	br_read_unlock(&vfsmount_lock);
990
	return -ECHILD;
N
Nick Piggin 已提交
991 992
}

993 994 995 996 997
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
998
int follow_down(struct path *path)
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
{
	unsigned managed;
	int ret;

	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held.
		 *
		 * We indicate to the filesystem if someone is trying to mount
		 * something here.  This gives autofs the chance to deny anyone
		 * other than its daemon the right to mount on its
		 * superstructure.
		 *
		 * The filesystem may sleep at this point.
		 */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1018
			ret = path->dentry->d_op->d_manage(
1019
				path->dentry, false);
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
			if (ret < 0)
				return ret == -EISDIR ? 0 : ret;
		}

		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (!mounted)
				break;
			dput(path->dentry);
			mntput(path->mnt);
			path->mnt = mounted;
			path->dentry = dget(mounted->mnt_root);
			continue;
		}

		/* Don't handle automount points here */
		break;
	}
	return 0;
}

1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
/*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 */
static void follow_mount(struct path *path)
{
	while (d_mountpoint(path->dentry)) {
		struct vfsmount *mounted = lookup_mnt(path);
		if (!mounted)
			break;
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
	}
}

N
Nick Piggin 已提交
1058
static void follow_dotdot(struct nameidata *nd)
L
Linus Torvalds 已提交
1059
{
A
Al Viro 已提交
1060
	set_root(nd);
1061

L
Linus Torvalds 已提交
1062
	while(1) {
1063
		struct dentry *old = nd->path.dentry;
L
Linus Torvalds 已提交
1064

A
Al Viro 已提交
1065 1066
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
L
Linus Torvalds 已提交
1067 1068
			break;
		}
1069
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
A
Al Viro 已提交
1070 1071
			/* rare case of legitimate dget_parent()... */
			nd->path.dentry = dget_parent(nd->path.dentry);
L
Linus Torvalds 已提交
1072 1073 1074
			dput(old);
			break;
		}
A
Al Viro 已提交
1075
		if (!follow_up(&nd->path))
L
Linus Torvalds 已提交
1076 1077
			break;
	}
A
Al Viro 已提交
1078
	follow_mount(&nd->path);
N
Nick Piggin 已提交
1079
	nd->inode = nd->path.dentry->d_inode;
L
Linus Torvalds 已提交
1080 1081
}

1082
/*
M
Miklos Szeredi 已提交
1083 1084 1085 1086 1087
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
 *
 * dir->d_inode->i_mutex must be held
1088
 */
M
Miklos Szeredi 已提交
1089
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1090
				    unsigned int flags, bool *need_lookup)
1091 1092
{
	struct dentry *dentry;
M
Miklos Szeredi 已提交
1093
	int error;
1094

M
Miklos Szeredi 已提交
1095 1096 1097 1098 1099 1100
	*need_lookup = false;
	dentry = d_lookup(dir, name);
	if (dentry) {
		if (d_need_lookup(dentry)) {
			*need_lookup = true;
		} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1101
			error = d_revalidate(dentry, flags);
M
Miklos Szeredi 已提交
1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
			if (unlikely(error <= 0)) {
				if (error < 0) {
					dput(dentry);
					return ERR_PTR(error);
				} else if (!d_invalidate(dentry)) {
					dput(dentry);
					dentry = NULL;
				}
			}
		}
	}
1113

M
Miklos Szeredi 已提交
1114 1115 1116 1117
	if (!dentry) {
		dentry = d_alloc(dir, name);
		if (unlikely(!dentry))
			return ERR_PTR(-ENOMEM);
1118

M
Miklos Szeredi 已提交
1119
		*need_lookup = true;
1120 1121 1122 1123
	}
	return dentry;
}

1124
/*
M
Miklos Szeredi 已提交
1125 1126 1127 1128
 * Call i_op->lookup on the dentry.  The dentry must be negative but may be
 * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
 *
 * dir->d_inode->i_mutex must be held
1129
 */
M
Miklos Szeredi 已提交
1130
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1131
				  unsigned int flags)
1132 1133 1134 1135
{
	struct dentry *old;

	/* Don't create child dentry for a dead directory. */
M
Miklos Szeredi 已提交
1136
	if (unlikely(IS_DEADDIR(dir))) {
1137
		dput(dentry);
1138
		return ERR_PTR(-ENOENT);
1139
	}
1140

1141
	old = dir->i_op->lookup(dir, dentry, flags);
1142 1143 1144 1145 1146 1147 1148
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
}

1149
static struct dentry *__lookup_hash(struct qstr *name,
1150
		struct dentry *base, unsigned int flags)
1151
{
M
Miklos Szeredi 已提交
1152
	bool need_lookup;
1153 1154
	struct dentry *dentry;

1155
	dentry = lookup_dcache(name, base, flags, &need_lookup);
M
Miklos Szeredi 已提交
1156 1157
	if (!need_lookup)
		return dentry;
1158

1159
	return lookup_real(base->d_inode, dentry, flags);
1160 1161
}

L
Linus Torvalds 已提交
1162 1163 1164 1165 1166
/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
M
Miklos Szeredi 已提交
1167 1168
static int lookup_fast(struct nameidata *nd, struct qstr *name,
		       struct path *path, struct inode **inode)
L
Linus Torvalds 已提交
1169
{
1170
	struct vfsmount *mnt = nd->path.mnt;
N
Nick Piggin 已提交
1171
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1172 1173
	int need_reval = 1;
	int status = 1;
1174 1175
	int err;

1176 1177 1178 1179 1180
	/*
	 * Rename seqlock is not required here because in the off chance
	 * of a false negative due to a concurrent rename, we're going to
	 * do the non-racy lookup, below.
	 */
N
Nick Piggin 已提交
1181 1182
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1183
		dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
A
Al Viro 已提交
1184 1185 1186
		if (!dentry)
			goto unlazy;

1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
		*inode = dentry->d_inode;
		if (read_seqcount_retry(&dentry->d_seq, seq))
			return -ECHILD;

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
N
Nick Piggin 已提交
1202 1203 1204
		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
			return -ECHILD;
		nd->seq = seq;
A
Al Viro 已提交
1205

1206 1207
		if (unlikely(d_need_lookup(dentry)))
			goto unlazy;
1208
		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1209
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1210 1211 1212 1213 1214
			if (unlikely(status <= 0)) {
				if (status != -ECHILD)
					need_reval = 0;
				goto unlazy;
			}
1215
		}
N
Nick Piggin 已提交
1216 1217
		path->mnt = mnt;
		path->dentry = dentry;
1218 1219 1220 1221 1222
		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
			goto unlazy;
		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
			goto unlazy;
		return 0;
A
Al Viro 已提交
1223
unlazy:
A
Al Viro 已提交
1224 1225
		if (unlazy_walk(nd, dentry))
			return -ECHILD;
A
Al Viro 已提交
1226 1227
	} else {
		dentry = __d_lookup(parent, name);
1228
	}
A
Al Viro 已提交
1229

1230 1231 1232 1233
	if (unlikely(!dentry))
		goto need_lookup;

	if (unlikely(d_need_lookup(dentry))) {
1234
		dput(dentry);
1235
		goto need_lookup;
A
Al Viro 已提交
1236
	}
1237

A
Al Viro 已提交
1238
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1239
		status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1240 1241 1242 1243 1244 1245 1246
	if (unlikely(status <= 0)) {
		if (status < 0) {
			dput(dentry);
			return status;
		}
		if (!d_invalidate(dentry)) {
			dput(dentry);
1247
			goto need_lookup;
A
Al Viro 已提交
1248
		}
1249
	}
M
Miklos Szeredi 已提交
1250

1251 1252 1253
	path->mnt = mnt;
	path->dentry = dentry;
	err = follow_managed(path, nd->flags);
1254 1255
	if (unlikely(err < 0)) {
		path_put_conditional(path, nd);
1256
		return err;
1257
	}
1258 1259
	if (err)
		nd->flags |= LOOKUP_JUMPED;
1260
	*inode = path->dentry->d_inode;
L
Linus Torvalds 已提交
1261
	return 0;
1262 1263

need_lookup:
M
Miklos Szeredi 已提交
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
	return 1;
}

/* Fast lookup failed, do it the slow way */
static int lookup_slow(struct nameidata *nd, struct qstr *name,
		       struct path *path)
{
	struct dentry *dentry, *parent;
	int err;

	parent = nd->path.dentry;
1275 1276 1277
	BUG_ON(nd->inode != parent->d_inode);

	mutex_lock(&parent->d_inode->i_mutex);
1278
	dentry = __lookup_hash(name, parent, nd->flags);
1279 1280 1281
	mutex_unlock(&parent->d_inode->i_mutex);
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
	err = follow_managed(path, nd->flags);
	if (unlikely(err < 0)) {
		path_put_conditional(path, nd);
		return err;
	}
	if (err)
		nd->flags |= LOOKUP_JUMPED;
	return 0;
L
Linus Torvalds 已提交
1292 1293
}

1294 1295 1296
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1297
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1298 1299
		if (err != -ECHILD)
			return err;
A
Al Viro 已提交
1300
		if (unlazy_walk(nd, NULL))
1301 1302
			return -ECHILD;
	}
1303
	return inode_permission(nd->inode, MAY_EXEC);
1304 1305
}

1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
static inline int handle_dots(struct nameidata *nd, int type)
{
	if (type == LAST_DOTDOT) {
		if (nd->flags & LOOKUP_RCU) {
			if (follow_dotdot_rcu(nd))
				return -ECHILD;
		} else
			follow_dotdot(nd);
	}
	return 0;
}

1318 1319 1320 1321 1322 1323
static void terminate_walk(struct nameidata *nd)
{
	if (!(nd->flags & LOOKUP_RCU)) {
		path_put(&nd->path);
	} else {
		nd->flags &= ~LOOKUP_RCU;
1324 1325
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
1326
		rcu_read_unlock();
A
Andi Kleen 已提交
1327
		br_read_unlock(&vfsmount_lock);
1328 1329 1330
	}
}

1331 1332 1333 1334 1335 1336
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1337
static inline int should_follow_link(struct inode *inode, int follow)
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
{
	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
		if (likely(inode->i_op->follow_link))
			return follow;

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_NOFOLLOW;
		spin_unlock(&inode->i_lock);
	}
	return 0;
}

1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static inline int walk_component(struct nameidata *nd, struct path *path,
		struct qstr *name, int type, int follow)
{
	struct inode *inode;
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
	if (unlikely(type != LAST_NORM))
		return handle_dots(nd, type);
M
Miklos Szeredi 已提交
1363
	err = lookup_fast(nd, name, path, &inode);
1364
	if (unlikely(err)) {
M
Miklos Szeredi 已提交
1365 1366 1367 1368 1369 1370 1371 1372
		if (err < 0)
			goto out_err;

		err = lookup_slow(nd, name, path);
		if (err < 0)
			goto out_err;

		inode = path->dentry->d_inode;
1373
	}
M
Miklos Szeredi 已提交
1374 1375 1376 1377
	err = -ENOENT;
	if (!inode)
		goto out_path_put;

1378
	if (should_follow_link(inode, follow)) {
A
Al Viro 已提交
1379 1380
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, path->dentry))) {
M
Miklos Szeredi 已提交
1381 1382
				err = -ECHILD;
				goto out_err;
A
Al Viro 已提交
1383 1384
			}
		}
1385 1386 1387 1388 1389 1390
		BUG_ON(inode != path->dentry->d_inode);
		return 1;
	}
	path_to_nameidata(path, nd);
	nd->inode = inode;
	return 0;
M
Miklos Szeredi 已提交
1391 1392 1393 1394 1395 1396

out_path_put:
	path_to_nameidata(path, nd);
out_err:
	terminate_walk(nd);
	return err;
1397 1398
}

1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414
/*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
 *
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups.
 */
static inline int nested_symlink(struct path *path, struct nameidata *nd)
{
	int res;

	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
		path_put_conditional(path, nd);
		path_put(&nd->path);
		return -ELOOP;
	}
1415
	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1416 1417 1418 1419 1420 1421 1422

	nd->depth++;
	current->link_count++;

	do {
		struct path link = *path;
		void *cookie;
1423 1424

		res = follow_link(&link, nd, &cookie);
1425 1426 1427 1428
		if (res)
			break;
		res = walk_component(nd, path, &nd->last,
				     nd->last_type, LOOKUP_FOLLOW);
1429
		put_link(nd, &link, cookie);
1430 1431 1432 1433 1434 1435 1436
	} while (res > 0);

	current->link_count--;
	nd->depth--;
	return res;
}

1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
/*
 * We really don't want to look at inode->i_op->lookup
 * when we don't have to. So we keep a cache bit in
 * the inode ->i_opflags field that says "yes, we can
 * do lookup on this inode".
 */
static inline int can_lookup(struct inode *inode)
{
	if (likely(inode->i_opflags & IOP_LOOKUP))
		return 1;
	if (likely(!inode->i_op->lookup))
		return 0;

	/* We do this once for the lifetime of the inode */
	spin_lock(&inode->i_lock);
	inode->i_opflags |= IOP_LOOKUP;
	spin_unlock(&inode->i_lock);
	return 1;
}

1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - Little-endian machines (so that we can generate the mask
 *   of low bytes efficiently). Again, we *could* do a byte
 *   swapping load on big-endian architectures if that is not
 *   expensive enough to make the optimization worthless.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1481
#include <asm/word-at-a-time.h>
1482

1483
#ifdef CONFIG_64BIT
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502

static inline unsigned int fold_hash(unsigned long hash)
{
	hash += hash >> (8*sizeof(int));
	return hash;
}

#else	/* 32-bit case */

#define fold_hash(x) (x)

#endif

unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long a, mask;
	unsigned long hash = 0;

	for (;;) {
1503
		a = load_unaligned_zeropad(name);
1504 1505 1506
		if (len < sizeof(unsigned long))
			break;
		hash += a;
1507
		hash *= 9;
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
		if (!len)
			goto done;
	}
	mask = ~(~0ul << len*8);
	hash += mask & a;
done:
	return fold_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/*
 * Calculate the length and hash of the path component, and
 * return the length of the component;
 */
static inline unsigned long hash_name(const char *name, unsigned int *hashp)
{
1526 1527
	unsigned long a, b, adata, bdata, mask, hash, len;
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1528 1529 1530 1531 1532 1533

	hash = a = 0;
	len = -sizeof(unsigned long);
	do {
		hash = (hash + a) * 9;
		len += sizeof(unsigned long);
1534
		a = load_unaligned_zeropad(name+len);
1535 1536 1537 1538 1539 1540 1541 1542 1543
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);

	mask = create_zero_mask(adata | bdata);

	hash += a & zero_bytemask(mask);
1544 1545
	*hashp = fold_hash(hash);

1546
	return len + find_zero(mask);
1547 1548 1549 1550
}

#else

L
Linus Torvalds 已提交
1551 1552 1553 1554 1555 1556 1557
unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
	unsigned long hash = init_name_hash();
	while (len--)
		hash = partial_name_hash(*name++, hash);
	return end_name_hash(hash);
}
1558
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
1559

1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline unsigned long hash_name(const char *name, unsigned int *hashp)
{
	unsigned long hash = init_name_hash();
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
	*hashp = end_name_hash(hash);
	return len;
}

1579 1580
#endif

L
Linus Torvalds 已提交
1581 1582
/*
 * Name resolution.
1583 1584
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
1585
 *
1586 1587
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
1588
 */
1589
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
1590 1591 1592 1593 1594 1595 1596
{
	struct path next;
	int err;
	
	while (*name=='/')
		name++;
	if (!*name)
1597
		return 0;
L
Linus Torvalds 已提交
1598 1599 1600 1601

	/* At this point we know we have a real path component. */
	for(;;) {
		struct qstr this;
1602
		long len;
A
Al Viro 已提交
1603
		int type;
L
Linus Torvalds 已提交
1604

1605
		err = may_lookup(nd);
L
Linus Torvalds 已提交
1606 1607 1608
 		if (err)
			break;

1609
		len = hash_name(name, &this.hash);
L
Linus Torvalds 已提交
1610
		this.name = name;
1611
		this.len = len;
L
Linus Torvalds 已提交
1612

A
Al Viro 已提交
1613
		type = LAST_NORM;
1614
		if (name[0] == '.') switch (len) {
A
Al Viro 已提交
1615
			case 2:
1616
				if (name[1] == '.') {
A
Al Viro 已提交
1617
					type = LAST_DOTDOT;
A
Al Viro 已提交
1618 1619
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
1620 1621 1622 1623
				break;
			case 1:
				type = LAST_DOT;
		}
1624 1625
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
1626
			nd->flags &= ~LOOKUP_JUMPED;
1627 1628 1629 1630 1631 1632 1633
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
				err = parent->d_op->d_hash(parent, nd->inode,
							   &this);
				if (err < 0)
					break;
			}
		}
A
Al Viro 已提交
1634

1635
		if (!name[len])
L
Linus Torvalds 已提交
1636
			goto last_component;
1637 1638 1639 1640 1641 1642 1643 1644
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
			len++;
		} while (unlikely(name[len] == '/'));
		if (!name[len])
1645
			goto last_component;
1646
		name += len;
L
Linus Torvalds 已提交
1647

1648 1649 1650
		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
		if (err < 0)
			return err;
L
Linus Torvalds 已提交
1651

1652
		if (err) {
1653
			err = nested_symlink(&next, nd);
L
Linus Torvalds 已提交
1654
			if (err)
1655
				return err;
N
Nick Piggin 已提交
1656
		}
1657 1658
		if (can_lookup(nd->inode))
			continue;
L
Linus Torvalds 已提交
1659
		err = -ENOTDIR; 
1660
		break;
L
Linus Torvalds 已提交
1661 1662 1663
		/* here ends the main loop */

last_component:
1664 1665
		nd->last = this;
		nd->last_type = type;
1666
		return 0;
L
Linus Torvalds 已提交
1667
	}
1668
	terminate_walk(nd);
L
Linus Torvalds 已提交
1669 1670 1671
	return err;
}

A
Al Viro 已提交
1672 1673
static int path_init(int dfd, const char *name, unsigned int flags,
		     struct nameidata *nd, struct file **fp)
N
Nick Piggin 已提交
1674 1675 1676 1677 1678 1679
{
	int retval = 0;
	int fput_needed;
	struct file *file;

	nd->last_type = LAST_ROOT; /* if there are only slashes... */
A
Al Viro 已提交
1680
	nd->flags = flags | LOOKUP_JUMPED;
N
Nick Piggin 已提交
1681
	nd->depth = 0;
1682 1683
	if (flags & LOOKUP_ROOT) {
		struct inode *inode = nd->root.dentry->d_inode;
A
Al Viro 已提交
1684 1685 1686 1687 1688 1689 1690
		if (*name) {
			if (!inode->i_op->lookup)
				return -ENOTDIR;
			retval = inode_permission(inode, MAY_EXEC);
			if (retval)
				return retval;
		}
1691 1692 1693
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
A
Andi Kleen 已提交
1694
			br_read_lock(&vfsmount_lock);
1695 1696 1697 1698 1699 1700 1701 1702
			rcu_read_lock();
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
		} else {
			path_get(&nd->path);
		}
		return 0;
	}

N
Nick Piggin 已提交
1703 1704 1705
	nd->root.mnt = NULL;

	if (*name=='/') {
A
Al Viro 已提交
1706
		if (flags & LOOKUP_RCU) {
A
Andi Kleen 已提交
1707
			br_read_lock(&vfsmount_lock);
A
Al Viro 已提交
1708 1709 1710 1711 1712 1713 1714
			rcu_read_lock();
			set_root_rcu(nd);
		} else {
			set_root(nd);
			path_get(&nd->root);
		}
		nd->path = nd->root;
N
Nick Piggin 已提交
1715
	} else if (dfd == AT_FDCWD) {
A
Al Viro 已提交
1716 1717 1718
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
1719

A
Andi Kleen 已提交
1720
			br_read_lock(&vfsmount_lock);
A
Al Viro 已提交
1721
			rcu_read_lock();
N
Nick Piggin 已提交
1722

A
Al Viro 已提交
1723 1724 1725 1726 1727 1728 1729 1730
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
		}
N
Nick Piggin 已提交
1731 1732 1733
	} else {
		struct dentry *dentry;

1734
		file = fget_raw_light(dfd, &fput_needed);
N
Nick Piggin 已提交
1735 1736 1737 1738 1739 1740
		retval = -EBADF;
		if (!file)
			goto out_fail;

		dentry = file->f_path.dentry;

A
Al Viro 已提交
1741 1742 1743 1744
		if (*name) {
			retval = -ENOTDIR;
			if (!S_ISDIR(dentry->d_inode->i_mode))
				goto fput_fail;
N
Nick Piggin 已提交
1745

1746
			retval = inode_permission(dentry->d_inode, MAY_EXEC);
A
Al Viro 已提交
1747 1748 1749
			if (retval)
				goto fput_fail;
		}
N
Nick Piggin 已提交
1750 1751

		nd->path = file->f_path;
A
Al Viro 已提交
1752 1753
		if (flags & LOOKUP_RCU) {
			if (fput_needed)
A
Al Viro 已提交
1754
				*fp = file;
A
Al Viro 已提交
1755
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
A
Andi Kleen 已提交
1756
			br_read_lock(&vfsmount_lock);
A
Al Viro 已提交
1757 1758 1759 1760 1761
			rcu_read_lock();
		} else {
			path_get(&file->f_path);
			fput_light(file, fput_needed);
		}
N
Nick Piggin 已提交
1762 1763 1764
	}

	nd->inode = nd->path.dentry->d_inode;
1765
	return 0;
1766

1767 1768 1769 1770 1771 1772
fput_fail:
	fput_light(file, fput_needed);
out_fail:
	return retval;
}

1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
static inline int lookup_last(struct nameidata *nd, struct path *path)
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
	return walk_component(nd, path, &nd->last, nd->last_type,
					nd->flags & LOOKUP_FOLLOW);
}

1783
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
A
Al Viro 已提交
1784
static int path_lookupat(int dfd, const char *name,
1785 1786
				unsigned int flags, struct nameidata *nd)
{
A
Al Viro 已提交
1787
	struct file *base = NULL;
1788 1789
	struct path path;
	int err;
N
Nick Piggin 已提交
1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804

	/*
	 * Path walking is largely split up into 2 different synchronisation
	 * schemes, rcu-walk and ref-walk (explained in
	 * Documentation/filesystems/path-lookup.txt). These share much of the
	 * path walk code, but some things particularly setup, cleanup, and
	 * following mounts are sufficiently divergent that functions are
	 * duplicated. Typically there is a function foo(), and its RCU
	 * analogue, foo_rcu().
	 *
	 * -ECHILD is the error number of choice (just to avoid clashes) that
	 * is returned if some aspect of an rcu-walk fails. Such an error must
	 * be handled by restarting a traditional ref-walk (which will always
	 * be able to complete).
	 */
1805
	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
A
Al Viro 已提交
1806

1807 1808
	if (unlikely(err))
		return err;
A
Al Viro 已提交
1809 1810

	current->total_link_count = 0;
1811 1812 1813 1814 1815 1816 1817 1818
	err = link_path_walk(name, nd);

	if (!err && !(flags & LOOKUP_PARENT)) {
		err = lookup_last(nd, &path);
		while (err > 0) {
			void *cookie;
			struct path link = path;
			nd->flags |= LOOKUP_PARENT;
1819
			err = follow_link(&link, nd, &cookie);
1820 1821 1822
			if (err)
				break;
			err = lookup_last(nd, &path);
1823
			put_link(nd, &link, cookie);
1824 1825
		}
	}
A
Al Viro 已提交
1826

1827 1828
	if (!err)
		err = complete_walk(nd);
1829 1830 1831 1832

	if (!err && nd->flags & LOOKUP_DIRECTORY) {
		if (!nd->inode->i_op->lookup) {
			path_put(&nd->path);
A
Al Viro 已提交
1833
			err = -ENOTDIR;
1834 1835
		}
	}
A
Al Viro 已提交
1836

A
Al Viro 已提交
1837 1838
	if (base)
		fput(base);
A
Al Viro 已提交
1839

1840
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
A
Al Viro 已提交
1841 1842 1843
		path_put(&nd->root);
		nd->root.mnt = NULL;
	}
1844
	return err;
A
Al Viro 已提交
1845
}
N
Nick Piggin 已提交
1846

A
Al Viro 已提交
1847 1848 1849 1850 1851 1852 1853 1854
static int do_path_lookup(int dfd, const char *name,
				unsigned int flags, struct nameidata *nd)
{
	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
	if (unlikely(retval == -ECHILD))
		retval = path_lookupat(dfd, name, flags, nd);
	if (unlikely(retval == -ESTALE))
		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
N
Nick Piggin 已提交
1855 1856 1857 1858 1859 1860 1861

	if (likely(!retval)) {
		if (unlikely(!audit_dummy_context())) {
			if (nd->path.dentry && nd->inode)
				audit_inode(name, nd->path.dentry);
		}
	}
1862
	return retval;
L
Linus Torvalds 已提交
1863 1864
}

A
Al Viro 已提交
1865 1866
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
1867
{
A
Al Viro 已提交
1868 1869 1870 1871 1872 1873 1874 1875 1876 1877
	struct nameidata nd;
	struct dentry *d;
	int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
	if (err)
		return ERR_PTR(err);
	if (nd.last_type != LAST_NORM) {
		path_put(&nd.path);
		return ERR_PTR(-EINVAL);
	}
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1878
	d = __lookup_hash(&nd.last, nd.path.dentry, 0);
A
Al Viro 已提交
1879 1880 1881 1882 1883 1884 1885
	if (IS_ERR(d)) {
		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
		path_put(&nd.path);
		return d;
	}
	*path = nd.path;
	return d;
1886 1887
}

A
Al Viro 已提交
1888 1889 1890 1891 1892 1893 1894 1895 1896
int kern_path(const char *name, unsigned int flags, struct path *path)
{
	struct nameidata nd;
	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
	if (!res)
		*path = nd.path;
	return res;
}

1897 1898 1899 1900 1901 1902
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
1903
 * @path: pointer to struct path to fill
1904 1905 1906
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
1907
		    struct path *path)
1908
{
1909 1910 1911 1912 1913
	struct nameidata nd;
	int err;
	nd.root.dentry = dentry;
	nd.root.mnt = mnt;
	BUG_ON(flags & LOOKUP_PARENT);
1914
	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1915 1916 1917 1918
	err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
	if (!err)
		*path = nd.path;
	return err;
1919 1920
}

1921 1922 1923 1924 1925
/*
 * Restricted form of lookup. Doesn't follow links, single-component only,
 * needs parent already locked. Doesn't follow mounts.
 * SMP-safe.
 */
1926
static struct dentry *lookup_hash(struct nameidata *nd)
1927
{
1928
	return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
L
Linus Torvalds 已提交
1929 1930
}

1931
/**
1932
 * lookup_one_len - filesystem helper to lookup single pathname component
1933 1934 1935 1936
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
1937 1938
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.  Also note that by using this function the
1939 1940 1941
 * nameidata argument is passed to the filesystem methods and a filesystem
 * using this helper needs to be prepared for that.
 */
1942 1943 1944
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
A
Al Viro 已提交
1945
	unsigned int c;
1946
	int err;
1947

1948 1949
	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));

A
Al Viro 已提交
1950 1951
	this.name = name;
	this.len = len;
L
Linus Torvalds 已提交
1952
	this.hash = full_name_hash(name, len);
A
Al Viro 已提交
1953 1954 1955 1956 1957 1958 1959 1960
	if (!len)
		return ERR_PTR(-EACCES);

	while (len--) {
		c = *(const unsigned char *)name++;
		if (c == '/' || c == '\0')
			return ERR_PTR(-EACCES);
	}
1961 1962 1963 1964 1965 1966 1967 1968 1969
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
		int err = base->d_op->d_hash(base, base->d_inode, &this);
		if (err < 0)
			return ERR_PTR(err);
	}
1970

1971 1972 1973 1974
	err = inode_permission(base->d_inode, MAY_EXEC);
	if (err)
		return ERR_PTR(err);

1975
	return __lookup_hash(&this, base, 0);
1976 1977
}

1978 1979
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
1980
{
1981
	struct nameidata nd;
1982
	char *tmp = getname_flags(name, flags, empty);
L
Linus Torvalds 已提交
1983 1984
	int err = PTR_ERR(tmp);
	if (!IS_ERR(tmp)) {
1985 1986 1987 1988

		BUG_ON(flags & LOOKUP_PARENT);

		err = do_path_lookup(dfd, tmp, flags, &nd);
L
Linus Torvalds 已提交
1989
		putname(tmp);
1990 1991
		if (!err)
			*path = nd.path;
L
Linus Torvalds 已提交
1992 1993 1994 1995
	}
	return err;
}

1996 1997 1998
int user_path_at(int dfd, const char __user *name, unsigned flags,
		 struct path *path)
{
1999
	return user_path_at_empty(dfd, name, flags, path, NULL);
2000 2001
}

2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
static int user_path_parent(int dfd, const char __user *path,
			struct nameidata *nd, char **name)
{
	char *s = getname(path);
	int error;

	if (IS_ERR(s))
		return PTR_ERR(s);

	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
	if (error)
		putname(s);
	else
		*name = s;

	return error;
}

L
Linus Torvalds 已提交
2020 2021 2022 2023 2024 2025
/*
 * It's inline, so penalty for filesystems that don't use sticky bit is
 * minimal.
 */
static inline int check_sticky(struct inode *dir, struct inode *inode)
{
2026
	kuid_t fsuid = current_fsuid();
2027

L
Linus Torvalds 已提交
2028 2029
	if (!(dir->i_mode & S_ISVTX))
		return 0;
2030
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2031
		return 0;
2032
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2033
		return 0;
2034
	return !inode_capable(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055
}

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 *  9. We can't remove a root or mountpoint.
 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
2056
static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
L
Linus Torvalds 已提交
2057 2058 2059 2060 2061 2062 2063
{
	int error;

	if (!victim->d_inode)
		return -ENOENT;

	BUG_ON(victim->d_parent->d_inode != dir);
2064
	audit_inode_child(victim, dir);
L
Linus Torvalds 已提交
2065

2066
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2067 2068 2069 2070 2071
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
H
Hugh Dickins 已提交
2072
	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
L
Linus Torvalds 已提交
2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095
		return -EPERM;
	if (isdir) {
		if (!S_ISDIR(victim->d_inode->i_mode))
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
	} else if (S_ISDIR(victim->d_inode->i_mode))
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We should have write and exec permissions on dir
 *  4. We can't do it if dir is immutable (done in permission())
 */
2096
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2097 2098 2099 2100 2101
{
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2102
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2103 2104 2105 2106 2107 2108 2109 2110 2111 2112
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
I
Ingo Molnar 已提交
2113
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2114 2115 2116
		return NULL;
	}

2117
	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2118

2119 2120 2121 2122 2123
	p = d_ancestor(p2, p1);
	if (p) {
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2124 2125
	}

2126 2127 2128 2129 2130
	p = d_ancestor(p1, p2);
	if (p) {
		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
		return p;
L
Linus Torvalds 已提交
2131 2132
	}

I
Ingo Molnar 已提交
2133 2134
	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
L
Linus Torvalds 已提交
2135 2136 2137 2138 2139
	return NULL;
}

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
2140
	mutex_unlock(&p1->d_inode->i_mutex);
L
Linus Torvalds 已提交
2141
	if (p1 != p2) {
2142
		mutex_unlock(&p2->d_inode->i_mutex);
2143
		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2144 2145 2146
	}
}

A
Al Viro 已提交
2147
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2148
		bool want_excl)
L
Linus Torvalds 已提交
2149
{
2150
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2151 2152 2153
	if (error)
		return error;

A
Al Viro 已提交
2154
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2155 2156 2157 2158 2159 2160
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2161
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2162
	if (!error)
2163
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2164 2165 2166
	return error;
}

A
Al Viro 已提交
2167
static int may_open(struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2168
{
2169
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2170 2171 2172
	struct inode *inode = dentry->d_inode;
	int error;

A
Al Viro 已提交
2173 2174 2175 2176
	/* O_PATH? */
	if (!acc_mode)
		return 0;

L
Linus Torvalds 已提交
2177 2178 2179
	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2180 2181
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2182
		return -ELOOP;
C
Christoph Hellwig 已提交
2183 2184 2185 2186 2187 2188
	case S_IFDIR:
		if (acc_mode & MAY_WRITE)
			return -EISDIR;
		break;
	case S_IFBLK:
	case S_IFCHR:
2189
		if (path->mnt->mnt_flags & MNT_NODEV)
L
Linus Torvalds 已提交
2190
			return -EACCES;
C
Christoph Hellwig 已提交
2191 2192 2193
		/*FALLTHRU*/
	case S_IFIFO:
	case S_IFSOCK:
L
Linus Torvalds 已提交
2194
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2195
		break;
2196
	}
2197

2198
	error = inode_permission(inode, acc_mode);
2199 2200
	if (error)
		return error;
M
Mimi Zohar 已提交
2201

L
Linus Torvalds 已提交
2202 2203 2204 2205
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2206
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2207
			return -EPERM;
L
Linus Torvalds 已提交
2208
		if (flag & O_TRUNC)
2209
			return -EPERM;
L
Linus Torvalds 已提交
2210 2211 2212
	}

	/* O_NOATIME can only be set by the owner or superuser */
2213
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2214
		return -EPERM;
L
Linus Torvalds 已提交
2215

2216
	return 0;
2217
}
L
Linus Torvalds 已提交
2218

2219
static int handle_truncate(struct file *filp)
2220
{
2221
	struct path *path = &filp->f_path;
2222 2223 2224 2225 2226 2227 2228 2229 2230
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
	error = locks_verify_locked(inode);
	if (!error)
2231
		error = security_path_truncate(path);
2232 2233 2234
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2235
				    filp);
2236 2237
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2238
	return error;
L
Linus Torvalds 已提交
2239 2240
}

2241 2242
static inline int open_to_namei_flags(int flag)
{
2243 2244
	if ((flag & O_ACCMODE) == 3)
		flag--;
2245 2246 2247
	return flag;
}

M
Miklos Szeredi 已提交
2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260
static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
{
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * Returns 1 if the file was looked up only or didn't need creating.  The
 * caller will need to perform the open themselves.  @path will have been
 * updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
2274 2275 2276 2277 2278
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
			struct path *path, struct file *file,
			const struct open_flags *op,
			bool *want_write, bool need_lookup,
			int *opened)
M
Miklos Szeredi 已提交
2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291
{
	struct inode *dir =  nd->path.dentry->d_inode;
	unsigned open_flag = open_to_namei_flags(op->open_flag);
	umode_t mode;
	int error;
	int acc_mode;
	int create_error = 0;
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;

	BUG_ON(dentry->d_inode);

	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir))) {
2292
		error = -ENOENT;
M
Miklos Szeredi 已提交
2293 2294 2295 2296 2297 2298 2299 2300 2301
		goto out;
	}

	mode = op->mode & S_IALLUGO;
	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
		mode &= ~current_umask();

	if (open_flag & O_EXCL) {
		open_flag &= ~O_TRUNC;
2302
		*opened |= FILE_CREATED;
M
Miklos Szeredi 已提交
2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317
	}

	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
	if ((open_flag & (O_CREAT | O_TRUNC)) ||
	    (open_flag & O_ACCMODE) != O_RDONLY) {
		error = mnt_want_write(nd->path.mnt);
		if (!error) {
M
Miklos Szeredi 已提交
2318
			*want_write = true;
M
Miklos Szeredi 已提交
2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348
		} else if (!(open_flag & O_CREAT)) {
			/*
			 * No O_CREATE -> atomicity not a requirement -> fall
			 * back to lookup + open
			 */
			goto no_open;
		} else if (open_flag & (O_EXCL | O_TRUNC)) {
			/* Fall back and fail with the right error */
			create_error = error;
			goto no_open;
		} else {
			/* No side effects, safe to clear O_CREAT */
			create_error = error;
			open_flag &= ~O_CREAT;
		}
	}

	if (open_flag & O_CREAT) {
		error = may_o_create(&nd->path, dentry, op->mode);
		if (error) {
			create_error = error;
			if (open_flag & O_EXCL)
				goto no_open;
			open_flag &= ~O_CREAT;
		}
	}

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
2349 2350 2351
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
	error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2352
				      opened);
A
Al Viro 已提交
2353 2354 2355
	if (error < 0) {
		if (create_error && error == -ENOENT)
			error = create_error;
M
Miklos Szeredi 已提交
2356 2357 2358 2359
		goto out;
	}

	acc_mode = op->acc_mode;
2360
	if (*opened & FILE_CREATED) {
M
Miklos Szeredi 已提交
2361 2362 2363 2364
		fsnotify_create(dir, dentry);
		acc_mode = MAY_OPEN;
	}

A
Al Viro 已提交
2365
	if (error) {	/* returned 1, that is */
A
Al Viro 已提交
2366
		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2367
			error = -EIO;
M
Miklos Szeredi 已提交
2368 2369
			goto out;
		}
A
Al Viro 已提交
2370
		if (file->f_path.dentry) {
M
Miklos Szeredi 已提交
2371
			dput(dentry);
A
Al Viro 已提交
2372
			dentry = file->f_path.dentry;
M
Miklos Szeredi 已提交
2373 2374 2375 2376 2377 2378 2379 2380
		}
		goto looked_up;
	}

	/*
	 * We didn't have the inode before the open, so check open permission
	 * here.
	 */
2381 2382 2383
	error = may_open(&file->f_path, acc_mode, open_flag);
	if (error)
		fput(file);
M
Miklos Szeredi 已提交
2384 2385 2386

out:
	dput(dentry);
2387
	return error;
M
Miklos Szeredi 已提交
2388 2389 2390

no_open:
	if (need_lookup) {
2391
		dentry = lookup_real(dir, dentry, nd->flags);
M
Miklos Szeredi 已提交
2392
		if (IS_ERR(dentry))
2393
			return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2394 2395 2396 2397

		if (create_error) {
			int open_flag = op->open_flag;

2398
			error = create_error;
M
Miklos Szeredi 已提交
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413
			if ((open_flag & O_EXCL)) {
				if (!dentry->d_inode)
					goto out;
			} else if (!dentry->d_inode) {
				goto out;
			} else if ((open_flag & O_TRUNC) &&
				   S_ISREG(dentry->d_inode->i_mode)) {
				goto out;
			}
			/* will fail later, go on to get the right error */
		}
	}
looked_up:
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2414
	return 1;
M
Miklos Szeredi 已提交
2415 2416
}

M
Miklos Szeredi 已提交
2417
/*
2418
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
2419 2420 2421
 *
 * Must be called with i_mutex held on parent.
 *
2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433
 * Returns 0 if the file was successfully atomically created (if necessary) and
 * opened.  In this case the file will be returned attached to @file.
 *
 * Returns 1 if the file was not completely opened at this time, though lookups
 * and creations will have been performed and the dentry returned in @path will
 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
 * specified then a negative dentry may be returned.
 *
 * An error code is returned otherwise.
 *
 * FILE_CREATE will be set in @*opened if the dentry was created and will be
 * cleared otherwise prior to returning.
M
Miklos Szeredi 已提交
2434
 */
2435 2436 2437 2438
static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
			bool *want_write, int *opened)
M
Miklos Szeredi 已提交
2439 2440
{
	struct dentry *dir = nd->path.dentry;
2441
	struct inode *dir_inode = dir->d_inode;
M
Miklos Szeredi 已提交
2442 2443
	struct dentry *dentry;
	int error;
2444
	bool need_lookup;
M
Miklos Szeredi 已提交
2445

2446
	*opened &= ~FILE_CREATED;
2447
	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
M
Miklos Szeredi 已提交
2448
	if (IS_ERR(dentry))
2449
		return PTR_ERR(dentry);
M
Miklos Szeredi 已提交
2450

M
Miklos Szeredi 已提交
2451 2452 2453 2454 2455
	/* Cached positive dentry: will open in f_op->open */
	if (!need_lookup && dentry->d_inode)
		goto out_no_open;

	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
A
Al Viro 已提交
2456
		return atomic_open(nd, dentry, path, file, op, want_write,
2457
				   need_lookup, opened);
M
Miklos Szeredi 已提交
2458 2459
	}

2460 2461 2462
	if (need_lookup) {
		BUG_ON(dentry->d_inode);

2463
		dentry = lookup_real(dir_inode, dentry, nd->flags);
2464
		if (IS_ERR(dentry))
2465
			return PTR_ERR(dentry);
2466 2467
	}

M
Miklos Szeredi 已提交
2468 2469 2470 2471 2472 2473 2474 2475 2476 2477
	/* Negative dentry, just create the file */
	if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
		umode_t mode = op->mode;
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
		/*
		 * This write is needed to ensure that a
		 * rw->ro transition does not occur between
		 * the time when the file is created and when
		 * a permanent write count is taken through
2478
		 * the 'struct file' in finish_open().
M
Miklos Szeredi 已提交
2479 2480 2481 2482
		 */
		error = mnt_want_write(nd->path.mnt);
		if (error)
			goto out_dput;
M
Miklos Szeredi 已提交
2483
		*want_write = true;
2484
		*opened |= FILE_CREATED;
M
Miklos Szeredi 已提交
2485 2486 2487
		error = security_path_mknod(&nd->path, dentry, mode, 0);
		if (error)
			goto out_dput;
A
Al Viro 已提交
2488 2489
		error = vfs_create(dir->d_inode, dentry, mode,
				   nd->flags & LOOKUP_EXCL);
M
Miklos Szeredi 已提交
2490 2491 2492
		if (error)
			goto out_dput;
	}
M
Miklos Szeredi 已提交
2493
out_no_open:
M
Miklos Szeredi 已提交
2494 2495
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
2496
	return 1;
M
Miklos Szeredi 已提交
2497 2498 2499

out_dput:
	dput(dentry);
2500
	return error;
M
Miklos Szeredi 已提交
2501 2502
}

N
Nick Piggin 已提交
2503
/*
2504
 * Handle the last step of open()
N
Nick Piggin 已提交
2505
 */
2506 2507 2508
static int do_last(struct nameidata *nd, struct path *path,
		   struct file *file, const struct open_flags *op,
		   int *opened, const char *pathname)
2509
{
2510
	struct dentry *dir = nd->path.dentry;
2511
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
2512 2513
	bool will_truncate = (open_flag & O_TRUNC) != 0;
	bool want_write = false;
A
Al Viro 已提交
2514
	int acc_mode = op->acc_mode;
2515
	struct inode *inode;
M
Miklos Szeredi 已提交
2516
	bool symlink_ok = false;
2517 2518
	struct path save_parent = { .dentry = NULL, .mnt = NULL };
	bool retried = false;
A
Al Viro 已提交
2519
	int error;
2520

2521 2522 2523
	nd->flags &= ~LOOKUP_PARENT;
	nd->flags |= op->intent;

2524 2525
	switch (nd->last_type) {
	case LAST_DOTDOT:
2526
	case LAST_DOT:
2527 2528
		error = handle_dots(nd, nd->last_type);
		if (error)
2529
			return error;
2530 2531
		/* fallthrough */
	case LAST_ROOT:
2532
		error = complete_walk(nd);
A
Al Viro 已提交
2533
		if (error)
2534
			return error;
2535
		audit_inode(pathname, nd->path.dentry);
2536
		if (open_flag & O_CREAT) {
2537
			error = -EISDIR;
2538
			goto out;
2539
		}
M
Miklos Szeredi 已提交
2540
		goto finish_open;
2541
	case LAST_BIND:
2542
		error = complete_walk(nd);
A
Al Viro 已提交
2543
		if (error)
2544
			return error;
2545
		audit_inode(pathname, dir);
M
Miklos Szeredi 已提交
2546
		goto finish_open;
2547
	}
2548

2549
	if (!(open_flag & O_CREAT)) {
2550 2551
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
A
Al Viro 已提交
2552
		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
M
Miklos Szeredi 已提交
2553
			symlink_ok = true;
2554
		/* we _can_ be in RCU mode here */
2555
		error = lookup_fast(nd, &nd->last, path, &inode);
2556 2557 2558 2559
		if (likely(!error))
			goto finish_lookup;

		if (error < 0)
2560
			goto out;
2561 2562

		BUG_ON(nd->inode != dir->d_inode);
2563 2564 2565 2566 2567 2568 2569 2570 2571
	} else {
		/* create side of things */
		/*
		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
		 * has been cleared when we got to the last component we are
		 * about to look up
		 */
		error = complete_walk(nd);
		if (error)
2572
			return error;
2573

2574 2575 2576 2577
		audit_inode(pathname, dir);
		error = -EISDIR;
		/* trailing slashes? */
		if (nd->last.name[nd->last.len])
2578
			goto out;
2579
	}
A
Al Viro 已提交
2580

2581
retry_lookup:
2582
	mutex_lock(&dir->d_inode->i_mutex);
2583
	error = lookup_open(nd, path, file, op, &want_write, opened);
M
Miklos Szeredi 已提交
2584
	mutex_unlock(&dir->d_inode->i_mutex);
2585

2586 2587
	if (error <= 0) {
		if (error)
M
Miklos Szeredi 已提交
2588 2589
			goto out;

2590
		if ((*opened & FILE_CREATED) ||
2591
		    !S_ISREG(file->f_path.dentry->d_inode->i_mode))
M
Miklos Szeredi 已提交
2592
			will_truncate = false;
M
Miklos Szeredi 已提交
2593

2594
		audit_inode(pathname, file->f_path.dentry);
M
Miklos Szeredi 已提交
2595 2596
		goto opened;
	}
2597

2598
	if (*opened & FILE_CREATED) {
2599
		/* Don't check for write permission, don't truncate */
2600
		open_flag &= ~O_TRUNC;
M
Miklos Szeredi 已提交
2601
		will_truncate = false;
A
Al Viro 已提交
2602
		acc_mode = MAY_OPEN;
M
Miklos Szeredi 已提交
2603
		path_to_nameidata(path, nd);
M
Miklos Szeredi 已提交
2604
		goto finish_open_created;
2605 2606 2607 2608 2609 2610 2611
	}

	/*
	 * It already exists.
	 */
	audit_inode(pathname, path->dentry);

M
Miklos Szeredi 已提交
2612 2613 2614 2615 2616 2617 2618
	/*
	 * If atomic_open() acquired write access it is dropped now due to
	 * possible mount and symlink following (this might be optimized away if
	 * necessary...)
	 */
	if (want_write) {
		mnt_drop_write(nd->path.mnt);
M
Miklos Szeredi 已提交
2619
		want_write = false;
M
Miklos Szeredi 已提交
2620 2621
	}

2622
	error = -EEXIST;
2623
	if (open_flag & O_EXCL)
2624 2625
		goto exit_dput;

2626 2627 2628
	error = follow_managed(path, nd->flags);
	if (error < 0)
		goto exit_dput;
2629

2630 2631 2632
	if (error)
		nd->flags |= LOOKUP_JUMPED;

2633 2634
	BUG_ON(nd->flags & LOOKUP_RCU);
	inode = path->dentry->d_inode;
2635 2636
finish_lookup:
	/* we _can_ be in RCU mode here */
2637
	error = -ENOENT;
2638 2639
	if (!inode) {
		path_to_nameidata(path, nd);
2640
		goto out;
2641
	}
A
Al Viro 已提交
2642

2643 2644 2645 2646
	if (should_follow_link(inode, !symlink_ok)) {
		if (nd->flags & LOOKUP_RCU) {
			if (unlikely(unlazy_walk(nd, path->dentry))) {
				error = -ECHILD;
2647
				goto out;
2648 2649 2650
			}
		}
		BUG_ON(inode != path->dentry->d_inode);
2651
		return 1;
2652
	}
2653

2654 2655 2656 2657 2658 2659 2660 2661
	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
		path_to_nameidata(path, nd);
	} else {
		save_parent.dentry = nd->path.dentry;
		save_parent.mnt = mntget(path->mnt);
		nd->path.dentry = path->dentry;

	}
2662
	nd->inode = inode;
2663 2664
	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
	error = complete_walk(nd);
2665 2666
	if (error) {
		path_put(&save_parent);
2667
		return error;
2668
	}
2669
	error = -EISDIR;
2670
	if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2671
		goto out;
2672 2673
	error = -ENOTDIR;
	if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
2674
		goto out;
2675
	audit_inode(pathname, nd->path.dentry);
M
Miklos Szeredi 已提交
2676
finish_open:
2677
	if (!S_ISREG(nd->inode->i_mode))
M
Miklos Szeredi 已提交
2678
		will_truncate = false;
2679

2680 2681 2682
	if (will_truncate) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
2683
			goto out;
M
Miklos Szeredi 已提交
2684
		want_write = true;
2685
	}
M
Miklos Szeredi 已提交
2686
finish_open_created:
A
Al Viro 已提交
2687
	error = may_open(&nd->path, acc_mode, open_flag);
2688
	if (error)
2689
		goto out;
A
Al Viro 已提交
2690 2691 2692 2693
	file->f_path.mnt = nd->path.mnt;
	error = finish_open(file, nd->path.dentry, NULL, opened);
	if (error) {
		if (error == -EOPENSTALE)
M
Miklos Szeredi 已提交
2694
			goto stale_open;
2695
		goto out;
M
Miklos Szeredi 已提交
2696
	}
2697
opened:
2698
	error = open_check_o_direct(file);
2699 2700
	if (error)
		goto exit_fput;
2701
	error = ima_file_check(file, op->acc_mode);
2702 2703 2704 2705
	if (error)
		goto exit_fput;

	if (will_truncate) {
2706
		error = handle_truncate(file);
2707 2708
		if (error)
			goto exit_fput;
2709
	}
2710 2711
out:
	if (want_write)
2712
		mnt_drop_write(nd->path.mnt);
2713
	path_put(&save_parent);
2714
	terminate_walk(nd);
2715
	return error;
2716 2717 2718

exit_dput:
	path_put_conditional(path, nd);
2719
	goto out;
2720
exit_fput:
2721 2722
	fput(file);
	goto out;
2723

M
Miklos Szeredi 已提交
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740
stale_open:
	/* If no saved parent or already retried then can't retry */
	if (!save_parent.dentry || retried)
		goto out;

	BUG_ON(save_parent.dentry != dir);
	path_put(&nd->path);
	nd->path = save_parent;
	nd->inode = dir->d_inode;
	save_parent.mnt = NULL;
	save_parent.dentry = NULL;
	if (want_write) {
		mnt_drop_write(nd->path.mnt);
		want_write = false;
	}
	retried = true;
	goto retry_lookup;
2741 2742
}

2743
static struct file *path_openat(int dfd, const char *pathname,
A
Al Viro 已提交
2744
		struct nameidata *nd, const struct open_flags *op, int flags)
L
Linus Torvalds 已提交
2745
{
2746
	struct file *base = NULL;
A
Al Viro 已提交
2747
	struct file *file;
2748
	struct path path;
2749
	int opened = 0;
2750
	int error;
N
Nick Piggin 已提交
2751

A
Al Viro 已提交
2752 2753
	file = get_empty_filp();
	if (!file)
N
Nick Piggin 已提交
2754 2755
		return ERR_PTR(-ENFILE);

A
Al Viro 已提交
2756
	file->f_flags = op->open_flag;
N
Nick Piggin 已提交
2757

A
Al Viro 已提交
2758
	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
N
Nick Piggin 已提交
2759
	if (unlikely(error))
2760
		goto out;
N
Nick Piggin 已提交
2761

2762
	current->total_link_count = 0;
A
Al Viro 已提交
2763
	error = link_path_walk(pathname, nd);
N
Nick Piggin 已提交
2764
	if (unlikely(error))
2765
		goto out;
L
Linus Torvalds 已提交
2766

2767 2768
	error = do_last(nd, &path, file, op, &opened, pathname);
	while (unlikely(error > 0)) { /* trailing symlink */
2769
		struct path link = path;
A
Al Viro 已提交
2770
		void *cookie;
2771
		if (!(nd->flags & LOOKUP_FOLLOW)) {
A
Al Viro 已提交
2772 2773
			path_put_conditional(&path, nd);
			path_put(&nd->path);
2774
			error = -ELOOP;
2775 2776
			break;
		}
A
Al Viro 已提交
2777 2778
		nd->flags |= LOOKUP_PARENT;
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2779
		error = follow_link(&link, nd, &cookie);
2780
		if (unlikely(error))
2781 2782
			break;
		error = do_last(nd, &path, file, op, &opened, pathname);
2783
		put_link(nd, &link, cookie);
2784
	}
A
Al Viro 已提交
2785
out:
A
Al Viro 已提交
2786 2787
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
		path_put(&nd->root);
2788 2789
	if (base)
		fput(base);
2790 2791
	if (!(opened & FILE_OPENED)) {
		BUG_ON(!error);
A
Al Viro 已提交
2792
		put_filp(file);
2793
	}
2794 2795 2796 2797 2798 2799 2800 2801 2802 2803
	if (unlikely(error)) {
		if (error == -EOPENSTALE) {
			if (flags & LOOKUP_RCU)
				error = -ECHILD;
			else
				error = -ESTALE;
		}
		file = ERR_PTR(error);
	}
	return file;
L
Linus Torvalds 已提交
2804 2805
}

2806 2807 2808
struct file *do_filp_open(int dfd, const char *pathname,
		const struct open_flags *op, int flags)
{
A
Al Viro 已提交
2809
	struct nameidata nd;
2810 2811
	struct file *filp;

A
Al Viro 已提交
2812
	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2813
	if (unlikely(filp == ERR_PTR(-ECHILD)))
A
Al Viro 已提交
2814
		filp = path_openat(dfd, pathname, &nd, op, flags);
2815
	if (unlikely(filp == ERR_PTR(-ESTALE)))
A
Al Viro 已提交
2816
		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2817 2818 2819
	return filp;
}

A
Al Viro 已提交
2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
		const char *name, const struct open_flags *op, int flags)
{
	struct nameidata nd;
	struct file *file;

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

	flags |= LOOKUP_ROOT;

A
Al Viro 已提交
2831
	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
2832 2833 2834 2835 2836 2837 2838 2839 2840 2841
		return ERR_PTR(-ELOOP);

	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
	if (unlikely(file == ERR_PTR(-ECHILD)))
		file = path_openat(-1, name, &nd, op, flags);
	if (unlikely(file == ERR_PTR(-ESTALE)))
		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
	return file;
}

A
Al Viro 已提交
2842
struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
L
Linus Torvalds 已提交
2843
{
2844
	struct dentry *dentry = ERR_PTR(-EEXIST);
A
Al Viro 已提交
2845 2846 2847 2848
	struct nameidata nd;
	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
	if (error)
		return ERR_PTR(error);
L
Linus Torvalds 已提交
2849

2850 2851 2852 2853
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
A
Al Viro 已提交
2854 2855 2856 2857
	if (nd.last_type != LAST_NORM)
		goto out;
	nd.flags &= ~LOOKUP_PARENT;
	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
2858 2859 2860 2861

	/*
	 * Do the final lookup.
	 */
A
Al Viro 已提交
2862 2863
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
2864 2865
	if (IS_ERR(dentry))
		goto fail;
2866

2867 2868
	if (dentry->d_inode)
		goto eexist;
2869 2870 2871 2872 2873 2874
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
A
Al Viro 已提交
2875
	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
2876 2877
		dput(dentry);
		dentry = ERR_PTR(-ENOENT);
A
Al Viro 已提交
2878
		goto fail;
2879
	}
A
Al Viro 已提交
2880
	*path = nd.path;
L
Linus Torvalds 已提交
2881
	return dentry;
2882
eexist:
L
Linus Torvalds 已提交
2883
	dput(dentry);
2884
	dentry = ERR_PTR(-EEXIST);
L
Linus Torvalds 已提交
2885
fail:
A
Al Viro 已提交
2886 2887 2888
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
out:
	path_put(&nd.path);
L
Linus Torvalds 已提交
2889 2890
	return dentry;
}
2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904
EXPORT_SYMBOL(kern_path_create);

struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
{
	char *tmp = getname(pathname);
	struct dentry *res;
	if (IS_ERR(tmp))
		return ERR_CAST(tmp);
	res = kern_path_create(dfd, tmp, path, is_dir);
	putname(tmp);
	return res;
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
2905
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
2906
{
2907
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2908 2909 2910 2911

	if (error)
		return error;

2912
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
2913 2914
		return -EPERM;

A
Al Viro 已提交
2915
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
2916 2917
		return -EPERM;

2918 2919 2920 2921
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
2922 2923 2924 2925 2926
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
2927
	if (!error)
2928
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2929 2930 2931
	return error;
}

A
Al Viro 已提交
2932
static int may_mknod(umode_t mode)
2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

A
Al Viro 已提交
2949
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
2950
		unsigned, dev)
L
Linus Torvalds 已提交
2951
{
2952
	struct dentry *dentry;
2953 2954
	struct path path;
	int error;
L
Linus Torvalds 已提交
2955 2956 2957 2958

	if (S_ISDIR(mode))
		return -EPERM;

2959 2960 2961
	dentry = user_path_create(dfd, filename, &path, 0);
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
2962

2963
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
2964
		mode &= ~current_umask();
2965 2966 2967
	error = may_mknod(mode);
	if (error)
		goto out_dput;
2968
	error = mnt_want_write(path.mnt);
2969 2970
	if (error)
		goto out_dput;
2971
	error = security_path_mknod(&path, dentry, mode, dev);
2972 2973
	if (error)
		goto out_drop_write;
2974
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
2975
		case 0: case S_IFREG:
A
Al Viro 已提交
2976
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
L
Linus Torvalds 已提交
2977 2978
			break;
		case S_IFCHR: case S_IFBLK:
2979
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
2980 2981 2982
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
2983
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
2984 2985
			break;
	}
2986
out_drop_write:
2987
	mnt_drop_write(path.mnt);
2988 2989
out_dput:
	dput(dentry);
2990 2991
	mutex_unlock(&path.dentry->d_inode->i_mutex);
	path_put(&path);
L
Linus Torvalds 已提交
2992 2993 2994 2995

	return error;
}

A
Al Viro 已提交
2996
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
2997 2998 2999 3000
{
	return sys_mknodat(AT_FDCWD, filename, mode, dev);
}

3001
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3002
{
3003
	int error = may_create(dir, dentry);
3004
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3005 3006 3007 3008

	if (error)
		return error;

A
Al Viro 已提交
3009
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3010 3011 3012 3013 3014 3015 3016
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3017 3018 3019
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3020
	error = dir->i_op->mkdir(dir, dentry, mode);
3021
	if (!error)
3022
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3023 3024 3025
	return error;
}

3026
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
L
Linus Torvalds 已提交
3027
{
3028
	struct dentry *dentry;
3029 3030
	struct path path;
	int error;
L
Linus Torvalds 已提交
3031

3032
	dentry = user_path_create(dfd, pathname, &path, 1);
3033
	if (IS_ERR(dentry))
3034
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3035

3036
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3037
		mode &= ~current_umask();
3038
	error = mnt_want_write(path.mnt);
3039 3040
	if (error)
		goto out_dput;
3041
	error = security_path_mkdir(&path, dentry, mode);
3042 3043
	if (error)
		goto out_drop_write;
3044
	error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3045
out_drop_write:
3046
	mnt_drop_write(path.mnt);
3047
out_dput:
3048
	dput(dentry);
3049 3050
	mutex_unlock(&path.dentry->d_inode->i_mutex);
	path_put(&path);
L
Linus Torvalds 已提交
3051 3052 3053
	return error;
}

3054
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3055 3056 3057 3058
{
	return sys_mkdirat(AT_FDCWD, pathname, mode);
}

L
Linus Torvalds 已提交
3059
/*
S
Sage Weil 已提交
3060
 * The dentry_unhash() helper will try to drop the dentry early: we
3061
 * should have a usage count of 1 if we're the only user of this
S
Sage Weil 已提交
3062 3063
 * dentry, and if that is true (possibly after pruning the dcache),
 * then we drop the dentry now.
L
Linus Torvalds 已提交
3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075
 *
 * A low-level filesystem can, if it choses, legally
 * do a
 *
 *	if (!d_unhashed(dentry))
 *		return -EBUSY;
 *
 * if it cannot handle the case of removing a directory
 * that is still in use by something else..
 */
void dentry_unhash(struct dentry *dentry)
{
3076
	shrink_dcache_parent(dentry);
L
Linus Torvalds 已提交
3077
	spin_lock(&dentry->d_lock);
3078
	if (dentry->d_count == 1)
L
Linus Torvalds 已提交
3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089
		__d_drop(dentry);
	spin_unlock(&dentry->d_lock);
}

int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3090
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3091 3092
		return -EPERM;

3093
	dget(dentry);
3094
	mutex_lock(&dentry->d_inode->i_mutex);
S
Sage Weil 已提交
3095 3096

	error = -EBUSY;
L
Linus Torvalds 已提交
3097
	if (d_mountpoint(dentry))
S
Sage Weil 已提交
3098 3099 3100 3101 3102 3103
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

3104
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3105 3106 3107 3108 3109 3110 3111 3112
	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);

out:
3113
	mutex_unlock(&dentry->d_inode->i_mutex);
3114
	dput(dentry);
S
Sage Weil 已提交
3115
	if (!error)
L
Linus Torvalds 已提交
3116 3117 3118 3119
		d_delete(dentry);
	return error;
}

3120
static long do_rmdir(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3121 3122 3123 3124 3125 3126
{
	int error = 0;
	char * name;
	struct dentry *dentry;
	struct nameidata nd;

3127
	error = user_path_parent(dfd, pathname, &nd, &name);
L
Linus Torvalds 已提交
3128
	if (error)
3129
		return error;
L
Linus Torvalds 已提交
3130 3131

	switch(nd.last_type) {
3132 3133 3134 3135 3136 3137 3138 3139 3140
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3141
	}
3142 3143 3144

	nd.flags &= ~LOOKUP_PARENT;

3145
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3146
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
3147
	error = PTR_ERR(dentry);
3148 3149
	if (IS_ERR(dentry))
		goto exit2;
3150 3151 3152 3153
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3154 3155 3156
	error = mnt_want_write(nd.path.mnt);
	if (error)
		goto exit3;
3157 3158 3159
	error = security_path_rmdir(&nd.path, dentry);
	if (error)
		goto exit4;
3160
	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
3161
exit4:
3162 3163
	mnt_drop_write(nd.path.mnt);
exit3:
3164 3165
	dput(dentry);
exit2:
3166
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3167
exit1:
J
Jan Blunck 已提交
3168
	path_put(&nd.path);
L
Linus Torvalds 已提交
3169 3170 3171 3172
	putname(name);
	return error;
}

3173
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3174 3175 3176 3177
{
	return do_rmdir(AT_FDCWD, pathname);
}

L
Linus Torvalds 已提交
3178 3179 3180 3181 3182 3183 3184
int vfs_unlink(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3185
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3186 3187
		return -EPERM;

3188
	mutex_lock(&dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3189 3190 3191 3192
	if (d_mountpoint(dentry))
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3193
		if (!error) {
L
Linus Torvalds 已提交
3194
			error = dir->i_op->unlink(dir, dentry);
3195
			if (!error)
3196
				dont_mount(dentry);
3197
		}
L
Linus Torvalds 已提交
3198
	}
3199
	mutex_unlock(&dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3200 3201 3202

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3203
		fsnotify_link_count(dentry->d_inode);
J
John McCutchan 已提交
3204
		d_delete(dentry);
L
Linus Torvalds 已提交
3205
	}
R
Robert Love 已提交
3206

L
Linus Torvalds 已提交
3207 3208 3209 3210 3211
	return error;
}

/*
 * Make sure that the actual truncation of the file will occur outside its
3212
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3213 3214 3215
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3216
static long do_unlinkat(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3217
{
3218 3219
	int error;
	char *name;
L
Linus Torvalds 已提交
3220 3221 3222 3223
	struct dentry *dentry;
	struct nameidata nd;
	struct inode *inode = NULL;

3224
	error = user_path_parent(dfd, pathname, &nd, &name);
L
Linus Torvalds 已提交
3225
	if (error)
3226 3227
		return error;

L
Linus Torvalds 已提交
3228 3229 3230
	error = -EISDIR;
	if (nd.last_type != LAST_NORM)
		goto exit1;
3231 3232 3233

	nd.flags &= ~LOOKUP_PARENT;

3234
	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3235
	dentry = lookup_hash(&nd);
L
Linus Torvalds 已提交
3236 3237 3238
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3239 3240
		if (nd.last.name[nd.last.len])
			goto slashes;
L
Linus Torvalds 已提交
3241
		inode = dentry->d_inode;
3242
		if (!inode)
3243 3244
			goto slashes;
		ihold(inode);
3245 3246 3247
		error = mnt_want_write(nd.path.mnt);
		if (error)
			goto exit2;
3248 3249 3250
		error = security_path_unlink(&nd.path, dentry);
		if (error)
			goto exit3;
3251
		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
3252
exit3:
3253
		mnt_drop_write(nd.path.mnt);
L
Linus Torvalds 已提交
3254 3255 3256
	exit2:
		dput(dentry);
	}
3257
	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
L
Linus Torvalds 已提交
3258 3259 3260
	if (inode)
		iput(inode);	/* truncate the inode here */
exit1:
J
Jan Blunck 已提交
3261
	path_put(&nd.path);
L
Linus Torvalds 已提交
3262 3263 3264 3265 3266 3267 3268 3269 3270
	putname(name);
	return error;

slashes:
	error = !dentry->d_inode ? -ENOENT :
		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
	goto exit2;
}

3271
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3272 3273 3274 3275 3276 3277 3278 3279 3280 3281
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
		return do_rmdir(dfd, pathname);

	return do_unlinkat(dfd, pathname);
}

3282
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3283 3284 3285 3286
{
	return do_unlinkat(AT_FDCWD, pathname);
}

3287
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
3288
{
3289
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3290 3291 3292 3293

	if (error)
		return error;

A
Al Viro 已提交
3294
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
3295 3296 3297 3298 3299 3300 3301
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
3302
	if (!error)
3303
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3304 3305 3306
	return error;
}

3307 3308
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
L
Linus Torvalds 已提交
3309
{
3310 3311
	int error;
	char *from;
3312
	struct dentry *dentry;
3313
	struct path path;
L
Linus Torvalds 已提交
3314 3315

	from = getname(oldname);
3316
	if (IS_ERR(from))
L
Linus Torvalds 已提交
3317 3318
		return PTR_ERR(from);

3319
	dentry = user_path_create(newdfd, newname, &path, 0);
3320 3321
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
3322
		goto out_putname;
3323

3324
	error = mnt_want_write(path.mnt);
3325 3326
	if (error)
		goto out_dput;
3327
	error = security_path_symlink(&path, dentry, from);
3328 3329
	if (error)
		goto out_drop_write;
3330
	error = vfs_symlink(path.dentry->d_inode, dentry, from);
3331
out_drop_write:
3332
	mnt_drop_write(path.mnt);
3333
out_dput:
3334
	dput(dentry);
3335 3336
	mutex_unlock(&path.dentry->d_inode->i_mutex);
	path_put(&path);
3337
out_putname:
L
Linus Torvalds 已提交
3338 3339 3340 3341
	putname(from);
	return error;
}

3342
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3343 3344 3345 3346
{
	return sys_symlinkat(oldname, AT_FDCWD, newname);
}

L
Linus Torvalds 已提交
3347 3348 3349
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
	struct inode *inode = old_dentry->d_inode;
3350
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3351 3352 3353 3354 3355
	int error;

	if (!inode)
		return -ENOENT;

3356
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
A
Al Viro 已提交
3368
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
3369
		return -EPERM;
3370
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
3371 3372 3373 3374 3375 3376
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

3377
	mutex_lock(&inode->i_mutex);
3378 3379 3380
	/* Make sure we don't allow creating hardlink to an unlinked file */
	if (inode->i_nlink == 0)
		error =  -ENOENT;
3381 3382
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
3383 3384
	else
		error = dir->i_op->link(old_dentry, dir, new_dentry);
3385
	mutex_unlock(&inode->i_mutex);
3386
	if (!error)
3387
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399
	return error;
}

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
3400 3401
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
L
Linus Torvalds 已提交
3402 3403
{
	struct dentry *new_dentry;
3404
	struct path old_path, new_path;
3405
	int how = 0;
L
Linus Torvalds 已提交
3406 3407
	int error;

3408
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3409
		return -EINVAL;
3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422
	/*
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
	 */
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
		how = LOOKUP_EMPTY;
	}

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
3423

3424
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
3425
	if (error)
3426 3427
		return error;

3428
	new_dentry = user_path_create(newdfd, newname, &new_path, 0);
L
Linus Torvalds 已提交
3429
	error = PTR_ERR(new_dentry);
3430
	if (IS_ERR(new_dentry))
3431 3432 3433 3434 3435 3436
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
	error = mnt_want_write(new_path.mnt);
3437 3438
	if (error)
		goto out_dput;
3439
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
3440 3441
	if (error)
		goto out_drop_write;
3442
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3443
out_drop_write:
3444
	mnt_drop_write(new_path.mnt);
3445
out_dput:
3446
	dput(new_dentry);
3447 3448
	mutex_unlock(&new_path.dentry->d_inode->i_mutex);
	path_put(&new_path);
L
Linus Torvalds 已提交
3449
out:
3450
	path_put(&old_path);
L
Linus Torvalds 已提交
3451 3452 3453 3454

	return error;
}

3455
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
3456
{
3457
	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3458 3459
}

L
Linus Torvalds 已提交
3460 3461 3462 3463 3464 3465 3466
/*
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *	a) we can get into loop creation. Check is done in is_subdir().
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
3467
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
3468 3469
 *	   story.
 *	c) we have to lock _three_ objects - parents and victim (if it exists).
3470
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
3471 3472
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
3473
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
3474 3475 3476
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
3477
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
3478 3479 3480
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
3481
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
3482
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
3483
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
3484
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
3485 3486
 *	   locking].
 */
A
Adrian Bunk 已提交
3487 3488
static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
			  struct inode *new_dir, struct dentry *new_dentry)
L
Linus Torvalds 已提交
3489 3490
{
	int error = 0;
S
Sage Weil 已提交
3491
	struct inode *target = new_dentry->d_inode;
3492
	unsigned max_links = new_dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3493 3494 3495 3496 3497 3498

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
	if (new_dir != old_dir) {
3499
		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
L
Linus Torvalds 已提交
3500 3501 3502 3503 3504 3505 3506 3507
		if (error)
			return error;
	}

	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
	if (error)
		return error;

3508
	dget(new_dentry);
3509
	if (target)
3510
		mutex_lock(&target->i_mutex);
S
Sage Weil 已提交
3511 3512 3513 3514 3515

	error = -EBUSY;
	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
		goto out;

3516 3517 3518 3519 3520
	error = -EMLINK;
	if (max_links && !target && new_dir != old_dir &&
	    new_dir->i_nlink >= max_links)
		goto out;

3521 3522
	if (target)
		shrink_dcache_parent(new_dentry);
S
Sage Weil 已提交
3523 3524 3525 3526
	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
	if (error)
		goto out;

L
Linus Torvalds 已提交
3527
	if (target) {
S
Sage Weil 已提交
3528 3529
		target->i_flags |= S_DEAD;
		dont_mount(new_dentry);
L
Linus Torvalds 已提交
3530
	}
S
Sage Weil 已提交
3531 3532 3533
out:
	if (target)
		mutex_unlock(&target->i_mutex);
3534
	dput(new_dentry);
3535
	if (!error)
3536 3537
		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
			d_move(old_dentry,new_dentry);
L
Linus Torvalds 已提交
3538 3539 3540
	return error;
}

A
Adrian Bunk 已提交
3541 3542
static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
			    struct inode *new_dir, struct dentry *new_dentry)
L
Linus Torvalds 已提交
3543
{
S
Sage Weil 已提交
3544
	struct inode *target = new_dentry->d_inode;
L
Linus Torvalds 已提交
3545 3546 3547 3548 3549 3550 3551 3552
	int error;

	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
	if (error)
		return error;

	dget(new_dentry);
	if (target)
3553
		mutex_lock(&target->i_mutex);
S
Sage Weil 已提交
3554 3555

	error = -EBUSY;
L
Linus Torvalds 已提交
3556
	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
S
Sage Weil 已提交
3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567
		goto out;

	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
	if (error)
		goto out;

	if (target)
		dont_mount(new_dentry);
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
		d_move(old_dentry, new_dentry);
out:
L
Linus Torvalds 已提交
3568
	if (target)
3569
		mutex_unlock(&target->i_mutex);
L
Linus Torvalds 已提交
3570 3571 3572 3573 3574 3575 3576 3577 3578
	dput(new_dentry);
	return error;
}

int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry)
{
	int error;
	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
3579
	const unsigned char *old_name;
L
Linus Torvalds 已提交
3580 3581 3582 3583 3584 3585 3586 3587 3588

	if (old_dentry->d_inode == new_dentry->d_inode)
 		return 0;
 
	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

	if (!new_dentry->d_inode)
3589
		error = may_create(new_dir, new_dentry);
L
Linus Torvalds 已提交
3590 3591 3592 3593 3594
	else
		error = may_delete(new_dir, new_dentry, is_dir);
	if (error)
		return error;

A
Al Viro 已提交
3595
	if (!old_dir->i_op->rename)
L
Linus Torvalds 已提交
3596 3597
		return -EPERM;

R
Robert Love 已提交
3598 3599
	old_name = fsnotify_oldname_init(old_dentry->d_name.name);

L
Linus Torvalds 已提交
3600 3601 3602 3603
	if (is_dir)
		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
	else
		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
3604 3605
	if (!error)
		fsnotify_move(old_dir, new_dir, old_name, is_dir,
3606
			      new_dentry->d_inode, old_dentry);
R
Robert Love 已提交
3607 3608
	fsnotify_oldname_free(old_name);

L
Linus Torvalds 已提交
3609 3610 3611
	return error;
}

3612 3613
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
L
Linus Torvalds 已提交
3614
{
3615 3616 3617
	struct dentry *old_dir, *new_dir;
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
L
Linus Torvalds 已提交
3618
	struct nameidata oldnd, newnd;
3619 3620 3621
	char *from;
	char *to;
	int error;
L
Linus Torvalds 已提交
3622

3623
	error = user_path_parent(olddfd, oldname, &oldnd, &from);
L
Linus Torvalds 已提交
3624 3625 3626
	if (error)
		goto exit;

3627
	error = user_path_parent(newdfd, newname, &newnd, &to);
L
Linus Torvalds 已提交
3628 3629 3630 3631
	if (error)
		goto exit1;

	error = -EXDEV;
3632
	if (oldnd.path.mnt != newnd.path.mnt)
L
Linus Torvalds 已提交
3633 3634
		goto exit2;

3635
	old_dir = oldnd.path.dentry;
L
Linus Torvalds 已提交
3636 3637 3638 3639
	error = -EBUSY;
	if (oldnd.last_type != LAST_NORM)
		goto exit2;

3640
	new_dir = newnd.path.dentry;
L
Linus Torvalds 已提交
3641 3642 3643
	if (newnd.last_type != LAST_NORM)
		goto exit2;

3644 3645
	oldnd.flags &= ~LOOKUP_PARENT;
	newnd.flags &= ~LOOKUP_PARENT;
3646
	newnd.flags |= LOOKUP_RENAME_TARGET;
3647

L
Linus Torvalds 已提交
3648 3649
	trap = lock_rename(new_dir, old_dir);

3650
	old_dentry = lookup_hash(&oldnd);
L
Linus Torvalds 已提交
3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
	if (!old_dentry->d_inode)
		goto exit4;
	/* unless the source is a directory trailing slashes give -ENOTDIR */
	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
		error = -ENOTDIR;
		if (oldnd.last.name[oldnd.last.len])
			goto exit4;
		if (newnd.last.name[newnd.last.len])
			goto exit4;
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
		goto exit4;
3670
	new_dentry = lookup_hash(&newnd);
L
Linus Torvalds 已提交
3671 3672 3673 3674 3675 3676 3677 3678
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	/* target should not be an ancestor of source */
	error = -ENOTEMPTY;
	if (new_dentry == trap)
		goto exit5;

3679 3680 3681
	error = mnt_want_write(oldnd.path.mnt);
	if (error)
		goto exit5;
3682 3683 3684 3685
	error = security_path_rename(&oldnd.path, old_dentry,
				     &newnd.path, new_dentry);
	if (error)
		goto exit6;
L
Linus Torvalds 已提交
3686 3687
	error = vfs_rename(old_dir->d_inode, old_dentry,
				   new_dir->d_inode, new_dentry);
3688
exit6:
3689
	mnt_drop_write(oldnd.path.mnt);
L
Linus Torvalds 已提交
3690 3691 3692 3693 3694 3695 3696
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
	unlock_rename(new_dir, old_dir);
exit2:
J
Jan Blunck 已提交
3697
	path_put(&newnd.path);
3698
	putname(to);
L
Linus Torvalds 已提交
3699
exit1:
J
Jan Blunck 已提交
3700
	path_put(&oldnd.path);
L
Linus Torvalds 已提交
3701
	putname(from);
3702
exit:
L
Linus Torvalds 已提交
3703 3704 3705
	return error;
}

3706
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
3707 3708 3709 3710
{
	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
}

L
Linus Torvalds 已提交
3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
{
	int len;

	len = PTR_ERR(link);
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

/*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
 * using) it for any given inode is up to filesystem.
 */
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct nameidata nd;
3736
	void *cookie;
3737
	int res;
3738

L
Linus Torvalds 已提交
3739
	nd.depth = 0;
3740
	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
3741 3742 3743 3744 3745 3746 3747
	if (IS_ERR(cookie))
		return PTR_ERR(cookie);

	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
	if (dentry->d_inode->i_op->put_link)
		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
	return res;
L
Linus Torvalds 已提交
3748 3749 3750 3751 3752 3753 3754 3755 3756 3757
}

int vfs_follow_link(struct nameidata *nd, const char *link)
{
	return __vfs_follow_link(nd, link);
}

/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
{
3758 3759
	char *kaddr;
	struct page *page;
L
Linus Torvalds 已提交
3760
	struct address_space *mapping = dentry->d_inode->i_mapping;
3761
	page = read_mapping_page(mapping, 0, NULL);
L
Linus Torvalds 已提交
3762
	if (IS_ERR(page))
3763
		return (char*)page;
L
Linus Torvalds 已提交
3764
	*ppage = page;
3765 3766 3767
	kaddr = kmap(page);
	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
	return kaddr;
L
Linus Torvalds 已提交
3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781
}

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct page *page = NULL;
	char *s = page_getlink(dentry, &page);
	int res = vfs_readlink(dentry,buffer,buflen,s);
	if (page) {
		kunmap(page);
		page_cache_release(page);
	}
	return res;
}

3782
void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
L
Linus Torvalds 已提交
3783
{
3784
	struct page *page = NULL;
L
Linus Torvalds 已提交
3785
	nd_set_link(nd, page_getlink(dentry, &page));
3786
	return page;
L
Linus Torvalds 已提交
3787 3788
}

3789
void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
L
Linus Torvalds 已提交
3790
{
3791 3792 3793
	struct page *page = cookie;

	if (page) {
L
Linus Torvalds 已提交
3794 3795 3796 3797 3798
		kunmap(page);
		page_cache_release(page);
	}
}

3799 3800 3801 3802
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
3803 3804
{
	struct address_space *mapping = inode->i_mapping;
3805
	struct page *page;
3806
	void *fsdata;
3807
	int err;
L
Linus Torvalds 已提交
3808
	char *kaddr;
3809 3810 3811
	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
3812

3813
retry:
3814
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
3815
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
3816
	if (err)
3817 3818
		goto fail;

3819
	kaddr = kmap_atomic(page);
L
Linus Torvalds 已提交
3820
	memcpy(kaddr, symname, len-1);
3821
	kunmap_atomic(kaddr);
3822 3823 3824

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
3825 3826
	if (err < 0)
		goto fail;
3827 3828 3829
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
3830 3831 3832 3833 3834 3835
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}

3836 3837 3838
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
3839
			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
3840 3841
}

3842
const struct inode_operations page_symlink_inode_operations = {
L
Linus Torvalds 已提交
3843 3844 3845 3846 3847
	.readlink	= generic_readlink,
	.follow_link	= page_follow_link_light,
	.put_link	= page_put_link,
};

3848
EXPORT_SYMBOL(user_path_at);
3849
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
3850 3851 3852 3853 3854 3855 3856 3857 3858
EXPORT_SYMBOL(follow_down);
EXPORT_SYMBOL(follow_up);
EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
EXPORT_SYMBOL(getname);
EXPORT_SYMBOL(lock_rename);
EXPORT_SYMBOL(lookup_one_len);
EXPORT_SYMBOL(page_follow_link_light);
EXPORT_SYMBOL(page_put_link);
EXPORT_SYMBOL(page_readlink);
3859
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
3860 3861
EXPORT_SYMBOL(page_symlink);
EXPORT_SYMBOL(page_symlink_inode_operations);
A
Al Viro 已提交
3862
EXPORT_SYMBOL(kern_path);
3863
EXPORT_SYMBOL(vfs_path_lookup);
3864
EXPORT_SYMBOL(inode_permission);
L
Linus Torvalds 已提交
3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878
EXPORT_SYMBOL(unlock_rename);
EXPORT_SYMBOL(vfs_create);
EXPORT_SYMBOL(vfs_follow_link);
EXPORT_SYMBOL(vfs_link);
EXPORT_SYMBOL(vfs_mkdir);
EXPORT_SYMBOL(vfs_mknod);
EXPORT_SYMBOL(generic_permission);
EXPORT_SYMBOL(vfs_readlink);
EXPORT_SYMBOL(vfs_rename);
EXPORT_SYMBOL(vfs_rmdir);
EXPORT_SYMBOL(vfs_symlink);
EXPORT_SYMBOL(vfs_unlink);
EXPORT_SYMBOL(dentry_unhash);
EXPORT_SYMBOL(generic_readlink);