namei.c 119.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
L
Linus Torvalds 已提交
20 21 22 23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
24
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
25 26
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
27
#include <linux/ima.h>
L
Linus Torvalds 已提交
28 29 30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
37
#include <linux/hash.h>
38
#include <linux/bitops.h>
L
Linus Torvalds 已提交
39 40
#include <asm/uaccess.h>

41
#include "internal.h"
42
#include "mount.h"
43

L
Linus Torvalds 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
78
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
111
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
112 113 114 115 116 117 118 119 120 121
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
122

A
Al Viro 已提交
123
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
124

125
struct filename *
126 127
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
128
	struct filename *result;
129
	char *kname;
A
Al Viro 已提交
130
	int len;
131

132 133 134 135
	result = audit_reusename(filename);
	if (result)
		return result;

136
	result = __getname();
137
	if (unlikely(!result))
138 139
		return ERR_PTR(-ENOMEM);

140 141 142 143
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
144
	kname = (char *)result->iname;
145
	result->name = kname;
146

A
Al Viro 已提交
147
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
148
	if (unlikely(len < 0)) {
A
Al Viro 已提交
149 150
		__putname(result);
		return ERR_PTR(len);
151
	}
152

153 154 155 156 157 158
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
159
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
160
		const size_t size = offsetof(struct filename, iname[1]);
161 162
		kname = (char *)result;

A
Al Viro 已提交
163 164 165 166 167 168
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
169 170 171
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
172 173
		}
		result->name = kname;
A
Al Viro 已提交
174 175 176 177 178 179 180 181 182 183 184
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
185 186
	}

A
Al Viro 已提交
187
	result->refcnt = 1;
188 189 190
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
191
			*empty = 1;
A
Al Viro 已提交
192 193 194 195
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
196
	}
197

198
	result->uptr = filename;
199
	result->aname = NULL;
200 201
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
202 203
}

204 205
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
206
{
207
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
208 209
}

210 211 212 213
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
214
	int len = strlen(filename) + 1;
215 216 217 218 219

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

220
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
221
		result->name = (char *)result->iname;
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
	} else if (len <= PATH_MAX) {
		struct filename *tmp;

		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
237 238
	result->uptr = NULL;
	result->aname = NULL;
239
	result->refcnt = 1;
240
	audit_getname(result);
241 242 243 244

	return result;
}

245
void putname(struct filename *name)
L
Linus Torvalds 已提交
246
{
247 248 249 250 251
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
252
	if (name->name != name->iname) {
253 254 255 256
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
257 258
}

259 260
static int check_acl(struct inode *inode, int mask)
{
261
#ifdef CONFIG_FS_POSIX_ACL
262 263 264
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
265 266
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
267
	                return -EAGAIN;
268
		/* no ->get_acl() calls in RCU mode... */
269
		if (is_uncached_acl(acl))
270
			return -ECHILD;
A
Ari Savolainen 已提交
271
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
272 273
	}

C
Christoph Hellwig 已提交
274 275 276
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
277 278 279 280 281
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
282
#endif
283 284 285 286

	return -EAGAIN;
}

287
/*
288
 * This does the basic permission checking
L
Linus Torvalds 已提交
289
 */
290
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
291
{
292
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
293

294
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
L
Linus Torvalds 已提交
295 296
		mode >>= 6;
	else {
297
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
298
			int error = check_acl(inode, mask);
299 300
			if (error != -EAGAIN)
				return error;
L
Linus Torvalds 已提交
301 302 303 304 305 306 307 308 309
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
310
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
L
Linus Torvalds 已提交
311
		return 0;
312 313 314 315
	return -EACCES;
}

/**
316
 * generic_permission -  check for access rights on a Posix-like filesystem
317
 * @inode:	inode to check access rights for
318
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
319 320 321 322
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
323 324 325 326 327
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
328
 */
329
int generic_permission(struct inode *inode, int mask)
330 331 332 333
{
	int ret;

	/*
334
	 * Do the basic permission checks.
335
	 */
336
	ret = acl_permission_check(inode, mask);
337 338
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
339

340 341
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
342
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
343 344
			return 0;
		if (!(mask & MAY_WRITE))
345 346
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
347 348 349
				return 0;
		return -EACCES;
	}
L
Linus Torvalds 已提交
350 351
	/*
	 * Read/write DACs are always overridable.
352 353
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
L
Linus Torvalds 已提交
354
	 */
355
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
356
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
357 358 359 360 361
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
362
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
363
	if (mask == MAY_READ)
364
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
365 366 367 368
			return 0;

	return -EACCES;
}
369
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
370

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

C
Christoph Hellwig 已提交
391
/**
D
David Howells 已提交
392 393 394
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
C
Christoph Hellwig 已提交
395
 *
D
David Howells 已提交
396
 * Check for read/write/execute permissions on an inode.
397 398
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
D
David Howells 已提交
399 400 401
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
C
Christoph Hellwig 已提交
402
 */
D
David Howells 已提交
403
int __inode_permission(struct inode *inode, int mask)
L
Linus Torvalds 已提交
404
{
405
	int retval;
L
Linus Torvalds 已提交
406

407
	if (unlikely(mask & MAY_WRITE)) {
L
Linus Torvalds 已提交
408 409 410 411 412 413 414
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

415
	retval = do_inode_permission(inode, mask);
L
Linus Torvalds 已提交
416 417 418
	if (retval)
		return retval;

419 420 421 422
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

423
	return security_inode_permission(inode, mask);
L
Linus Torvalds 已提交
424
}
425
EXPORT_SYMBOL(__inode_permission);
L
Linus Torvalds 已提交
426

D
David Howells 已提交
427 428 429
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
430
 * @inode: Inode to check permission on
D
David Howells 已提交
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}
468
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
469

J
Jan Blunck 已提交
470 471 472 473 474 475
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
476
void path_get(const struct path *path)
J
Jan Blunck 已提交
477 478 479 480 481 482
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
483 484 485 486 487 488
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
489
void path_put(const struct path *path)
L
Linus Torvalds 已提交
490
{
J
Jan Blunck 已提交
491 492
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
493
}
J
Jan Blunck 已提交
494
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
495

496
#define EMBEDDED_LEVELS 2
497 498
struct nameidata {
	struct path	path;
A
Al Viro 已提交
499
	struct qstr	last;
500 501 502
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
503
	unsigned	seq, m_seq;
504 505
	int		last_type;
	unsigned	depth;
506
	int		total_link_count;
507 508
	struct saved {
		struct path link;
509
		struct delayed_call done;
510
		const char *name;
511
		unsigned seq;
512
	} *stack, internal[EMBEDDED_LEVELS];
513 514
	struct filename	*name;
	struct nameidata *saved;
515
	struct inode	*link_inode;
516 517
	unsigned	root_seq;
	int		dfd;
518 519
};

520
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
521
{
522 523
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
524 525
	p->dfd = dfd;
	p->name = name;
526
	p->total_link_count = old ? old->total_link_count : 0;
527
	p->saved = old;
528
	current->nameidata = p;
529 530
}

531
static void restore_nameidata(void)
532
{
533
	struct nameidata *now = current->nameidata, *old = now->saved;
534 535 536 537

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
538
	if (now->stack != now->internal)
539
		kfree(now->stack);
540 541 542 543
}

static int __nd_alloc_stack(struct nameidata *nd)
{
A
Al Viro 已提交
544 545 546 547 548 549 550 551 552
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
553
				  GFP_KERNEL);
A
Al Viro 已提交
554 555 556
		if (unlikely(!p))
			return -ENOMEM;
	}
557 558 559 560 561
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;

	/* Only bind mounts can have disconnected paths */
	if (mnt->mnt_root == mnt->mnt_sb->s_root)
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

580 581
static inline int nd_alloc_stack(struct nameidata *nd)
{
582
	if (likely(nd->depth != EMBEDDED_LEVELS))
583 584 585 586 587 588
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

589 590 591 592 593
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
594 595
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
596 597 598 599 600 601 602 603 604 605 606
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
607 608 609 610
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

A
Al Viro 已提交
652
/*
N
Nick Piggin 已提交
653
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
654 655
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
656
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
657 658 659 660
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
661 662 663
 */

/**
A
Al Viro 已提交
664 665 666
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
667
 * @seq: seq number to check dentry against
668
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
669
 *
A
Al Viro 已提交
670 671 672
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
673 674
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
N
Nick Piggin 已提交
675
 */
676
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
N
Nick Piggin 已提交
677 678 679 680
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
681 682

	nd->flags &= ~LOOKUP_RCU;
683 684 685 686 687 688
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
		goto out1;
A
Al Viro 已提交
689

690 691 692 693 694 695 696 697 698 699 700
	/*
	 * For a negative lookup, the lookup sequence point is the parents
	 * sequence point, and it only needs to revalidate the parent dentry.
	 *
	 * For a positive lookup, we need to move both the parent and the
	 * dentry from the RCU domain to be properly refcounted. And the
	 * sequence number in the dentry validates *both* dentry counters,
	 * since we checked the sequence number of the parent after we got
	 * the child sequence number. So we know the parent must still
	 * be valid if the child sequence number is still valid.
	 */
A
Al Viro 已提交
701
	if (!dentry) {
702 703
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			goto out;
A
Al Viro 已提交
704 705
		BUG_ON(nd->inode != parent->d_inode);
	} else {
706 707
		if (!lockref_get_not_dead(&dentry->d_lockref))
			goto out;
708
		if (read_seqcount_retry(&dentry->d_seq, seq))
709
			goto drop_dentry;
A
Al Viro 已提交
710
	}
711 712 713 714 715 716

	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
717 718 719 720
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
721
		}
N
Nick Piggin 已提交
722 723
	}

A
Al Viro 已提交
724
	rcu_read_unlock();
N
Nick Piggin 已提交
725
	return 0;
A
Al Viro 已提交
726

727
drop_dentry:
A
Al Viro 已提交
728
	rcu_read_unlock();
729
	dput(dentry);
730
	goto drop_root_mnt;
731 732 733 734
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
735
out:
A
Al Viro 已提交
736
	rcu_read_unlock();
737 738 739
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
N
Nick Piggin 已提交
740 741 742
	return -ECHILD;
}

743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(!legitimize_path(nd, link, seq))) {
		drop_links(nd);
		nd->depth = 0;
		nd->flags &= ~LOOKUP_RCU;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
		return 0;
	}
	path_put(link);
	return -ECHILD;
}

761
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
762
{
763
	return dentry->d_op->d_revalidate(dentry, flags);
764 765
}

766 767 768
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
769
 *
770 771 772 773 774
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
775
 */
776
static int complete_walk(struct nameidata *nd)
777
{
A
Al Viro 已提交
778
	struct dentry *dentry = nd->path.dentry;
779 780
	int status;

781 782 783
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
784
		if (unlikely(unlazy_walk(nd, NULL, 0)))
785 786 787
			return -ECHILD;
	}

A
Al Viro 已提交
788 789 790
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

791
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
792 793
		return 0;

794
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
795 796 797
	if (status > 0)
		return 0;

A
Al Viro 已提交
798
	if (!status)
799
		status = -ESTALE;
A
Al Viro 已提交
800

801 802 803
	return status;
}

A
Al Viro 已提交
804
static void set_root(struct nameidata *nd)
N
Nick Piggin 已提交
805
{
806
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
807

808 809 810 811 812 813 814 815 816 817 818
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
	}
N
Nick Piggin 已提交
819 820
}

J
Jan Blunck 已提交
821
static void path_put_conditional(struct path *path, struct nameidata *nd)
822 823
{
	dput(path->dentry);
824
	if (path->mnt != nd->path.mnt)
825 826 827
		mntput(path->mnt);
}

828 829
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
830
{
N
Nick Piggin 已提交
831 832 833 834
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
835
	}
N
Nick Piggin 已提交
836
	nd->path.mnt = path->mnt;
837
	nd->path.dentry = path->dentry;
838 839
}

840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

C
Christoph Hellwig 已提交
860
/*
861
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
862 863
 * caller must have taken a reference to path beforehand.
 */
864
void nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
865
{
866
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
867 868 869 870 871 872 873
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

874
static inline void put_link(struct nameidata *nd)
875
{
A
Al Viro 已提交
876
	struct saved *last = nd->stack + --nd->depth;
877
	do_delayed_call(&last->done);
A
Al Viro 已提交
878 879
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
880 881
}

882 883
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
K
Kees Cook 已提交
884 885 886

/**
 * may_follow_link - Check symlink following for unsafe situations
887
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
888 889 890 891 892 893 894 895 896 897 898 899
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
A
Al Viro 已提交
900
static inline int may_follow_link(struct nameidata *nd)
K
Kees Cook 已提交
901 902 903 904 905 906 907 908
{
	const struct inode *inode;
	const struct inode *parent;

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
909
	inode = nd->link_inode;
910
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
911 912 913
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
A
Al Viro 已提交
914
	parent = nd->inode;
K
Kees Cook 已提交
915 916 917 918
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
919
	if (uid_eq(parent->i_uid, inode->i_uid))
K
Kees Cook 已提交
920 921
		return 0;

922 923 924
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

A
Al Viro 已提交
925
	audit_log_link_denied("follow_link", &nd->stack[0].link);
K
Kees Cook 已提交
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
972
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
988
	if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
K
Kees Cook 已提交
989 990
		return 0;

991
	audit_log_link_denied("linkat", link);
K
Kees Cook 已提交
992 993 994
	return -EPERM;
}

995 996
static __always_inline
const char *get_link(struct nameidata *nd)
L
Linus Torvalds 已提交
997
{
998
	struct saved *last = nd->stack + nd->depth - 1;
A
Al Viro 已提交
999
	struct dentry *dentry = last->link.dentry;
1000
	struct inode *inode = nd->link_inode;
1001
	int error;
1002
	const char *res;
L
Linus Torvalds 已提交
1003

1004 1005 1006 1007
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
A
Al Viro 已提交
1008 1009
		if (unlikely(unlazy_walk(nd, NULL, 0)))
			return ERR_PTR(-ECHILD);
1010
		touch_atime(&last->link);
A
Al Viro 已提交
1011
	}
1012

1013 1014 1015
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
1016
		return ERR_PTR(error);
1017

1018
	nd->last_type = LAST_BIND;
1019 1020
	res = inode->i_link;
	if (!res) {
1021 1022 1023
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
1024
		if (nd->flags & LOOKUP_RCU) {
1025
			res = get(NULL, inode, &last->done);
1026 1027 1028
			if (res == ERR_PTR(-ECHILD)) {
				if (unlikely(unlazy_walk(nd, NULL, 0)))
					return ERR_PTR(-ECHILD);
1029
				res = get(dentry, inode, &last->done);
1030 1031
			}
		} else {
1032
			res = get(dentry, inode, &last->done);
1033
		}
1034
		if (IS_ERR_OR_NULL(res))
1035 1036 1037
			return res;
	}
	if (*res == '/') {
1038 1039
		if (!nd->root.mnt)
			set_root(nd);
1040 1041
		if (unlikely(nd_jump_root(nd)))
			return ERR_PTR(-ECHILD);
1042 1043
		while (unlikely(*++res == '/'))
			;
L
Linus Torvalds 已提交
1044
	}
1045 1046
	if (!*res)
		res = NULL;
1047 1048
	return res;
}
1049

1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1060
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1061
{
1062 1063
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1064
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1065

A
Al Viro 已提交
1066
	read_seqlock_excl(&mount_lock);
1067
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1068
	if (parent == mnt) {
A
Al Viro 已提交
1069
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1070 1071
		return 0;
	}
1072
	mntget(&parent->mnt);
1073
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1074
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1075 1076 1077
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1078
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1079 1080
	return 1;
}
1081
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1082

N
Nick Piggin 已提交
1083
/*
1084 1085 1086
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1087
 */
1088
static int follow_automount(struct path *path, struct nameidata *nd,
1089
			    bool *need_mntput)
N
Nick Piggin 已提交
1090
{
1091
	struct vfsmount *mnt;
1092
	int err;
1093 1094 1095 1096

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1107
	 */
1108 1109
	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1110 1111 1112
	    path->dentry->d_inode)
		return -EISDIR;

1113 1114
	nd->total_link_count++;
	if (nd->total_link_count >= 40)
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
1128
		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1129 1130
			return -EREMOTE;
		return PTR_ERR(mnt);
N
Nick Piggin 已提交
1131
	}
1132

1133 1134
	if (!mnt) /* mount collision */
		return 0;
N
Nick Piggin 已提交
1135

1136 1137 1138 1139 1140
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1141
	err = finish_automount(mnt, path);
1142

1143 1144 1145
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1146
		return 0;
1147
	case 0:
1148
		path_put(path);
1149 1150 1151
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1152 1153
	default:
		return err;
1154
	}
1155

A
Al Viro 已提交
1156 1157
}

1158 1159
/*
 * Handle a dentry that is managed in some way.
1160
 * - Flagged for transit management (autofs)
1161 1162 1163 1164 1165 1166 1167
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
1168
static int follow_managed(struct path *path, struct nameidata *nd)
L
Linus Torvalds 已提交
1169
{
1170
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1171 1172
	unsigned managed;
	bool need_mntput = false;
1173
	int ret = 0;
1174 1175 1176 1177 1178 1179 1180

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1181 1182 1183 1184 1185
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1186
			ret = path->dentry->d_op->d_manage(path->dentry, false);
1187
			if (ret < 0)
1188
				break;
1189 1190
		}

1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
A
Al Viro 已提交
1206 1207
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1208 1209 1210 1211
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
1212
			ret = follow_automount(path, nd, &need_mntput);
1213
			if (ret < 0)
1214
				break;
1215 1216 1217 1218 1219
			continue;
		}

		/* We didn't change the current path point */
		break;
L
Linus Torvalds 已提交
1220
	}
1221 1222 1223

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
1224 1225
	if (ret == -EISDIR || !ret)
		ret = 1;
1226 1227 1228 1229 1230
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
	return ret;
L
Linus Torvalds 已提交
1231 1232
}

1233
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1234 1235 1236
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1237
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1238
	if (mounted) {
A
Al Viro 已提交
1239 1240 1241 1242
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1243 1244 1245 1246
		return 1;
	}
	return 0;
}
1247
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1248

1249
static inline int managed_dentry_rcu(struct dentry *dentry)
1250
{
1251 1252
	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		dentry->d_op->d_manage(dentry, true) : 0;
1253 1254
}

1255
/*
1256 1257
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1258 1259
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1260
			       struct inode **inode, unsigned *seqp)
1261
{
1262
	for (;;) {
1263
		struct mount *mounted;
1264 1265 1266 1267
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1268 1269 1270
		switch (managed_dentry_rcu(path->dentry)) {
		case -ECHILD:
		default:
1271
			return false;
1272 1273 1274 1275 1276
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1277 1278

		if (!d_mountpoint(path->dentry))
1279
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1280

A
Al Viro 已提交
1281
		mounted = __lookup_mnt(path->mnt, path->dentry);
1282 1283
		if (!mounted)
			break;
1284 1285
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1286
		nd->flags |= LOOKUP_JUMPED;
1287
		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1288 1289 1290 1291 1292 1293
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1294
	}
1295
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1296
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1297 1298
}

N
Nick Piggin 已提交
1299 1300
static int follow_dotdot_rcu(struct nameidata *nd)
{
1301
	struct inode *inode = nd->inode;
N
Nick Piggin 已提交
1302

1303
	while (1) {
1304
		if (path_equal(&nd->path, &nd->root))
N
Nick Piggin 已提交
1305 1306 1307 1308 1309 1310
			break;
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1311
			inode = parent->d_inode;
N
Nick Piggin 已提交
1312
			seq = read_seqcount_begin(&parent->d_seq);
1313 1314
			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				return -ECHILD;
N
Nick Piggin 已提交
1315 1316
			nd->path.dentry = parent;
			nd->seq = seq;
1317 1318
			if (unlikely(!path_connected(&nd->path)))
				return -ENOENT;
N
Nick Piggin 已提交
1319
			break;
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
		} else {
			struct mount *mnt = real_mount(nd->path.mnt);
			struct mount *mparent = mnt->mnt_parent;
			struct dentry *mountpoint = mnt->mnt_mountpoint;
			struct inode *inode2 = mountpoint->d_inode;
			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				return -ECHILD;
			if (&mparent->mnt == nd->path.mnt)
				break;
			/* we know that mountpoint was pinned */
			nd->path.dentry = mountpoint;
			nd->path.mnt = &mparent->mnt;
			inode = inode2;
			nd->seq = seq;
N
Nick Piggin 已提交
1335 1336
		}
	}
1337
	while (unlikely(d_mountpoint(nd->path.dentry))) {
1338 1339
		struct mount *mounted;
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1340 1341
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return -ECHILD;
1342 1343 1344 1345
		if (!mounted)
			break;
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
1346
		inode = nd->path.dentry->d_inode;
1347 1348
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
	}
1349
	nd->inode = inode;
N
Nick Piggin 已提交
1350 1351 1352
	return 0;
}

1353 1354 1355 1356 1357
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
1358
int follow_down(struct path *path)
1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
{
	unsigned managed;
	int ret;

	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held.
		 *
		 * We indicate to the filesystem if someone is trying to mount
		 * something here.  This gives autofs the chance to deny anyone
		 * other than its daemon the right to mount on its
		 * superstructure.
		 *
		 * The filesystem may sleep at this point.
		 */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1378
			ret = path->dentry->d_op->d_manage(
1379
				path->dentry, false);
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
			if (ret < 0)
				return ret == -EISDIR ? 0 : ret;
		}

		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (!mounted)
				break;
			dput(path->dentry);
			mntput(path->mnt);
			path->mnt = mounted;
			path->dentry = dget(mounted->mnt_root);
			continue;
		}

		/* Don't handle automount points here */
		break;
	}
	return 0;
}
1401
EXPORT_SYMBOL(follow_down);
1402

1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
/*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 */
static void follow_mount(struct path *path)
{
	while (d_mountpoint(path->dentry)) {
		struct vfsmount *mounted = lookup_mnt(path);
		if (!mounted)
			break;
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
	}
}

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
static int path_parent_directory(struct path *path)
{
	struct dentry *old = path->dentry;
	/* rare case of legitimate dget_parent()... */
	path->dentry = dget_parent(path->dentry);
	dput(old);
	if (unlikely(!path_connected(path)))
		return -ENOENT;
	return 0;
}

1430
static int follow_dotdot(struct nameidata *nd)
L
Linus Torvalds 已提交
1431 1432
{
	while(1) {
A
Al Viro 已提交
1433 1434
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
L
Linus Torvalds 已提交
1435 1436
			break;
		}
1437
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
1438 1439 1440
			int ret = path_parent_directory(&nd->path);
			if (ret)
				return ret;
L
Linus Torvalds 已提交
1441 1442
			break;
		}
A
Al Viro 已提交
1443
		if (!follow_up(&nd->path))
L
Linus Torvalds 已提交
1444 1445
			break;
	}
A
Al Viro 已提交
1446
	follow_mount(&nd->path);
N
Nick Piggin 已提交
1447
	nd->inode = nd->path.dentry->d_inode;
1448
	return 0;
L
Linus Torvalds 已提交
1449 1450
}

1451
/*
M
Miklos Szeredi 已提交
1452 1453 1454
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
1455
 */
1456 1457
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1458
				    unsigned int flags)
1459 1460
{
	struct dentry *dentry;
M
Miklos Szeredi 已提交
1461
	int error;
1462

M
Miklos Szeredi 已提交
1463 1464
	dentry = d_lookup(dir, name);
	if (dentry) {
J
Jeff Layton 已提交
1465
		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1466
			error = d_revalidate(dentry, flags);
M
Miklos Szeredi 已提交
1467
			if (unlikely(error <= 0)) {
1468
				if (!error)
1469
					d_invalidate(dentry);
1470 1471
				dput(dentry);
				return ERR_PTR(error);
M
Miklos Szeredi 已提交
1472 1473 1474
			}
		}
	}
1475 1476 1477
	return dentry;
}

1478
/*
1479 1480
 * Call i_op->lookup on the dentry.  The dentry must be negative and
 * unhashed.
M
Miklos Szeredi 已提交
1481 1482
 *
 * dir->d_inode->i_mutex must be held
1483
 */
M
Miklos Szeredi 已提交
1484
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1485
				  unsigned int flags)
1486 1487 1488 1489
{
	struct dentry *old;

	/* Don't create child dentry for a dead directory. */
M
Miklos Szeredi 已提交
1490
	if (unlikely(IS_DEADDIR(dir))) {
1491
		dput(dentry);
1492
		return ERR_PTR(-ENOENT);
1493
	}
1494

1495
	old = dir->i_op->lookup(dir, dentry, flags);
1496 1497 1498 1499 1500 1501 1502
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
}

1503
static struct dentry *__lookup_hash(const struct qstr *name,
1504
		struct dentry *base, unsigned int flags)
1505
{
1506
	struct dentry *dentry = lookup_dcache(name, base, flags);
1507

1508
	if (dentry)
M
Miklos Szeredi 已提交
1509
		return dentry;
1510

1511 1512 1513 1514
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1515
	return lookup_real(base->d_inode, dentry, flags);
1516 1517
}

A
Al Viro 已提交
1518
static int lookup_fast(struct nameidata *nd,
1519 1520
		       struct path *path, struct inode **inode,
		       unsigned *seqp)
L
Linus Torvalds 已提交
1521
{
1522
	struct vfsmount *mnt = nd->path.mnt;
N
Nick Piggin 已提交
1523
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1524
	int status = 1;
1525 1526
	int err;

1527 1528
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1529 1530
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1531
	 */
N
Nick Piggin 已提交
1532 1533
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1534
		bool negative;
1535
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1536 1537 1538
		if (unlikely(!dentry)) {
			if (unlazy_walk(nd, NULL, 0))
				return -ECHILD;
1539
			return 0;
A
Al Viro 已提交
1540
		}
A
Al Viro 已提交
1541

1542 1543 1544 1545
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1546
		*inode = d_backing_inode(dentry);
1547
		negative = d_is_negative(dentry);
A
Al Viro 已提交
1548
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1549 1550 1551 1552 1553 1554 1555 1556 1557
			return -ECHILD;

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1558
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
N
Nick Piggin 已提交
1559
			return -ECHILD;
A
Al Viro 已提交
1560

1561
		*seqp = seq;
A
Al Viro 已提交
1562
		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
1563
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
		if (unlikely(status <= 0)) {
			if (unlazy_walk(nd, dentry, seq))
				return -ECHILD;
			if (status == -ECHILD)
				status = d_revalidate(dentry, nd->flags);
		} else {
			/*
			 * Note: do negative dentry check after revalidation in
			 * case that drops it.
			 */
			if (unlikely(negative))
				return -ENOENT;
			path->mnt = mnt;
			path->dentry = dentry;
			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1579
				return 1;
A
Al Viro 已提交
1580 1581
			if (unlazy_walk(nd, dentry, seq))
				return -ECHILD;
1582
		}
A
Al Viro 已提交
1583
	} else {
A
Al Viro 已提交
1584
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1585
		if (unlikely(!dentry))
1586
			return 0;
A
Al Viro 已提交
1587 1588
		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
			status = d_revalidate(dentry, nd->flags);
1589
	}
A
Al Viro 已提交
1590
	if (unlikely(status <= 0)) {
1591
		if (!status)
A
Al Viro 已提交
1592
			d_invalidate(dentry);
1593
		dput(dentry);
A
Al Viro 已提交
1594
		return status;
1595
	}
1596 1597 1598 1599
	if (unlikely(d_is_negative(dentry))) {
		dput(dentry);
		return -ENOENT;
	}
A
Al Viro 已提交
1600

1601 1602
	path->mnt = mnt;
	path->dentry = dentry;
1603
	err = follow_managed(path, nd);
1604
	if (likely(err > 0))
1605
		*inode = d_backing_inode(path->dentry);
1606
	return err;
M
Miklos Szeredi 已提交
1607 1608 1609
}

/* Fast lookup failed, do it the slow way */
1610 1611 1612
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
M
Miklos Szeredi 已提交
1613
{
A
Al Viro 已提交
1614
	struct dentry *dentry = ERR_PTR(-ENOENT), *old;
1615
	struct inode *inode = dir->d_inode;
1616
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1617

1618
	inode_lock_shared(inode);
1619
	/* Don't go there if it's already dead */
A
Al Viro 已提交
1620 1621 1622
	if (unlikely(IS_DEADDIR(inode)))
		goto out;
again:
1623
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1624 1625 1626
	if (IS_ERR(dentry))
		goto out;
	if (unlikely(!d_in_lookup(dentry))) {
1627 1628 1629 1630
		if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
		    !(flags & LOOKUP_NO_REVAL)) {
			int error = d_revalidate(dentry, flags);
			if (unlikely(error <= 0)) {
A
Al Viro 已提交
1631
				if (!error) {
1632
					d_invalidate(dentry);
A
Al Viro 已提交
1633 1634 1635
					dput(dentry);
					goto again;
				}
1636 1637 1638 1639
				dput(dentry);
				dentry = ERR_PTR(error);
			}
		}
A
Al Viro 已提交
1640 1641 1642 1643 1644 1645
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1646 1647
		}
	}
A
Al Viro 已提交
1648
out:
1649
	inode_unlock_shared(inode);
1650
	return dentry;
L
Linus Torvalds 已提交
1651 1652
}

1653 1654 1655
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1656
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1657 1658
		if (err != -ECHILD)
			return err;
1659
		if (unlazy_walk(nd, NULL, 0))
1660 1661
			return -ECHILD;
	}
1662
	return inode_permission(nd->inode, MAY_EXEC);
1663 1664
}

1665 1666 1667
static inline int handle_dots(struct nameidata *nd, int type)
{
	if (type == LAST_DOTDOT) {
1668 1669
		if (!nd->root.mnt)
			set_root(nd);
1670
		if (nd->flags & LOOKUP_RCU) {
1671
			return follow_dotdot_rcu(nd);
1672
		} else
1673
			return follow_dotdot(nd);
1674 1675 1676 1677
	}
	return 0;
}

1678 1679
static int pick_link(struct nameidata *nd, struct path *link,
		     struct inode *inode, unsigned seq)
1680
{
1681
	int error;
A
Al Viro 已提交
1682
	struct saved *last;
1683
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1684 1685 1686
		path_to_nameidata(link, nd);
		return -ELOOP;
	}
A
Al Viro 已提交
1687
	if (!(nd->flags & LOOKUP_RCU)) {
1688 1689
		if (link->mnt == nd->path.mnt)
			mntget(link->mnt);
1690
	}
1691 1692
	error = nd_alloc_stack(nd);
	if (unlikely(error)) {
A
Al Viro 已提交
1693 1694 1695 1696 1697 1698 1699 1700 1701
		if (error == -ECHILD) {
			if (unlikely(unlazy_link(nd, link, seq)))
				return -ECHILD;
			error = nd_alloc_stack(nd);
		}
		if (error) {
			path_put(link);
			return error;
		}
1702 1703
	}

1704
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1705
	last->link = *link;
1706 1707
	clear_delayed_call(&last->done);
	nd->link_inode = inode;
1708
	last->seq = seq;
1709 1710 1711
	return 1;
}

1712 1713 1714 1715 1716 1717
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1718
static inline int should_follow_link(struct nameidata *nd, struct path *link,
1719 1720
				     int follow,
				     struct inode *inode, unsigned seq)
1721
{
1722 1723 1724 1725
	if (likely(!d_is_symlink(link->dentry)))
		return 0;
	if (!follow)
		return 0;
1726 1727 1728 1729 1730
	/* make sure that d_is_symlink above matches inode */
	if (nd->flags & LOOKUP_RCU) {
		if (read_seqcount_retry(&link->dentry->d_seq, seq))
			return -ECHILD;
	}
1731
	return pick_link(nd, link, inode, seq);
1732 1733
}

1734 1735 1736
enum {WALK_GET = 1, WALK_PUT = 2};

static int walk_component(struct nameidata *nd, int flags)
1737
{
A
Al Viro 已提交
1738
	struct path path;
1739
	struct inode *inode;
1740
	unsigned seq;
1741 1742 1743 1744 1745 1746
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1747 1748 1749 1750 1751 1752
	if (unlikely(nd->last_type != LAST_NORM)) {
		err = handle_dots(nd, nd->last_type);
		if (flags & WALK_PUT)
			put_link(nd);
		return err;
	}
1753
	err = lookup_fast(nd, &path, &inode, &seq);
1754
	if (unlikely(err <= 0)) {
M
Miklos Szeredi 已提交
1755
		if (err < 0)
1756
			return err;
1757 1758 1759 1760
		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
					  nd->flags);
		if (IS_ERR(path.dentry))
			return PTR_ERR(path.dentry);
1761

1762 1763 1764
		path.mnt = nd->path.mnt;
		err = follow_managed(&path, nd);
		if (unlikely(err < 0))
1765
			return err;
M
Miklos Szeredi 已提交
1766

1767 1768 1769 1770 1771
		if (unlikely(d_is_negative(path.dentry))) {
			path_to_nameidata(&path, nd);
			return -ENOENT;
		}

1772
		seq = 0;	/* we are already out of RCU mode */
1773
		inode = d_backing_inode(path.dentry);
1774
	}
M
Miklos Szeredi 已提交
1775

1776 1777
	if (flags & WALK_PUT)
		put_link(nd);
1778
	err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
1779 1780
	if (unlikely(err))
		return err;
A
Al Viro 已提交
1781
	path_to_nameidata(&path, nd);
1782
	nd->inode = inode;
1783
	nd->seq = seq;
1784 1785 1786
	return 0;
}

1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1806
#include <asm/word-at-a-time.h>
1807

1808
#ifdef HASH_MIX
1809

1810
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1811

1812
#elif defined(CONFIG_64BIT)
1813
/*
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
1841
 */
1842 1843 1844 1845 1846
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
1847

1848
/*
1849 1850 1851
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
1852
 */
1853
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1854
{
1855 1856 1857
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
1858 1859
}

1860 1861
#else	/* 32-bit case */

1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
1877

1878
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1879
{
1880 1881
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
1882 1883
}

1884 1885
#endif

1886 1887 1888 1889 1890 1891 1892
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
1893
unsigned int full_name_hash(const char *name, unsigned int len)
1894
{
1895
	unsigned long a, x = 0, y = 0;
1896 1897

	for (;;) {
1898 1899
		if (!len)
			goto done;
1900
		a = load_unaligned_zeropad(name);
1901 1902
		if (len < sizeof(unsigned long))
			break;
1903
		HASH_MIX(x, y, a);
1904 1905 1906
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
1907
	x ^= a & bytemask_from_count(len);
1908
done:
1909
	return fold_hash(x, y);
1910 1911 1912
}
EXPORT_SYMBOL(full_name_hash);

1913 1914 1915
/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const char *name)
{
1916
	unsigned long a = 0, x = 0, y = 0, adata, mask, len;
1917 1918 1919 1920
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

	len = -sizeof(unsigned long);
	do {
1921
		HASH_MIX(x, y, a);
1922 1923 1924 1925 1926 1927
		len += sizeof(unsigned long);
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
1928
	x ^= a & zero_bytemask(mask);
1929

1930
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1931 1932 1933
}
EXPORT_SYMBOL(hashlen_string);

1934 1935
/*
 * Calculate the length and hash of the path component, and
1936
 * return the "hash_len" as the result.
1937
 */
1938
static inline u64 hash_name(const char *name)
1939
{
1940
	unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len;
1941
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1942 1943 1944

	len = -sizeof(unsigned long);
	do {
1945
		HASH_MIX(x, y, a);
1946
		len += sizeof(unsigned long);
1947
		a = load_unaligned_zeropad(name+len);
1948 1949 1950 1951 1952 1953
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
1954
	x ^= a & zero_bytemask(mask);
1955

1956
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1957 1958
}

1959
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
1960

1961 1962
/* Return the hash of a string of known length */
unsigned int full_name_hash(const char *name, unsigned int len)
L
Linus Torvalds 已提交
1963 1964 1965
{
	unsigned long hash = init_name_hash();
	while (len--)
1966
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
1967 1968
	return end_name_hash(hash);
}
1969
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
1970

1971
/* Return the "hash_len" (hash and length) of a null-terminated string */
1972
u64 hashlen_string(const char *name)
1973 1974 1975 1976 1977
{
	unsigned long hash = init_name_hash();
	unsigned long len = 0, c;

	c = (unsigned char)*name;
1978
	while (c) {
1979 1980 1981
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
1982
	}
1983 1984
	return hashlen_create(end_name_hash(hash), len);
}
1985
EXPORT_SYMBOL(hashlen_string);
1986

1987 1988 1989 1990
/*
 * We know there's a real path component here of at least
 * one character.
 */
1991
static inline u64 hash_name(const char *name)
1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
{
	unsigned long hash = init_name_hash();
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2002
	return hashlen_create(end_name_hash(hash), len);
2003 2004
}

2005 2006
#endif

L
Linus Torvalds 已提交
2007 2008
/*
 * Name resolution.
2009 2010
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2011
 *
2012 2013
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2014
 */
2015
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2016 2017
{
	int err;
A
Al Viro 已提交
2018

L
Linus Torvalds 已提交
2019 2020 2021
	while (*name=='/')
		name++;
	if (!*name)
2022
		return 0;
L
Linus Torvalds 已提交
2023 2024 2025

	/* At this point we know we have a real path component. */
	for(;;) {
2026
		u64 hash_len;
A
Al Viro 已提交
2027
		int type;
L
Linus Torvalds 已提交
2028

2029
		err = may_lookup(nd);
2030
		if (err)
2031
			return err;
L
Linus Torvalds 已提交
2032

2033
		hash_len = hash_name(name);
L
Linus Torvalds 已提交
2034

A
Al Viro 已提交
2035
		type = LAST_NORM;
2036
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2037
			case 2:
2038
				if (name[1] == '.') {
A
Al Viro 已提交
2039
					type = LAST_DOTDOT;
A
Al Viro 已提交
2040 2041
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
2042 2043 2044 2045
				break;
			case 1:
				type = LAST_DOT;
		}
2046 2047
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
2048
			nd->flags &= ~LOOKUP_JUMPED;
2049
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2050
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2051
				err = parent->d_op->d_hash(parent, &this);
2052
				if (err < 0)
2053
					return err;
2054 2055
				hash_len = this.hash_len;
				name = this.name;
2056 2057
			}
		}
A
Al Viro 已提交
2058

2059 2060
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2061 2062
		nd->last_type = type;

2063 2064
		name += hashlen_len(hash_len);
		if (!*name)
2065
			goto OK;
2066 2067 2068 2069 2070
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2071 2072
			name++;
		} while (unlikely(*name == '/'));
2073 2074
		if (unlikely(!*name)) {
OK:
2075
			/* pathname body, done */
2076 2077 2078
			if (!nd->depth)
				return 0;
			name = nd->stack[nd->depth - 1].name;
2079
			/* trailing symlink, done */
2080 2081 2082
			if (!name)
				return 0;
			/* last component of nested symlink */
2083
			err = walk_component(nd, WALK_GET | WALK_PUT);
2084
		} else {
2085
			err = walk_component(nd, WALK_GET);
2086
		}
2087
		if (err < 0)
2088
			return err;
L
Linus Torvalds 已提交
2089

2090
		if (err) {
2091
			const char *s = get_link(nd);
2092

2093
			if (IS_ERR(s))
2094
				return PTR_ERR(s);
2095 2096 2097
			err = 0;
			if (unlikely(!s)) {
				/* jumped */
2098
				put_link(nd);
2099
			} else {
2100 2101 2102
				nd->stack[nd->depth - 1].name = name;
				name = s;
				continue;
2103
			}
N
Nick Piggin 已提交
2104
		}
2105 2106 2107 2108 2109
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
				if (unlazy_walk(nd, NULL, 0))
					return -ECHILD;
			}
2110
			return -ENOTDIR;
2111
		}
L
Linus Torvalds 已提交
2112 2113 2114
	}
}

2115
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2116 2117
{
	int retval = 0;
2118
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2119 2120

	nd->last_type = LAST_ROOT; /* if there are only slashes... */
2121
	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
N
Nick Piggin 已提交
2122
	nd->depth = 0;
2123
	if (flags & LOOKUP_ROOT) {
2124 2125
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
A
Al Viro 已提交
2126
		if (*s) {
M
Miklos Szeredi 已提交
2127
			if (!d_can_lookup(root))
2128
				return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2129 2130
			retval = inode_permission(inode, MAY_EXEC);
			if (retval)
2131
				return ERR_PTR(retval);
A
Al Viro 已提交
2132
		}
2133 2134 2135
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2136
			rcu_read_lock();
2137
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2138
			nd->root_seq = nd->seq;
A
Al Viro 已提交
2139
			nd->m_seq = read_seqbegin(&mount_lock);
2140 2141 2142
		} else {
			path_get(&nd->path);
		}
2143
		return s;
2144 2145
	}

N
Nick Piggin 已提交
2146
	nd->root.mnt = NULL;
2147 2148
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
N
Nick Piggin 已提交
2149

A
Al Viro 已提交
2150
	nd->m_seq = read_seqbegin(&mount_lock);
A
Al Viro 已提交
2151
	if (*s == '/') {
2152
		if (flags & LOOKUP_RCU)
A
Al Viro 已提交
2153
			rcu_read_lock();
2154
		set_root(nd);
2155
		if (likely(!nd_jump_root(nd)))
2156
			return s;
2157
		nd->root.mnt = NULL;
2158 2159
		rcu_read_unlock();
		return ERR_PTR(-ECHILD);
2160
	} else if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2161 2162 2163
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2164

A
Al Viro 已提交
2165
			rcu_read_lock();
N
Nick Piggin 已提交
2166

A
Al Viro 已提交
2167 2168 2169
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2170
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2171 2172 2173 2174
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2175
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2176
		}
2177
		return s;
N
Nick Piggin 已提交
2178
	} else {
2179
		/* Caller must check execute permissions on the starting path component */
2180
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2181 2182
		struct dentry *dentry;

2183
		if (!f.file)
2184
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2185

2186
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2187

A
Al Viro 已提交
2188
		if (*s) {
M
Miklos Szeredi 已提交
2189
			if (!d_can_lookup(dentry)) {
2190
				fdput(f);
2191
				return ERR_PTR(-ENOTDIR);
2192
			}
A
Al Viro 已提交
2193
		}
N
Nick Piggin 已提交
2194

2195
		nd->path = f.file->f_path;
A
Al Viro 已提交
2196
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2197
			rcu_read_lock();
A
Al Viro 已提交
2198 2199
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2200
		} else {
2201
			path_get(&nd->path);
A
Al Viro 已提交
2202
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2203
		}
A
Al Viro 已提交
2204
		fdput(f);
2205
		return s;
N
Nick Piggin 已提交
2206
	}
2207 2208
}

2209
static const char *trailing_symlink(struct nameidata *nd)
2210 2211
{
	const char *s;
A
Al Viro 已提交
2212
	int error = may_follow_link(nd);
2213
	if (unlikely(error))
2214
		return ERR_PTR(error);
2215
	nd->flags |= LOOKUP_PARENT;
2216
	nd->stack[0].name = NULL;
2217
	s = get_link(nd);
2218
	return s ? s : "";
2219 2220
}

A
Al Viro 已提交
2221
static inline int lookup_last(struct nameidata *nd)
2222 2223 2224 2225 2226
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
2227
	return walk_component(nd,
2228 2229 2230 2231 2232
			nd->flags & LOOKUP_FOLLOW
				? nd->depth
					? WALK_PUT | WALK_GET
					: WALK_GET
				: 0);
2233 2234
}

2235
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2236
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2237
{
2238
	const char *s = path_init(nd, flags);
2239
	int err;
N
Nick Piggin 已提交
2240

2241 2242
	if (IS_ERR(s))
		return PTR_ERR(s);
2243 2244 2245 2246 2247 2248
	while (!(err = link_path_walk(s, nd))
		&& ((err = lookup_last(nd)) > 0)) {
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			err = PTR_ERR(s);
			break;
2249 2250
		}
	}
2251 2252
	if (!err)
		err = complete_walk(nd);
2253

2254 2255
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2256
			err = -ENOTDIR;
2257 2258 2259 2260 2261 2262
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2263
	return err;
A
Al Viro 已提交
2264
}
N
Nick Piggin 已提交
2265

2266
static int filename_lookup(int dfd, struct filename *name, unsigned flags,
2267
			   struct path *path, struct path *root)
A
Al Viro 已提交
2268
{
2269
	int retval;
2270
	struct nameidata nd;
2271 2272
	if (IS_ERR(name))
		return PTR_ERR(name);
2273 2274 2275 2276
	if (unlikely(root)) {
		nd.root = *root;
		flags |= LOOKUP_ROOT;
	}
2277
	set_nameidata(&nd, dfd, name);
2278
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2279
	if (unlikely(retval == -ECHILD))
2280
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2281
	if (unlikely(retval == -ESTALE))
2282
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2283

2284
	if (likely(!retval))
2285
		audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2286
	restore_nameidata();
2287
	putname(name);
2288
	return retval;
L
Linus Torvalds 已提交
2289 2290
}

2291
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2292
static int path_parentat(struct nameidata *nd, unsigned flags,
2293
				struct path *parent)
2294
{
2295
	const char *s = path_init(nd, flags);
2296 2297 2298 2299
	int err;
	if (IS_ERR(s))
		return PTR_ERR(s);
	err = link_path_walk(s, nd);
2300 2301
	if (!err)
		err = complete_walk(nd);
2302 2303 2304 2305 2306 2307
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2308 2309 2310
	return err;
}

2311
static struct filename *filename_parentat(int dfd, struct filename *name,
2312 2313
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2314 2315
{
	int retval;
2316
	struct nameidata nd;
2317

2318 2319
	if (IS_ERR(name))
		return name;
2320
	set_nameidata(&nd, dfd, name);
2321
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2322
	if (unlikely(retval == -ECHILD))
2323
		retval = path_parentat(&nd, flags, parent);
2324
	if (unlikely(retval == -ESTALE))
2325
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2326 2327 2328 2329
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
		audit_inode(name, parent->dentry, LOOKUP_PARENT);
2330 2331 2332
	} else {
		putname(name);
		name = ERR_PTR(retval);
2333
	}
2334
	restore_nameidata();
2335
	return name;
2336 2337
}

A
Al Viro 已提交
2338 2339
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2340
{
2341 2342
	struct filename *filename;
	struct dentry *d;
2343 2344
	struct qstr last;
	int type;
2345

2346 2347
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2348 2349
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2350
	if (unlikely(type != LAST_NORM)) {
2351
		path_put(path);
2352 2353
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2354
	}
A
Al Viro 已提交
2355
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2356
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2357
	if (IS_ERR(d)) {
A
Al Viro 已提交
2358
		inode_unlock(path->dentry->d_inode);
2359
		path_put(path);
A
Al Viro 已提交
2360
	}
2361
	putname(filename);
A
Al Viro 已提交
2362
	return d;
2363 2364
}

A
Al Viro 已提交
2365 2366
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2367 2368
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2369
}
2370
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2371

2372 2373 2374 2375 2376 2377
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2378
 * @path: pointer to struct path to fill
2379 2380 2381
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2382
		    struct path *path)
2383
{
2384 2385
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2386 2387
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2388
}
2389
EXPORT_SYMBOL(vfs_path_lookup);
2390

M
Miklos Szeredi 已提交
2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417
/**
 * lookup_hash - lookup single pathname component on already hashed name
 * @name:	name and hash to lookup
 * @base:	base directory to lookup from
 *
 * The name must have been verified and hashed (see lookup_one_len()).  Using
 * this after just full_name_hash() is unsafe.
 *
 * This function also doesn't check for search permission on base directory.
 *
 * Use lookup_one_len_unlocked() instead, unless you really know what you are
 * doing.
 *
 * Do not hold i_mutex; this helper takes i_mutex if necessary.
 */
struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
{
	struct dentry *ret;

	ret = lookup_dcache(name, base, 0);
	if (!ret)
		ret = lookup_slow(name, base, 0);

	return ret;
}
EXPORT_SYMBOL(lookup_hash);

2418
/**
2419
 * lookup_one_len - filesystem helper to lookup single pathname component
2420 2421 2422 2423
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
2424
 * Note that this routine is purely a helper for filesystem usage and should
2425
 * not be called by generic code.
2426 2427
 *
 * The caller must hold base->i_mutex.
2428
 */
2429 2430 2431
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
A
Al Viro 已提交
2432
	unsigned int c;
2433
	int err;
2434

A
Al Viro 已提交
2435
	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2436

A
Al Viro 已提交
2437 2438
	this.name = name;
	this.len = len;
L
Linus Torvalds 已提交
2439
	this.hash = full_name_hash(name, len);
A
Al Viro 已提交
2440 2441 2442
	if (!len)
		return ERR_PTR(-EACCES);

A
Al Viro 已提交
2443 2444 2445 2446 2447
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
			return ERR_PTR(-EACCES);
	}

A
Al Viro 已提交
2448 2449 2450 2451 2452
	while (len--) {
		c = *(const unsigned char *)name++;
		if (c == '/' || c == '\0')
			return ERR_PTR(-EACCES);
	}
2453 2454 2455 2456 2457
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2458
		int err = base->d_op->d_hash(base, &this);
2459 2460 2461
		if (err < 0)
			return ERR_PTR(err);
	}
2462

2463 2464 2465 2466
	err = inode_permission(base->d_inode, MAY_EXEC);
	if (err)
		return ERR_PTR(err);

2467
	return __lookup_hash(&this, base, 0);
2468
}
2469
EXPORT_SYMBOL(lookup_one_len);
2470

2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	unsigned int c;
	int err;

	this.name = name;
	this.len = len;
	this.hash = full_name_hash(name, len);
	if (!len)
		return ERR_PTR(-EACCES);

	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
			return ERR_PTR(-EACCES);
	}

	while (len--) {
		c = *(const unsigned char *)name++;
		if (c == '/' || c == '\0')
			return ERR_PTR(-EACCES);
	}
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
		int err = base->d_op->d_hash(base, &this);
		if (err < 0)
			return ERR_PTR(err);
	}

	err = inode_permission(base->d_inode, MAY_EXEC);
	if (err)
		return ERR_PTR(err);

M
Miklos Szeredi 已提交
2520
	return lookup_hash(&this, base);
2521 2522 2523
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
	struct dentry *child, *parent;
	struct qstr this;
	int ret;

	ret = path_parent_directory(path);
	if (ret)
		return ret;

	parent = path->dentry;
	this.name = "pts";
	this.len = 3;
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
	follow_mount(path);
	return 0;
}
#endif

2552 2553
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2554
{
2555 2556
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2557
}
2558
EXPORT_SYMBOL(user_path_at_empty);
2559

2560 2561 2562 2563 2564 2565
/*
 * NB: most callers don't do anything directly with the reference to the
 *     to struct filename, but the nd->last pointer points into the name string
 *     allocated by getname. So we must hold the reference to it until all
 *     path-walking is complete.
 */
A
Al Viro 已提交
2566
static inline struct filename *
2567 2568 2569 2570
user_path_parent(int dfd, const char __user *path,
		 struct path *parent,
		 struct qstr *last,
		 int *type,
2571
		 unsigned int flags)
2572
{
2573
	/* only LOOKUP_REVAL is allowed in extra flags */
2574 2575
	return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
				 parent, last, type);
2576 2577
}

2578
/**
2579
 * mountpoint_last - look up last component for umount
2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
 * @path: pointer to container for result
 *
 * This is a special lookup_last function just for umount. In this case, we
 * need to resolve the path without doing any revalidation.
 *
 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
 * in almost all cases, this lookup will be served out of the dcache. The only
 * cases where it won't are if nd->last refers to a symlink or the path is
 * bogus and it doesn't exist.
 *
 * Returns:
 * -error: if there was an error during lookup. This includes -ENOENT if the
 *         lookup found a negative dentry. The nd->path reference will also be
 *         put in this case.
 *
 * 0:      if we successfully resolved nd->path and found it to not to be a
 *         symlink that needs to be followed. "path" will also be populated.
 *         The nd->path reference will also be put.
 *
 * 1:      if we successfully resolved nd->last and found it to be a symlink
 *         that needs to be followed. "path" will be populated with the path
 *         to the link, and nd->path will *not* be put.
 */
static int
2606
mountpoint_last(struct nameidata *nd, struct path *path)
2607 2608 2609 2610 2611
{
	int error = 0;
	struct dentry *dentry;
	struct dentry *dir = nd->path.dentry;

2612 2613
	/* If we're in rcuwalk, drop out of it to handle last component */
	if (nd->flags & LOOKUP_RCU) {
2614
		if (unlazy_walk(nd, NULL, 0))
2615
			return -ECHILD;
2616 2617 2618 2619 2620 2621
	}

	nd->flags &= ~LOOKUP_PARENT;

	if (unlikely(nd->last_type != LAST_NORM)) {
		error = handle_dots(nd, nd->last_type);
2622
		if (error)
2623
			return error;
2624
		dentry = dget(nd->path.dentry);
2625 2626
	} else {
		dentry = d_lookup(dir, &nd->last);
2627
		if (!dentry) {
2628 2629 2630 2631 2632 2633 2634 2635 2636 2637
			/*
			 * No cached dentry. Mounted dentries are pinned in the
			 * cache, so that means that this dentry is probably
			 * a symlink or the path doesn't actually point
			 * to a mounted dentry.
			 */
			dentry = lookup_slow(&nd->last, dir,
					     nd->flags | LOOKUP_NO_REVAL);
			if (IS_ERR(dentry))
				return PTR_ERR(dentry);
2638
		}
2639
	}
2640
	if (d_is_negative(dentry)) {
2641
		dput(dentry);
2642
		return -ENOENT;
2643
	}
2644 2645
	if (nd->depth)
		put_link(nd);
2646
	path->dentry = dentry;
2647
	path->mnt = nd->path.mnt;
2648 2649
	error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
				   d_backing_inode(dentry), 0);
2650
	if (unlikely(error))
2651
		return error;
2652
	mntget(path->mnt);
2653
	follow_mount(path);
2654
	return 0;
2655 2656 2657
}

/**
2658
 * path_mountpoint - look up a path to be umounted
2659
 * @nd:		lookup context
2660
 * @flags:	lookup flags
2661
 * @path:	pointer to container for result
2662 2663
 *
 * Look up the given name, but don't attempt to revalidate the last component.
2664
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2665 2666
 */
static int
2667
path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2668
{
2669
	const char *s = path_init(nd, flags);
2670 2671 2672
	int err;
	if (IS_ERR(s))
		return PTR_ERR(s);
2673 2674 2675 2676 2677
	while (!(err = link_path_walk(s, nd)) &&
		(err = mountpoint_last(nd, path)) > 0) {
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			err = PTR_ERR(s);
2678
			break;
2679
		}
2680
	}
2681
	terminate_walk(nd);
2682 2683 2684
	return err;
}

A
Al Viro 已提交
2685
static int
2686
filename_mountpoint(int dfd, struct filename *name, struct path *path,
A
Al Viro 已提交
2687 2688
			unsigned int flags)
{
2689
	struct nameidata nd;
2690
	int error;
2691 2692
	if (IS_ERR(name))
		return PTR_ERR(name);
2693
	set_nameidata(&nd, dfd, name);
2694
	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2695
	if (unlikely(error == -ECHILD))
2696
		error = path_mountpoint(&nd, flags, path);
A
Al Viro 已提交
2697
	if (unlikely(error == -ESTALE))
2698
		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
A
Al Viro 已提交
2699
	if (likely(!error))
2700
		audit_inode(name, path->dentry, 0);
2701
	restore_nameidata();
2702
	putname(name);
A
Al Viro 已提交
2703 2704 2705
	return error;
}

2706
/**
2707
 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720
 * @dfd:	directory file descriptor
 * @name:	pathname from userland
 * @flags:	lookup flags
 * @path:	pointer to container to hold result
 *
 * A umount is a special case for path walking. We're not actually interested
 * in the inode in this situation, and ESTALE errors can be a problem. We
 * simply want track down the dentry and vfsmount attached at the mountpoint
 * and avoid revalidating the last component.
 *
 * Returns 0 and populates "path" on success.
 */
int
2721
user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2722 2723
			struct path *path)
{
2724
	return filename_mountpoint(dfd, getname(name), path, flags);
2725 2726
}

A
Al Viro 已提交
2727 2728 2729 2730
int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
			unsigned int flags)
{
2731
	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
A
Al Viro 已提交
2732 2733 2734
}
EXPORT_SYMBOL(kern_path_mountpoint);

M
Miklos Szeredi 已提交
2735
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2736
{
2737
	kuid_t fsuid = current_fsuid();
2738

2739
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2740
		return 0;
2741
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2742
		return 0;
2743
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2744
}
M
Miklos Szeredi 已提交
2745
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 *  9. We can't remove a root or mountpoint.
 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
2766
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2767
{
2768
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2769 2770
	int error;

2771
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2772
		return -ENOENT;
2773
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2774 2775

	BUG_ON(victim->d_parent->d_inode != dir);
2776
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2777

2778
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2779 2780 2781 2782
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2783 2784 2785

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
L
Linus Torvalds 已提交
2786 2787
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2788
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2789 2790 2791
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2792
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We should have write and exec permissions on dir
 *  4. We can't do it if dir is immutable (done in permission())
 */
2809
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2810
{
2811
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2812 2813 2814 2815
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2816
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2827
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2828 2829 2830
		return NULL;
	}

2831
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2832

2833 2834
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2835 2836
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2837
		return p;
L
Linus Torvalds 已提交
2838 2839
	}

2840 2841
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2842 2843
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2844
		return p;
L
Linus Torvalds 已提交
2845 2846
	}

A
Al Viro 已提交
2847 2848
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2849 2850
	return NULL;
}
2851
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2852 2853 2854

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2855
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2856
	if (p1 != p2) {
A
Al Viro 已提交
2857
		inode_unlock(p2->d_inode);
2858
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2859 2860
	}
}
2861
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2862

A
Al Viro 已提交
2863
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2864
		bool want_excl)
L
Linus Torvalds 已提交
2865
{
2866
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2867 2868 2869
	if (error)
		return error;

A
Al Viro 已提交
2870
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2871 2872 2873 2874 2875 2876
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2877
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2878
	if (!error)
2879
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2880 2881
	return error;
}
2882
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2883

2884 2885 2886 2887 2888 2889
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

A
Al Viro 已提交
2890
static int may_open(struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2891
{
2892
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2893 2894 2895 2896 2897 2898
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2899 2900
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2901
		return -ELOOP;
C
Christoph Hellwig 已提交
2902 2903 2904 2905 2906 2907
	case S_IFDIR:
		if (acc_mode & MAY_WRITE)
			return -EISDIR;
		break;
	case S_IFBLK:
	case S_IFCHR:
2908
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2909
			return -EACCES;
C
Christoph Hellwig 已提交
2910 2911 2912
		/*FALLTHRU*/
	case S_IFIFO:
	case S_IFSOCK:
L
Linus Torvalds 已提交
2913
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2914
		break;
2915
	}
2916

A
Al Viro 已提交
2917
	error = inode_permission(inode, MAY_OPEN | acc_mode);
2918 2919
	if (error)
		return error;
M
Mimi Zohar 已提交
2920

L
Linus Torvalds 已提交
2921 2922 2923 2924
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2925
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2926
			return -EPERM;
L
Linus Torvalds 已提交
2927
		if (flag & O_TRUNC)
2928
			return -EPERM;
L
Linus Torvalds 已提交
2929 2930 2931
	}

	/* O_NOATIME can only be set by the owner or superuser */
2932
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2933
		return -EPERM;
L
Linus Torvalds 已提交
2934

2935
	return 0;
2936
}
L
Linus Torvalds 已提交
2937

2938
static int handle_truncate(struct file *filp)
2939
{
2940
	struct path *path = &filp->f_path;
2941 2942 2943 2944 2945 2946 2947
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2948
	error = locks_verify_locked(filp);
2949
	if (!error)
2950
		error = security_path_truncate(path);
2951 2952 2953
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2954
				    filp);
2955 2956
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2957
	return error;
L
Linus Torvalds 已提交
2958 2959
}

2960 2961
static inline int open_to_namei_flags(int flag)
{
2962 2963
	if ((flag & O_ACCMODE) == 3)
		flag--;
2964 2965 2966
	return flag;
}

2967
static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
M
Miklos Szeredi 已提交
2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979
{
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * Returns 1 if the file was looked up only or didn't need creating.  The
 * caller will need to perform the open themselves.  @path will have been
 * updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
2993 2994 2995
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
			struct path *path, struct file *file,
			const struct open_flags *op,
2996
			int open_flag, umode_t mode,
2997
			int *opened)
M
Miklos Szeredi 已提交
2998
{
2999
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
3000 3001 3002
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

3003
	if (!(~open_flag & (O_EXCL | O_CREAT)))	/* both O_EXCL and O_CREAT */
M
Miklos Szeredi 已提交
3004 3005 3006 3007 3008
		open_flag &= ~O_TRUNC;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
3009 3010
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
3011 3012 3013
	error = dir->i_op->atomic_open(dir, dentry, file,
				       open_to_namei_flags(open_flag),
				       mode, opened);
3014
	d_lookup_done(dentry);
3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029
	if (!error) {
		/*
		 * We didn't have the inode before the open, so check open
		 * permission here.
		 */
		int acc_mode = op->acc_mode;
		if (*opened & FILE_CREATED) {
			WARN_ON(!(open_flag & O_CREAT));
			fsnotify_create(dir, dentry);
			acc_mode = 0;
		}
		error = may_open(&file->f_path, acc_mode, open_flag);
		if (WARN_ON(error > 0))
			error = -EINVAL;
	} else if (error > 0) {
A
Al Viro 已提交
3030
		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3031
			error = -EIO;
3032
		} else {
3033 3034 3035
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
3036
			}
3037 3038 3039 3040 3041
			if (*opened & FILE_CREATED)
				fsnotify_create(dir, dentry);
			path->dentry = dentry;
			path->mnt = nd->path.mnt;
			return 1;
3042
		}
M
Miklos Szeredi 已提交
3043 3044
	}
	dput(dentry);
3045
	return error;
M
Miklos Szeredi 已提交
3046 3047
}

M
Miklos Szeredi 已提交
3048
/*
3049
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3050 3051 3052
 *
 * Must be called with i_mutex held on parent.
 *
3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064
 * Returns 0 if the file was successfully atomically created (if necessary) and
 * opened.  In this case the file will be returned attached to @file.
 *
 * Returns 1 if the file was not completely opened at this time, though lookups
 * and creations will have been performed and the dentry returned in @path will
 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
 * specified then a negative dentry may be returned.
 *
 * An error code is returned otherwise.
 *
 * FILE_CREATE will be set in @*opened if the dentry was created and will be
 * cleared otherwise prior to returning.
M
Miklos Szeredi 已提交
3065
 */
3066 3067 3068
static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
3069
			bool got_write, int *opened)
M
Miklos Szeredi 已提交
3070 3071
{
	struct dentry *dir = nd->path.dentry;
3072
	struct inode *dir_inode = dir->d_inode;
3073
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3074
	struct dentry *dentry;
3075 3076
	int error, create_error = 0;
	umode_t mode = op->mode;
3077
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3078

3079 3080
	if (unlikely(IS_DEADDIR(dir_inode)))
		return -ENOENT;
M
Miklos Szeredi 已提交
3081

3082
	*opened &= ~FILE_CREATED;
3083 3084 3085 3086 3087 3088 3089 3090 3091
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
				return PTR_ERR(dentry);
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3092

3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105
		if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
			break;

		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3106
		/* Cached positive dentry: will open in f_op->open */
M
Miklos Szeredi 已提交
3107
		goto out_no_open;
3108
	}
M
Miklos Szeredi 已提交
3109

3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
	if (open_flag & O_CREAT) {
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
		if (unlikely(!got_write)) {
			create_error = -EROFS;
			open_flag &= ~O_CREAT;
			if (open_flag & (O_EXCL | O_TRUNC))
				goto no_open;
			/* No side effects, safe to clear O_CREAT */
		} else {
			create_error = may_o_create(&nd->path, dentry, mode);
			if (create_error) {
				open_flag &= ~O_CREAT;
				if (open_flag & O_EXCL)
					goto no_open;
			}
		}
	} else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
		   unlikely(!got_write)) {
		/*
		 * No O_CREATE -> atomicity not a requirement -> fall
		 * back to lookup + open
		 */
		goto no_open;
M
Miklos Szeredi 已提交
3143 3144
	}

3145
	if (dir_inode->i_op->atomic_open) {
3146 3147 3148 3149 3150
		error = atomic_open(nd, dentry, path, file, op, open_flag,
				    mode, opened);
		if (unlikely(error == -ENOENT) && create_error)
			error = create_error;
		return error;
M
Miklos Szeredi 已提交
3151
	}
3152

3153
no_open:
3154
	if (d_in_lookup(dentry)) {
3155 3156
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3157
		d_lookup_done(dentry);
3158 3159 3160 3161 3162 3163 3164 3165
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3166 3167
	}

M
Miklos Szeredi 已提交
3168
	/* Negative dentry, just create the file */
3169
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3170
		*opened |= FILE_CREATED;
3171 3172 3173
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3174
			goto out_dput;
3175 3176
		}
		error = dir_inode->i_op->create(dir_inode, dentry, mode,
3177
						open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3178 3179
		if (error)
			goto out_dput;
3180
		fsnotify_create(dir_inode, dentry);
M
Miklos Szeredi 已提交
3181
	}
3182 3183 3184
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3185
	}
M
Miklos Szeredi 已提交
3186
out_no_open:
M
Miklos Szeredi 已提交
3187 3188
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
3189
	return 1;
M
Miklos Szeredi 已提交
3190 3191 3192

out_dput:
	dput(dentry);
3193
	return error;
M
Miklos Szeredi 已提交
3194 3195
}

N
Nick Piggin 已提交
3196
/*
3197
 * Handle the last step of open()
N
Nick Piggin 已提交
3198
 */
3199
static int do_last(struct nameidata *nd,
3200
		   struct file *file, const struct open_flags *op,
A
Al Viro 已提交
3201
		   int *opened)
3202
{
3203
	struct dentry *dir = nd->path.dentry;
3204
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3205
	bool will_truncate = (open_flag & O_TRUNC) != 0;
3206
	bool got_write = false;
A
Al Viro 已提交
3207
	int acc_mode = op->acc_mode;
3208
	unsigned seq;
3209
	struct inode *inode;
3210
	struct path save_parent = { .dentry = NULL, .mnt = NULL };
3211
	struct path path;
3212
	bool retried = false;
A
Al Viro 已提交
3213
	int error;
3214

3215 3216 3217
	nd->flags &= ~LOOKUP_PARENT;
	nd->flags |= op->intent;

3218
	if (nd->last_type != LAST_NORM) {
3219
		error = handle_dots(nd, nd->last_type);
3220
		if (unlikely(error))
3221
			return error;
M
Miklos Szeredi 已提交
3222
		goto finish_open;
3223
	}
3224

3225
	if (!(open_flag & O_CREAT)) {
3226 3227 3228
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3229
		error = lookup_fast(nd, &path, &inode, &seq);
3230
		if (likely(error > 0))
3231 3232 3233
			goto finish_lookup;

		if (error < 0)
3234
			return error;
3235 3236

		BUG_ON(nd->inode != dir->d_inode);
A
Al Viro 已提交
3237
		BUG_ON(nd->flags & LOOKUP_RCU);
3238 3239 3240 3241 3242 3243 3244 3245
	} else {
		/* create side of things */
		/*
		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
		 * has been cleared when we got to the last component we are
		 * about to look up
		 */
		error = complete_walk(nd);
3246
		if (error)
3247
			return error;
3248

A
Al Viro 已提交
3249
		audit_inode(nd->name, dir, LOOKUP_PARENT);
3250
		/* trailing slashes? */
3251 3252
		if (unlikely(nd->last.name[nd->last.len]))
			return -EISDIR;
3253
	}
A
Al Viro 已提交
3254

3255
retry_lookup:
3256
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3257 3258 3259 3260 3261 3262 3263 3264 3265
		error = mnt_want_write(nd->path.mnt);
		if (!error)
			got_write = true;
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3266 3267 3268 3269
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3270
	error = lookup_open(nd, &path, file, op, got_write, opened);
3271 3272 3273 3274
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3275

3276 3277
	if (error <= 0) {
		if (error)
M
Miklos Szeredi 已提交
3278 3279
			goto out;

3280
		if ((*opened & FILE_CREATED) ||
A
Al Viro 已提交
3281
		    !S_ISREG(file_inode(file)->i_mode))
M
Miklos Szeredi 已提交
3282
			will_truncate = false;
M
Miklos Szeredi 已提交
3283

A
Al Viro 已提交
3284
		audit_inode(nd->name, file->f_path.dentry, 0);
M
Miklos Szeredi 已提交
3285 3286
		goto opened;
	}
3287

3288
	if (*opened & FILE_CREATED) {
3289
		/* Don't check for write permission, don't truncate */
3290
		open_flag &= ~O_TRUNC;
M
Miklos Szeredi 已提交
3291
		will_truncate = false;
A
Al Viro 已提交
3292
		acc_mode = 0;
3293
		path_to_nameidata(&path, nd);
M
Miklos Szeredi 已提交
3294
		goto finish_open_created;
3295 3296
	}

M
Miklos Szeredi 已提交
3297 3298 3299 3300 3301
	/*
	 * If atomic_open() acquired write access it is dropped now due to
	 * possible mount and symlink following (this might be optimized away if
	 * necessary...)
	 */
3302
	if (got_write) {
M
Miklos Szeredi 已提交
3303
		mnt_drop_write(nd->path.mnt);
3304
		got_write = false;
M
Miklos Szeredi 已提交
3305 3306
	}

A
Al Viro 已提交
3307 3308 3309 3310 3311 3312 3313 3314 3315 3316
	if (unlikely(d_is_negative(path.dentry))) {
		path_to_nameidata(&path, nd);
		return -ENOENT;
	}

	/*
	 * create/update audit record if it already exists.
	 */
	audit_inode(nd->name, path.dentry, 0);

3317 3318 3319 3320
	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
		path_to_nameidata(&path, nd);
		return -EEXIST;
	}
3321

3322
	error = follow_managed(&path, nd);
3323 3324
	if (unlikely(error < 0))
		return error;
3325

3326
	seq = 0;	/* out of RCU mode, so the value doesn't matter */
3327
	inode = d_backing_inode(path.dentry);
3328
finish_lookup:
3329 3330
	if (nd->depth)
		put_link(nd);
3331 3332
	error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
				   inode, seq);
3333
	if (unlikely(error))
3334
		return error;
3335

3336 3337
	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
		path_to_nameidata(&path, nd);
3338 3339
	} else {
		save_parent.dentry = nd->path.dentry;
3340 3341
		save_parent.mnt = mntget(path.mnt);
		nd->path.dentry = path.dentry;
3342 3343

	}
3344
	nd->inode = inode;
3345
	nd->seq = seq;
3346
	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3347
finish_open:
3348
	error = complete_walk(nd);
3349 3350
	if (error) {
		path_put(&save_parent);
3351
		return error;
3352
	}
A
Al Viro 已提交
3353
	audit_inode(nd->name, nd->path.dentry, 0);
3354
	error = -EISDIR;
M
Miklos Szeredi 已提交
3355
	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3356
		goto out;
3357
	error = -ENOTDIR;
M
Miklos Szeredi 已提交
3358
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3359
		goto out;
3360
	if (!d_is_reg(nd->path.dentry))
M
Miklos Szeredi 已提交
3361
		will_truncate = false;
3362

3363 3364 3365
	if (will_truncate) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
3366
			goto out;
3367
		got_write = true;
3368
	}
M
Miklos Szeredi 已提交
3369
finish_open_created:
3370 3371 3372
	error = may_open(&nd->path, acc_mode, open_flag);
	if (error)
		goto out;
M
Miklos Szeredi 已提交
3373 3374 3375 3376 3377
	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
	error = vfs_open(&nd->path, file, current_cred());
	if (!error) {
		*opened |= FILE_OPENED;
	} else {
A
Al Viro 已提交
3378
		if (error == -EOPENSTALE)
M
Miklos Szeredi 已提交
3379
			goto stale_open;
3380
		goto out;
M
Miklos Szeredi 已提交
3381
	}
3382
opened:
3383
	error = open_check_o_direct(file);
3384 3385 3386
	if (!error)
		error = ima_file_check(file, op->acc_mode, *opened);
	if (!error && will_truncate)
3387
		error = handle_truncate(file);
3388
out:
3389 3390
	if (unlikely(error) && (*opened & FILE_OPENED))
		fput(file);
3391 3392 3393 3394
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3395
	if (got_write)
3396
		mnt_drop_write(nd->path.mnt);
3397
	path_put(&save_parent);
3398
	return error;
3399

M
Miklos Szeredi 已提交
3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410
stale_open:
	/* If no saved parent or already retried then can't retry */
	if (!save_parent.dentry || retried)
		goto out;

	BUG_ON(save_parent.dentry != dir);
	path_put(&nd->path);
	nd->path = save_parent;
	nd->inode = dir->d_inode;
	save_parent.mnt = NULL;
	save_parent.dentry = NULL;
3411
	if (got_write) {
M
Miklos Szeredi 已提交
3412
		mnt_drop_write(nd->path.mnt);
3413
		got_write = false;
M
Miklos Szeredi 已提交
3414 3415 3416
	}
	retried = true;
	goto retry_lookup;
3417 3418
}

3419
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3420 3421 3422 3423
		const struct open_flags *op,
		struct file *file, int *opened)
{
	static const struct qstr name = QSTR_INIT("/", 1);
3424
	struct dentry *child;
3425
	struct inode *dir;
3426
	struct path path;
3427
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3428 3429
	if (unlikely(error))
		return error;
3430
	error = mnt_want_write(path.mnt);
3431 3432
	if (unlikely(error))
		goto out;
3433
	dir = path.dentry->d_inode;
3434
	/* we want directory to be writable */
3435
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3436 3437 3438 3439 3440 3441
	if (error)
		goto out2;
	if (!dir->i_op->tmpfile) {
		error = -EOPNOTSUPP;
		goto out2;
	}
3442
	child = d_alloc(path.dentry, &name);
3443 3444 3445 3446
	if (unlikely(!child)) {
		error = -ENOMEM;
		goto out2;
	}
3447 3448 3449
	dput(path.dentry);
	path.dentry = child;
	error = dir->i_op->tmpfile(dir, child, op->mode);
3450 3451
	if (error)
		goto out2;
3452
	audit_inode(nd->name, child, 0);
3453
	/* Don't check for other permissions, the inode was just created */
A
Al Viro 已提交
3454
	error = may_open(&path, 0, op->open_flag);
3455 3456
	if (error)
		goto out2;
3457 3458
	file->f_path.mnt = path.mnt;
	error = finish_open(file, child, NULL, opened);
3459 3460 3461
	if (error)
		goto out2;
	error = open_check_o_direct(file);
3462
	if (error) {
3463
		fput(file);
3464 3465 3466 3467 3468 3469
	} else if (!(op->open_flag & O_EXCL)) {
		struct inode *inode = file_inode(file);
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3470
out2:
3471
	mnt_drop_write(path.mnt);
3472
out:
3473
	path_put(&path);
3474 3475 3476
	return error;
}

3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
		error = vfs_open(&path, file, current_cred());
		path_put(&path);
	}
	return error;
}

3489 3490
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3491
{
3492
	const char *s;
A
Al Viro 已提交
3493
	struct file *file;
3494
	int opened = 0;
3495
	int error;
N
Nick Piggin 已提交
3496

A
Al Viro 已提交
3497
	file = get_empty_filp();
3498 3499
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3500

A
Al Viro 已提交
3501
	file->f_flags = op->open_flag;
N
Nick Piggin 已提交
3502

A
Al Viro 已提交
3503
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3504
		error = do_tmpfile(nd, flags, op, file, &opened);
A
Al Viro 已提交
3505
		goto out2;
3506 3507
	}

3508 3509 3510 3511 3512 3513 3514
	if (unlikely(file->f_flags & O_PATH)) {
		error = do_o_path(nd, flags, file);
		if (!error)
			opened |= FILE_OPENED;
		goto out2;
	}

3515
	s = path_init(nd, flags);
3516 3517 3518 3519
	if (IS_ERR(s)) {
		put_filp(file);
		return ERR_CAST(s);
	}
3520
	while (!(error = link_path_walk(s, nd)) &&
A
Al Viro 已提交
3521
		(error = do_last(nd, file, op, &opened)) > 0) {
A
Al Viro 已提交
3522
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3523 3524 3525
		s = trailing_symlink(nd);
		if (IS_ERR(s)) {
			error = PTR_ERR(s);
3526
			break;
3527
		}
3528
	}
3529
	terminate_walk(nd);
A
Al Viro 已提交
3530
out2:
3531 3532
	if (!(opened & FILE_OPENED)) {
		BUG_ON(!error);
A
Al Viro 已提交
3533
		put_filp(file);
3534
	}
3535 3536 3537 3538 3539 3540 3541 3542 3543 3544
	if (unlikely(error)) {
		if (error == -EOPENSTALE) {
			if (flags & LOOKUP_RCU)
				error = -ECHILD;
			else
				error = -ESTALE;
		}
		file = ERR_PTR(error);
	}
	return file;
L
Linus Torvalds 已提交
3545 3546
}

3547
struct file *do_filp_open(int dfd, struct filename *pathname,
3548
		const struct open_flags *op)
3549
{
3550
	struct nameidata nd;
3551
	int flags = op->lookup_flags;
3552 3553
	struct file *filp;

3554
	set_nameidata(&nd, dfd, pathname);
3555
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3556
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3557
		filp = path_openat(&nd, op, flags);
3558
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3559
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3560
	restore_nameidata();
3561 3562 3563
	return filp;
}

A
Al Viro 已提交
3564
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3565
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3566
{
3567
	struct nameidata nd;
A
Al Viro 已提交
3568
	struct file *file;
3569
	struct filename *filename;
3570
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3571 3572 3573 3574

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3575
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3576 3577
		return ERR_PTR(-ELOOP);

3578
	filename = getname_kernel(name);
3579
	if (IS_ERR(filename))
3580 3581
		return ERR_CAST(filename);

3582
	set_nameidata(&nd, -1, filename);
3583
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3584
	if (unlikely(file == ERR_PTR(-ECHILD)))
3585
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3586
	if (unlikely(file == ERR_PTR(-ESTALE)))
3587
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3588
	restore_nameidata();
3589
	putname(filename);
A
Al Viro 已提交
3590 3591 3592
	return file;
}

3593
static struct dentry *filename_create(int dfd, struct filename *name,
3594
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3595
{
3596
	struct dentry *dentry = ERR_PTR(-EEXIST);
3597 3598
	struct qstr last;
	int type;
3599
	int err2;
3600 3601 3602 3603 3604 3605 3606 3607 3608
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3609 3610 3611
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3612

3613 3614 3615 3616
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3617
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3618
		goto out;
3619

3620
	/* don't fail immediately if it's r/o, at least try to report other errors */
3621
	err2 = mnt_want_write(path->mnt);
3622 3623 3624
	/*
	 * Do the final lookup.
	 */
3625
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3626
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3627
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3628
	if (IS_ERR(dentry))
3629
		goto unlock;
3630

3631
	error = -EEXIST;
3632
	if (d_is_positive(dentry))
3633
		goto fail;
3634

3635 3636 3637 3638 3639 3640
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3641
	if (unlikely(!is_dir && last.name[last.len])) {
3642
		error = -ENOENT;
A
Al Viro 已提交
3643
		goto fail;
3644
	}
3645 3646
	if (unlikely(err2)) {
		error = err2;
3647
		goto fail;
3648
	}
3649
	putname(name);
L
Linus Torvalds 已提交
3650 3651
	return dentry;
fail:
3652 3653 3654
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3655
	inode_unlock(path->dentry->d_inode);
3656
	if (!err2)
3657
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3658
out:
3659
	path_put(path);
3660
	putname(name);
L
Linus Torvalds 已提交
3661 3662
	return dentry;
}
3663 3664 3665 3666

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3667 3668
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3669
}
3670 3671
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3672 3673 3674
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3675
	inode_unlock(path->dentry->d_inode);
3676
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3677 3678 3679 3680
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3681
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3682
				struct path *path, unsigned int lookup_flags)
3683
{
3684
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3685 3686 3687
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3688
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3689
{
3690
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3691 3692 3693 3694

	if (error)
		return error;

3695
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3696 3697
		return -EPERM;

A
Al Viro 已提交
3698
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3699 3700
		return -EPERM;

3701 3702 3703 3704
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3705 3706 3707 3708 3709
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3710
	if (!error)
3711
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3712 3713
	return error;
}
3714
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3715

A
Al Viro 已提交
3716
static int may_mknod(umode_t mode)
3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

A
Al Viro 已提交
3733
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3734
		unsigned, dev)
L
Linus Torvalds 已提交
3735
{
3736
	struct dentry *dentry;
3737 3738
	struct path path;
	int error;
3739
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3740

3741 3742 3743
	error = may_mknod(mode);
	if (error)
		return error;
3744 3745
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3746 3747
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3748

3749
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3750
		mode &= ~current_umask();
3751
	error = security_path_mknod(&path, dentry, mode, dev);
3752
	if (error)
3753
		goto out;
3754
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3755
		case 0: case S_IFREG:
A
Al Viro 已提交
3756
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3757 3758
			if (!error)
				ima_post_path_mknod(dentry);
L
Linus Torvalds 已提交
3759 3760
			break;
		case S_IFCHR: case S_IFBLK:
3761
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3762 3763 3764
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3765
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3766 3767
			break;
	}
3768
out:
A
Al Viro 已提交
3769
	done_path_create(&path, dentry);
3770 3771 3772 3773
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3774 3775 3776
	return error;
}

A
Al Viro 已提交
3777
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3778 3779 3780 3781
{
	return sys_mknodat(AT_FDCWD, filename, mode, dev);
}

3782
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3783
{
3784
	int error = may_create(dir, dentry);
3785
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3786 3787 3788 3789

	if (error)
		return error;

A
Al Viro 已提交
3790
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3791 3792 3793 3794 3795 3796 3797
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3798 3799 3800
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3801
	error = dir->i_op->mkdir(dir, dentry, mode);
3802
	if (!error)
3803
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3804 3805
	return error;
}
3806
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3807

3808
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
L
Linus Torvalds 已提交
3809
{
3810
	struct dentry *dentry;
3811 3812
	struct path path;
	int error;
3813
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3814

3815 3816
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3817
	if (IS_ERR(dentry))
3818
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3819

3820
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3821
		mode &= ~current_umask();
3822
	error = security_path_mkdir(&path, dentry, mode);
3823 3824
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3825
	done_path_create(&path, dentry);
3826 3827 3828 3829
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3830 3831 3832
	return error;
}

3833
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3834 3835 3836 3837
{
	return sys_mkdirat(AT_FDCWD, pathname, mode);
}

L
Linus Torvalds 已提交
3838 3839 3840 3841 3842 3843 3844
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3845
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3846 3847
		return -EPERM;

3848
	dget(dentry);
A
Al Viro 已提交
3849
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3850 3851

	error = -EBUSY;
3852
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3853 3854 3855 3856 3857 3858
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

3859
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3860 3861 3862 3863 3864 3865
	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3866
	detach_mounts(dentry);
S
Sage Weil 已提交
3867 3868

out:
A
Al Viro 已提交
3869
	inode_unlock(dentry->d_inode);
3870
	dput(dentry);
S
Sage Weil 已提交
3871
	if (!error)
L
Linus Torvalds 已提交
3872 3873 3874
		d_delete(dentry);
	return error;
}
3875
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3876

3877
static long do_rmdir(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3878 3879
{
	int error = 0;
3880
	struct filename *name;
L
Linus Torvalds 已提交
3881
	struct dentry *dentry;
3882 3883 3884
	struct path path;
	struct qstr last;
	int type;
3885 3886
	unsigned int lookup_flags = 0;
retry:
3887 3888
	name = user_path_parent(dfd, pathname,
				&path, &last, &type, lookup_flags);
3889 3890
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3891

3892
	switch (type) {
3893 3894 3895 3896 3897 3898 3899 3900 3901
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3902
	}
3903

3904
	error = mnt_want_write(path.mnt);
3905 3906
	if (error)
		goto exit1;
3907

A
Al Viro 已提交
3908
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3909
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3910
	error = PTR_ERR(dentry);
3911 3912
	if (IS_ERR(dentry))
		goto exit2;
3913 3914 3915 3916
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3917
	error = security_path_rmdir(&path, dentry);
3918
	if (error)
3919
		goto exit3;
3920
	error = vfs_rmdir(path.dentry->d_inode, dentry);
3921
exit3:
3922 3923
	dput(dentry);
exit2:
A
Al Viro 已提交
3924
	inode_unlock(path.dentry->d_inode);
3925
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3926
exit1:
3927
	path_put(&path);
L
Linus Torvalds 已提交
3928
	putname(name);
3929 3930 3931 3932
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3933 3934 3935
	return error;
}

3936
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3937 3938 3939 3940
{
	return do_rmdir(AT_FDCWD, pathname);
}

3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3960
{
J
J. Bruce Fields 已提交
3961
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3962 3963 3964 3965 3966
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3967
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3968 3969
		return -EPERM;

A
Al Viro 已提交
3970
	inode_lock(target);
3971
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3972 3973 3974
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3975
		if (!error) {
3976 3977
			error = try_break_deleg(target, delegated_inode);
			if (error)
3978
				goto out;
L
Linus Torvalds 已提交
3979
			error = dir->i_op->unlink(dir, dentry);
3980
			if (!error) {
3981
				dont_mount(dentry);
3982 3983
				detach_mounts(dentry);
			}
3984
		}
L
Linus Torvalds 已提交
3985
	}
3986
out:
A
Al Viro 已提交
3987
	inode_unlock(target);
L
Linus Torvalds 已提交
3988 3989 3990

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3991
		fsnotify_link_count(target);
J
John McCutchan 已提交
3992
		d_delete(dentry);
L
Linus Torvalds 已提交
3993
	}
R
Robert Love 已提交
3994

L
Linus Torvalds 已提交
3995 3996
	return error;
}
3997
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3998 3999 4000

/*
 * Make sure that the actual truncation of the file will occur outside its
4001
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
4002 4003 4004
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
4005
static long do_unlinkat(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
4006
{
4007
	int error;
4008
	struct filename *name;
L
Linus Torvalds 已提交
4009
	struct dentry *dentry;
4010 4011 4012
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
4013
	struct inode *inode = NULL;
4014
	struct inode *delegated_inode = NULL;
4015 4016
	unsigned int lookup_flags = 0;
retry:
4017 4018
	name = user_path_parent(dfd, pathname,
				&path, &last, &type, lookup_flags);
4019 4020
	if (IS_ERR(name))
		return PTR_ERR(name);
4021

L
Linus Torvalds 已提交
4022
	error = -EISDIR;
4023
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
4024
		goto exit1;
4025

4026
	error = mnt_want_write(path.mnt);
4027 4028
	if (error)
		goto exit1;
4029
retry_deleg:
A
Al Viro 已提交
4030
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4031
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4032 4033 4034
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
4035
		if (last.name[last.len])
4036
			goto slashes;
L
Linus Torvalds 已提交
4037
		inode = dentry->d_inode;
4038
		if (d_is_negative(dentry))
4039 4040
			goto slashes;
		ihold(inode);
4041
		error = security_path_unlink(&path, dentry);
4042
		if (error)
4043
			goto exit2;
4044
		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4045
exit2:
L
Linus Torvalds 已提交
4046 4047
		dput(dentry);
	}
A
Al Viro 已提交
4048
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
4049 4050
	if (inode)
		iput(inode);	/* truncate the inode here */
4051 4052
	inode = NULL;
	if (delegated_inode) {
4053
		error = break_deleg_wait(&delegated_inode);
4054 4055 4056
		if (!error)
			goto retry_deleg;
	}
4057
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
4058
exit1:
4059
	path_put(&path);
L
Linus Torvalds 已提交
4060
	putname(name);
4061 4062 4063 4064 4065
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
L
Linus Torvalds 已提交
4066 4067 4068
	return error;

slashes:
4069 4070
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
4071
	else if (d_is_dir(dentry))
4072 4073 4074
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
4075 4076 4077
	goto exit2;
}

4078
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4079 4080 4081 4082 4083 4084 4085 4086 4087 4088
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
		return do_rmdir(dfd, pathname);

	return do_unlinkat(dfd, pathname);
}

4089
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4090 4091 4092 4093
{
	return do_unlinkat(AT_FDCWD, pathname);
}

4094
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
4095
{
4096
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
4097 4098 4099 4100

	if (error)
		return error;

A
Al Viro 已提交
4101
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
4102 4103 4104 4105 4106 4107 4108
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
4109
	if (!error)
4110
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
4111 4112
	return error;
}
4113
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
4114

4115 4116
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
L
Linus Torvalds 已提交
4117
{
4118
	int error;
4119
	struct filename *from;
4120
	struct dentry *dentry;
4121
	struct path path;
4122
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
4123 4124

	from = getname(oldname);
4125
	if (IS_ERR(from))
L
Linus Torvalds 已提交
4126
		return PTR_ERR(from);
4127 4128
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4129 4130
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
4131
		goto out_putname;
4132

4133
	error = security_path_symlink(&path, dentry, from->name);
4134
	if (!error)
4135
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
4136
	done_path_create(&path, dentry);
4137 4138 4139 4140
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4141
out_putname:
L
Linus Torvalds 已提交
4142 4143 4144 4145
	putname(from);
	return error;
}

4146
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4147 4148 4149 4150
{
	return sys_symlinkat(oldname, AT_FDCWD, newname);
}

J
J. Bruce Fields 已提交
4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4171 4172
{
	struct inode *inode = old_dentry->d_inode;
4173
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4174 4175 4176 4177 4178
	int error;

	if (!inode)
		return -ENOENT;

4179
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
A
Al Viro 已提交
4191
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4192
		return -EPERM;
4193
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4194 4195 4196 4197 4198 4199
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4200
	inode_lock(inode);
4201
	/* Make sure we don't allow creating hardlink to an unlinked file */
4202
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4203
		error =  -ENOENT;
4204 4205
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4206 4207 4208 4209 4210
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4211 4212 4213 4214 4215 4216

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4217
	inode_unlock(inode);
4218
	if (!error)
4219
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4220 4221
	return error;
}
4222
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4223 4224 4225 4226 4227 4228 4229 4230 4231 4232

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4233 4234
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
L
Linus Torvalds 已提交
4235 4236
{
	struct dentry *new_dentry;
4237
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4238
	struct inode *delegated_inode = NULL;
4239
	int how = 0;
L
Linus Torvalds 已提交
4240 4241
	int error;

4242
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4243
		return -EINVAL;
4244
	/*
4245 4246 4247
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4248
	 */
4249 4250 4251
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4252
		how = LOOKUP_EMPTY;
4253
	}
4254 4255 4256

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4257
retry:
4258
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4259
	if (error)
4260 4261
		return error;

4262 4263
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4264
	error = PTR_ERR(new_dentry);
4265
	if (IS_ERR(new_dentry))
4266 4267 4268 4269 4270
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4271 4272 4273
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4274
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4275
	if (error)
4276
		goto out_dput;
J
J. Bruce Fields 已提交
4277
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4278
out_dput:
A
Al Viro 已提交
4279
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4280 4281
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4282 4283
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4284
			goto retry;
4285
		}
J
J. Bruce Fields 已提交
4286
	}
4287
	if (retry_estale(error, how)) {
4288
		path_put(&old_path);
4289 4290 4291
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4292
out:
4293
	path_put(&old_path);
L
Linus Torvalds 已提交
4294 4295 4296 4297

	return error;
}

4298
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4299
{
4300
	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4301 4302
}

4303 4304 4305 4306 4307 4308 4309
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4310
 * @flags:	rename flags
4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4325 4326 4327
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4328
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4329 4330
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4331
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4332
 *	   story.
4333 4334
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4335
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4336 4337
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4338
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4339 4340 4341
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4342
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4343 4344 4345
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4346
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4347
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4348
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4349
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4350 4351
 *	   locking].
 */
4352 4353
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4354
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4355
{
4356 4357 4358 4359
	int error;
	bool is_dir = d_is_dir(old_dentry);
	const unsigned char *old_name;
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4360
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4361 4362
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
4363

4364 4365 4366 4367 4368
	/*
	 * Check source == target.
	 * On overlayfs need to look at underlying inodes.
	 */
	if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
4369 4370 4371 4372 4373 4374
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4375
	if (!target) {
4376
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4377 4378 4379 4380 4381 4382 4383 4384
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4385 4386 4387
	if (error)
		return error;

M
Miklos Szeredi 已提交
4388
	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4389
		return -EPERM;
L
Linus Torvalds 已提交
4390

M
Miklos Szeredi 已提交
4391 4392 4393
	if (flags && !old_dir->i_op->rename2)
		return -EINVAL;

L
Linus Torvalds 已提交
4394 4395 4396 4397
	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4409 4410
	}

4411 4412
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4413 4414 4415
	if (error)
		return error;

4416
	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4417
	dget(new_dentry);
M
Miklos Szeredi 已提交
4418
	if (!is_dir || (flags & RENAME_EXCHANGE))
4419 4420
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4421
		inode_lock(target);
S
Sage Weil 已提交
4422 4423

	error = -EBUSY;
4424
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4425 4426
		goto out;

M
Miklos Szeredi 已提交
4427
	if (max_links && new_dir != old_dir) {
4428
		error = -EMLINK;
M
Miklos Szeredi 已提交
4429
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4430
			goto out;
M
Miklos Szeredi 已提交
4431 4432 4433 4434 4435 4436 4437
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
		shrink_dcache_parent(new_dentry);
	if (!is_dir) {
4438
		error = try_break_deleg(source, delegated_inode);
4439 4440
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4441 4442 4443 4444 4445
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4446
	}
M
Miklos Szeredi 已提交
4447
	if (!old_dir->i_op->rename2) {
M
Miklos Szeredi 已提交
4448 4449 4450
		error = old_dir->i_op->rename(old_dir, old_dentry,
					      new_dir, new_dentry);
	} else {
M
Miklos Szeredi 已提交
4451
		WARN_ON(old_dir->i_op->rename != NULL);
M
Miklos Szeredi 已提交
4452 4453 4454
		error = old_dir->i_op->rename2(old_dir, old_dentry,
					       new_dir, new_dentry, flags);
	}
S
Sage Weil 已提交
4455 4456 4457
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4458
	if (!(flags & RENAME_EXCHANGE) && target) {
4459 4460
		if (is_dir)
			target->i_flags |= S_DEAD;
S
Sage Weil 已提交
4461
		dont_mount(new_dentry);
4462
		detach_mounts(new_dentry);
4463
	}
M
Miklos Szeredi 已提交
4464 4465 4466 4467 4468 4469
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4470
out:
M
Miklos Szeredi 已提交
4471
	if (!is_dir || (flags & RENAME_EXCHANGE))
4472 4473
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4474
		inode_unlock(target);
L
Linus Torvalds 已提交
4475
	dput(new_dentry);
M
Miklos Szeredi 已提交
4476
	if (!error) {
4477
		fsnotify_move(old_dir, new_dir, old_name, is_dir,
M
Miklos Szeredi 已提交
4478 4479 4480 4481 4482 4483
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
				      new_is_dir, NULL, new_dentry);
		}
	}
R
Robert Love 已提交
4484 4485
	fsnotify_oldname_free(old_name);

L
Linus Torvalds 已提交
4486 4487
	return error;
}
4488
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4489

M
Miklos Szeredi 已提交
4490 4491
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
L
Linus Torvalds 已提交
4492
{
4493 4494
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4495 4496 4497
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4498
	struct inode *delegated_inode = NULL;
4499 4500
	struct filename *from;
	struct filename *to;
4501
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4502
	bool should_retry = false;
4503
	int error;
M
Miklos Szeredi 已提交
4504

M
Miklos Szeredi 已提交
4505
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
4506 4507
		return -EINVAL;

M
Miklos Szeredi 已提交
4508 4509
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
M
Miklos Szeredi 已提交
4510 4511
		return -EINVAL;

M
Miklos Szeredi 已提交
4512 4513 4514
	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
		return -EPERM;

4515 4516 4517
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4518
retry:
4519 4520
	from = user_path_parent(olddfd, oldname,
				&old_path, &old_last, &old_type, lookup_flags);
4521 4522
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
L
Linus Torvalds 已提交
4523
		goto exit;
4524
	}
L
Linus Torvalds 已提交
4525

4526 4527
	to = user_path_parent(newdfd, newname,
				&new_path, &new_last, &new_type, lookup_flags);
4528 4529
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4530
		goto exit1;
4531
	}
L
Linus Torvalds 已提交
4532 4533

	error = -EXDEV;
4534
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4535 4536 4537
		goto exit2;

	error = -EBUSY;
4538
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4539 4540
		goto exit2;

M
Miklos Szeredi 已提交
4541 4542
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4543
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4544 4545
		goto exit2;

4546
	error = mnt_want_write(old_path.mnt);
4547 4548 4549
	if (error)
		goto exit2;

4550
retry_deleg:
4551
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4552

4553
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4554 4555 4556 4557 4558
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4559
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4560
		goto exit4;
4561
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4562 4563 4564 4565 4566 4567
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4568 4569 4570 4571 4572 4573 4574
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4575
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4576 4577 4578
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4579
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4580
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4581
		error = -ENOTDIR;
4582
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4583
			goto exit5;
4584
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4585
			goto exit5;
L
Linus Torvalds 已提交
4586 4587 4588 4589
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4590
		goto exit5;
L
Linus Torvalds 已提交
4591
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4592 4593
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4594 4595 4596
	if (new_dentry == trap)
		goto exit5;

4597 4598
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4599
	if (error)
4600
		goto exit5;
4601 4602
	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
			   new_path.dentry->d_inode, new_dentry,
M
Miklos Szeredi 已提交
4603
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4604 4605 4606 4607 4608
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4609
	unlock_rename(new_path.dentry, old_path.dentry);
4610 4611 4612 4613 4614
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4615
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4616
exit2:
4617 4618
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4619
	path_put(&new_path);
4620
	putname(to);
L
Linus Torvalds 已提交
4621
exit1:
4622
	path_put(&old_path);
L
Linus Torvalds 已提交
4623
	putname(from);
4624 4625 4626 4627 4628
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4629
exit:
L
Linus Torvalds 已提交
4630 4631 4632
	return error;
}

M
Miklos Szeredi 已提交
4633 4634 4635 4636 4637 4638
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
}

4639
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4640
{
M
Miklos Szeredi 已提交
4641
	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4642 4643
}

M
Miklos Szeredi 已提交
4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
	int error = may_create(dir, dentry);
	if (error)
		return error;

	if (!dir->i_op->mknod)
		return -EPERM;

	return dir->i_op->mknod(dir, dentry,
				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}
EXPORT_SYMBOL(vfs_whiteout);

A
Al Viro 已提交
4658
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4659
{
A
Al Viro 已提交
4660
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

/*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
4675 4676
 * have ->get_link() not calling nd_jump_link().  Using (or not using) it
 * for any given inode is up to filesystem.
L
Linus Torvalds 已提交
4677 4678 4679
 */
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4680
	DEFINE_DELAYED_CALL(done);
4681 4682
	struct inode *inode = d_inode(dentry);
	const char *link = inode->i_link;
4683
	int res;
4684

4685
	if (!link) {
4686
		link = inode->i_op->get_link(dentry, inode, &done);
4687 4688 4689
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
4690
	res = readlink_copy(buffer, buflen, link);
4691
	do_delayed_call(&done);
4692
	return res;
L
Linus Torvalds 已提交
4693
}
4694
EXPORT_SYMBOL(generic_readlink);
L
Linus Torvalds 已提交
4695 4696

/* get the link contents into pagecache */
4697
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4698
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4699
{
4700 4701
	char *kaddr;
	struct page *page;
4702 4703
	struct address_space *mapping = inode->i_mapping;

4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4717
	set_delayed_call(callback, page_put_link, page);
4718 4719
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4720
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4721
	return kaddr;
L
Linus Torvalds 已提交
4722 4723
}

4724
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4725

4726
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4727
{
4728
	put_page(arg);
L
Linus Torvalds 已提交
4729
}
4730
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4731

4732 4733
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4734
	DEFINE_DELAYED_CALL(done);
4735 4736
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4737 4738
					      &done));
	do_delayed_call(&done);
4739 4740 4741 4742
	return res;
}
EXPORT_SYMBOL(page_readlink);

4743 4744 4745 4746
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4747 4748
{
	struct address_space *mapping = inode->i_mapping;
4749
	struct page *page;
4750
	void *fsdata;
4751
	int err;
4752 4753 4754
	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4755

4756
retry:
4757
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4758
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4759
	if (err)
4760 4761
		goto fail;

4762
	memcpy(page_address(page), symname, len-1);
4763 4764 4765

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4766 4767
	if (err < 0)
		goto fail;
4768 4769 4770
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4771 4772 4773 4774 4775
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4776
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4777

4778 4779 4780
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4781
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4782
}
4783
EXPORT_SYMBOL(page_symlink);
4784

4785
const struct inode_operations page_symlink_inode_operations = {
L
Linus Torvalds 已提交
4786
	.readlink	= generic_readlink,
4787
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4788 4789
};
EXPORT_SYMBOL(page_symlink_inode_operations);