namei.c 121.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
25
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
26 27
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
28
#include <linux/ima.h>
L
Linus Torvalds 已提交
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
42

43
#include "internal.h"
44
#include "mount.h"
45

L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
80
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

A
Al Viro 已提交
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
130
	struct filename *result;
131
	char *kname;
A
Al Viro 已提交
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

A
Al Viro 已提交
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
A
Al Viro 已提交
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

A
Al Viro 已提交
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
A
Al Viro 已提交
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

A
Al Viro 已提交
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
A
Al Viro 已提交
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
204 205
}

206 207
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
208
{
209
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
L
Linus Torvalds 已提交
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
260 261
}

262 263
static int check_acl(struct inode *inode, int mask)
{
264
#ifdef CONFIG_FS_POSIX_ACL
265 266 267
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
268 269
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
270
	                return -EAGAIN;
271
		/* no ->get_acl() calls in RCU mode... */
272
		if (is_uncached_acl(acl))
273
			return -ECHILD;
A
Ari Savolainen 已提交
274
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
275 276
	}

C
Christoph Hellwig 已提交
277 278 279
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
280 281 282 283 284
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
285
#endif
286 287 288 289

	return -EAGAIN;
}

290
/*
291
 * This does the basic permission checking
L
Linus Torvalds 已提交
292
 */
293
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
294
{
295
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
296

297
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
L
Linus Torvalds 已提交
298 299
		mode >>= 6;
	else {
300
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
301
			int error = check_acl(inode, mask);
302 303
			if (error != -EAGAIN)
				return error;
L
Linus Torvalds 已提交
304 305 306 307 308 309 310 311 312
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
313
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
L
Linus Torvalds 已提交
314
		return 0;
315 316 317 318
	return -EACCES;
}

/**
319
 * generic_permission -  check for access rights on a Posix-like filesystem
320
 * @inode:	inode to check access rights for
321
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
322 323 324 325
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
326 327 328 329 330
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
331
 */
332
int generic_permission(struct inode *inode, int mask)
333 334 335 336
{
	int ret;

	/*
337
	 * Do the basic permission checks.
338
	 */
339
	ret = acl_permission_check(inode, mask);
340 341
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
342

343 344 345
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
346 347
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
348
				return 0;
349
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
350
			return 0;
351 352
		return -EACCES;
	}
L
Linus Torvalds 已提交
353 354 355 356

	/*
	 * Searching includes executable on directories, else just read.
	 */
357
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
358
	if (mask == MAY_READ)
359
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
360
			return 0;
361 362 363 364 365 366 367 368
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
L
Linus Torvalds 已提交
369 370 371

	return -EACCES;
}
372
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

D
David Howells 已提交
394 395 396
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
397
 * @inode: Inode to check permission on
D
David Howells 已提交
398 399 400 401 402 403 404 405 406 407
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
408
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
D
David Howells 已提交
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
	}

	retval = do_inode_permission(inode, mask);
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
D
David Howells 已提交
458
}
459
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
460

J
Jan Blunck 已提交
461 462 463 464 465 466
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
467
void path_get(const struct path *path)
J
Jan Blunck 已提交
468 469 470 471 472 473
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
474 475 476 477 478 479
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
480
void path_put(const struct path *path)
L
Linus Torvalds 已提交
481
{
J
Jan Blunck 已提交
482 483
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
484
}
J
Jan Blunck 已提交
485
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
486

487
#define EMBEDDED_LEVELS 2
488 489
struct nameidata {
	struct path	path;
A
Al Viro 已提交
490
	struct qstr	last;
491 492 493
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
494
	unsigned	seq, m_seq;
495 496
	int		last_type;
	unsigned	depth;
497
	int		total_link_count;
498 499
	struct saved {
		struct path link;
500
		struct delayed_call done;
501
		const char *name;
502
		unsigned seq;
503
	} *stack, internal[EMBEDDED_LEVELS];
504 505
	struct filename	*name;
	struct nameidata *saved;
506
	struct inode	*link_inode;
507 508
	unsigned	root_seq;
	int		dfd;
509
} __randomize_layout;
510

511
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
512
{
513 514
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
515 516
	p->dfd = dfd;
	p->name = name;
517
	p->total_link_count = old ? old->total_link_count : 0;
518
	p->saved = old;
519
	current->nameidata = p;
520 521
}

522
static void restore_nameidata(void)
523
{
524
	struct nameidata *now = current->nameidata, *old = now->saved;
525 526 527 528

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
529
	if (now->stack != now->internal)
530
		kfree(now->stack);
531 532 533 534
}

static int __nd_alloc_stack(struct nameidata *nd)
{
A
Al Viro 已提交
535 536 537
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
538
		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
A
Al Viro 已提交
539 540 541 542
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
543
		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
544
				  GFP_KERNEL);
A
Al Viro 已提交
545 546 547
		if (unlikely(!p))
			return -ENOMEM;
	}
548 549 550 551 552
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

553 554 555 556 557 558 559 560 561 562
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;
563
	struct super_block *sb = mnt->mnt_sb;
564

565 566
	/* Bind mounts and multi-root filesystems can have disconnected paths */
	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
567 568 569 570 571
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

572 573
static inline int nd_alloc_stack(struct nameidata *nd)
{
574
	if (likely(nd->depth != EMBEDDED_LEVELS))
575 576 577 578 579 580
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

581 582 583 584 585
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
586 587
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
588 589 590 591 592 593 594 595 596 597 598
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
599
		if (nd->flags & LOOKUP_ROOT_GRABBED) {
600
			path_put(&nd->root);
601
			nd->flags &= ~LOOKUP_ROOT_GRABBED;
602
		}
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

642 643 644 645
static bool legitimize_root(struct nameidata *nd)
{
	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
		return true;
646
	nd->flags |= LOOKUP_ROOT_GRABBED;
647 648 649
	return legitimize_path(nd, &nd->root, nd->root_seq);
}

A
Al Viro 已提交
650
/*
N
Nick Piggin 已提交
651
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
652 653
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
654
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
655 656 657 658
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
659 660 661
 */

/**
A
Al Viro 已提交
662 663
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
664
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
665
 *
A
Al Viro 已提交
666 667 668
 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
669 670
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
N
Nick Piggin 已提交
671
 */
A
Al Viro 已提交
672
static int unlazy_walk(struct nameidata *nd)
N
Nick Piggin 已提交
673 674 675 676
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
677

A
Al Viro 已提交
678 679 680
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
681 682
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
683 684
	if (unlikely(!legitimize_root(nd)))
		goto out;
A
Al Viro 已提交
685 686 687 688
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
	return 0;

689
out1:
A
Al Viro 已提交
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
	return -ECHILD;
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

714
	nd->flags &= ~LOOKUP_RCU;
715 716 717 718
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
A
Al Viro 已提交
719
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
720
		goto out1;
A
Al Viro 已提交
721

722
	/*
A
Al Viro 已提交
723 724 725 726 727
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
728
	 */
A
Al Viro 已提交
729 730
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
731 732
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
		goto out_dput;
733 734 735 736
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
737 738
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
A
Al Viro 已提交
739
	rcu_read_unlock();
N
Nick Piggin 已提交
740
	return 0;
A
Al Viro 已提交
741

742 743 744 745
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
746
out:
A
Al Viro 已提交
747
	rcu_read_unlock();
748 749 750 751
	return -ECHILD;
out_dput:
	rcu_read_unlock();
	dput(dentry);
N
Nick Piggin 已提交
752 753 754
	return -ECHILD;
}

755
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
756
{
757 758 759 760
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
761 762
}

763 764 765
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
766
 *
767 768 769 770 771
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
772
 */
773
static int complete_walk(struct nameidata *nd)
774
{
A
Al Viro 已提交
775
	struct dentry *dentry = nd->path.dentry;
776 777
	int status;

778 779 780
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
A
Al Viro 已提交
781
		if (unlikely(unlazy_walk(nd)))
782 783 784
			return -ECHILD;
	}

A
Al Viro 已提交
785 786 787
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

788
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
789 790
		return 0;

791
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
792 793 794
	if (status > 0)
		return 0;

A
Al Viro 已提交
795
	if (!status)
796
		status = -ESTALE;
A
Al Viro 已提交
797

798 799 800
	return status;
}

A
Al Viro 已提交
801
static void set_root(struct nameidata *nd)
N
Nick Piggin 已提交
802
{
803
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
804

805 806 807 808 809 810 811 812 813 814
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
815
		nd->flags |= LOOKUP_ROOT_GRABBED;
816
	}
N
Nick Piggin 已提交
817 818
}

J
Jan Blunck 已提交
819
static void path_put_conditional(struct path *path, struct nameidata *nd)
820 821
{
	dput(path->dentry);
822
	if (path->mnt != nd->path.mnt)
823 824 825
		mntput(path->mnt);
}

826 827
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
828
{
N
Nick Piggin 已提交
829 830 831 832
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
833
	}
N
Nick Piggin 已提交
834
	nd->path.mnt = path->mnt;
835
	nd->path.dentry = path->dentry;
836 837
}

838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

C
Christoph Hellwig 已提交
858
/*
859
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
860 861
 * caller must have taken a reference to path beforehand.
 */
862
void nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
863
{
864
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
865 866 867 868 869 870 871
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

872
static inline void put_link(struct nameidata *nd)
873
{
A
Al Viro 已提交
874
	struct saved *last = nd->stack + --nd->depth;
875
	do_delayed_call(&last->done);
A
Al Viro 已提交
876 877
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
878 879
}

880 881
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
882 883
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
K
Kees Cook 已提交
884 885 886

/**
 * may_follow_link - Check symlink following for unsafe situations
887
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
888 889 890 891 892 893 894 895 896 897 898 899
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
A
Al Viro 已提交
900
static inline int may_follow_link(struct nameidata *nd)
K
Kees Cook 已提交
901 902 903
{
	const struct inode *inode;
	const struct inode *parent;
904
	kuid_t puid;
K
Kees Cook 已提交
905 906 907 908 909

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
910
	inode = nd->link_inode;
911
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
912 913 914
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
A
Al Viro 已提交
915
	parent = nd->inode;
K
Kees Cook 已提交
916 917 918 919
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
920 921
	puid = parent->i_uid;
	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
K
Kees Cook 已提交
922 923
		return 0;

924 925 926
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

927
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
K
Kees Cook 已提交
928
	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
K
Kees Cook 已提交
929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
975
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
976 977 978 979 980
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
981 982 983 984 985
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;
K
Kees Cook 已提交
986 987 988 989 990 991 992

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
993
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
K
Kees Cook 已提交
994 995
		return 0;

K
Kees Cook 已提交
996
	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
K
Kees Cook 已提交
997 998 999
	return -EPERM;
}

1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
 * @dir: the sticky parent directory
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct dentry * const dir,
				struct inode * const inode)
{
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
	    likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
	    uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
	    uid_eq(current_fsuid(), inode->i_uid))
		return 0;

	if (likely(dir->d_inode->i_mode & 0002) ||
	    (dir->d_inode->i_mode & 0020 &&
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
K
Kees Cook 已提交
1034 1035 1036 1037
		const char *operation = S_ISFIFO(inode->i_mode) ?
					"sticky_create_fifo" :
					"sticky_create_regular";
		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1038 1039 1040 1041 1042
		return -EACCES;
	}
	return 0;
}

1043 1044
static __always_inline
const char *get_link(struct nameidata *nd)
L
Linus Torvalds 已提交
1045
{
1046
	struct saved *last = nd->stack + nd->depth - 1;
A
Al Viro 已提交
1047
	struct dentry *dentry = last->link.dentry;
1048
	struct inode *inode = nd->link_inode;
1049
	int error;
1050
	const char *res;
L
Linus Torvalds 已提交
1051

1052 1053 1054
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
1055
	} else if (atime_needs_update(&last->link, inode)) {
A
Al Viro 已提交
1056
		if (unlikely(unlazy_walk(nd)))
A
Al Viro 已提交
1057
			return ERR_PTR(-ECHILD);
1058
		touch_atime(&last->link);
A
Al Viro 已提交
1059
	}
1060

1061 1062 1063
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
1064
		return ERR_PTR(error);
1065

1066
	nd->last_type = LAST_BIND;
1067
	res = READ_ONCE(inode->i_link);
1068
	if (!res) {
1069 1070 1071
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
1072
		if (nd->flags & LOOKUP_RCU) {
1073
			res = get(NULL, inode, &last->done);
1074
			if (res == ERR_PTR(-ECHILD)) {
A
Al Viro 已提交
1075
				if (unlikely(unlazy_walk(nd)))
1076
					return ERR_PTR(-ECHILD);
1077
				res = get(dentry, inode, &last->done);
1078 1079
			}
		} else {
1080
			res = get(dentry, inode, &last->done);
1081
		}
1082
		if (IS_ERR_OR_NULL(res))
1083 1084 1085
			return res;
	}
	if (*res == '/') {
1086 1087
		if (!nd->root.mnt)
			set_root(nd);
1088 1089
		if (unlikely(nd_jump_root(nd)))
			return ERR_PTR(-ECHILD);
1090 1091
		while (unlikely(*++res == '/'))
			;
L
Linus Torvalds 已提交
1092
	}
1093 1094
	if (!*res)
		res = NULL;
1095 1096
	return res;
}
1097

1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1108
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1109
{
1110 1111
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1112
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1113

A
Al Viro 已提交
1114
	read_seqlock_excl(&mount_lock);
1115
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1116
	if (parent == mnt) {
A
Al Viro 已提交
1117
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1118 1119
		return 0;
	}
1120
	mntget(&parent->mnt);
1121
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1122
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1123 1124 1125
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1126
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1127 1128
	return 1;
}
1129
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1130

N
Nick Piggin 已提交
1131
/*
1132 1133 1134
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1135
 */
1136
static int follow_automount(struct path *path, struct nameidata *nd,
1137
			    bool *need_mntput)
N
Nick Piggin 已提交
1138
{
1139
	struct vfsmount *mnt;
1140
	int err;
1141 1142 1143 1144

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1155
	 */
1156
	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1157 1158 1159
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
	    path->dentry->d_inode)
		return -EISDIR;
1160

1161 1162
	nd->total_link_count++;
	if (nd->total_link_count >= 40)
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
1176
		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1177 1178
			return -EREMOTE;
		return PTR_ERR(mnt);
N
Nick Piggin 已提交
1179
	}
1180

1181 1182
	if (!mnt) /* mount collision */
		return 0;
N
Nick Piggin 已提交
1183

1184 1185 1186 1187 1188
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1189
	err = finish_automount(mnt, path);
1190

1191 1192 1193
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1194
		return 0;
1195
	case 0:
1196
		path_put(path);
1197 1198 1199
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1200 1201
	default:
		return err;
1202
	}
1203

A
Al Viro 已提交
1204 1205
}

1206 1207
/*
 * Handle a dentry that is managed in some way.
1208
 * - Flagged for transit management (autofs)
1209 1210 1211 1212
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
1213
 * On success path->dentry is known positive.
1214 1215 1216
 *
 * Serialization is taken care of in namespace.c
 */
1217
static int follow_managed(struct path *path, struct nameidata *nd)
L
Linus Torvalds 已提交
1218
{
1219
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1220
	unsigned flags;
1221
	bool need_mntput = false;
1222
	int ret = 0;
1223 1224 1225 1226

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
1227
	while (flags = smp_load_acquire(&path->dentry->d_flags),
1228
	       unlikely(flags & DCACHE_MANAGED_DENTRY)) {
1229 1230
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
1231
		if (flags & DCACHE_MANAGE_TRANSIT) {
1232 1233
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1234
			ret = path->dentry->d_op->d_manage(path, false);
1235
			if (ret < 0)
1236
				break;
1237 1238
		}

1239
		/* Transit to a mounted filesystem. */
1240
		if (flags & DCACHE_MOUNTED) {
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
A
Al Viro 已提交
1254 1255
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1256 1257 1258
		}

		/* Handle an automount point */
1259
		if (flags & DCACHE_NEED_AUTOMOUNT) {
1260
			ret = follow_automount(path, nd, &need_mntput);
1261
			if (ret < 0)
1262
				break;
1263 1264 1265 1266 1267
			continue;
		}

		/* We didn't change the current path point */
		break;
L
Linus Torvalds 已提交
1268
	}
1269 1270 1271

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
1272 1273
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
1274 1275 1276 1277
	if (ret == -EISDIR || !ret)
		ret = 1;
	if (ret > 0 && unlikely(d_flags_negative(flags)))
		ret = -ENOENT;
1278 1279 1280
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
	return ret;
L
Linus Torvalds 已提交
1281 1282
}

1283
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1284 1285 1286
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1287
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1288
	if (mounted) {
A
Al Viro 已提交
1289 1290 1291 1292
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1293 1294 1295 1296
		return 1;
	}
	return 0;
}
1297
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1298

1299
static inline int managed_dentry_rcu(const struct path *path)
1300
{
1301 1302
	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		path->dentry->d_op->d_manage(path, true) : 0;
1303 1304
}

1305
/*
1306 1307
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1308 1309
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1310
			       struct inode **inode, unsigned *seqp)
1311
{
1312
	for (;;) {
1313
		struct mount *mounted;
1314 1315 1316 1317
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1318
		switch (managed_dentry_rcu(path)) {
1319 1320
		case -ECHILD:
		default:
1321
			return false;
1322 1323 1324 1325 1326
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1327 1328

		if (!d_mountpoint(path->dentry))
1329
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1330

A
Al Viro 已提交
1331
		mounted = __lookup_mnt(path->mnt, path->dentry);
1332 1333
		if (!mounted)
			break;
1334 1335
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1336
		nd->flags |= LOOKUP_JUMPED;
1337
		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1338 1339 1340 1341 1342 1343
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1344
	}
1345
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1346
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1347 1348
}

N
Nick Piggin 已提交
1349 1350
static int follow_dotdot_rcu(struct nameidata *nd)
{
1351
	struct inode *inode = nd->inode;
N
Nick Piggin 已提交
1352

1353
	while (1) {
1354
		if (path_equal(&nd->path, &nd->root))
N
Nick Piggin 已提交
1355 1356 1357 1358 1359 1360
			break;
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1361
			inode = parent->d_inode;
N
Nick Piggin 已提交
1362
			seq = read_seqcount_begin(&parent->d_seq);
1363 1364
			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				return -ECHILD;
N
Nick Piggin 已提交
1365 1366
			nd->path.dentry = parent;
			nd->seq = seq;
1367 1368
			if (unlikely(!path_connected(&nd->path)))
				return -ENOENT;
N
Nick Piggin 已提交
1369
			break;
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
		} else {
			struct mount *mnt = real_mount(nd->path.mnt);
			struct mount *mparent = mnt->mnt_parent;
			struct dentry *mountpoint = mnt->mnt_mountpoint;
			struct inode *inode2 = mountpoint->d_inode;
			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				return -ECHILD;
			if (&mparent->mnt == nd->path.mnt)
				break;
			/* we know that mountpoint was pinned */
			nd->path.dentry = mountpoint;
			nd->path.mnt = &mparent->mnt;
			inode = inode2;
			nd->seq = seq;
N
Nick Piggin 已提交
1385 1386
		}
	}
1387
	while (unlikely(d_mountpoint(nd->path.dentry))) {
1388 1389
		struct mount *mounted;
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1390 1391
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return -ECHILD;
1392 1393 1394 1395
		if (!mounted)
			break;
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
1396
		inode = nd->path.dentry->d_inode;
1397 1398
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
	}
1399
	nd->inode = inode;
N
Nick Piggin 已提交
1400 1401 1402
	return 0;
}

1403 1404 1405 1406 1407
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
1408
int follow_down(struct path *path)
1409 1410 1411 1412
{
	unsigned managed;
	int ret;

1413
	while (managed = READ_ONCE(path->dentry->d_flags),
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held.
		 *
		 * We indicate to the filesystem if someone is trying to mount
		 * something here.  This gives autofs the chance to deny anyone
		 * other than its daemon the right to mount on its
		 * superstructure.
		 *
		 * The filesystem may sleep at this point.
		 */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1428
			ret = path->dentry->d_op->d_manage(path, false);
1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449
			if (ret < 0)
				return ret == -EISDIR ? 0 : ret;
		}

		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (!mounted)
				break;
			dput(path->dentry);
			mntput(path->mnt);
			path->mnt = mounted;
			path->dentry = dget(mounted->mnt_root);
			continue;
		}

		/* Don't handle automount points here */
		break;
	}
	return 0;
}
1450
EXPORT_SYMBOL(follow_down);
1451

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467
/*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 */
static void follow_mount(struct path *path)
{
	while (d_mountpoint(path->dentry)) {
		struct vfsmount *mounted = lookup_mnt(path);
		if (!mounted)
			break;
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
	}
}

1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478
static int path_parent_directory(struct path *path)
{
	struct dentry *old = path->dentry;
	/* rare case of legitimate dget_parent()... */
	path->dentry = dget_parent(path->dentry);
	dput(old);
	if (unlikely(!path_connected(path)))
		return -ENOENT;
	return 0;
}

1479
static int follow_dotdot(struct nameidata *nd)
L
Linus Torvalds 已提交
1480 1481
{
	while(1) {
1482
		if (path_equal(&nd->path, &nd->root))
L
Linus Torvalds 已提交
1483
			break;
1484
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
1485 1486 1487
			int ret = path_parent_directory(&nd->path);
			if (ret)
				return ret;
L
Linus Torvalds 已提交
1488 1489
			break;
		}
A
Al Viro 已提交
1490
		if (!follow_up(&nd->path))
L
Linus Torvalds 已提交
1491 1492
			break;
	}
A
Al Viro 已提交
1493
	follow_mount(&nd->path);
N
Nick Piggin 已提交
1494
	nd->inode = nd->path.dentry->d_inode;
1495
	return 0;
L
Linus Torvalds 已提交
1496 1497
}

1498
/*
1499 1500
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
1501
 */
1502 1503
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1504
				    unsigned int flags)
1505
{
1506
	struct dentry *dentry = d_lookup(dir, name);
M
Miklos Szeredi 已提交
1507
	if (dentry) {
1508 1509 1510 1511 1512 1513
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error)
				d_invalidate(dentry);
			dput(dentry);
			return ERR_PTR(error);
M
Miklos Szeredi 已提交
1514 1515
		}
	}
1516 1517 1518
	return dentry;
}

1519
/*
1520 1521 1522 1523 1524
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
1525
 */
1526
static struct dentry *__lookup_hash(const struct qstr *name,
1527
		struct dentry *base, unsigned int flags)
1528
{
1529
	struct dentry *dentry = lookup_dcache(name, base, flags);
1530 1531
	struct dentry *old;
	struct inode *dir = base->d_inode;
1532

1533
	if (dentry)
M
Miklos Szeredi 已提交
1534
		return dentry;
1535

1536 1537 1538 1539
	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir)))
		return ERR_PTR(-ENOENT);

1540 1541 1542 1543
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1544 1545 1546 1547 1548 1549
	old = dir->i_op->lookup(dir, dentry, flags);
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
1550 1551
}

A
Al Viro 已提交
1552
static int lookup_fast(struct nameidata *nd,
1553 1554
		       struct path *path, struct inode **inode,
		       unsigned *seqp)
L
Linus Torvalds 已提交
1555
{
1556
	struct vfsmount *mnt = nd->path.mnt;
N
Nick Piggin 已提交
1557
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1558
	int status = 1;
1559 1560
	int err;

1561 1562
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1563 1564
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1565
	 */
N
Nick Piggin 已提交
1566 1567
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1568
		bool negative;
1569
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1570
		if (unlikely(!dentry)) {
A
Al Viro 已提交
1571
			if (unlazy_walk(nd))
A
Al Viro 已提交
1572
				return -ECHILD;
1573
			return 0;
A
Al Viro 已提交
1574
		}
A
Al Viro 已提交
1575

1576 1577 1578 1579
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1580
		*inode = d_backing_inode(dentry);
1581
		negative = d_is_negative(dentry);
A
Al Viro 已提交
1582
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1583 1584 1585 1586 1587 1588 1589 1590 1591
			return -ECHILD;

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1592
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
N
Nick Piggin 已提交
1593
			return -ECHILD;
A
Al Viro 已提交
1594

1595
		*seqp = seq;
1596
		status = d_revalidate(dentry, nd->flags);
1597
		if (likely(status > 0)) {
A
Al Viro 已提交
1598 1599 1600 1601 1602 1603 1604 1605 1606
			/*
			 * Note: do negative dentry check after revalidation in
			 * case that drops it.
			 */
			if (unlikely(negative))
				return -ENOENT;
			path->mnt = mnt;
			path->dentry = dentry;
			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1607
				return 1;
1608
		}
A
Al Viro 已提交
1609
		if (unlazy_child(nd, dentry, seq))
1610 1611 1612 1613
			return -ECHILD;
		if (unlikely(status == -ECHILD))
			/* we'd been told to redo it in non-rcu mode */
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1614
	} else {
A
Al Viro 已提交
1615
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1616
		if (unlikely(!dentry))
1617
			return 0;
1618
		status = d_revalidate(dentry, nd->flags);
1619
	}
A
Al Viro 已提交
1620
	if (unlikely(status <= 0)) {
1621
		if (!status)
A
Al Viro 已提交
1622
			d_invalidate(dentry);
1623
		dput(dentry);
A
Al Viro 已提交
1624
		return status;
1625
	}
A
Al Viro 已提交
1626

1627 1628
	path->mnt = mnt;
	path->dentry = dentry;
1629
	err = follow_managed(path, nd);
1630
	if (likely(err > 0))
1631
		*inode = d_backing_inode(path->dentry);
1632
	return err;
M
Miklos Szeredi 已提交
1633 1634 1635
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1636 1637 1638
static struct dentry *__lookup_slow(const struct qstr *name,
				    struct dentry *dir,
				    unsigned int flags)
M
Miklos Szeredi 已提交
1639
{
A
Al Viro 已提交
1640
	struct dentry *dentry, *old;
1641
	struct inode *inode = dir->d_inode;
1642
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1643 1644

	/* Don't go there if it's already dead */
A
Al Viro 已提交
1645
	if (unlikely(IS_DEADDIR(inode)))
A
Al Viro 已提交
1646
		return ERR_PTR(-ENOENT);
A
Al Viro 已提交
1647
again:
1648
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1649
	if (IS_ERR(dentry))
A
Al Viro 已提交
1650
		return dentry;
A
Al Viro 已提交
1651
	if (unlikely(!d_in_lookup(dentry))) {
1652 1653 1654 1655
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error) {
				d_invalidate(dentry);
1656
				dput(dentry);
1657
				goto again;
1658
			}
1659 1660
			dput(dentry);
			dentry = ERR_PTR(error);
1661
		}
A
Al Viro 已提交
1662 1663 1664 1665 1666 1667
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1668 1669
		}
	}
1670
	return dentry;
L
Linus Torvalds 已提交
1671 1672
}

A
Al Viro 已提交
1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
{
	struct inode *inode = dir->d_inode;
	struct dentry *res;
	inode_lock_shared(inode);
	res = __lookup_slow(name, dir, flags);
	inode_unlock_shared(inode);
	return res;
}

1685 1686 1687
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1688
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1689 1690
		if (err != -ECHILD)
			return err;
A
Al Viro 已提交
1691
		if (unlazy_walk(nd))
1692 1693
			return -ECHILD;
	}
1694
	return inode_permission(nd->inode, MAY_EXEC);
1695 1696
}

1697 1698 1699
static inline int handle_dots(struct nameidata *nd, int type)
{
	if (type == LAST_DOTDOT) {
1700 1701
		if (!nd->root.mnt)
			set_root(nd);
1702
		if (nd->flags & LOOKUP_RCU) {
1703
			return follow_dotdot_rcu(nd);
1704
		} else
1705
			return follow_dotdot(nd);
1706 1707 1708 1709
	}
	return 0;
}

1710 1711
static int pick_link(struct nameidata *nd, struct path *link,
		     struct inode *inode, unsigned seq)
1712
{
1713
	int error;
A
Al Viro 已提交
1714
	struct saved *last;
1715
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1716 1717 1718
		path_to_nameidata(link, nd);
		return -ELOOP;
	}
A
Al Viro 已提交
1719
	if (!(nd->flags & LOOKUP_RCU)) {
1720 1721
		if (link->mnt == nd->path.mnt)
			mntget(link->mnt);
1722
	}
1723 1724
	error = nd_alloc_stack(nd);
	if (unlikely(error)) {
A
Al Viro 已提交
1725
		if (error == -ECHILD) {
1726 1727 1728 1729 1730 1731 1732
			if (unlikely(!legitimize_path(nd, link, seq))) {
				drop_links(nd);
				nd->depth = 0;
				nd->flags &= ~LOOKUP_RCU;
				nd->path.mnt = NULL;
				nd->path.dentry = NULL;
				rcu_read_unlock();
A
Al Viro 已提交
1733
			} else if (likely(unlazy_walk(nd)) == 0)
1734
				error = nd_alloc_stack(nd);
A
Al Viro 已提交
1735 1736 1737 1738 1739
		}
		if (error) {
			path_put(link);
			return error;
		}
1740 1741
	}

1742
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1743
	last->link = *link;
1744 1745
	clear_delayed_call(&last->done);
	nd->link_inode = inode;
1746
	last->seq = seq;
1747 1748 1749
	return 1;
}

1750
enum {WALK_FOLLOW = 1, WALK_MORE = 2};
1751

1752 1753 1754 1755 1756 1757
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1758 1759
static inline int step_into(struct nameidata *nd, struct path *path,
			    int flags, struct inode *inode, unsigned seq)
1760
{
1761 1762
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
1763 1764 1765 1766 1767 1768
	if (likely(!d_is_symlink(path->dentry)) ||
	   !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
		/* not a symlink or should not follow */
		path_to_nameidata(path, nd);
		nd->inode = inode;
		nd->seq = seq;
1769
		return 0;
1770
	}
1771 1772
	/* make sure that d_is_symlink above matches inode */
	if (nd->flags & LOOKUP_RCU) {
1773
		if (read_seqcount_retry(&path->dentry->d_seq, seq))
1774 1775
			return -ECHILD;
	}
1776
	return pick_link(nd, path, inode, seq);
1777 1778
}

1779
static int walk_component(struct nameidata *nd, int flags)
1780
{
A
Al Viro 已提交
1781
	struct path path;
1782
	struct inode *inode;
1783
	unsigned seq;
1784 1785 1786 1787 1788 1789
	int err;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1790 1791
	if (unlikely(nd->last_type != LAST_NORM)) {
		err = handle_dots(nd, nd->last_type);
A
Al Viro 已提交
1792
		if (!(flags & WALK_MORE) && nd->depth)
1793 1794 1795
			put_link(nd);
		return err;
	}
1796
	err = lookup_fast(nd, &path, &inode, &seq);
1797
	if (unlikely(err <= 0)) {
M
Miklos Szeredi 已提交
1798
		if (err < 0)
1799
			return err;
1800 1801 1802 1803
		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
					  nd->flags);
		if (IS_ERR(path.dentry))
			return PTR_ERR(path.dentry);
1804

1805 1806 1807
		path.mnt = nd->path.mnt;
		err = follow_managed(&path, nd);
		if (unlikely(err < 0))
1808
			return err;
M
Miklos Szeredi 已提交
1809

1810
		seq = 0;	/* we are already out of RCU mode */
1811
		inode = d_backing_inode(path.dentry);
1812
	}
M
Miklos Szeredi 已提交
1813

1814
	return step_into(nd, &path, flags, inode, seq);
1815 1816
}

1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1836
#include <asm/word-at-a-time.h>
1837

1838
#ifdef HASH_MIX
1839

1840
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1841

1842
#elif defined(CONFIG_64BIT)
1843
/*
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
1871
 */
1872 1873 1874 1875 1876
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
1877

1878
/*
1879 1880 1881
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
1882
 */
1883
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1884
{
1885 1886 1887
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
1888 1889
}

1890 1891
#else	/* 32-bit case */

1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
1907

1908
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1909
{
1910 1911
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
1912 1913
}

1914 1915
#endif

1916 1917 1918 1919 1920 1921 1922
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
1923
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1924
{
1925
	unsigned long a, x = 0, y = (unsigned long)salt;
1926 1927

	for (;;) {
1928 1929
		if (!len)
			goto done;
1930
		a = load_unaligned_zeropad(name);
1931 1932
		if (len < sizeof(unsigned long))
			break;
1933
		HASH_MIX(x, y, a);
1934 1935 1936
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
1937
	x ^= a & bytemask_from_count(len);
1938
done:
1939
	return fold_hash(x, y);
1940 1941 1942
}
EXPORT_SYMBOL(full_name_hash);

1943
/* Return the "hash_len" (hash and length) of a null-terminated string */
1944
u64 hashlen_string(const void *salt, const char *name)
1945
{
1946 1947
	unsigned long a = 0, x = 0, y = (unsigned long)salt;
	unsigned long adata, mask, len;
1948 1949
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

1950 1951 1952
	len = 0;
	goto inside;

1953
	do {
1954
		HASH_MIX(x, y, a);
1955
		len += sizeof(unsigned long);
1956
inside:
1957 1958 1959 1960 1961
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
1962
	x ^= a & zero_bytemask(mask);
1963

1964
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1965 1966 1967
}
EXPORT_SYMBOL(hashlen_string);

1968 1969
/*
 * Calculate the length and hash of the path component, and
1970
 * return the "hash_len" as the result.
1971
 */
1972
static inline u64 hash_name(const void *salt, const char *name)
1973
{
1974 1975
	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
	unsigned long adata, bdata, mask, len;
1976
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1977

1978 1979 1980
	len = 0;
	goto inside;

1981
	do {
1982
		HASH_MIX(x, y, a);
1983
		len += sizeof(unsigned long);
1984
inside:
1985
		a = load_unaligned_zeropad(name+len);
1986 1987 1988 1989 1990 1991
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
1992
	x ^= a & zero_bytemask(mask);
1993

1994
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1995 1996
}

1997
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
1998

1999
/* Return the hash of a string of known length */
2000
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
L
Linus Torvalds 已提交
2001
{
2002
	unsigned long hash = init_name_hash(salt);
L
Linus Torvalds 已提交
2003
	while (len--)
2004
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
2005 2006
	return end_name_hash(hash);
}
2007
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
2008

2009
/* Return the "hash_len" (hash and length) of a null-terminated string */
2010
u64 hashlen_string(const void *salt, const char *name)
2011
{
2012
	unsigned long hash = init_name_hash(salt);
2013 2014 2015
	unsigned long len = 0, c;

	c = (unsigned char)*name;
2016
	while (c) {
2017 2018 2019
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
2020
	}
2021 2022
	return hashlen_create(end_name_hash(hash), len);
}
2023
EXPORT_SYMBOL(hashlen_string);
2024

2025 2026 2027 2028
/*
 * We know there's a real path component here of at least
 * one character.
 */
2029
static inline u64 hash_name(const void *salt, const char *name)
2030
{
2031
	unsigned long hash = init_name_hash(salt);
2032 2033 2034 2035 2036 2037 2038 2039
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2040
	return hashlen_create(end_name_hash(hash), len);
2041 2042
}

2043 2044
#endif

L
Linus Torvalds 已提交
2045 2046
/*
 * Name resolution.
2047 2048
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2049
 *
2050 2051
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2052
 */
2053
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2054 2055
{
	int err;
A
Al Viro 已提交
2056

2057 2058
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
2059 2060 2061
	while (*name=='/')
		name++;
	if (!*name)
2062
		return 0;
L
Linus Torvalds 已提交
2063 2064 2065

	/* At this point we know we have a real path component. */
	for(;;) {
2066
		u64 hash_len;
A
Al Viro 已提交
2067
		int type;
L
Linus Torvalds 已提交
2068

2069
		err = may_lookup(nd);
2070
		if (err)
2071
			return err;
L
Linus Torvalds 已提交
2072

2073
		hash_len = hash_name(nd->path.dentry, name);
L
Linus Torvalds 已提交
2074

A
Al Viro 已提交
2075
		type = LAST_NORM;
2076
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2077
			case 2:
2078
				if (name[1] == '.') {
A
Al Viro 已提交
2079
					type = LAST_DOTDOT;
A
Al Viro 已提交
2080 2081
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
2082 2083 2084 2085
				break;
			case 1:
				type = LAST_DOT;
		}
2086 2087
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
2088
			nd->flags &= ~LOOKUP_JUMPED;
2089
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2090
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2091
				err = parent->d_op->d_hash(parent, &this);
2092
				if (err < 0)
2093
					return err;
2094 2095
				hash_len = this.hash_len;
				name = this.name;
2096 2097
			}
		}
A
Al Viro 已提交
2098

2099 2100
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2101 2102
		nd->last_type = type;

2103 2104
		name += hashlen_len(hash_len);
		if (!*name)
2105
			goto OK;
2106 2107 2108 2109 2110
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2111 2112
			name++;
		} while (unlikely(*name == '/'));
2113 2114
		if (unlikely(!*name)) {
OK:
2115
			/* pathname body, done */
2116 2117 2118
			if (!nd->depth)
				return 0;
			name = nd->stack[nd->depth - 1].name;
2119
			/* trailing symlink, done */
2120 2121 2122
			if (!name)
				return 0;
			/* last component of nested symlink */
2123
			err = walk_component(nd, WALK_FOLLOW);
A
Al Viro 已提交
2124 2125
		} else {
			/* not the last component */
2126
			err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
2127
		}
2128
		if (err < 0)
2129
			return err;
L
Linus Torvalds 已提交
2130

2131
		if (err) {
2132
			const char *s = get_link(nd);
2133

2134
			if (IS_ERR(s))
2135
				return PTR_ERR(s);
2136 2137 2138
			err = 0;
			if (unlikely(!s)) {
				/* jumped */
2139
				put_link(nd);
2140
			} else {
2141 2142 2143
				nd->stack[nd->depth - 1].name = name;
				name = s;
				continue;
2144
			}
N
Nick Piggin 已提交
2145
		}
2146 2147
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
A
Al Viro 已提交
2148
				if (unlazy_walk(nd))
2149 2150
					return -ECHILD;
			}
2151
			return -ENOTDIR;
2152
		}
L
Linus Torvalds 已提交
2153 2154 2155
	}
}

2156
/* must be paired with terminate_walk() */
2157
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2158
{
2159
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2160

2161 2162
	if (!*s)
		flags &= ~LOOKUP_RCU;
2163 2164
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
2165

N
Nick Piggin 已提交
2166
	nd->last_type = LAST_ROOT; /* if there are only slashes... */
2167
	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
N
Nick Piggin 已提交
2168
	nd->depth = 0;
2169
	if (flags & LOOKUP_ROOT) {
2170 2171
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
2172 2173
		if (*s && unlikely(!d_can_lookup(root)))
			return ERR_PTR(-ENOTDIR);
2174 2175 2176 2177
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2178
			nd->root_seq = nd->seq;
A
Al Viro 已提交
2179
			nd->m_seq = read_seqbegin(&mount_lock);
2180 2181 2182
		} else {
			path_get(&nd->path);
		}
2183
		return s;
2184 2185
	}

N
Nick Piggin 已提交
2186
	nd->root.mnt = NULL;
2187 2188
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
N
Nick Piggin 已提交
2189

A
Al Viro 已提交
2190
	nd->m_seq = read_seqbegin(&mount_lock);
A
Al Viro 已提交
2191
	if (*s == '/') {
2192
		set_root(nd);
2193
		if (likely(!nd_jump_root(nd)))
2194 2195
			return s;
		return ERR_PTR(-ECHILD);
2196
	} else if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2197 2198 2199
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2200

A
Al Viro 已提交
2201 2202 2203
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2204
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2205 2206 2207 2208
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2209
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2210
		}
2211
		return s;
N
Nick Piggin 已提交
2212
	} else {
2213
		/* Caller must check execute permissions on the starting path component */
2214
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2215 2216
		struct dentry *dentry;

2217
		if (!f.file)
2218
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2219

2220
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2221

2222 2223 2224
		if (*s && unlikely(!d_can_lookup(dentry))) {
			fdput(f);
			return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2225
		}
N
Nick Piggin 已提交
2226

2227
		nd->path = f.file->f_path;
A
Al Viro 已提交
2228
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2229 2230
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2231
		} else {
2232
			path_get(&nd->path);
A
Al Viro 已提交
2233
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2234
		}
A
Al Viro 已提交
2235
		fdput(f);
2236
		return s;
N
Nick Piggin 已提交
2237
	}
2238 2239
}

2240
static const char *trailing_symlink(struct nameidata *nd)
2241 2242
{
	const char *s;
A
Al Viro 已提交
2243
	int error = may_follow_link(nd);
2244
	if (unlikely(error))
2245
		return ERR_PTR(error);
2246
	nd->flags |= LOOKUP_PARENT;
2247
	nd->stack[0].name = NULL;
2248
	s = get_link(nd);
2249
	return s ? s : "";
2250 2251
}

A
Al Viro 已提交
2252
static inline int lookup_last(struct nameidata *nd)
2253 2254 2255 2256 2257
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

	nd->flags &= ~LOOKUP_PARENT;
A
Al Viro 已提交
2258
	return walk_component(nd, 0);
2259 2260
}

2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289
static int handle_lookup_down(struct nameidata *nd)
{
	struct path path = nd->path;
	struct inode *inode = nd->inode;
	unsigned seq = nd->seq;
	int err;

	if (nd->flags & LOOKUP_RCU) {
		/*
		 * don't bother with unlazy_walk on failure - we are
		 * at the very beginning of walk, so we lose nothing
		 * if we simply redo everything in non-RCU mode
		 */
		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
			return -ECHILD;
	} else {
		dget(path.dentry);
		err = follow_managed(&path, nd);
		if (unlikely(err < 0))
			return err;
		inode = d_backing_inode(path.dentry);
		seq = 0;
	}
	path_to_nameidata(&path, nd);
	nd->inode = inode;
	nd->seq = seq;
	return 0;
}

2290
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2291
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2292
{
2293
	const char *s = path_init(nd, flags);
2294
	int err;
N
Nick Piggin 已提交
2295

2296
	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2297
		err = handle_lookup_down(nd);
2298 2299
		if (unlikely(err < 0))
			s = ERR_PTR(err);
2300 2301
	}

2302 2303 2304
	while (!(err = link_path_walk(s, nd))
		&& ((err = lookup_last(nd)) > 0)) {
		s = trailing_symlink(nd);
2305
	}
2306 2307
	if (!err)
		err = complete_walk(nd);
2308

2309 2310
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2311
			err = -ENOTDIR;
2312 2313 2314 2315 2316 2317
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2318
	return err;
A
Al Viro 已提交
2319
}
N
Nick Piggin 已提交
2320

2321 2322
int filename_lookup(int dfd, struct filename *name, unsigned flags,
		    struct path *path, struct path *root)
A
Al Viro 已提交
2323
{
2324
	int retval;
2325
	struct nameidata nd;
2326 2327
	if (IS_ERR(name))
		return PTR_ERR(name);
2328 2329 2330 2331
	if (unlikely(root)) {
		nd.root = *root;
		flags |= LOOKUP_ROOT;
	}
2332
	set_nameidata(&nd, dfd, name);
2333
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2334
	if (unlikely(retval == -ECHILD))
2335
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2336
	if (unlikely(retval == -ESTALE))
2337
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2338

2339
	if (likely(!retval))
2340
		audit_inode(name, path->dentry, 0);
2341
	restore_nameidata();
2342
	putname(name);
2343
	return retval;
L
Linus Torvalds 已提交
2344 2345
}

2346
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2347
static int path_parentat(struct nameidata *nd, unsigned flags,
2348
				struct path *parent)
2349
{
2350
	const char *s = path_init(nd, flags);
2351
	int err = link_path_walk(s, nd);
2352 2353
	if (!err)
		err = complete_walk(nd);
2354 2355 2356 2357 2358 2359
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2360 2361 2362
	return err;
}

2363
static struct filename *filename_parentat(int dfd, struct filename *name,
2364 2365
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2366 2367
{
	int retval;
2368
	struct nameidata nd;
2369

2370 2371
	if (IS_ERR(name))
		return name;
2372
	set_nameidata(&nd, dfd, name);
2373
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2374
	if (unlikely(retval == -ECHILD))
2375
		retval = path_parentat(&nd, flags, parent);
2376
	if (unlikely(retval == -ESTALE))
2377
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2378 2379 2380
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
2381
		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2382 2383 2384
	} else {
		putname(name);
		name = ERR_PTR(retval);
2385
	}
2386
	restore_nameidata();
2387
	return name;
2388 2389
}

A
Al Viro 已提交
2390 2391
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2392
{
2393 2394
	struct filename *filename;
	struct dentry *d;
2395 2396
	struct qstr last;
	int type;
2397

2398 2399
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2400 2401
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2402
	if (unlikely(type != LAST_NORM)) {
2403
		path_put(path);
2404 2405
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2406
	}
A
Al Viro 已提交
2407
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2408
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2409
	if (IS_ERR(d)) {
A
Al Viro 已提交
2410
		inode_unlock(path->dentry->d_inode);
2411
		path_put(path);
A
Al Viro 已提交
2412
	}
2413
	putname(filename);
A
Al Viro 已提交
2414
	return d;
2415 2416
}

A
Al Viro 已提交
2417 2418
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2419 2420
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2421
}
2422
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2423

2424 2425 2426 2427 2428 2429
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2430
 * @path: pointer to struct path to fill
2431 2432 2433
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2434
		    struct path *path)
2435
{
2436 2437
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2438 2439
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2440
}
2441
EXPORT_SYMBOL(vfs_path_lookup);
2442

2443 2444
static int lookup_one_len_common(const char *name, struct dentry *base,
				 int len, struct qstr *this)
2445
{
2446 2447 2448
	this->name = name;
	this->len = len;
	this->hash = full_name_hash(base, name, len);
A
Al Viro 已提交
2449
	if (!len)
2450
		return -EACCES;
A
Al Viro 已提交
2451

A
Al Viro 已提交
2452 2453
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
2454
			return -EACCES;
A
Al Viro 已提交
2455 2456
	}

A
Al Viro 已提交
2457
	while (len--) {
2458
		unsigned int c = *(const unsigned char *)name++;
A
Al Viro 已提交
2459
		if (c == '/' || c == '\0')
2460
			return -EACCES;
A
Al Viro 已提交
2461
	}
2462 2463 2464 2465 2466
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2467
		int err = base->d_op->d_hash(base, this);
2468
		if (err < 0)
2469
			return err;
2470
	}
2471

2472 2473 2474
	return inode_permission(base->d_inode, MAY_EXEC);
}

2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503
/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
	if (err)
		return ERR_PTR(err);

	return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516
/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
2517
	struct dentry *dentry;
2518 2519 2520 2521 2522 2523
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
2524 2525 2526
	if (err)
		return ERR_PTR(err);

2527 2528
	dentry = lookup_dcache(&this, base, 0);
	return dentry ? dentry : __lookup_slow(&this, base, 0);
2529
}
2530
EXPORT_SYMBOL(lookup_one_len);
2531

2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	int err;
2549
	struct dentry *ret;
2550

2551
	err = lookup_one_len_common(name, base, len, &this);
2552 2553 2554
	if (err)
		return ERR_PTR(err);

2555 2556 2557 2558
	ret = lookup_dcache(&this, base, 0);
	if (!ret)
		ret = lookup_slow(&this, base, 0);
	return ret;
2559 2560 2561
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

A
Al Viro 已提交
2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573
/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2574
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
A
Al Viro 已提交
2575 2576 2577 2578 2579 2580 2581
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
	struct dentry *child, *parent;
	struct qstr this;
	int ret;

	ret = path_parent_directory(path);
	if (ret)
		return ret;

	parent = path->dentry;
	this.name = "pts";
	this.len = 3;
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
	follow_mount(path);
	return 0;
}
#endif

2610 2611
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2612
{
2613 2614
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2615
}
2616
EXPORT_SYMBOL(user_path_at_empty);
2617

2618
/**
2619
 * path_mountpoint - look up a path to be umounted
2620
 * @nd:		lookup context
2621
 * @flags:	lookup flags
2622
 * @path:	pointer to container for result
2623 2624
 *
 * Look up the given name, but don't attempt to revalidate the last component.
2625
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2626 2627
 */
static int
2628
path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2629
{
2630
	const char *s = path_init(nd, flags);
2631
	int err;
2632

2633
	while (!(err = link_path_walk(s, nd)) &&
2634
		(err = lookup_last(nd)) > 0) {
2635
		s = trailing_symlink(nd);
2636
	}
2637 2638 2639 2640
	if (!err && (nd->flags & LOOKUP_RCU))
		err = unlazy_walk(nd);
	if (!err)
		err = handle_lookup_down(nd);
2641 2642 2643 2644 2645
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
2646
	terminate_walk(nd);
2647 2648 2649
	return err;
}

A
Al Viro 已提交
2650
static int
2651
filename_mountpoint(int dfd, struct filename *name, struct path *path,
A
Al Viro 已提交
2652 2653
			unsigned int flags)
{
2654
	struct nameidata nd;
2655
	int error;
2656 2657
	if (IS_ERR(name))
		return PTR_ERR(name);
2658
	set_nameidata(&nd, dfd, name);
2659
	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2660
	if (unlikely(error == -ECHILD))
2661
		error = path_mountpoint(&nd, flags, path);
A
Al Viro 已提交
2662
	if (unlikely(error == -ESTALE))
2663
		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
A
Al Viro 已提交
2664
	if (likely(!error))
2665
		audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
2666
	restore_nameidata();
2667
	putname(name);
A
Al Viro 已提交
2668 2669 2670
	return error;
}

2671
/**
2672
 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685
 * @dfd:	directory file descriptor
 * @name:	pathname from userland
 * @flags:	lookup flags
 * @path:	pointer to container to hold result
 *
 * A umount is a special case for path walking. We're not actually interested
 * in the inode in this situation, and ESTALE errors can be a problem. We
 * simply want track down the dentry and vfsmount attached at the mountpoint
 * and avoid revalidating the last component.
 *
 * Returns 0 and populates "path" on success.
 */
int
2686
user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2687 2688
			struct path *path)
{
2689
	return filename_mountpoint(dfd, getname(name), path, flags);
2690 2691
}

A
Al Viro 已提交
2692 2693 2694 2695
int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
			unsigned int flags)
{
2696
	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
A
Al Viro 已提交
2697 2698 2699
}
EXPORT_SYMBOL(kern_path_mountpoint);

M
Miklos Szeredi 已提交
2700
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2701
{
2702
	kuid_t fsuid = current_fsuid();
2703

2704
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2705
		return 0;
2706
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2707
		return 0;
2708
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2709
}
M
Miklos Szeredi 已提交
2710
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
2725 2726 2727 2728 2729
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
L
Linus Torvalds 已提交
2730 2731
 *     nfs_async_unlink().
 */
2732
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2733
{
2734
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2735 2736
	int error;

2737
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2738
		return -ENOENT;
2739
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2740 2741

	BUG_ON(victim->d_parent->d_inode != dir);
2742 2743 2744 2745 2746

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;

2747
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2748

2749
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2750 2751 2752 2753
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2754 2755

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2756
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
L
Linus Torvalds 已提交
2757 2758
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2759
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2760 2761 2762
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2763
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
2777 2778 2779
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
L
Linus Torvalds 已提交
2780
 */
2781
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2782
{
2783
	struct user_namespace *s_user_ns;
2784
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2785 2786 2787 2788
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2789 2790 2791 2792
	s_user_ns = dir->i_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;
2793
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2794 2795 2796 2797 2798 2799 2800 2801 2802 2803
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2804
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2805 2806 2807
		return NULL;
	}

2808
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2809

2810 2811
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2812 2813
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2814
		return p;
L
Linus Torvalds 已提交
2815 2816
	}

2817 2818
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2819 2820
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2821
		return p;
L
Linus Torvalds 已提交
2822 2823
	}

A
Al Viro 已提交
2824 2825
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2826 2827
	return NULL;
}
2828
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2829 2830 2831

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2832
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2833
	if (p1 != p2) {
A
Al Viro 已提交
2834
		inode_unlock(p2->d_inode);
2835
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2836 2837
	}
}
2838
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2839

A
Al Viro 已提交
2840
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2841
		bool want_excl)
L
Linus Torvalds 已提交
2842
{
2843
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2844 2845 2846
	if (error)
		return error;

A
Al Viro 已提交
2847
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2848 2849 2850 2851 2852 2853
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2854
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2855
	if (!error)
2856
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2857 2858
	return error;
}
2859
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2860

A
Al Viro 已提交
2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881
int vfs_mkobj(struct dentry *dentry, umode_t mode,
		int (*f)(struct dentry *, umode_t, void *),
		void *arg)
{
	struct inode *dir = dentry->d_parent->d_inode;
	int error = may_create(dir, dentry);
	if (error)
		return error;

	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	error = f(dentry, mode, arg);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}
EXPORT_SYMBOL(vfs_mkobj);

2882 2883 2884 2885 2886 2887
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

A
Al Viro 已提交
2888
static int may_open(const struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2889
{
2890
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2891 2892 2893 2894 2895 2896
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2897 2898
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2899
		return -ELOOP;
C
Christoph Hellwig 已提交
2900 2901 2902 2903 2904 2905
	case S_IFDIR:
		if (acc_mode & MAY_WRITE)
			return -EISDIR;
		break;
	case S_IFBLK:
	case S_IFCHR:
2906
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2907
			return -EACCES;
C
Christoph Hellwig 已提交
2908 2909 2910
		/*FALLTHRU*/
	case S_IFIFO:
	case S_IFSOCK:
L
Linus Torvalds 已提交
2911
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2912
		break;
2913
	}
2914

A
Al Viro 已提交
2915
	error = inode_permission(inode, MAY_OPEN | acc_mode);
2916 2917
	if (error)
		return error;
M
Mimi Zohar 已提交
2918

L
Linus Torvalds 已提交
2919 2920 2921 2922
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2923
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2924
			return -EPERM;
L
Linus Torvalds 已提交
2925
		if (flag & O_TRUNC)
2926
			return -EPERM;
L
Linus Torvalds 已提交
2927 2928 2929
	}

	/* O_NOATIME can only be set by the owner or superuser */
2930
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2931
		return -EPERM;
L
Linus Torvalds 已提交
2932

2933
	return 0;
2934
}
L
Linus Torvalds 已提交
2935

2936
static int handle_truncate(struct file *filp)
2937
{
A
Al Viro 已提交
2938
	const struct path *path = &filp->f_path;
2939 2940 2941 2942 2943 2944 2945
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2946
	error = locks_verify_locked(filp);
2947
	if (!error)
2948
		error = security_path_truncate(path);
2949 2950 2951
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2952
				    filp);
2953 2954
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2955
	return error;
L
Linus Torvalds 已提交
2956 2957
}

2958 2959
static inline int open_to_namei_flags(int flag)
{
2960 2961
	if ((flag & O_ACCMODE) == 3)
		flag--;
2962 2963 2964
	return flag;
}

2965
static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
M
Miklos Szeredi 已提交
2966
{
2967
	struct user_namespace *s_user_ns;
M
Miklos Szeredi 已提交
2968 2969 2970 2971
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

2972 2973 2974 2975 2976
	s_user_ns = dir->dentry->d_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;

M
Miklos Szeredi 已提交
2977 2978 2979 2980 2981 2982 2983
	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2984 2985 2986 2987 2988 2989 2990
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
2991 2992 2993
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
2994 2995 2996
 *
 * Returns an error code otherwise.
 */
2997 2998 2999
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
			struct path *path, struct file *file,
			const struct open_flags *op,
3000
			int open_flag, umode_t mode)
M
Miklos Szeredi 已提交
3001
{
3002
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
3003 3004 3005
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

3006
	if (!(~open_flag & (O_EXCL | O_CREAT)))	/* both O_EXCL and O_CREAT */
M
Miklos Szeredi 已提交
3007 3008 3009 3010 3011
		open_flag &= ~O_TRUNC;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
3012 3013
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
3014
	error = dir->i_op->atomic_open(dir, dentry, file,
3015
				       open_to_namei_flags(open_flag), mode);
3016
	d_lookup_done(dentry);
3017
	if (!error) {
3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032
		if (file->f_mode & FMODE_OPENED) {
			/*
			 * We didn't have the inode before the open, so check open
			 * permission here.
			 */
			int acc_mode = op->acc_mode;
			if (file->f_mode & FMODE_CREATED) {
				WARN_ON(!(open_flag & O_CREAT));
				fsnotify_create(dir, dentry);
				acc_mode = 0;
			}
			error = may_open(&file->f_path, acc_mode, open_flag);
			if (WARN_ON(error > 0))
				error = -EINVAL;
		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3033
			error = -EIO;
3034
		} else {
3035 3036 3037
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
3038
			}
3039
			if (file->f_mode & FMODE_CREATED)
3040
				fsnotify_create(dir, dentry);
A
Al Viro 已提交
3041 3042 3043 3044 3045
			if (unlikely(d_is_negative(dentry))) {
				error = -ENOENT;
			} else {
				path->dentry = dentry;
				path->mnt = nd->path.mnt;
3046
				return 0;
A
Al Viro 已提交
3047
			}
3048
		}
M
Miklos Szeredi 已提交
3049 3050
	}
	dput(dentry);
3051
	return error;
M
Miklos Szeredi 已提交
3052 3053
}

M
Miklos Szeredi 已提交
3054
/*
3055
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3056
 *
3057
 * Must be called with parent locked (exclusive in O_CREAT case).
3058
 *
3059 3060 3061 3062 3063 3064 3065
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
3066
 *
3067
 * An error code is returned on failure.
M
Miklos Szeredi 已提交
3068
 */
3069 3070 3071
static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
3072
			bool got_write)
M
Miklos Szeredi 已提交
3073 3074
{
	struct dentry *dir = nd->path.dentry;
3075
	struct inode *dir_inode = dir->d_inode;
3076
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3077
	struct dentry *dentry;
3078 3079
	int error, create_error = 0;
	umode_t mode = op->mode;
3080
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3081

3082 3083
	if (unlikely(IS_DEADDIR(dir_inode)))
		return -ENOENT;
M
Miklos Szeredi 已提交
3084

3085
	file->f_mode &= ~FMODE_CREATED;
3086 3087 3088 3089 3090 3091 3092 3093 3094
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
				return PTR_ERR(dentry);
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3095

3096 3097 3098 3099 3100 3101 3102 3103 3104 3105
		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3106
		/* Cached positive dentry: will open in f_op->open */
M
Miklos Szeredi 已提交
3107
		goto out_no_open;
3108
	}
M
Miklos Szeredi 已提交
3109

3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
	if (open_flag & O_CREAT) {
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
		if (unlikely(!got_write)) {
			create_error = -EROFS;
			open_flag &= ~O_CREAT;
			if (open_flag & (O_EXCL | O_TRUNC))
				goto no_open;
			/* No side effects, safe to clear O_CREAT */
		} else {
			create_error = may_o_create(&nd->path, dentry, mode);
			if (create_error) {
				open_flag &= ~O_CREAT;
				if (open_flag & O_EXCL)
					goto no_open;
			}
		}
	} else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
		   unlikely(!got_write)) {
		/*
		 * No O_CREATE -> atomicity not a requirement -> fall
		 * back to lookup + open
		 */
		goto no_open;
M
Miklos Szeredi 已提交
3143 3144
	}

3145
	if (dir_inode->i_op->atomic_open) {
3146
		error = atomic_open(nd, dentry, path, file, op, open_flag,
3147
				    mode);
3148 3149 3150
		if (unlikely(error == -ENOENT) && create_error)
			error = create_error;
		return error;
M
Miklos Szeredi 已提交
3151
	}
3152

3153
no_open:
3154
	if (d_in_lookup(dentry)) {
3155 3156
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3157
		d_lookup_done(dentry);
3158 3159 3160 3161 3162 3163 3164 3165
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3166 3167
	}

M
Miklos Szeredi 已提交
3168
	/* Negative dentry, just create the file */
3169
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3170
		file->f_mode |= FMODE_CREATED;
3171 3172 3173
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3174
			goto out_dput;
3175 3176
		}
		error = dir_inode->i_op->create(dir_inode, dentry, mode,
3177
						open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3178 3179
		if (error)
			goto out_dput;
3180
		fsnotify_create(dir_inode, dentry);
M
Miklos Szeredi 已提交
3181
	}
3182 3183 3184
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3185
	}
M
Miklos Szeredi 已提交
3186
out_no_open:
M
Miklos Szeredi 已提交
3187 3188
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
3189
	return 0;
M
Miklos Szeredi 已提交
3190 3191 3192

out_dput:
	dput(dentry);
3193
	return error;
M
Miklos Szeredi 已提交
3194 3195
}

N
Nick Piggin 已提交
3196
/*
3197
 * Handle the last step of open()
N
Nick Piggin 已提交
3198
 */
3199
static int do_last(struct nameidata *nd,
3200
		   struct file *file, const struct open_flags *op)
3201
{
3202
	struct dentry *dir = nd->path.dentry;
3203
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3204
	bool will_truncate = (open_flag & O_TRUNC) != 0;
3205
	bool got_write = false;
A
Al Viro 已提交
3206
	int acc_mode = op->acc_mode;
3207
	unsigned seq;
3208
	struct inode *inode;
3209
	struct path path;
A
Al Viro 已提交
3210
	int error;
3211

3212 3213 3214
	nd->flags &= ~LOOKUP_PARENT;
	nd->flags |= op->intent;

3215
	if (nd->last_type != LAST_NORM) {
3216
		error = handle_dots(nd, nd->last_type);
3217
		if (unlikely(error))
3218
			return error;
M
Miklos Szeredi 已提交
3219
		goto finish_open;
3220
	}
3221

3222
	if (!(open_flag & O_CREAT)) {
3223 3224 3225
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3226
		error = lookup_fast(nd, &path, &inode, &seq);
3227
		if (likely(error > 0))
3228 3229 3230
			goto finish_lookup;

		if (error < 0)
3231
			return error;
3232 3233

		BUG_ON(nd->inode != dir->d_inode);
A
Al Viro 已提交
3234
		BUG_ON(nd->flags & LOOKUP_RCU);
3235 3236 3237 3238 3239 3240 3241 3242
	} else {
		/* create side of things */
		/*
		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
		 * has been cleared when we got to the last component we are
		 * about to look up
		 */
		error = complete_walk(nd);
3243
		if (error)
3244
			return error;
3245

3246
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3247
		/* trailing slashes? */
3248 3249
		if (unlikely(nd->last.name[nd->last.len]))
			return -EISDIR;
3250
	}
A
Al Viro 已提交
3251

3252
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3253 3254 3255 3256 3257 3258 3259 3260 3261
		error = mnt_want_write(nd->path.mnt);
		if (!error)
			got_write = true;
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3262 3263 3264 3265
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3266
	error = lookup_open(nd, &path, file, op, got_write);
3267 3268 3269 3270
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3271

3272 3273
	if (error)
		goto out;
M
Miklos Szeredi 已提交
3274

3275
	if (file->f_mode & FMODE_OPENED) {
3276
		if ((file->f_mode & FMODE_CREATED) ||
A
Al Viro 已提交
3277
		    !S_ISREG(file_inode(file)->i_mode))
M
Miklos Szeredi 已提交
3278
			will_truncate = false;
M
Miklos Szeredi 已提交
3279

A
Al Viro 已提交
3280
		audit_inode(nd->name, file->f_path.dentry, 0);
M
Miklos Szeredi 已提交
3281 3282
		goto opened;
	}
3283

3284
	if (file->f_mode & FMODE_CREATED) {
3285
		/* Don't check for write permission, don't truncate */
3286
		open_flag &= ~O_TRUNC;
M
Miklos Szeredi 已提交
3287
		will_truncate = false;
A
Al Viro 已提交
3288
		acc_mode = 0;
3289
		path_to_nameidata(&path, nd);
M
Miklos Szeredi 已提交
3290
		goto finish_open_created;
3291 3292
	}

M
Miklos Szeredi 已提交
3293 3294 3295 3296 3297
	/*
	 * If atomic_open() acquired write access it is dropped now due to
	 * possible mount and symlink following (this might be optimized away if
	 * necessary...)
	 */
3298
	if (got_write) {
M
Miklos Szeredi 已提交
3299
		mnt_drop_write(nd->path.mnt);
3300
		got_write = false;
M
Miklos Szeredi 已提交
3301 3302
	}

A
Al Viro 已提交
3303 3304 3305 3306
	error = follow_managed(&path, nd);
	if (unlikely(error < 0))
		return error;

A
Al Viro 已提交
3307 3308 3309 3310 3311
	/*
	 * create/update audit record if it already exists.
	 */
	audit_inode(nd->name, path.dentry, 0);

3312 3313 3314 3315
	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
		path_to_nameidata(&path, nd);
		return -EEXIST;
	}
3316

3317
	seq = 0;	/* out of RCU mode, so the value doesn't matter */
3318
	inode = d_backing_inode(path.dentry);
3319
finish_lookup:
3320
	error = step_into(nd, &path, 0, inode, seq);
3321
	if (unlikely(error))
3322
		return error;
3323
finish_open:
3324
	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3325
	error = complete_walk(nd);
A
Al Viro 已提交
3326
	if (error)
3327
		return error;
A
Al Viro 已提交
3328
	audit_inode(nd->name, nd->path.dentry, 0);
3329 3330 3331 3332 3333 3334 3335 3336 3337
	if (open_flag & O_CREAT) {
		error = -EISDIR;
		if (d_is_dir(nd->path.dentry))
			goto out;
		error = may_create_in_sticky(dir,
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
			goto out;
	}
3338
	error = -ENOTDIR;
M
Miklos Szeredi 已提交
3339
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3340
		goto out;
3341
	if (!d_is_reg(nd->path.dentry))
M
Miklos Szeredi 已提交
3342
		will_truncate = false;
3343

3344 3345 3346
	if (will_truncate) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
3347
			goto out;
3348
		got_write = true;
3349
	}
M
Miklos Szeredi 已提交
3350
finish_open_created:
3351 3352 3353
	error = may_open(&nd->path, acc_mode, open_flag);
	if (error)
		goto out;
3354
	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
3355
	error = vfs_open(&nd->path, file);
A
Al Viro 已提交
3356
	if (error)
3357
		goto out;
3358
opened:
3359
	error = ima_file_check(file, op->acc_mode);
3360
	if (!error && will_truncate)
3361
		error = handle_truncate(file);
3362
out:
3363 3364 3365 3366
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3367
	if (got_write)
3368
		mnt_drop_write(nd->path.mnt);
3369
	return error;
3370 3371
}

3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386
struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
{
	struct dentry *child = NULL;
	struct inode *dir = dentry->d_inode;
	struct inode *inode;
	int error;

	/* we want directory to be writable */
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
	if (error)
		goto out_err;
	error = -EOPNOTSUPP;
	if (!dir->i_op->tmpfile)
		goto out_err;
	error = -ENOMEM;
D
David Howells 已提交
3387
	child = d_alloc(dentry, &slash_name);
3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401
	if (unlikely(!child))
		goto out_err;
	error = dir->i_op->tmpfile(dir, child, mode);
	if (error)
		goto out_err;
	error = -ENOENT;
	inode = child->d_inode;
	if (unlikely(!inode))
		goto out_err;
	if (!(open_flag & O_EXCL)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3402
	ima_post_create_tmpfile(inode);
3403 3404 3405 3406 3407 3408 3409 3410
	return child;

out_err:
	dput(child);
	return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

3411
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3412
		const struct open_flags *op,
3413
		struct file *file)
3414
{
3415 3416
	struct dentry *child;
	struct path path;
3417
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3418 3419
	if (unlikely(error))
		return error;
3420
	error = mnt_want_write(path.mnt);
3421 3422
	if (unlikely(error))
		goto out;
3423 3424
	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
	error = PTR_ERR(child);
3425
	if (IS_ERR(child))
3426
		goto out2;
3427 3428
	dput(path.dentry);
	path.dentry = child;
3429
	audit_inode(nd->name, child, 0);
3430
	/* Don't check for other permissions, the inode was just created */
A
Al Viro 已提交
3431
	error = may_open(&path, 0, op->open_flag);
3432 3433
	if (error)
		goto out2;
3434
	file->f_path.mnt = path.mnt;
3435
	error = finish_open(file, child, NULL);
3436
out2:
3437
	mnt_drop_write(path.mnt);
3438
out:
3439
	path_put(&path);
3440 3441 3442
	return error;
}

3443 3444 3445 3446 3447 3448
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
3449
		error = vfs_open(&path, file);
3450 3451 3452 3453 3454
		path_put(&path);
	}
	return error;
}

3455 3456
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3457
{
A
Al Viro 已提交
3458
	struct file *file;
3459
	int error;
N
Nick Piggin 已提交
3460

3461
	file = alloc_empty_file(op->open_flag, current_cred());
3462 3463
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3464

A
Al Viro 已提交
3465
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3466
		error = do_tmpfile(nd, flags, op, file);
3467
	} else if (unlikely(file->f_flags & O_PATH)) {
3468
		error = do_o_path(nd, flags, file);
3469 3470 3471 3472 3473 3474 3475 3476
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
			(error = do_last(nd, file, op)) > 0) {
			nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
			s = trailing_symlink(nd);
		}
		terminate_walk(nd);
3477
	}
3478
	if (likely(!error)) {
3479
		if (likely(file->f_mode & FMODE_OPENED))
3480 3481 3482
			return file;
		WARN_ON(1);
		error = -EINVAL;
3483
	}
3484 3485 3486 3487 3488 3489
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
3490
	}
3491
	return ERR_PTR(error);
L
Linus Torvalds 已提交
3492 3493
}

3494
struct file *do_filp_open(int dfd, struct filename *pathname,
3495
		const struct open_flags *op)
3496
{
3497
	struct nameidata nd;
3498
	int flags = op->lookup_flags;
3499 3500
	struct file *filp;

3501
	set_nameidata(&nd, dfd, pathname);
3502
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3503
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3504
		filp = path_openat(&nd, op, flags);
3505
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3506
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3507
	restore_nameidata();
3508 3509 3510
	return filp;
}

A
Al Viro 已提交
3511
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3512
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3513
{
3514
	struct nameidata nd;
A
Al Viro 已提交
3515
	struct file *file;
3516
	struct filename *filename;
3517
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3518 3519 3520 3521

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3522
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3523 3524
		return ERR_PTR(-ELOOP);

3525
	filename = getname_kernel(name);
3526
	if (IS_ERR(filename))
3527 3528
		return ERR_CAST(filename);

3529
	set_nameidata(&nd, -1, filename);
3530
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3531
	if (unlikely(file == ERR_PTR(-ECHILD)))
3532
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3533
	if (unlikely(file == ERR_PTR(-ESTALE)))
3534
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3535
	restore_nameidata();
3536
	putname(filename);
A
Al Viro 已提交
3537 3538 3539
	return file;
}

3540
static struct dentry *filename_create(int dfd, struct filename *name,
3541
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3542
{
3543
	struct dentry *dentry = ERR_PTR(-EEXIST);
3544 3545
	struct qstr last;
	int type;
3546
	int err2;
3547 3548 3549 3550 3551 3552 3553 3554 3555
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3556 3557 3558
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3559

3560 3561 3562 3563
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3564
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3565
		goto out;
3566

3567
	/* don't fail immediately if it's r/o, at least try to report other errors */
3568
	err2 = mnt_want_write(path->mnt);
3569 3570 3571
	/*
	 * Do the final lookup.
	 */
3572
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3573
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3574
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3575
	if (IS_ERR(dentry))
3576
		goto unlock;
3577

3578
	error = -EEXIST;
3579
	if (d_is_positive(dentry))
3580
		goto fail;
3581

3582 3583 3584 3585 3586 3587
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3588
	if (unlikely(!is_dir && last.name[last.len])) {
3589
		error = -ENOENT;
A
Al Viro 已提交
3590
		goto fail;
3591
	}
3592 3593
	if (unlikely(err2)) {
		error = err2;
3594
		goto fail;
3595
	}
3596
	putname(name);
L
Linus Torvalds 已提交
3597 3598
	return dentry;
fail:
3599 3600 3601
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3602
	inode_unlock(path->dentry->d_inode);
3603
	if (!err2)
3604
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3605
out:
3606
	path_put(path);
3607
	putname(name);
L
Linus Torvalds 已提交
3608 3609
	return dentry;
}
3610 3611 3612 3613

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3614 3615
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3616
}
3617 3618
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3619 3620 3621
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3622
	inode_unlock(path->dentry->d_inode);
3623
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3624 3625 3626 3627
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3628
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3629
				struct path *path, unsigned int lookup_flags)
3630
{
3631
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3632 3633 3634
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3635
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3636
{
3637
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3638 3639 3640 3641

	if (error)
		return error;

3642
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3643 3644
		return -EPERM;

A
Al Viro 已提交
3645
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3646 3647
		return -EPERM;

3648 3649 3650 3651
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3652 3653 3654 3655 3656
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3657
	if (!error)
3658
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3659 3660
	return error;
}
3661
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3662

A
Al Viro 已提交
3663
static int may_mknod(umode_t mode)
3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

3680 3681
long do_mknodat(int dfd, const char __user *filename, umode_t mode,
		unsigned int dev)
L
Linus Torvalds 已提交
3682
{
3683
	struct dentry *dentry;
3684 3685
	struct path path;
	int error;
3686
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3687

3688 3689 3690
	error = may_mknod(mode);
	if (error)
		return error;
3691 3692
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3693 3694
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3695

3696
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3697
		mode &= ~current_umask();
3698
	error = security_path_mknod(&path, dentry, mode, dev);
3699
	if (error)
3700
		goto out;
3701
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3702
		case 0: case S_IFREG:
A
Al Viro 已提交
3703
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3704 3705
			if (!error)
				ima_post_path_mknod(dentry);
L
Linus Torvalds 已提交
3706 3707
			break;
		case S_IFCHR: case S_IFBLK:
3708
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3709 3710 3711
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3712
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3713 3714
			break;
	}
3715
out:
A
Al Viro 已提交
3716
	done_path_create(&path, dentry);
3717 3718 3719 3720
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3721 3722 3723
	return error;
}

3724 3725 3726 3727 3728 3729
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
		unsigned int, dev)
{
	return do_mknodat(dfd, filename, mode, dev);
}

A
Al Viro 已提交
3730
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3731
{
3732
	return do_mknodat(AT_FDCWD, filename, mode, dev);
3733 3734
}

3735
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3736
{
3737
	int error = may_create(dir, dentry);
3738
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3739 3740 3741 3742

	if (error)
		return error;

A
Al Viro 已提交
3743
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3744 3745 3746 3747 3748 3749 3750
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3751 3752 3753
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3754
	error = dir->i_op->mkdir(dir, dentry, mode);
3755
	if (!error)
3756
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3757 3758
	return error;
}
3759
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3760

3761
long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
L
Linus Torvalds 已提交
3762
{
3763
	struct dentry *dentry;
3764 3765
	struct path path;
	int error;
3766
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3767

3768 3769
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3770
	if (IS_ERR(dentry))
3771
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3772

3773
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3774
		mode &= ~current_umask();
3775
	error = security_path_mkdir(&path, dentry, mode);
3776 3777
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3778
	done_path_create(&path, dentry);
3779 3780 3781 3782
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3783 3784 3785
	return error;
}

3786 3787 3788 3789 3790
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
	return do_mkdirat(dfd, pathname, mode);
}

3791
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3792
{
3793
	return do_mkdirat(AT_FDCWD, pathname, mode);
3794 3795
}

L
Linus Torvalds 已提交
3796 3797 3798 3799 3800 3801 3802
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3803
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3804 3805
		return -EPERM;

3806
	dget(dentry);
A
Al Viro 已提交
3807
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3808 3809

	error = -EBUSY;
3810
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3811 3812 3813 3814 3815 3816 3817 3818 3819 3820
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

3821
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3822 3823
	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3824
	detach_mounts(dentry);
3825
	fsnotify_rmdir(dir, dentry);
S
Sage Weil 已提交
3826 3827

out:
A
Al Viro 已提交
3828
	inode_unlock(dentry->d_inode);
3829
	dput(dentry);
S
Sage Weil 已提交
3830
	if (!error)
L
Linus Torvalds 已提交
3831 3832 3833
		d_delete(dentry);
	return error;
}
3834
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3835

3836
long do_rmdir(int dfd, const char __user *pathname)
L
Linus Torvalds 已提交
3837 3838
{
	int error = 0;
3839
	struct filename *name;
L
Linus Torvalds 已提交
3840
	struct dentry *dentry;
3841 3842 3843
	struct path path;
	struct qstr last;
	int type;
3844 3845
	unsigned int lookup_flags = 0;
retry:
A
Al Viro 已提交
3846 3847
	name = filename_parentat(dfd, getname(pathname), lookup_flags,
				&path, &last, &type);
3848 3849
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3850

3851
	switch (type) {
3852 3853 3854 3855 3856 3857 3858 3859 3860
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3861
	}
3862

3863
	error = mnt_want_write(path.mnt);
3864 3865
	if (error)
		goto exit1;
3866

A
Al Viro 已提交
3867
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3868
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3869
	error = PTR_ERR(dentry);
3870 3871
	if (IS_ERR(dentry))
		goto exit2;
3872 3873 3874 3875
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3876
	error = security_path_rmdir(&path, dentry);
3877
	if (error)
3878
		goto exit3;
3879
	error = vfs_rmdir(path.dentry->d_inode, dentry);
3880
exit3:
3881 3882
	dput(dentry);
exit2:
A
Al Viro 已提交
3883
	inode_unlock(path.dentry->d_inode);
3884
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3885
exit1:
3886
	path_put(&path);
L
Linus Torvalds 已提交
3887
	putname(name);
3888 3889 3890 3891
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3892 3893 3894
	return error;
}

3895
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3896 3897 3898 3899
{
	return do_rmdir(AT_FDCWD, pathname);
}

3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3919
{
J
J. Bruce Fields 已提交
3920
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3921 3922 3923 3924 3925
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3926
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3927 3928
		return -EPERM;

A
Al Viro 已提交
3929
	inode_lock(target);
3930
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3931 3932 3933
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3934
		if (!error) {
3935 3936
			error = try_break_deleg(target, delegated_inode);
			if (error)
3937
				goto out;
L
Linus Torvalds 已提交
3938
			error = dir->i_op->unlink(dir, dentry);
3939
			if (!error) {
3940
				dont_mount(dentry);
3941
				detach_mounts(dentry);
3942
				fsnotify_unlink(dir, dentry);
3943
			}
3944
		}
L
Linus Torvalds 已提交
3945
	}
3946
out:
A
Al Viro 已提交
3947
	inode_unlock(target);
L
Linus Torvalds 已提交
3948 3949 3950

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3951
		fsnotify_link_count(target);
J
John McCutchan 已提交
3952
		d_delete(dentry);
L
Linus Torvalds 已提交
3953
	}
R
Robert Love 已提交
3954

L
Linus Torvalds 已提交
3955 3956
	return error;
}
3957
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3958 3959 3960

/*
 * Make sure that the actual truncation of the file will occur outside its
3961
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3962 3963 3964
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3965
long do_unlinkat(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3966
{
3967
	int error;
L
Linus Torvalds 已提交
3968
	struct dentry *dentry;
3969 3970 3971
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
3972
	struct inode *inode = NULL;
3973
	struct inode *delegated_inode = NULL;
3974 3975
	unsigned int lookup_flags = 0;
retry:
3976
	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
3977 3978
	if (IS_ERR(name))
		return PTR_ERR(name);
3979

L
Linus Torvalds 已提交
3980
	error = -EISDIR;
3981
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
3982
		goto exit1;
3983

3984
	error = mnt_want_write(path.mnt);
3985 3986
	if (error)
		goto exit1;
3987
retry_deleg:
A
Al Viro 已提交
3988
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3989
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3990 3991 3992
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3993
		if (last.name[last.len])
3994
			goto slashes;
L
Linus Torvalds 已提交
3995
		inode = dentry->d_inode;
3996
		if (d_is_negative(dentry))
3997 3998
			goto slashes;
		ihold(inode);
3999
		error = security_path_unlink(&path, dentry);
4000
		if (error)
4001
			goto exit2;
4002
		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4003
exit2:
L
Linus Torvalds 已提交
4004 4005
		dput(dentry);
	}
A
Al Viro 已提交
4006
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
4007 4008
	if (inode)
		iput(inode);	/* truncate the inode here */
4009 4010
	inode = NULL;
	if (delegated_inode) {
4011
		error = break_deleg_wait(&delegated_inode);
4012 4013 4014
		if (!error)
			goto retry_deleg;
	}
4015
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
4016
exit1:
4017
	path_put(&path);
4018 4019 4020 4021 4022
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
4023
	putname(name);
L
Linus Torvalds 已提交
4024 4025 4026
	return error;

slashes:
4027 4028
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
4029
	else if (d_is_dir(dentry))
4030 4031 4032
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
4033 4034 4035
	goto exit2;
}

4036
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4037 4038 4039 4040 4041 4042 4043
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
		return do_rmdir(dfd, pathname);

4044
	return do_unlinkat(dfd, getname(pathname));
4045 4046
}

4047
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4048
{
4049
	return do_unlinkat(AT_FDCWD, getname(pathname));
4050 4051
}

4052
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
4053
{
4054
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
4055 4056 4057 4058

	if (error)
		return error;

A
Al Viro 已提交
4059
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
4060 4061 4062 4063 4064 4065 4066
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
4067
	if (!error)
4068
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
4069 4070
	return error;
}
4071
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
4072

4073 4074
long do_symlinkat(const char __user *oldname, int newdfd,
		  const char __user *newname)
L
Linus Torvalds 已提交
4075
{
4076
	int error;
4077
	struct filename *from;
4078
	struct dentry *dentry;
4079
	struct path path;
4080
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
4081 4082

	from = getname(oldname);
4083
	if (IS_ERR(from))
L
Linus Torvalds 已提交
4084
		return PTR_ERR(from);
4085 4086
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4087 4088
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
4089
		goto out_putname;
4090

4091
	error = security_path_symlink(&path, dentry, from->name);
4092
	if (!error)
4093
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
4094
	done_path_create(&path, dentry);
4095 4096 4097 4098
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4099
out_putname:
L
Linus Torvalds 已提交
4100 4101 4102 4103
	putname(from);
	return error;
}

4104 4105 4106 4107 4108 4109
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return do_symlinkat(oldname, newdfd, newname);
}

4110
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4111
{
4112
	return do_symlinkat(oldname, AT_FDCWD, newname);
4113 4114
}

J
J. Bruce Fields 已提交
4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4135 4136
{
	struct inode *inode = old_dentry->d_inode;
4137
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4138 4139 4140 4141 4142
	int error;

	if (!inode)
		return -ENOENT;

4143
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
4155 4156 4157 4158 4159 4160 4161
	/*
	 * Updating the link count will likely cause i_uid and i_gid to
	 * be writen back improperly if their true value is unknown to
	 * the vfs.
	 */
	if (HAS_UNMAPPED_ID(inode))
		return -EPERM;
A
Al Viro 已提交
4162
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4163
		return -EPERM;
4164
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4165 4166 4167 4168 4169 4170
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4171
	inode_lock(inode);
4172
	/* Make sure we don't allow creating hardlink to an unlinked file */
4173
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4174
		error =  -ENOENT;
4175 4176
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4177 4178 4179 4180 4181
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4182 4183 4184 4185 4186 4187

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4188
	inode_unlock(inode);
4189
	if (!error)
4190
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4191 4192
	return error;
}
4193
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4194 4195 4196 4197 4198 4199 4200 4201 4202 4203

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4204 4205
int do_linkat(int olddfd, const char __user *oldname, int newdfd,
	      const char __user *newname, int flags)
L
Linus Torvalds 已提交
4206 4207
{
	struct dentry *new_dentry;
4208
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4209
	struct inode *delegated_inode = NULL;
4210
	int how = 0;
L
Linus Torvalds 已提交
4211 4212
	int error;

4213
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4214
		return -EINVAL;
4215
	/*
4216 4217 4218
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4219
	 */
4220 4221 4222
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4223
		how = LOOKUP_EMPTY;
4224
	}
4225 4226 4227

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4228
retry:
4229
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4230
	if (error)
4231 4232
		return error;

4233 4234
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4235
	error = PTR_ERR(new_dentry);
4236
	if (IS_ERR(new_dentry))
4237 4238 4239 4240 4241
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4242 4243 4244
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4245
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4246
	if (error)
4247
		goto out_dput;
J
J. Bruce Fields 已提交
4248
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4249
out_dput:
A
Al Viro 已提交
4250
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4251 4252
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4253 4254
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4255
			goto retry;
4256
		}
J
J. Bruce Fields 已提交
4257
	}
4258
	if (retry_estale(error, how)) {
4259
		path_put(&old_path);
4260 4261 4262
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4263
out:
4264
	path_put(&old_path);
L
Linus Torvalds 已提交
4265 4266 4267 4268

	return error;
}

4269 4270 4271 4272 4273 4274
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
{
	return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

4275
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4276
{
4277
	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4278 4279
}

4280 4281 4282 4283 4284 4285 4286
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4287
 * @flags:	rename flags
4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4302 4303 4304
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4305
 *
4306
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4307 4308
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4309
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4310
 *	   story.
4311 4312
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4313
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4314 4315
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4316
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4317 4318 4319
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4320
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4321 4322 4323
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4324
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4325
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4326
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4327
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4328 4329
 *	   locking].
 */
4330 4331
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4332
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4333
{
4334 4335 4336
	int error;
	bool is_dir = d_is_dir(old_dentry);
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4337
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4338 4339
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
A
Al Viro 已提交
4340
	struct name_snapshot old_name;
4341

4342
	if (source == target)
4343 4344 4345 4346 4347 4348
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4349
	if (!target) {
4350
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4351 4352 4353 4354 4355 4356 4357 4358
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4359 4360 4361
	if (error)
		return error;

4362
	if (!old_dir->i_op->rename)
4363
		return -EPERM;
L
Linus Torvalds 已提交
4364 4365 4366 4367 4368

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4380 4381
	}

4382 4383
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4384 4385 4386
	if (error)
		return error;

A
Al Viro 已提交
4387
	take_dentry_name_snapshot(&old_name, old_dentry);
4388
	dget(new_dentry);
M
Miklos Szeredi 已提交
4389
	if (!is_dir || (flags & RENAME_EXCHANGE))
4390 4391
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4392
		inode_lock(target);
S
Sage Weil 已提交
4393 4394

	error = -EBUSY;
4395
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4396 4397
		goto out;

M
Miklos Szeredi 已提交
4398
	if (max_links && new_dir != old_dir) {
4399
		error = -EMLINK;
M
Miklos Szeredi 已提交
4400
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4401
			goto out;
M
Miklos Szeredi 已提交
4402 4403 4404 4405 4406
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (!is_dir) {
4407
		error = try_break_deleg(source, delegated_inode);
4408 4409
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4410 4411 4412 4413 4414
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4415
	}
4416
	error = old_dir->i_op->rename(old_dir, old_dentry,
M
Miklos Szeredi 已提交
4417
				       new_dir, new_dentry, flags);
S
Sage Weil 已提交
4418 4419 4420
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4421
	if (!(flags & RENAME_EXCHANGE) && target) {
4422 4423
		if (is_dir) {
			shrink_dcache_parent(new_dentry);
4424
			target->i_flags |= S_DEAD;
4425
		}
S
Sage Weil 已提交
4426
		dont_mount(new_dentry);
4427
		detach_mounts(new_dentry);
4428
	}
M
Miklos Szeredi 已提交
4429 4430 4431 4432 4433 4434
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4435
out:
M
Miklos Szeredi 已提交
4436
	if (!is_dir || (flags & RENAME_EXCHANGE))
4437 4438
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4439
		inode_unlock(target);
L
Linus Torvalds 已提交
4440
	dput(new_dentry);
M
Miklos Szeredi 已提交
4441
	if (!error) {
4442
		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
M
Miklos Szeredi 已提交
4443 4444
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
4445
			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
M
Miklos Szeredi 已提交
4446 4447 4448
				      new_is_dir, NULL, new_dentry);
		}
	}
A
Al Viro 已提交
4449
	release_dentry_name_snapshot(&old_name);
R
Robert Love 已提交
4450

L
Linus Torvalds 已提交
4451 4452
	return error;
}
4453
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4454

4455 4456
static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
			const char __user *newname, unsigned int flags)
L
Linus Torvalds 已提交
4457
{
4458 4459
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4460 4461 4462
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4463
	struct inode *delegated_inode = NULL;
4464 4465
	struct filename *from;
	struct filename *to;
4466
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4467
	bool should_retry = false;
4468
	int error;
M
Miklos Szeredi 已提交
4469

M
Miklos Szeredi 已提交
4470
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
4471 4472
		return -EINVAL;

M
Miklos Szeredi 已提交
4473 4474
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
M
Miklos Szeredi 已提交
4475 4476
		return -EINVAL;

M
Miklos Szeredi 已提交
4477 4478 4479
	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
		return -EPERM;

4480 4481 4482
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4483
retry:
A
Al Viro 已提交
4484 4485
	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
				&old_path, &old_last, &old_type);
4486 4487
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
L
Linus Torvalds 已提交
4488
		goto exit;
4489
	}
L
Linus Torvalds 已提交
4490

A
Al Viro 已提交
4491 4492
	to = filename_parentat(newdfd, getname(newname), lookup_flags,
				&new_path, &new_last, &new_type);
4493 4494
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4495
		goto exit1;
4496
	}
L
Linus Torvalds 已提交
4497 4498

	error = -EXDEV;
4499
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4500 4501 4502
		goto exit2;

	error = -EBUSY;
4503
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4504 4505
		goto exit2;

M
Miklos Szeredi 已提交
4506 4507
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4508
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4509 4510
		goto exit2;

4511
	error = mnt_want_write(old_path.mnt);
4512 4513 4514
	if (error)
		goto exit2;

4515
retry_deleg:
4516
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4517

4518
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4519 4520 4521 4522 4523
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4524
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4525
		goto exit4;
4526
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4527 4528 4529 4530 4531 4532
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4533 4534 4535 4536 4537 4538 4539
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4540
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4541 4542 4543
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4544
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4545
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4546
		error = -ENOTDIR;
4547
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4548
			goto exit5;
4549
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4550
			goto exit5;
L
Linus Torvalds 已提交
4551 4552 4553 4554
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4555
		goto exit5;
L
Linus Torvalds 已提交
4556
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4557 4558
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4559 4560 4561
	if (new_dentry == trap)
		goto exit5;

4562 4563
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4564
	if (error)
4565
		goto exit5;
4566 4567
	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
			   new_path.dentry->d_inode, new_dentry,
M
Miklos Szeredi 已提交
4568
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4569 4570 4571 4572 4573
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4574
	unlock_rename(new_path.dentry, old_path.dentry);
4575 4576 4577 4578 4579
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4580
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4581
exit2:
4582 4583
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4584
	path_put(&new_path);
4585
	putname(to);
L
Linus Torvalds 已提交
4586
exit1:
4587
	path_put(&old_path);
L
Linus Torvalds 已提交
4588
	putname(from);
4589 4590 4591 4592 4593
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4594
exit:
L
Linus Torvalds 已提交
4595 4596 4597
	return error;
}

4598 4599 4600 4601 4602 4603
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
{
	return do_renameat2(olddfd, oldname, newdfd, newname, flags);
}

M
Miklos Szeredi 已提交
4604 4605 4606
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
4607
	return do_renameat2(olddfd, oldname, newdfd, newname, 0);
M
Miklos Szeredi 已提交
4608 4609
}

4610
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4611
{
4612
	return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4613 4614
}

M
Miklos Szeredi 已提交
4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
	int error = may_create(dir, dentry);
	if (error)
		return error;

	if (!dir->i_op->mknod)
		return -EPERM;

	return dir->i_op->mknod(dir, dentry,
				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}
EXPORT_SYMBOL(vfs_whiteout);

A
Al Viro 已提交
4629
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4630
{
A
Al Viro 已提交
4631
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656
/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct inode *inode = d_inode(dentry);
4657 4658 4659
	DEFINE_DELAYED_CALL(done);
	const char *link;
	int res;
4660

4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671
	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
		if (unlikely(inode->i_op->readlink))
			return inode->i_op->readlink(dentry, buffer, buflen);

		if (!d_is_symlink(dentry))
			return -EINVAL;

		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_DEFAULT_READLINK;
		spin_unlock(&inode->i_lock);
	}
4672

4673
	link = READ_ONCE(inode->i_link);
4674 4675 4676 4677 4678 4679 4680 4681
	if (!link) {
		link = inode->i_op->get_link(dentry, inode, &done);
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
	res = readlink_copy(buffer, buflen, link);
	do_delayed_call(&done);
	return res;
4682 4683
}
EXPORT_SYMBOL(vfs_readlink);
L
Linus Torvalds 已提交
4684

M
Miklos Szeredi 已提交
4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709
/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
	const char *res = ERR_PTR(-EINVAL);
	struct inode *inode = d_inode(dentry);

	if (d_is_symlink(dentry)) {
		res = ERR_PTR(security_inode_readlink(dentry));
		if (!res)
			res = inode->i_op->get_link(dentry, inode, done);
	}
	return res;
}
EXPORT_SYMBOL(vfs_get_link);

L
Linus Torvalds 已提交
4710
/* get the link contents into pagecache */
4711
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4712
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4713
{
4714 4715
	char *kaddr;
	struct page *page;
4716 4717
	struct address_space *mapping = inode->i_mapping;

4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4731
	set_delayed_call(callback, page_put_link, page);
4732 4733
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4734
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4735
	return kaddr;
L
Linus Torvalds 已提交
4736 4737
}

4738
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4739

4740
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4741
{
4742
	put_page(arg);
L
Linus Torvalds 已提交
4743
}
4744
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4745

4746 4747
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4748
	DEFINE_DELAYED_CALL(done);
4749 4750
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4751 4752
					      &done));
	do_delayed_call(&done);
4753 4754 4755 4756
	return res;
}
EXPORT_SYMBOL(page_readlink);

4757 4758 4759 4760
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4761 4762
{
	struct address_space *mapping = inode->i_mapping;
4763
	struct page *page;
4764
	void *fsdata;
4765
	int err;
4766
	unsigned int flags = 0;
4767 4768
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4769

4770
retry:
4771
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4772
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4773
	if (err)
4774 4775
		goto fail;

4776
	memcpy(page_address(page), symname, len-1);
4777 4778 4779

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4780 4781
	if (err < 0)
		goto fail;
4782 4783 4784
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4785 4786 4787 4788 4789
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4790
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4791

4792 4793 4794
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4795
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4796
}
4797
EXPORT_SYMBOL(page_symlink);
4798

4799
const struct inode_operations page_symlink_inode_operations = {
4800
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4801 4802
};
EXPORT_SYMBOL(page_symlink_inode_operations);