namei.c 119.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
25
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
26 27
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
28
#include <linux/ima.h>
L
Linus Torvalds 已提交
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
42

43
#include "internal.h"
44
#include "mount.h"
45

L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
80
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

A
Al Viro 已提交
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
130
	struct filename *result;
131
	char *kname;
A
Al Viro 已提交
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

A
Al Viro 已提交
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
A
Al Viro 已提交
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

A
Al Viro 已提交
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
A
Al Viro 已提交
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

A
Al Viro 已提交
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
A
Al Viro 已提交
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
204 205
}

206 207
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
208
{
209
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
L
Linus Torvalds 已提交
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
260 261
}

262 263
static int check_acl(struct inode *inode, int mask)
{
264
#ifdef CONFIG_FS_POSIX_ACL
265 266 267
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
268 269
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
270
	                return -EAGAIN;
271
		/* no ->get_acl() calls in RCU mode... */
272
		if (is_uncached_acl(acl))
273
			return -ECHILD;
274
	        return posix_acl_permission(inode, acl, mask);
275 276
	}

C
Christoph Hellwig 已提交
277 278 279
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
280 281 282 283 284
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
285
#endif
286 287 288 289

	return -EAGAIN;
}

290
/*
291 292 293 294
 * This does the basic UNIX permission checking.
 *
 * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
 * for RCU walking.
L
Linus Torvalds 已提交
295
 */
296
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
297
{
298
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
299

300 301 302
	/* Are we the owner? If so, ACL's don't matter */
	if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
		mask &= 7;
L
Linus Torvalds 已提交
303
		mode >>= 6;
304 305
		return (mask & ~mode) ? -EACCES : 0;
	}
L
Linus Torvalds 已提交
306

307 308 309 310 311
	/* Do we have ACL's? */
	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
		int error = check_acl(inode, mask);
		if (error != -EAGAIN)
			return error;
L
Linus Torvalds 已提交
312 313
	}

314 315 316
	/* Only RWX matters for group/other mode bits */
	mask &= 7;

L
Linus Torvalds 已提交
317
	/*
318 319 320
	 * Are the group permissions different from
	 * the other permissions in the bits we care
	 * about? Need to check group ownership if so.
L
Linus Torvalds 已提交
321
	 */
322 323 324 325 326 327 328
	if (mask & (mode ^ (mode >> 3))) {
		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/* Bits in 'mode' clear that we require? */
	return (mask & ~mode) ? -EACCES : 0;
329 330 331
}

/**
332
 * generic_permission -  check for access rights on a Posix-like filesystem
333
 * @inode:	inode to check access rights for
334 335
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *		%MAY_NOT_BLOCK ...)
336 337 338 339
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
340 341 342 343 344
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
345
 */
346
int generic_permission(struct inode *inode, int mask)
347 348 349 350
{
	int ret;

	/*
351
	 * Do the basic permission checks.
352
	 */
353
	ret = acl_permission_check(inode, mask);
354 355
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
356

357 358 359
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
360 361
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
362
				return 0;
363
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
364
			return 0;
365 366
		return -EACCES;
	}
L
Linus Torvalds 已提交
367 368 369 370

	/*
	 * Searching includes executable on directories, else just read.
	 */
371
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
372
	if (mask == MAY_READ)
373
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
374
			return 0;
375 376 377 378 379 380 381 382
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
L
Linus Torvalds 已提交
383 384 385

	return -EACCES;
}
386
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
387

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

D
David Howells 已提交
408 409 410
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
411
 * @inode: Inode to check permission on
D
David Howells 已提交
412 413 414 415 416 417 418 419 420 421
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
422
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
D
David Howells 已提交
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
	}

	retval = do_inode_permission(inode, mask);
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
D
David Howells 已提交
472
}
473
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
474

J
Jan Blunck 已提交
475 476 477 478 479 480
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
481
void path_get(const struct path *path)
J
Jan Blunck 已提交
482 483 484 485 486 487
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
488 489 490 491 492 493
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
494
void path_put(const struct path *path)
L
Linus Torvalds 已提交
495
{
J
Jan Blunck 已提交
496 497
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
498
}
J
Jan Blunck 已提交
499
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
500

501
#define EMBEDDED_LEVELS 2
502 503
struct nameidata {
	struct path	path;
A
Al Viro 已提交
504
	struct qstr	last;
505 506
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
507
	unsigned int	flags, state;
508
	unsigned	seq, m_seq, r_seq;
509 510
	int		last_type;
	unsigned	depth;
511
	int		total_link_count;
512 513
	struct saved {
		struct path link;
514
		struct delayed_call done;
515
		const char *name;
516
		unsigned seq;
517
	} *stack, internal[EMBEDDED_LEVELS];
518 519 520 521
	struct filename	*name;
	struct nameidata *saved;
	unsigned	root_seq;
	int		dfd;
522 523
	kuid_t		dir_uid;
	umode_t		dir_mode;
524
} __randomize_layout;
525

526 527 528 529
#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

530
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
531
{
532 533
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
534 535
	p->dfd = dfd;
	p->name = name;
536
	p->total_link_count = old ? old->total_link_count : 0;
537
	p->saved = old;
538
	p->state = 0;
539
	current->nameidata = p;
540 541
}

542
static void restore_nameidata(void)
543
{
544
	struct nameidata *now = current->nameidata, *old = now->saved;
545 546 547 548

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
549
	if (now->stack != now->internal)
550
		kfree(now->stack);
551 552
}

553
static bool nd_alloc_stack(struct nameidata *nd)
554
{
A
Al Viro 已提交
555 556
	struct saved *p;

557 558 559 560
	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
	if (unlikely(!p))
		return false;
561 562
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
563
	return true;
564 565
}

566
/**
567
 * path_connected - Verify that a dentry is below mnt.mnt_root
568 569 570 571
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
572
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
573
{
574
	struct super_block *sb = mnt->mnt_sb;
575

576 577
	/* Bind mounts can have disconnected paths */
	if (mnt->mnt_root == sb->s_root)
578 579
		return true;

580
	return is_subdir(dentry, mnt->mnt_root);
581 582
}

583 584 585 586 587
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
588 589
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
590 591 592 593 594 595 596 597 598 599 600
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
601
		if (nd->state & ND_ROOT_GRABBED) {
602
			path_put(&nd->root);
603
			nd->state &= ~ND_ROOT_GRABBED;
604
		}
605 606 607 608 609 610 611 612
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
613
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
614
{
615
	int res = __legitimize_mnt(path->mnt, mseq);
616 617 618 619 620 621 622 623 624 625 626 627 628
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

629 630 631
static inline bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
A
Al Viro 已提交
632
	return __legitimize_path(path, seq, nd->m_seq);
633 634
}

635 636 637 638 639 640 641 642 643 644 645 646 647 648
static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

649 650
static bool legitimize_root(struct nameidata *nd)
{
651 652 653 654 655 656 657 658
	/*
	 * For scoped-lookups (where nd->root has been zeroed), we need to
	 * restart the whole lookup from scratch -- because set_root() is wrong
	 * for these lookups (nd->dfd is the root, not the filesystem root).
	 */
	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
		return false;
	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
659
	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
660
		return true;
661
	nd->state |= ND_ROOT_GRABBED;
662 663 664
	return legitimize_path(nd, &nd->root, nd->root_seq);
}

A
Al Viro 已提交
665
/*
N
Nick Piggin 已提交
666
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
667 668
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
669
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
670 671 672 673
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
674 675 676
 */

/**
677
 * try_to_unlazy - try to switch to ref-walk mode.
A
Al Viro 已提交
678
 * @nd: nameidata pathwalk data
679
 * Returns: true on success, false on failure
N
Nick Piggin 已提交
680
 *
681
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
A
Al Viro 已提交
682 683
 * for ref-walk mode.
 * Must be called from rcu-walk context.
684
 * Nothing should touch nameidata between try_to_unlazy() failure and
685
 * terminate_walk().
N
Nick Piggin 已提交
686
 */
687
static bool try_to_unlazy(struct nameidata *nd)
N
Nick Piggin 已提交
688 689 690 691
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
692

A
Al Viro 已提交
693 694 695
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
696 697
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
698 699
	if (unlikely(!legitimize_root(nd)))
		goto out;
A
Al Viro 已提交
700 701
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
702
	return true;
A
Al Viro 已提交
703

704
out1:
A
Al Viro 已提交
705 706 707 708
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
709
	return false;
A
Al Viro 已提交
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

729
	nd->flags &= ~LOOKUP_RCU;
730 731 732 733
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
A
Al Viro 已提交
734
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
735
		goto out1;
A
Al Viro 已提交
736

737
	/*
A
Al Viro 已提交
738 739 740 741 742
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
743
	 */
A
Al Viro 已提交
744 745
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
746 747
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
		goto out_dput;
748 749 750 751
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
752 753
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
A
Al Viro 已提交
754
	rcu_read_unlock();
N
Nick Piggin 已提交
755
	return 0;
A
Al Viro 已提交
756

757 758 759 760
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
761
out:
A
Al Viro 已提交
762
	rcu_read_unlock();
763 764 765 766
	return -ECHILD;
out_dput:
	rcu_read_unlock();
	dput(dentry);
N
Nick Piggin 已提交
767 768 769
	return -ECHILD;
}

770
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
771
{
772 773 774 775
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
776 777
}

778 779 780
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
781
 *
782 783 784 785 786
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
787
 */
788
static int complete_walk(struct nameidata *nd)
789
{
A
Al Viro 已提交
790
	struct dentry *dentry = nd->path.dentry;
791 792
	int status;

793
	if (nd->flags & LOOKUP_RCU) {
794 795 796 797
		/*
		 * We don't want to zero nd->root for scoped-lookups or
		 * externally-managed nd->root.
		 */
798 799 800
		if (!(nd->state & ND_ROOT_PRESET))
			if (!(nd->flags & LOOKUP_IS_SCOPED))
				nd->root.mnt = NULL;
801
		if (!try_to_unlazy(nd))
802 803 804
			return -ECHILD;
	}

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
		/*
		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
		 * ever step outside the root during lookup" and should already
		 * be guaranteed by the rest of namei, we want to avoid a namei
		 * BUG resulting in userspace being given a path that was not
		 * scoped within the root at some point during the lookup.
		 *
		 * So, do a final sanity-check to make sure that in the
		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
		 * we won't silently return an fd completely outside of the
		 * requested root to userspace.
		 *
		 * Userspace could move the path outside the root after this
		 * check, but as discussed elsewhere this is not a concern (the
		 * resolved file was inside the root at some point).
		 */
		if (!path_is_under(&nd->path, &nd->root))
			return -EXDEV;
	}

826
	if (likely(!(nd->state & ND_JUMPED)))
A
Al Viro 已提交
827 828
		return 0;

829
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
830 831
		return 0;

832
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
833 834 835
	if (status > 0)
		return 0;

A
Al Viro 已提交
836
	if (!status)
837
		status = -ESTALE;
A
Al Viro 已提交
838

839 840 841
	return status;
}

842
static int set_root(struct nameidata *nd)
N
Nick Piggin 已提交
843
{
844
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
845

846 847 848 849 850 851 852 853
	/*
	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
	 * still have to ensure it doesn't happen because it will cause a breakout
	 * from the dirfd.
	 */
	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
		return -ENOTRECOVERABLE;

854 855 856 857 858 859 860 861 862 863
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
864
		nd->state |= ND_ROOT_GRABBED;
865
	}
866
	return 0;
N
Nick Piggin 已提交
867 868
}

869 870
static int nd_jump_root(struct nameidata *nd)
{
871 872
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return -EXDEV;
873 874 875 876 877
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		/* Absolute path arguments to path_init() are allowed. */
		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
			return -EXDEV;
	}
878 879 880 881 882
	if (!nd->root.mnt) {
		int error = set_root(nd);
		if (error)
			return error;
	}
883 884 885 886 887 888 889 890 891 892 893 894 895 896
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
897
	nd->state |= ND_JUMPED;
898 899 900
	return 0;
}

C
Christoph Hellwig 已提交
901
/*
902
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
903 904
 * caller must have taken a reference to path beforehand.
 */
905
int nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
906
{
907
	int error = -ELOOP;
908
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
909

910 911 912
	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
		goto err;

913 914 915 916 917
	error = -EXDEV;
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		if (nd->path.mnt != path->mnt)
			goto err;
	}
918 919 920
	/* Not currently safe for scoped-lookups. */
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
		goto err;
921

922
	path_put(&nd->path);
C
Christoph Hellwig 已提交
923 924
	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
925
	nd->state |= ND_JUMPED;
926
	return 0;
927 928 929 930

err:
	path_put(path);
	return error;
C
Christoph Hellwig 已提交
931 932
}

933
static inline void put_link(struct nameidata *nd)
934
{
A
Al Viro 已提交
935
	struct saved *last = nd->stack + --nd->depth;
936
	do_delayed_call(&last->done);
A
Al Viro 已提交
937 938
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
939 940
}

941 942
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
943 944
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
K
Kees Cook 已提交
945 946 947

/**
 * may_follow_link - Check symlink following for unsafe situations
948
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
949 950 951 952 953 954 955 956 957 958 959 960
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
961
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
K
Kees Cook 已提交
962 963 964 965 966
{
	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
967
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
968 969 970
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
971
	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
K
Kees Cook 已提交
972 973 974
		return 0;

	/* Allowed if parent directory and link owner match. */
975
	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
K
Kees Cook 已提交
976 977
		return 0;

978 979 980
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

981
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
K
Kees Cook 已提交
982
	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
K
Kees Cook 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1029
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
1030 1031 1032
 *
 * Returns 0 if successful, -ve on error.
 */
1033
int may_linkat(struct path *link)
K
Kees Cook 已提交
1034
{
1035 1036 1037 1038 1039
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;
K
Kees Cook 已提交
1040 1041 1042 1043 1044 1045 1046

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1047
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
K
Kees Cook 已提交
1048 1049
		return 0;

K
Kees Cook 已提交
1050
	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
K
Kees Cook 已提交
1051 1052 1053
	return -EPERM;
}

1054 1055 1056 1057
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
1058 1059
 * @dir_mode: mode bits of directory
 * @dir_uid: owner of directory
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
1075
static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
1076 1077 1078 1079
				struct inode * const inode)
{
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1080 1081
	    likely(!(dir_mode & S_ISVTX)) ||
	    uid_eq(inode->i_uid, dir_uid) ||
1082 1083 1084
	    uid_eq(current_fsuid(), inode->i_uid))
		return 0;

1085 1086
	if (likely(dir_mode & 0002) ||
	    (dir_mode & 0020 &&
1087 1088
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
K
Kees Cook 已提交
1089 1090 1091 1092
		const char *operation = S_ISFIFO(inode->i_mode) ?
					"sticky_create_fifo" :
					"sticky_create_regular";
		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1093 1094 1095 1096 1097
		return -EACCES;
	}
	return 0;
}

1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1108
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1109
{
1110 1111
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1112
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1113

A
Al Viro 已提交
1114
	read_seqlock_excl(&mount_lock);
1115
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1116
	if (parent == mnt) {
A
Al Viro 已提交
1117
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1118 1119
		return 0;
	}
1120
	mntget(&parent->mnt);
1121
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1122
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1123 1124 1125
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1126
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1127 1128
	return 1;
}
1129
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1130

A
Al Viro 已提交
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
				  struct path *path, unsigned *seqp)
{
	while (mnt_has_parent(m)) {
		struct dentry *mountpoint = m->mnt_mountpoint;

		m = m->mnt_parent;
		if (unlikely(root->dentry == mountpoint &&
			     root->mnt == &m->mnt))
			break;
		if (mountpoint != m->mnt.mnt_root) {
			path->mnt = &m->mnt;
			path->dentry = mountpoint;
			*seqp = read_seqcount_begin(&mountpoint->d_seq);
			return true;
		}
	}
	return false;
}

1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
static bool choose_mountpoint(struct mount *m, const struct path *root,
			      struct path *path)
{
	bool found;

	rcu_read_lock();
	while (1) {
		unsigned seq, mseq = read_seqbegin(&mount_lock);

		found = choose_mountpoint_rcu(m, root, path, &seq);
		if (unlikely(!found)) {
			if (!read_seqretry(&mount_lock, mseq))
				break;
		} else {
			if (likely(__legitimize_path(path, seq, mseq)))
				break;
			rcu_read_unlock();
			path_put(path);
			rcu_read_lock();
		}
	}
	rcu_read_unlock();
	return found;
}

N
Nick Piggin 已提交
1176
/*
1177 1178 1179
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1180
 */
1181
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
N
Nick Piggin 已提交
1182
{
1183
	struct dentry *dentry = path->dentry;
1184

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1195
	 */
1196
	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1197
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1198
	    dentry->d_inode)
1199
		return -EISDIR;
1200

1201
	if (count && (*count)++ >= MAXSYMLINKS)
1202 1203
		return -ELOOP;

1204
	return finish_automount(dentry->d_op->d_automount(path), path);
A
Al Viro 已提交
1205 1206
}

1207
/*
A
Al Viro 已提交
1208 1209 1210 1211
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
1212
 */
A
Al Viro 已提交
1213 1214
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
			     int *count, unsigned lookup_flags)
L
Linus Torvalds 已提交
1215
{
A
Al Viro 已提交
1216
	struct vfsmount *mnt = path->mnt;
1217
	bool need_mntput = false;
1218
	int ret = 0;
1219

A
Al Viro 已提交
1220
	while (flags & DCACHE_MANAGED_DENTRY) {
1221 1222
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
1223
		if (flags & DCACHE_MANAGE_TRANSIT) {
1224
			ret = path->dentry->d_op->d_manage(path, false);
1225
			flags = smp_load_acquire(&path->dentry->d_flags);
1226
			if (ret < 0)
1227
				break;
1228 1229
		}

A
Al Viro 已提交
1230
		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1231
			struct vfsmount *mounted = lookup_mnt(path);
A
Al Viro 已提交
1232
			if (mounted) {		// ... in our namespace
1233 1234 1235 1236 1237
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
A
Al Viro 已提交
1238 1239
				// here we know it's positive
				flags = path->dentry->d_flags;
1240 1241 1242 1243 1244
				need_mntput = true;
				continue;
			}
		}

A
Al Viro 已提交
1245 1246
		if (!(flags & DCACHE_NEED_AUTOMOUNT))
			break;
1247

A
Al Viro 已提交
1248 1249 1250 1251 1252
		// uncovered automount point
		ret = follow_automount(path, count, lookup_flags);
		flags = smp_load_acquire(&path->dentry->d_flags);
		if (ret < 0)
			break;
L
Linus Torvalds 已提交
1253
	}
1254

A
Al Viro 已提交
1255 1256 1257 1258 1259 1260
	if (ret == -EISDIR)
		ret = 0;
	// possible if you race with several mount --move
	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (!ret && unlikely(d_flags_negative(flags)))
1261
		ret = -ENOENT;
A
Al Viro 已提交
1262
	*jumped = need_mntput;
1263
	return ret;
L
Linus Torvalds 已提交
1264 1265
}

A
Al Viro 已提交
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
static inline int traverse_mounts(struct path *path, bool *jumped,
				  int *count, unsigned lookup_flags)
{
	unsigned flags = smp_load_acquire(&path->dentry->d_flags);

	/* fastpath */
	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
		*jumped = false;
		if (unlikely(d_flags_negative(flags)))
			return -ENOENT;
		return 0;
	}
	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

1281
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1282 1283 1284
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1285
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1286
	if (mounted) {
A
Al Viro 已提交
1287 1288 1289 1290
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1291 1292 1293 1294
		return 1;
	}
	return 0;
}
1295
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1296

A
Al Viro 已提交
1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
	struct vfsmount *mnt = path->mnt;
	bool jumped;
	int ret = traverse_mounts(path, &jumped, NULL, 0);

	if (path->mnt != mnt)
		mntput(mnt);
	return ret;
}
EXPORT_SYMBOL(follow_down);

1314
/*
1315 1316
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1317 1318
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1319
			       struct inode **inode, unsigned *seqp)
1320
{
A
Al Viro 已提交
1321 1322 1323 1324 1325 1326 1327 1328 1329
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;

	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
		return true;

	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
		return false;

1330 1331 1332 1333 1334
	for (;;) {
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
A
Al Viro 已提交
1335 1336 1337 1338 1339
		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
			int res = dentry->d_op->d_manage(path, true);
			if (res)
				return res == -EISDIR;
			flags = dentry->d_flags;
1340
		}
1341

A
Al Viro 已提交
1342 1343 1344 1345 1346
		if (flags & DCACHE_MOUNTED) {
			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
			if (mounted) {
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
1347
				nd->state |= ND_JUMPED;
A
Al Viro 已提交
1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
				*seqp = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
				 * ->d_inode read - there will be an RCU delay
				 * between mount hash removal and ->mnt_root
				 * becoming unpinned.
				 */
				flags = dentry->d_flags;
				continue;
			}
			if (read_seqretry(&mount_lock, nd->m_seq))
				return false;
		}
		return !(flags & DCACHE_NEED_AUTOMOUNT);
1363
	}
1364 1365
}

1366 1367 1368
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
1369
{
A
Al Viro 已提交
1370
	bool jumped;
1371
	int ret;
1372

1373 1374
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
1375 1376 1377 1378 1379
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
A
Al Viro 已提交
1380
			return 0;
1381 1382 1383 1384 1385 1386
		if (unlazy_child(nd, dentry, seq))
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
	}
A
Al Viro 已提交
1387 1388 1389 1390 1391
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			ret = -EXDEV;
		else
1392
			nd->state |= ND_JUMPED;
A
Al Viro 已提交
1393 1394 1395 1396 1397 1398
	}
	if (unlikely(ret)) {
		dput(path->dentry);
		if (path->mnt != nd->path.mnt)
			mntput(path->mnt);
	} else {
1399 1400 1401 1402 1403 1404
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}

1405
/*
1406 1407
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
1408
 */
1409 1410
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1411
				    unsigned int flags)
1412
{
1413
	struct dentry *dentry = d_lookup(dir, name);
M
Miklos Szeredi 已提交
1414
	if (dentry) {
1415 1416 1417 1418 1419 1420
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error)
				d_invalidate(dentry);
			dput(dentry);
			return ERR_PTR(error);
M
Miklos Szeredi 已提交
1421 1422
		}
	}
1423 1424 1425
	return dentry;
}

1426
/*
1427 1428 1429 1430 1431
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
1432
 */
1433
static struct dentry *__lookup_hash(const struct qstr *name,
1434
		struct dentry *base, unsigned int flags)
1435
{
1436
	struct dentry *dentry = lookup_dcache(name, base, flags);
1437 1438
	struct dentry *old;
	struct inode *dir = base->d_inode;
1439

1440
	if (dentry)
M
Miklos Szeredi 已提交
1441
		return dentry;
1442

1443 1444 1445 1446
	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir)))
		return ERR_PTR(-ENOENT);

1447 1448 1449 1450
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1451 1452 1453 1454 1455 1456
	old = dir->i_op->lookup(dir, dentry, flags);
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
1457 1458
}

1459 1460 1461
static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
L
Linus Torvalds 已提交
1462
{
N
Nick Piggin 已提交
1463
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1464
	int status = 1;
1465

1466 1467
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1468 1469
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1470
	 */
N
Nick Piggin 已提交
1471 1472
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1473
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1474
		if (unlikely(!dentry)) {
1475
			if (!try_to_unlazy(nd))
1476 1477
				return ERR_PTR(-ECHILD);
			return NULL;
A
Al Viro 已提交
1478
		}
A
Al Viro 已提交
1479

1480 1481 1482 1483
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1484
		*inode = d_backing_inode(dentry);
A
Al Viro 已提交
1485
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1486
			return ERR_PTR(-ECHILD);
1487 1488 1489 1490 1491 1492 1493 1494

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1495
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1496
			return ERR_PTR(-ECHILD);
A
Al Viro 已提交
1497

1498
		*seqp = seq;
1499
		status = d_revalidate(dentry, nd->flags);
1500
		if (likely(status > 0))
1501
			return dentry;
A
Al Viro 已提交
1502
		if (unlazy_child(nd, dentry, seq))
1503
			return ERR_PTR(-ECHILD);
1504 1505 1506
		if (unlikely(status == -ECHILD))
			/* we'd been told to redo it in non-rcu mode */
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1507
	} else {
A
Al Viro 已提交
1508
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1509
		if (unlikely(!dentry))
1510
			return NULL;
1511
		status = d_revalidate(dentry, nd->flags);
1512
	}
A
Al Viro 已提交
1513
	if (unlikely(status <= 0)) {
1514
		if (!status)
A
Al Viro 已提交
1515
			d_invalidate(dentry);
1516
		dput(dentry);
1517
		return ERR_PTR(status);
1518
	}
1519
	return dentry;
M
Miklos Szeredi 已提交
1520 1521 1522
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1523 1524 1525
static struct dentry *__lookup_slow(const struct qstr *name,
				    struct dentry *dir,
				    unsigned int flags)
M
Miklos Szeredi 已提交
1526
{
A
Al Viro 已提交
1527
	struct dentry *dentry, *old;
1528
	struct inode *inode = dir->d_inode;
1529
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1530 1531

	/* Don't go there if it's already dead */
A
Al Viro 已提交
1532
	if (unlikely(IS_DEADDIR(inode)))
A
Al Viro 已提交
1533
		return ERR_PTR(-ENOENT);
A
Al Viro 已提交
1534
again:
1535
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1536
	if (IS_ERR(dentry))
A
Al Viro 已提交
1537
		return dentry;
A
Al Viro 已提交
1538
	if (unlikely(!d_in_lookup(dentry))) {
1539 1540 1541 1542
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error) {
				d_invalidate(dentry);
1543
				dput(dentry);
1544
				goto again;
1545
			}
1546 1547
			dput(dentry);
			dentry = ERR_PTR(error);
1548
		}
A
Al Viro 已提交
1549 1550 1551 1552 1553 1554
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1555 1556
		}
	}
1557
	return dentry;
L
Linus Torvalds 已提交
1558 1559
}

A
Al Viro 已提交
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
{
	struct inode *inode = dir->d_inode;
	struct dentry *res;
	inode_lock_shared(inode);
	res = __lookup_slow(name, dir, flags);
	inode_unlock_shared(inode);
	return res;
}

1572 1573 1574
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1575
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1576
		if (err != -ECHILD || !try_to_unlazy(nd))
1577 1578
			return err;
	}
1579
	return inode_permission(nd->inode, MAY_EXEC);
1580 1581
}

1582 1583 1584 1585
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
1586 1587 1588 1589 1590

	if (likely(nd->depth != EMBEDDED_LEVELS))
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
1591
	if (likely(nd_alloc_stack(nd)))
1592
		return 0;
1593 1594 1595 1596

	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
1597
		bool grabbed_link = legitimize_path(nd, link, seq);
1598

1599
		if (!try_to_unlazy(nd) != 0 || !grabbed_link)
1600 1601 1602 1603
			return -ECHILD;

		if (nd_alloc_stack(nd))
			return 0;
1604
	}
1605
	return -ENOMEM;
1606 1607
}

1608 1609
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

1610
static const char *pick_link(struct nameidata *nd, struct path *link,
1611
		     struct inode *inode, unsigned seq, int flags)
1612
{
A
Al Viro 已提交
1613
	struct saved *last;
1614
	const char *res;
1615
	int error = reserve_stack(nd, link, seq);
1616

1617
	if (unlikely(error)) {
1618
		if (!(nd->flags & LOOKUP_RCU))
A
Al Viro 已提交
1619
			path_put(link);
1620
		return ERR_PTR(error);
1621
	}
1622
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1623
	last->link = *link;
1624
	clear_delayed_call(&last->done);
1625
	last->seq = seq;
1626

1627
	if (flags & WALK_TRAILING) {
1628 1629 1630 1631 1632
		error = may_follow_link(nd, inode);
		if (unlikely(error))
			return ERR_PTR(error);
	}

1633 1634
	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1635 1636 1637 1638 1639 1640
		return ERR_PTR(-ELOOP);

	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
1641
		if (!try_to_unlazy(nd))
1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657
			return ERR_PTR(-ECHILD);
		touch_atime(&last->link);
	}

	error = security_inode_follow_link(link->dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
		return ERR_PTR(error);

	res = READ_ONCE(inode->i_link);
	if (!res) {
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
		if (nd->flags & LOOKUP_RCU) {
			res = get(NULL, inode, &last->done);
1658
			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
				res = get(link->dentry, inode, &last->done);
		} else {
			res = get(link->dentry, inode, &last->done);
		}
		if (!res)
			goto all_done;
		if (IS_ERR(res))
			return res;
	}
	if (*res == '/') {
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		while (unlikely(*++res == '/'))
			;
	}
	if (*res)
		return res;
all_done: // pure jump
	put_link(nd);
	return NULL;
1680 1681
}

1682 1683 1684 1685 1686 1687
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1688
static const char *step_into(struct nameidata *nd, int flags,
A
Al Viro 已提交
1689
		     struct dentry *dentry, struct inode *inode, unsigned seq)
1690
{
A
Al Viro 已提交
1691 1692 1693 1694
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);

	if (err < 0)
1695
		return ERR_PTR(err);
A
Al Viro 已提交
1696
	if (likely(!d_is_symlink(path.dentry)) ||
1697
	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
A
Al Viro 已提交
1698
	   (flags & WALK_NOFOLLOW)) {
1699
		/* not a symlink or should not follow */
1700 1701 1702 1703 1704 1705
		if (!(nd->flags & LOOKUP_RCU)) {
			dput(nd->path.dentry);
			if (nd->path.mnt != path.mnt)
				mntput(nd->path.mnt);
		}
		nd->path = path;
1706 1707
		nd->inode = inode;
		nd->seq = seq;
1708
		return NULL;
1709
	}
1710
	if (nd->flags & LOOKUP_RCU) {
1711
		/* make sure that d_is_symlink above matches inode */
A
Al Viro 已提交
1712
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
1713
			return ERR_PTR(-ECHILD);
1714 1715 1716
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
1717
	}
1718
	return pick_link(nd, &path, inode, seq, flags);
1719 1720
}

1721 1722 1723
static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
1724
{
A
Al Viro 已提交
1725
	struct dentry *parent, *old;
1726

A
Al Viro 已提交
1727 1728 1729
	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
A
Al Viro 已提交
1730
		struct path path;
1731
		unsigned seq;
A
Al Viro 已提交
1732 1733 1734
		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
					   &nd->root, &path, &seq))
			goto in_root;
1735 1736 1737 1738 1739 1740 1741 1742
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-ECHILD);
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
1743
	}
A
Al Viro 已提交
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
		return ERR_PTR(-ECHILD);
	return parent;
in_root:
1754 1755
	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
		return ERR_PTR(-ECHILD);
1756 1757 1758
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	return NULL;
1759 1760
}

1761 1762 1763
static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
1764
{
A
Al Viro 已提交
1765 1766 1767 1768 1769
	struct dentry *parent;

	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1770 1771 1772 1773 1774
		struct path path;

		if (!choose_mountpoint(real_mount(nd->path.mnt),
				       &nd->root, &path))
			goto in_root;
1775 1776
		path_put(&nd->path);
		nd->path = path;
1777
		nd->inode = path.dentry->d_inode;
1778 1779
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-EXDEV);
1780
	}
A
Al Viro 已提交
1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791
	/* rare case of legitimate dget_parent()... */
	parent = dget_parent(nd->path.dentry);
	if (unlikely(!path_connected(nd->path.mnt, parent))) {
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
1792 1793 1794 1795
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	dget(nd->path.dentry);
	return NULL;
1796 1797
}

1798
static const char *handle_dots(struct nameidata *nd, int type)
1799 1800
{
	if (type == LAST_DOTDOT) {
1801
		const char *error = NULL;
1802 1803 1804
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;
1805 1806

		if (!nd->root.mnt) {
1807
			error = ERR_PTR(set_root(nd));
1808 1809 1810 1811
			if (error)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
1812
			parent = follow_dotdot_rcu(nd, &inode, &seq);
1813
		else
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
			parent = follow_dotdot(nd, &inode, &seq);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		if (unlikely(!parent))
			error = step_into(nd, WALK_NOFOLLOW,
					 nd->path.dentry, nd->inode, nd->seq);
		else
			error = step_into(nd, WALK_NOFOLLOW,
					 parent, inode, seq);
		if (unlikely(error))
1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
			return error;

		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
			/*
			 * If there was a racing rename or mount along our
			 * path, then we can't be sure that ".." hasn't jumped
			 * above nd->root (and so userspace should retry or use
			 * some fallback).
			 */
			smp_rmb();
			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1835
				return ERR_PTR(-EAGAIN);
1836
			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1837
				return ERR_PTR(-EAGAIN);
1838 1839
		}
	}
1840
	return NULL;
1841 1842
}

1843
static const char *walk_component(struct nameidata *nd, int flags)
1844
{
1845
	struct dentry *dentry;
1846
	struct inode *inode;
1847
	unsigned seq;
1848 1849 1850 1851 1852
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1853
	if (unlikely(nd->last_type != LAST_NORM)) {
A
Al Viro 已提交
1854
		if (!(flags & WALK_MORE) && nd->depth)
1855
			put_link(nd);
1856
		return handle_dots(nd, nd->last_type);
1857
	}
1858 1859
	dentry = lookup_fast(nd, &inode, &seq);
	if (IS_ERR(dentry))
1860
		return ERR_CAST(dentry);
1861
	if (unlikely(!dentry)) {
1862 1863
		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
		if (IS_ERR(dentry))
1864
			return ERR_CAST(dentry);
1865
	}
1866 1867
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
1868
	return step_into(nd, flags, dentry, inode, seq);
1869 1870
}

1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1890
#include <asm/word-at-a-time.h>
1891

1892
#ifdef HASH_MIX
1893

1894
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1895

1896
#elif defined(CONFIG_64BIT)
1897
/*
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
1925
 */
1926 1927 1928 1929 1930
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
1931

1932
/*
1933 1934 1935
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
1936
 */
1937
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1938
{
1939 1940 1941
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
1942 1943
}

1944 1945
#else	/* 32-bit case */

1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
1961

1962
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1963
{
1964 1965
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
1966 1967
}

1968 1969
#endif

1970 1971 1972 1973 1974 1975 1976
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
1977
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1978
{
1979
	unsigned long a, x = 0, y = (unsigned long)salt;
1980 1981

	for (;;) {
1982 1983
		if (!len)
			goto done;
1984
		a = load_unaligned_zeropad(name);
1985 1986
		if (len < sizeof(unsigned long))
			break;
1987
		HASH_MIX(x, y, a);
1988 1989 1990
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
1991
	x ^= a & bytemask_from_count(len);
1992
done:
1993
	return fold_hash(x, y);
1994 1995 1996
}
EXPORT_SYMBOL(full_name_hash);

1997
/* Return the "hash_len" (hash and length) of a null-terminated string */
1998
u64 hashlen_string(const void *salt, const char *name)
1999
{
2000 2001
	unsigned long a = 0, x = 0, y = (unsigned long)salt;
	unsigned long adata, mask, len;
2002 2003
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

2004 2005 2006
	len = 0;
	goto inside;

2007
	do {
2008
		HASH_MIX(x, y, a);
2009
		len += sizeof(unsigned long);
2010
inside:
2011 2012 2013 2014 2015
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
2016
	x ^= a & zero_bytemask(mask);
2017

2018
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2019 2020 2021
}
EXPORT_SYMBOL(hashlen_string);

2022 2023
/*
 * Calculate the length and hash of the path component, and
2024
 * return the "hash_len" as the result.
2025
 */
2026
static inline u64 hash_name(const void *salt, const char *name)
2027
{
2028 2029
	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
	unsigned long adata, bdata, mask, len;
2030
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2031

2032 2033 2034
	len = 0;
	goto inside;

2035
	do {
2036
		HASH_MIX(x, y, a);
2037
		len += sizeof(unsigned long);
2038
inside:
2039
		a = load_unaligned_zeropad(name+len);
2040 2041 2042 2043 2044 2045
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
2046
	x ^= a & zero_bytemask(mask);
2047

2048
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2049 2050
}

2051
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2052

2053
/* Return the hash of a string of known length */
2054
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
L
Linus Torvalds 已提交
2055
{
2056
	unsigned long hash = init_name_hash(salt);
L
Linus Torvalds 已提交
2057
	while (len--)
2058
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
2059 2060
	return end_name_hash(hash);
}
2061
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
2062

2063
/* Return the "hash_len" (hash and length) of a null-terminated string */
2064
u64 hashlen_string(const void *salt, const char *name)
2065
{
2066
	unsigned long hash = init_name_hash(salt);
2067 2068 2069
	unsigned long len = 0, c;

	c = (unsigned char)*name;
2070
	while (c) {
2071 2072 2073
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
2074
	}
2075 2076
	return hashlen_create(end_name_hash(hash), len);
}
2077
EXPORT_SYMBOL(hashlen_string);
2078

2079 2080 2081 2082
/*
 * We know there's a real path component here of at least
 * one character.
 */
2083
static inline u64 hash_name(const void *salt, const char *name)
2084
{
2085
	unsigned long hash = init_name_hash(salt);
2086 2087 2088 2089 2090 2091 2092 2093
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2094
	return hashlen_create(end_name_hash(hash), len);
2095 2096
}

2097 2098
#endif

L
Linus Torvalds 已提交
2099 2100
/*
 * Name resolution.
2101 2102
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2103
 *
2104 2105
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2106
 */
2107
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2108
{
2109
	int depth = 0; // depth <= nd->depth
L
Linus Torvalds 已提交
2110
	int err;
A
Al Viro 已提交
2111

2112
	nd->last_type = LAST_ROOT;
2113
	nd->flags |= LOOKUP_PARENT;
2114 2115
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
2116 2117 2118
	while (*name=='/')
		name++;
	if (!*name)
2119
		return 0;
L
Linus Torvalds 已提交
2120 2121 2122

	/* At this point we know we have a real path component. */
	for(;;) {
2123
		const char *link;
2124
		u64 hash_len;
A
Al Viro 已提交
2125
		int type;
L
Linus Torvalds 已提交
2126

2127
		err = may_lookup(nd);
2128
		if (err)
2129
			return err;
L
Linus Torvalds 已提交
2130

2131
		hash_len = hash_name(nd->path.dentry, name);
L
Linus Torvalds 已提交
2132

A
Al Viro 已提交
2133
		type = LAST_NORM;
2134
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2135
			case 2:
2136
				if (name[1] == '.') {
A
Al Viro 已提交
2137
					type = LAST_DOTDOT;
2138
					nd->state |= ND_JUMPED;
A
Al Viro 已提交
2139
				}
A
Al Viro 已提交
2140 2141 2142 2143
				break;
			case 1:
				type = LAST_DOT;
		}
2144 2145
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
2146
			nd->state &= ~ND_JUMPED;
2147
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2148
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2149
				err = parent->d_op->d_hash(parent, &this);
2150
				if (err < 0)
2151
					return err;
2152 2153
				hash_len = this.hash_len;
				name = this.name;
2154 2155
			}
		}
A
Al Viro 已提交
2156

2157 2158
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2159 2160
		nd->last_type = type;

2161 2162
		name += hashlen_len(hash_len);
		if (!*name)
2163
			goto OK;
2164 2165 2166 2167 2168
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2169 2170
			name++;
		} while (unlikely(*name == '/'));
2171 2172
		if (unlikely(!*name)) {
OK:
2173
			/* pathname or trailing symlink, done */
2174
			if (!depth) {
2175 2176
				nd->dir_uid = nd->inode->i_uid;
				nd->dir_mode = nd->inode->i_mode;
2177
				nd->flags &= ~LOOKUP_PARENT;
2178
				return 0;
2179
			}
2180
			/* last component of nested symlink */
2181
			name = nd->stack[--depth].name;
2182
			link = walk_component(nd, 0);
A
Al Viro 已提交
2183 2184
		} else {
			/* not the last component */
2185
			link = walk_component(nd, WALK_MORE);
2186
		}
2187 2188 2189 2190
		if (unlikely(link)) {
			if (IS_ERR(link))
				return PTR_ERR(link);
			/* a symlink to follow */
2191
			nd->stack[depth++].name = name;
2192 2193
			name = link;
			continue;
N
Nick Piggin 已提交
2194
		}
2195 2196
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
2197
				if (!try_to_unlazy(nd))
2198 2199
					return -ECHILD;
			}
2200
			return -ENOTDIR;
2201
		}
L
Linus Torvalds 已提交
2202 2203 2204
	}
}

2205
/* must be paired with terminate_walk() */
2206
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2207
{
2208
	int error;
2209
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2210

2211 2212
	if (!*s)
		flags &= ~LOOKUP_RCU;
2213 2214
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
2215

2216 2217
	nd->flags = flags;
	nd->state |= ND_JUMPED;
N
Nick Piggin 已提交
2218
	nd->depth = 0;
2219 2220 2221 2222 2223

	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
	smp_rmb();

2224
	if (nd->state & ND_ROOT_PRESET) {
2225 2226
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
2227 2228
		if (*s && unlikely(!d_can_lookup(root)))
			return ERR_PTR(-ENOTDIR);
2229 2230 2231
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
2232
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2233
			nd->root_seq = nd->seq;
2234 2235 2236
		} else {
			path_get(&nd->path);
		}
2237
		return s;
2238 2239
	}

N
Nick Piggin 已提交
2240
	nd->root.mnt = NULL;
2241 2242
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
N
Nick Piggin 已提交
2243

2244 2245
	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2246 2247 2248 2249
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		return s;
2250 2251 2252 2253
	}

	/* Relative pathname -- get the starting-point it is relative to. */
	if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2254 2255 2256
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2257

A
Al Viro 已提交
2258 2259 2260
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2261
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2262 2263 2264 2265
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2266
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2267
		}
N
Nick Piggin 已提交
2268
	} else {
2269
		/* Caller must check execute permissions on the starting path component */
2270
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2271 2272
		struct dentry *dentry;

2273
		if (!f.file)
2274
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2275

2276
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2277

2278 2279 2280
		if (*s && unlikely(!d_can_lookup(dentry))) {
			fdput(f);
			return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2281
		}
N
Nick Piggin 已提交
2282

2283
		nd->path = f.file->f_path;
A
Al Viro 已提交
2284
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2285 2286
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2287
		} else {
2288
			path_get(&nd->path);
A
Al Viro 已提交
2289
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2290
		}
A
Al Viro 已提交
2291
		fdput(f);
N
Nick Piggin 已提交
2292
	}
2293

2294 2295 2296 2297 2298 2299 2300
	/* For scoped-lookups we need to set the root to the dirfd as well. */
	if (flags & LOOKUP_IS_SCOPED) {
		nd->root = nd->path;
		if (flags & LOOKUP_RCU) {
			nd->root_seq = nd->seq;
		} else {
			path_get(&nd->root);
2301
			nd->state |= ND_ROOT_GRABBED;
2302 2303 2304
		}
	}
	return s;
2305 2306
}

2307
static inline const char *lookup_last(struct nameidata *nd)
2308 2309 2310 2311
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

2312
	return walk_component(nd, WALK_TRAILING);
2313 2314
}

2315 2316
static int handle_lookup_down(struct nameidata *nd)
{
2317
	if (!(nd->flags & LOOKUP_RCU))
2318
		dget(nd->path.dentry);
2319 2320
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
2321 2322
}

2323
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2324
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2325
{
2326
	const char *s = path_init(nd, flags);
2327
	int err;
N
Nick Piggin 已提交
2328

2329
	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2330
		err = handle_lookup_down(nd);
2331 2332
		if (unlikely(err < 0))
			s = ERR_PTR(err);
2333 2334
	}

2335 2336 2337
	while (!(err = link_path_walk(s, nd)) &&
	       (s = lookup_last(nd)) != NULL)
		;
2338 2339
	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
		err = handle_lookup_down(nd);
2340
		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2341
	}
2342 2343
	if (!err)
		err = complete_walk(nd);
2344

2345 2346
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2347
			err = -ENOTDIR;
2348 2349 2350 2351 2352 2353
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2354
	return err;
A
Al Viro 已提交
2355
}
N
Nick Piggin 已提交
2356

2357 2358
int filename_lookup(int dfd, struct filename *name, unsigned flags,
		    struct path *path, struct path *root)
A
Al Viro 已提交
2359
{
2360
	int retval;
2361
	struct nameidata nd;
2362 2363
	if (IS_ERR(name))
		return PTR_ERR(name);
2364
	set_nameidata(&nd, dfd, name);
2365 2366
	if (unlikely(root)) {
		nd.root = *root;
2367
		nd.state = ND_ROOT_PRESET;
2368
	}
2369
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2370
	if (unlikely(retval == -ECHILD))
2371
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2372
	if (unlikely(retval == -ESTALE))
2373
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2374

2375
	if (likely(!retval))
2376 2377
		audit_inode(name, path->dentry,
			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2378
	restore_nameidata();
2379
	putname(name);
2380
	return retval;
L
Linus Torvalds 已提交
2381 2382
}

2383
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2384
static int path_parentat(struct nameidata *nd, unsigned flags,
2385
				struct path *parent)
2386
{
2387
	const char *s = path_init(nd, flags);
2388
	int err = link_path_walk(s, nd);
2389 2390
	if (!err)
		err = complete_walk(nd);
2391 2392 2393 2394 2395 2396
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2397 2398 2399
	return err;
}

2400
static struct filename *filename_parentat(int dfd, struct filename *name,
2401 2402
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2403 2404
{
	int retval;
2405
	struct nameidata nd;
2406

2407 2408
	if (IS_ERR(name))
		return name;
2409
	set_nameidata(&nd, dfd, name);
2410
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2411
	if (unlikely(retval == -ECHILD))
2412
		retval = path_parentat(&nd, flags, parent);
2413
	if (unlikely(retval == -ESTALE))
2414
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2415 2416 2417
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
2418
		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2419 2420 2421
	} else {
		putname(name);
		name = ERR_PTR(retval);
2422
	}
2423
	restore_nameidata();
2424
	return name;
2425 2426
}

A
Al Viro 已提交
2427 2428
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2429
{
2430 2431
	struct filename *filename;
	struct dentry *d;
2432 2433
	struct qstr last;
	int type;
2434

2435 2436
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2437 2438
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2439
	if (unlikely(type != LAST_NORM)) {
2440
		path_put(path);
2441 2442
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2443
	}
A
Al Viro 已提交
2444
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2445
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2446
	if (IS_ERR(d)) {
A
Al Viro 已提交
2447
		inode_unlock(path->dentry->d_inode);
2448
		path_put(path);
A
Al Viro 已提交
2449
	}
2450
	putname(filename);
A
Al Viro 已提交
2451
	return d;
2452 2453
}

A
Al Viro 已提交
2454 2455
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2456 2457
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2458
}
2459
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2460

2461 2462 2463 2464 2465 2466
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2467
 * @path: pointer to struct path to fill
2468 2469 2470
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2471
		    struct path *path)
2472
{
2473 2474
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2475 2476
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2477
}
2478
EXPORT_SYMBOL(vfs_path_lookup);
2479

2480 2481
static int lookup_one_len_common(const char *name, struct dentry *base,
				 int len, struct qstr *this)
2482
{
2483 2484 2485
	this->name = name;
	this->len = len;
	this->hash = full_name_hash(base, name, len);
A
Al Viro 已提交
2486
	if (!len)
2487
		return -EACCES;
A
Al Viro 已提交
2488

A
Al Viro 已提交
2489 2490
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
2491
			return -EACCES;
A
Al Viro 已提交
2492 2493
	}

A
Al Viro 已提交
2494
	while (len--) {
2495
		unsigned int c = *(const unsigned char *)name++;
A
Al Viro 已提交
2496
		if (c == '/' || c == '\0')
2497
			return -EACCES;
A
Al Viro 已提交
2498
	}
2499 2500 2501 2502 2503
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2504
		int err = base->d_op->d_hash(base, this);
2505
		if (err < 0)
2506
			return err;
2507
	}
2508

2509 2510 2511
	return inode_permission(base->d_inode, MAY_EXEC);
}

2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540
/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
	if (err)
		return ERR_PTR(err);

	return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
2554
	struct dentry *dentry;
2555 2556 2557 2558 2559 2560
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
2561 2562 2563
	if (err)
		return ERR_PTR(err);

2564 2565
	dentry = lookup_dcache(&this, base, 0);
	return dentry ? dentry : __lookup_slow(&this, base, 0);
2566
}
2567
EXPORT_SYMBOL(lookup_one_len);
2568

2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	int err;
2586
	struct dentry *ret;
2587

2588
	err = lookup_one_len_common(name, base, len, &this);
2589 2590 2591
	if (err)
		return ERR_PTR(err);

2592 2593 2594 2595
	ret = lookup_dcache(&this, base, 0);
	if (!ret)
		ret = lookup_slow(&this, base, 0);
	return ret;
2596 2597 2598
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

A
Al Viro 已提交
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610
/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2611
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
A
Al Viro 已提交
2612 2613 2614 2615 2616 2617 2618
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

2619 2620 2621 2622 2623 2624
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
2625 2626
	struct dentry *parent = dget_parent(path->dentry);
	struct dentry *child;
A
Al Viro 已提交
2627
	struct qstr this = QSTR_INIT("pts", 3);
2628

2629 2630
	if (unlikely(!path_connected(path->mnt, parent))) {
		dput(parent);
2631
		return -ENOENT;
2632
	}
2633 2634
	dput(path->dentry);
	path->dentry = parent;
2635 2636 2637 2638 2639 2640
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
A
Al Viro 已提交
2641
	follow_down(path);
2642 2643 2644 2645
	return 0;
}
#endif

2646 2647
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2648
{
2649 2650
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2651
}
2652
EXPORT_SYMBOL(user_path_at_empty);
2653

M
Miklos Szeredi 已提交
2654
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2655
{
2656
	kuid_t fsuid = current_fsuid();
2657

2658
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2659
		return 0;
2660
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2661
		return 0;
2662
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2663
}
M
Miklos Szeredi 已提交
2664
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
2679 2680 2681 2682 2683
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
L
Linus Torvalds 已提交
2684 2685
 *     nfs_async_unlink().
 */
2686
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2687
{
2688
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2689 2690
	int error;

2691
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2692
		return -ENOENT;
2693
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2694 2695

	BUG_ON(victim->d_parent->d_inode != dir);
2696 2697 2698 2699 2700

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;

2701
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2702

2703
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2704 2705 2706 2707
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2708 2709

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2710
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
L
Linus Torvalds 已提交
2711 2712
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2713
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2714 2715 2716
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2717
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
2731 2732 2733
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
L
Linus Torvalds 已提交
2734
 */
2735
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2736
{
2737
	struct user_namespace *s_user_ns;
2738
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2739 2740 2741 2742
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2743 2744 2745 2746
	s_user_ns = dir->i_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;
2747
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2758
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2759 2760 2761
		return NULL;
	}

2762
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2763

2764 2765
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2766 2767
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2768
		return p;
L
Linus Torvalds 已提交
2769 2770
	}

2771 2772
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2773 2774
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2775
		return p;
L
Linus Torvalds 已提交
2776 2777
	}

A
Al Viro 已提交
2778 2779
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2780 2781
	return NULL;
}
2782
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2783 2784 2785

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2786
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2787
	if (p1 != p2) {
A
Al Viro 已提交
2788
		inode_unlock(p2->d_inode);
2789
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2790 2791
	}
}
2792
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2793

A
Al Viro 已提交
2794
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2795
		bool want_excl)
L
Linus Torvalds 已提交
2796
{
2797
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2798 2799 2800
	if (error)
		return error;

A
Al Viro 已提交
2801
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2802 2803 2804 2805 2806 2807
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2808
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2809
	if (!error)
2810
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2811 2812
	return error;
}
2813
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2814

A
Al Viro 已提交
2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835
int vfs_mkobj(struct dentry *dentry, umode_t mode,
		int (*f)(struct dentry *, umode_t, void *),
		void *arg)
{
	struct inode *dir = dentry->d_parent->d_inode;
	int error = may_create(dir, dentry);
	if (error)
		return error;

	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	error = f(dentry, mode, arg);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}
EXPORT_SYMBOL(vfs_mkobj);

2836 2837 2838 2839 2840 2841
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

A
Al Viro 已提交
2842
static int may_open(const struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2843
{
2844
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2845 2846 2847 2848 2849 2850
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2851 2852
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2853
		return -ELOOP;
C
Christoph Hellwig 已提交
2854
	case S_IFDIR:
2855
		if (acc_mode & MAY_WRITE)
C
Christoph Hellwig 已提交
2856
			return -EISDIR;
2857 2858
		if (acc_mode & MAY_EXEC)
			return -EACCES;
C
Christoph Hellwig 已提交
2859 2860 2861
		break;
	case S_IFBLK:
	case S_IFCHR:
2862
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2863
			return -EACCES;
K
Kees Cook 已提交
2864
		fallthrough;
C
Christoph Hellwig 已提交
2865 2866
	case S_IFIFO:
	case S_IFSOCK:
K
Kees Cook 已提交
2867 2868
		if (acc_mode & MAY_EXEC)
			return -EACCES;
L
Linus Torvalds 已提交
2869
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2870
		break;
2871 2872 2873 2874
	case S_IFREG:
		if ((acc_mode & MAY_EXEC) && path_noexec(path))
			return -EACCES;
		break;
2875
	}
2876

A
Al Viro 已提交
2877
	error = inode_permission(inode, MAY_OPEN | acc_mode);
2878 2879
	if (error)
		return error;
M
Mimi Zohar 已提交
2880

L
Linus Torvalds 已提交
2881 2882 2883 2884
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2885
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2886
			return -EPERM;
L
Linus Torvalds 已提交
2887
		if (flag & O_TRUNC)
2888
			return -EPERM;
L
Linus Torvalds 已提交
2889 2890 2891
	}

	/* O_NOATIME can only be set by the owner or superuser */
2892
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2893
		return -EPERM;
L
Linus Torvalds 已提交
2894

2895
	return 0;
2896
}
L
Linus Torvalds 已提交
2897

2898
static int handle_truncate(struct file *filp)
2899
{
A
Al Viro 已提交
2900
	const struct path *path = &filp->f_path;
2901 2902 2903 2904 2905 2906 2907
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2908
	error = locks_verify_locked(filp);
2909
	if (!error)
2910
		error = security_path_truncate(path);
2911 2912 2913
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2914
				    filp);
2915 2916
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2917
	return error;
L
Linus Torvalds 已提交
2918 2919
}

2920 2921
static inline int open_to_namei_flags(int flag)
{
2922 2923
	if ((flag & O_ACCMODE) == 3)
		flag--;
2924 2925 2926
	return flag;
}

2927
static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
M
Miklos Szeredi 已提交
2928
{
2929
	struct user_namespace *s_user_ns;
M
Miklos Szeredi 已提交
2930 2931 2932 2933
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

2934 2935 2936 2937 2938
	s_user_ns = dir->dentry->d_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;

M
Miklos Szeredi 已提交
2939 2940 2941 2942 2943 2944 2945
	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2946 2947 2948 2949 2950 2951 2952
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
2953 2954 2955
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
2956 2957 2958
 *
 * Returns an error code otherwise.
 */
2959 2960 2961
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
				  struct file *file,
				  int open_flag, umode_t mode)
M
Miklos Szeredi 已提交
2962
{
2963
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
2964 2965 2966 2967 2968 2969
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
2970 2971
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
2972
	error = dir->i_op->atomic_open(dir, dentry, file,
2973
				       open_to_namei_flags(open_flag), mode);
2974
	d_lookup_done(dentry);
2975
	if (!error) {
2976
		if (file->f_mode & FMODE_OPENED) {
2977 2978 2979 2980
			if (unlikely(dentry != file->f_path.dentry)) {
				dput(dentry);
				dentry = dget(file->f_path.dentry);
			}
2981
		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2982
			error = -EIO;
2983
		} else {
2984 2985 2986
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
2987
			}
2988
			if (unlikely(d_is_negative(dentry)))
A
Al Viro 已提交
2989
				error = -ENOENT;
2990
		}
M
Miklos Szeredi 已提交
2991
	}
2992 2993 2994 2995 2996
	if (error) {
		dput(dentry);
		dentry = ERR_PTR(error);
	}
	return dentry;
M
Miklos Szeredi 已提交
2997 2998
}

M
Miklos Szeredi 已提交
2999
/*
3000
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3001
 *
3002
 * Must be called with parent locked (exclusive in O_CREAT case).
3003
 *
3004 3005 3006 3007 3008 3009 3010
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
3011
 *
3012
 * An error code is returned on failure.
M
Miklos Szeredi 已提交
3013
 */
3014 3015 3016
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
M
Miklos Szeredi 已提交
3017 3018
{
	struct dentry *dir = nd->path.dentry;
3019
	struct inode *dir_inode = dir->d_inode;
3020
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3021
	struct dentry *dentry;
3022 3023
	int error, create_error = 0;
	umode_t mode = op->mode;
3024
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3025

3026
	if (unlikely(IS_DEADDIR(dir_inode)))
3027
		return ERR_PTR(-ENOENT);
M
Miklos Szeredi 已提交
3028

3029
	file->f_mode &= ~FMODE_CREATED;
3030 3031 3032 3033 3034
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
3035
				return dentry;
3036 3037 3038
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3039

3040 3041 3042 3043 3044 3045 3046 3047 3048 3049
		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3050
		/* Cached positive dentry: will open in f_op->open */
3051
		return dentry;
3052
	}
M
Miklos Szeredi 已提交
3053

3054 3055 3056 3057 3058 3059 3060 3061 3062
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
3063 3064
	if (unlikely(!got_write))
		open_flag &= ~O_TRUNC;
3065
	if (open_flag & O_CREAT) {
3066 3067
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
3068 3069
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
3070
		if (likely(got_write))
3071
			create_error = may_o_create(&nd->path, dentry, mode);
3072 3073
		else
			create_error = -EROFS;
M
Miklos Szeredi 已提交
3074
	}
3075 3076
	if (create_error)
		open_flag &= ~O_CREAT;
3077
	if (dir_inode->i_op->atomic_open) {
3078
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3079 3080 3081
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
M
Miklos Szeredi 已提交
3082
	}
3083

3084
	if (d_in_lookup(dentry)) {
3085 3086
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3087
		d_lookup_done(dentry);
3088 3089 3090 3091 3092 3093 3094 3095
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3096 3097
	}

M
Miklos Szeredi 已提交
3098
	/* Negative dentry, just create the file */
3099
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3100
		file->f_mode |= FMODE_CREATED;
3101 3102 3103
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3104
			goto out_dput;
3105 3106
		}
		error = dir_inode->i_op->create(dir_inode, dentry, mode,
3107
						open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3108 3109 3110
		if (error)
			goto out_dput;
	}
3111 3112 3113
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3114
	}
3115
	return dentry;
M
Miklos Szeredi 已提交
3116 3117 3118

out_dput:
	dput(dentry);
3119
	return ERR_PTR(error);
M
Miklos Szeredi 已提交
3120 3121
}

3122
static const char *open_last_lookups(struct nameidata *nd,
3123
		   struct file *file, const struct open_flags *op)
3124
{
3125
	struct dentry *dir = nd->path.dentry;
3126
	int open_flag = op->open_flag;
3127
	bool got_write = false;
3128
	unsigned seq;
3129
	struct inode *inode;
3130
	struct dentry *dentry;
3131
	const char *res;
3132

3133 3134
	nd->flags |= op->intent;

3135
	if (nd->last_type != LAST_NORM) {
3136 3137
		if (nd->depth)
			put_link(nd);
3138
		return handle_dots(nd, nd->last_type);
3139
	}
3140

3141
	if (!(open_flag & O_CREAT)) {
3142 3143 3144
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3145 3146
		dentry = lookup_fast(nd, &inode, &seq);
		if (IS_ERR(dentry))
3147
			return ERR_CAST(dentry);
3148
		if (likely(dentry))
3149 3150
			goto finish_lookup;

A
Al Viro 已提交
3151
		BUG_ON(nd->flags & LOOKUP_RCU);
3152 3153
	} else {
		/* create side of things */
3154
		if (nd->flags & LOOKUP_RCU) {
3155 3156
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
3157
		}
3158
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3159
		/* trailing slashes? */
3160
		if (unlikely(nd->last.name[nd->last.len]))
3161
			return ERR_PTR(-EISDIR);
3162
	}
A
Al Viro 已提交
3163

3164
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3165
		got_write = !mnt_want_write(nd->path.mnt);
3166 3167 3168 3169 3170 3171
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3172 3173 3174 3175
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3176
	dentry = lookup_open(nd, file, op, got_write);
3177 3178
	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);
3179 3180 3181 3182
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3183

3184
	if (got_write)
3185
		mnt_drop_write(nd->path.mnt);
M
Miklos Szeredi 已提交
3186

3187 3188 3189
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

3190
	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3191 3192
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
3193
		return NULL;
3194 3195
	}

3196
finish_lookup:
3197 3198
	if (nd->depth)
		put_link(nd);
3199
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3200
	if (unlikely(res))
3201
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3202
	return res;
3203 3204 3205 3206 3207
}

/*
 * Handle the last step of open()
 */
3208
static int do_open(struct nameidata *nd,
3209 3210 3211 3212 3213 3214 3215
		   struct file *file, const struct open_flags *op)
{
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

3216 3217 3218 3219 3220
	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
3221 3222
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);
3223
	if (open_flag & O_CREAT) {
3224 3225
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
3226
		if (d_is_dir(nd->path.dentry))
3227
			return -EISDIR;
3228
		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
3229 3230
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
3231
			return error;
3232
	}
M
Miklos Szeredi 已提交
3233
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3234
		return -ENOTDIR;
3235

3236 3237
	do_truncate = false;
	acc_mode = op->acc_mode;
3238 3239 3240 3241
	if (file->f_mode & FMODE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
3242
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3243 3244
		error = mnt_want_write(nd->path.mnt);
		if (error)
3245
			return error;
3246
		do_truncate = true;
3247
	}
3248
	error = may_open(&nd->path, acc_mode, open_flag);
3249
	if (!error && !(file->f_mode & FMODE_OPENED))
A
Al Viro 已提交
3250
		error = vfs_open(&nd->path, file);
3251 3252 3253
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
3254
		error = handle_truncate(file);
3255 3256 3257 3258
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3259
	if (do_truncate)
3260
		mnt_drop_write(nd->path.mnt);
3261
	return error;
3262 3263
}

3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278
struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
{
	struct dentry *child = NULL;
	struct inode *dir = dentry->d_inode;
	struct inode *inode;
	int error;

	/* we want directory to be writable */
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
	if (error)
		goto out_err;
	error = -EOPNOTSUPP;
	if (!dir->i_op->tmpfile)
		goto out_err;
	error = -ENOMEM;
D
David Howells 已提交
3279
	child = d_alloc(dentry, &slash_name);
3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293
	if (unlikely(!child))
		goto out_err;
	error = dir->i_op->tmpfile(dir, child, mode);
	if (error)
		goto out_err;
	error = -ENOENT;
	inode = child->d_inode;
	if (unlikely(!inode))
		goto out_err;
	if (!(open_flag & O_EXCL)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3294
	ima_post_create_tmpfile(inode);
3295 3296 3297 3298 3299 3300 3301 3302
	return child;

out_err:
	dput(child);
	return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

3303
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3304
		const struct open_flags *op,
3305
		struct file *file)
3306
{
3307 3308
	struct dentry *child;
	struct path path;
3309
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3310 3311
	if (unlikely(error))
		return error;
3312
	error = mnt_want_write(path.mnt);
3313 3314
	if (unlikely(error))
		goto out;
3315 3316
	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
	error = PTR_ERR(child);
3317
	if (IS_ERR(child))
3318
		goto out2;
3319 3320
	dput(path.dentry);
	path.dentry = child;
3321
	audit_inode(nd->name, child, 0);
3322
	/* Don't check for other permissions, the inode was just created */
A
Al Viro 已提交
3323
	error = may_open(&path, 0, op->open_flag);
3324 3325
	if (error)
		goto out2;
3326
	file->f_path.mnt = path.mnt;
3327
	error = finish_open(file, child, NULL);
3328
out2:
3329
	mnt_drop_write(path.mnt);
3330
out:
3331
	path_put(&path);
3332 3333 3334
	return error;
}

3335 3336 3337 3338 3339 3340
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
3341
		error = vfs_open(&path, file);
3342 3343 3344 3345 3346
		path_put(&path);
	}
	return error;
}

3347 3348
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3349
{
A
Al Viro 已提交
3350
	struct file *file;
3351
	int error;
N
Nick Piggin 已提交
3352

3353
	file = alloc_empty_file(op->open_flag, current_cred());
3354 3355
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3356

A
Al Viro 已提交
3357
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3358
		error = do_tmpfile(nd, flags, op, file);
3359
	} else if (unlikely(file->f_flags & O_PATH)) {
3360
		error = do_o_path(nd, flags, file);
3361 3362 3363
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
3364
		       (s = open_last_lookups(nd, file, op)) != NULL)
3365
			;
3366 3367
		if (!error)
			error = do_open(nd, file, op);
3368
		terminate_walk(nd);
3369
	}
3370
	if (likely(!error)) {
3371
		if (likely(file->f_mode & FMODE_OPENED))
3372 3373 3374
			return file;
		WARN_ON(1);
		error = -EINVAL;
3375
	}
3376 3377 3378 3379 3380 3381
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
3382
	}
3383
	return ERR_PTR(error);
L
Linus Torvalds 已提交
3384 3385
}

3386
struct file *do_filp_open(int dfd, struct filename *pathname,
3387
		const struct open_flags *op)
3388
{
3389
	struct nameidata nd;
3390
	int flags = op->lookup_flags;
3391 3392
	struct file *filp;

3393
	set_nameidata(&nd, dfd, pathname);
3394
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3395
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3396
		filp = path_openat(&nd, op, flags);
3397
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3398
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3399
	restore_nameidata();
3400 3401 3402
	return filp;
}

A
Al Viro 已提交
3403
struct file *do_file_open_root(const struct path *root,
3404
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3405
{
3406
	struct nameidata nd;
A
Al Viro 已提交
3407
	struct file *file;
3408
	struct filename *filename;
3409
	int flags = op->lookup_flags;
A
Al Viro 已提交
3410

A
Al Viro 已提交
3411
	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3412 3413
		return ERR_PTR(-ELOOP);

3414
	filename = getname_kernel(name);
3415
	if (IS_ERR(filename))
3416 3417
		return ERR_CAST(filename);

3418
	set_nameidata(&nd, -1, filename);
3419 3420
	nd.root = *root;
	nd.state = ND_ROOT_PRESET;
3421
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3422
	if (unlikely(file == ERR_PTR(-ECHILD)))
3423
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3424
	if (unlikely(file == ERR_PTR(-ESTALE)))
3425
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3426
	restore_nameidata();
3427
	putname(filename);
A
Al Viro 已提交
3428 3429 3430
	return file;
}

3431
static struct dentry *filename_create(int dfd, struct filename *name,
3432
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3433
{
3434
	struct dentry *dentry = ERR_PTR(-EEXIST);
3435 3436
	struct qstr last;
	int type;
3437
	int err2;
3438 3439 3440 3441 3442 3443 3444 3445 3446
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3447 3448 3449
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3450

3451 3452 3453 3454
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3455
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3456
		goto out;
3457

3458
	/* don't fail immediately if it's r/o, at least try to report other errors */
3459
	err2 = mnt_want_write(path->mnt);
3460 3461 3462
	/*
	 * Do the final lookup.
	 */
3463
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3464
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3465
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3466
	if (IS_ERR(dentry))
3467
		goto unlock;
3468

3469
	error = -EEXIST;
3470
	if (d_is_positive(dentry))
3471
		goto fail;
3472

3473 3474 3475 3476 3477 3478
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3479
	if (unlikely(!is_dir && last.name[last.len])) {
3480
		error = -ENOENT;
A
Al Viro 已提交
3481
		goto fail;
3482
	}
3483 3484
	if (unlikely(err2)) {
		error = err2;
3485
		goto fail;
3486
	}
3487
	putname(name);
L
Linus Torvalds 已提交
3488 3489
	return dentry;
fail:
3490 3491 3492
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3493
	inode_unlock(path->dentry->d_inode);
3494
	if (!err2)
3495
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3496
out:
3497
	path_put(path);
3498
	putname(name);
L
Linus Torvalds 已提交
3499 3500
	return dentry;
}
3501 3502 3503 3504

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3505 3506
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3507
}
3508 3509
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3510 3511 3512
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3513
	inode_unlock(path->dentry->d_inode);
3514
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3515 3516 3517 3518
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3519
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3520
				struct path *path, unsigned int lookup_flags)
3521
{
3522
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3523 3524 3525
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3526
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3527
{
3528
	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3529
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3530 3531 3532 3533

	if (error)
		return error;

3534 3535
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
	    !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3536 3537
		return -EPERM;

A
Al Viro 已提交
3538
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3539 3540
		return -EPERM;

3541 3542 3543 3544
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3545 3546 3547 3548 3549
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3550
	if (!error)
3551
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3552 3553
	return error;
}
3554
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3555

A
Al Viro 已提交
3556
static int may_mknod(umode_t mode)
3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

3573
static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3574
		unsigned int dev)
L
Linus Torvalds 已提交
3575
{
3576
	struct dentry *dentry;
3577 3578
	struct path path;
	int error;
3579
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3580

3581 3582 3583
	error = may_mknod(mode);
	if (error)
		return error;
3584 3585
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3586 3587
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3588

3589
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3590
		mode &= ~current_umask();
3591
	error = security_path_mknod(&path, dentry, mode, dev);
3592
	if (error)
3593
		goto out;
3594
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3595
		case 0: case S_IFREG:
A
Al Viro 已提交
3596
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3597 3598
			if (!error)
				ima_post_path_mknod(dentry);
L
Linus Torvalds 已提交
3599 3600
			break;
		case S_IFCHR: case S_IFBLK:
3601
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3602 3603 3604
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3605
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3606 3607
			break;
	}
3608
out:
A
Al Viro 已提交
3609
	done_path_create(&path, dentry);
3610 3611 3612 3613
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3614 3615 3616
	return error;
}

3617 3618 3619 3620 3621 3622
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
		unsigned int, dev)
{
	return do_mknodat(dfd, filename, mode, dev);
}

A
Al Viro 已提交
3623
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3624
{
3625
	return do_mknodat(AT_FDCWD, filename, mode, dev);
3626 3627
}

3628
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3629
{
3630
	int error = may_create(dir, dentry);
3631
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3632 3633 3634 3635

	if (error)
		return error;

A
Al Viro 已提交
3636
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3637 3638 3639 3640 3641 3642 3643
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3644 3645 3646
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3647
	error = dir->i_op->mkdir(dir, dentry, mode);
3648
	if (!error)
3649
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3650 3651
	return error;
}
3652
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3653

3654
static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
L
Linus Torvalds 已提交
3655
{
3656
	struct dentry *dentry;
3657 3658
	struct path path;
	int error;
3659
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3660

3661 3662
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3663
	if (IS_ERR(dentry))
3664
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3665

3666
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3667
		mode &= ~current_umask();
3668
	error = security_path_mkdir(&path, dentry, mode);
3669 3670
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3671
	done_path_create(&path, dentry);
3672 3673 3674 3675
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3676 3677 3678
	return error;
}

3679 3680 3681 3682 3683
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
	return do_mkdirat(dfd, pathname, mode);
}

3684
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3685
{
3686
	return do_mkdirat(AT_FDCWD, pathname, mode);
3687 3688
}

L
Linus Torvalds 已提交
3689 3690 3691 3692 3693 3694 3695
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3696
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3697 3698
		return -EPERM;

3699
	dget(dentry);
A
Al Viro 已提交
3700
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3701 3702

	error = -EBUSY;
3703
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3704 3705 3706 3707 3708 3709 3710 3711 3712 3713
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

3714
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3715 3716
	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3717
	detach_mounts(dentry);
3718
	fsnotify_rmdir(dir, dentry);
S
Sage Weil 已提交
3719 3720

out:
A
Al Viro 已提交
3721
	inode_unlock(dentry->d_inode);
3722
	dput(dentry);
S
Sage Weil 已提交
3723
	if (!error)
L
Linus Torvalds 已提交
3724 3725 3726
		d_delete(dentry);
	return error;
}
3727
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3728

3729
long do_rmdir(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3730 3731 3732
{
	int error = 0;
	struct dentry *dentry;
3733 3734 3735
	struct path path;
	struct qstr last;
	int type;
3736 3737
	unsigned int lookup_flags = 0;
retry:
3738
	name = filename_parentat(dfd, name, lookup_flags,
A
Al Viro 已提交
3739
				&path, &last, &type);
3740 3741
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3742

3743
	switch (type) {
3744 3745 3746 3747 3748 3749 3750 3751 3752
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3753
	}
3754

3755
	error = mnt_want_write(path.mnt);
3756 3757
	if (error)
		goto exit1;
3758

A
Al Viro 已提交
3759
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3760
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3761
	error = PTR_ERR(dentry);
3762 3763
	if (IS_ERR(dentry))
		goto exit2;
3764 3765 3766 3767
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3768
	error = security_path_rmdir(&path, dentry);
3769
	if (error)
3770
		goto exit3;
3771
	error = vfs_rmdir(path.dentry->d_inode, dentry);
3772
exit3:
3773 3774
	dput(dentry);
exit2:
A
Al Viro 已提交
3775
	inode_unlock(path.dentry->d_inode);
3776
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3777
exit1:
3778
	path_put(&path);
3779 3780 3781 3782
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
A
Al Viro 已提交
3783
	putname(name);
L
Linus Torvalds 已提交
3784 3785 3786
	return error;
}

3787
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3788
{
3789
	return do_rmdir(AT_FDCWD, getname(pathname));
3790 3791
}

3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3811
{
J
J. Bruce Fields 已提交
3812
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3813 3814 3815 3816 3817
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3818
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3819 3820
		return -EPERM;

A
Al Viro 已提交
3821
	inode_lock(target);
3822
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3823 3824 3825
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3826
		if (!error) {
3827 3828
			error = try_break_deleg(target, delegated_inode);
			if (error)
3829
				goto out;
L
Linus Torvalds 已提交
3830
			error = dir->i_op->unlink(dir, dentry);
3831
			if (!error) {
3832
				dont_mount(dentry);
3833
				detach_mounts(dentry);
3834
				fsnotify_unlink(dir, dentry);
3835
			}
3836
		}
L
Linus Torvalds 已提交
3837
	}
3838
out:
A
Al Viro 已提交
3839
	inode_unlock(target);
L
Linus Torvalds 已提交
3840 3841 3842

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3843
		fsnotify_link_count(target);
J
John McCutchan 已提交
3844
		d_delete(dentry);
L
Linus Torvalds 已提交
3845
	}
R
Robert Love 已提交
3846

L
Linus Torvalds 已提交
3847 3848
	return error;
}
3849
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3850 3851 3852

/*
 * Make sure that the actual truncation of the file will occur outside its
3853
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3854 3855 3856
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3857
long do_unlinkat(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3858
{
3859
	int error;
L
Linus Torvalds 已提交
3860
	struct dentry *dentry;
3861 3862 3863
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
3864
	struct inode *inode = NULL;
3865
	struct inode *delegated_inode = NULL;
3866 3867
	unsigned int lookup_flags = 0;
retry:
3868
	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
3869 3870
	if (IS_ERR(name))
		return PTR_ERR(name);
3871

L
Linus Torvalds 已提交
3872
	error = -EISDIR;
3873
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
3874
		goto exit1;
3875

3876
	error = mnt_want_write(path.mnt);
3877 3878
	if (error)
		goto exit1;
3879
retry_deleg:
A
Al Viro 已提交
3880
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3881
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3882 3883 3884
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3885
		if (last.name[last.len])
3886
			goto slashes;
L
Linus Torvalds 已提交
3887
		inode = dentry->d_inode;
3888
		if (d_is_negative(dentry))
3889 3890
			goto slashes;
		ihold(inode);
3891
		error = security_path_unlink(&path, dentry);
3892
		if (error)
3893
			goto exit2;
3894
		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
3895
exit2:
L
Linus Torvalds 已提交
3896 3897
		dput(dentry);
	}
A
Al Viro 已提交
3898
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
3899 3900
	if (inode)
		iput(inode);	/* truncate the inode here */
3901 3902
	inode = NULL;
	if (delegated_inode) {
3903
		error = break_deleg_wait(&delegated_inode);
3904 3905 3906
		if (!error)
			goto retry_deleg;
	}
3907
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3908
exit1:
3909
	path_put(&path);
3910 3911 3912 3913 3914
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
3915
	putname(name);
L
Linus Torvalds 已提交
3916 3917 3918
	return error;

slashes:
3919 3920
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
3921
	else if (d_is_dir(dentry))
3922 3923 3924
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
3925 3926 3927
	goto exit2;
}

3928
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3929 3930 3931 3932 3933
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
3934
		return do_rmdir(dfd, getname(pathname));
3935
	return do_unlinkat(dfd, getname(pathname));
3936 3937
}

3938
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3939
{
3940
	return do_unlinkat(AT_FDCWD, getname(pathname));
3941 3942
}

3943
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
3944
{
3945
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3946 3947 3948 3949

	if (error)
		return error;

A
Al Viro 已提交
3950
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
3951 3952 3953 3954 3955 3956 3957
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
3958
	if (!error)
3959
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3960 3961
	return error;
}
3962
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
3963

3964
static long do_symlinkat(const char __user *oldname, int newdfd,
3965
		  const char __user *newname)
L
Linus Torvalds 已提交
3966
{
3967
	int error;
3968
	struct filename *from;
3969
	struct dentry *dentry;
3970
	struct path path;
3971
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3972 3973

	from = getname(oldname);
3974
	if (IS_ERR(from))
L
Linus Torvalds 已提交
3975
		return PTR_ERR(from);
3976 3977
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3978 3979
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
3980
		goto out_putname;
3981

3982
	error = security_path_symlink(&path, dentry, from->name);
3983
	if (!error)
3984
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
3985
	done_path_create(&path, dentry);
3986 3987 3988 3989
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
3990
out_putname:
L
Linus Torvalds 已提交
3991 3992 3993 3994
	putname(from);
	return error;
}

3995 3996 3997 3998 3999 4000
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return do_symlinkat(oldname, newdfd, newname);
}

4001
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4002
{
4003
	return do_symlinkat(oldname, AT_FDCWD, newname);
4004 4005
}

J
J. Bruce Fields 已提交
4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4026 4027
{
	struct inode *inode = old_dentry->d_inode;
4028
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4029 4030 4031 4032 4033
	int error;

	if (!inode)
		return -ENOENT;

4034
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
4046 4047 4048 4049 4050 4051 4052
	/*
	 * Updating the link count will likely cause i_uid and i_gid to
	 * be writen back improperly if their true value is unknown to
	 * the vfs.
	 */
	if (HAS_UNMAPPED_ID(inode))
		return -EPERM;
A
Al Viro 已提交
4053
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4054
		return -EPERM;
4055
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4056 4057 4058 4059 4060 4061
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4062
	inode_lock(inode);
4063
	/* Make sure we don't allow creating hardlink to an unlinked file */
4064
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4065
		error =  -ENOENT;
4066 4067
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4068 4069 4070 4071 4072
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4073 4074 4075 4076 4077 4078

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4079
	inode_unlock(inode);
4080
	if (!error)
4081
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4082 4083
	return error;
}
4084
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4085 4086 4087 4088 4089 4090 4091 4092 4093 4094

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4095
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4096
	      const char __user *newname, int flags)
L
Linus Torvalds 已提交
4097 4098
{
	struct dentry *new_dentry;
4099
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4100
	struct inode *delegated_inode = NULL;
4101
	int how = 0;
L
Linus Torvalds 已提交
4102 4103
	int error;

4104
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4105
		return -EINVAL;
4106
	/*
4107 4108 4109
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4110
	 */
4111 4112 4113
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4114
		how = LOOKUP_EMPTY;
4115
	}
4116 4117 4118

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4119
retry:
4120
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4121
	if (error)
4122 4123
		return error;

4124 4125
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4126
	error = PTR_ERR(new_dentry);
4127
	if (IS_ERR(new_dentry))
4128 4129 4130 4131 4132
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4133 4134 4135
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4136
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4137
	if (error)
4138
		goto out_dput;
J
J. Bruce Fields 已提交
4139
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4140
out_dput:
A
Al Viro 已提交
4141
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4142 4143
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4144 4145
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4146
			goto retry;
4147
		}
J
J. Bruce Fields 已提交
4148
	}
4149
	if (retry_estale(error, how)) {
4150
		path_put(&old_path);
4151 4152 4153
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4154
out:
4155
	path_put(&old_path);
L
Linus Torvalds 已提交
4156 4157 4158 4159

	return error;
}

4160 4161 4162 4163 4164 4165
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
{
	return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

4166
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4167
{
4168
	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4169 4170
}

4171 4172 4173 4174 4175 4176 4177
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4178
 * @flags:	rename flags
4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4193 4194 4195
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4196
 *
4197
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4198 4199
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4200
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4201
 *	   story.
4202 4203
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4204
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4205 4206
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4207
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4208 4209 4210
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4211
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4212 4213 4214
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4215
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4216
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4217
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4218
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4219 4220
 *	   locking].
 */
4221 4222
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4223
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4224
{
4225 4226 4227
	int error;
	bool is_dir = d_is_dir(old_dentry);
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4228
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4229 4230
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
A
Al Viro 已提交
4231
	struct name_snapshot old_name;
4232

4233
	if (source == target)
4234 4235 4236 4237 4238 4239
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4240
	if (!target) {
4241
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4242 4243 4244 4245 4246 4247 4248 4249
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4250 4251 4252
	if (error)
		return error;

4253
	if (!old_dir->i_op->rename)
4254
		return -EPERM;
L
Linus Torvalds 已提交
4255 4256 4257 4258 4259

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4271 4272
	}

4273 4274
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4275 4276 4277
	if (error)
		return error;

A
Al Viro 已提交
4278
	take_dentry_name_snapshot(&old_name, old_dentry);
4279
	dget(new_dentry);
M
Miklos Szeredi 已提交
4280
	if (!is_dir || (flags & RENAME_EXCHANGE))
4281 4282
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4283
		inode_lock(target);
S
Sage Weil 已提交
4284 4285

	error = -EBUSY;
4286
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4287 4288
		goto out;

M
Miklos Szeredi 已提交
4289
	if (max_links && new_dir != old_dir) {
4290
		error = -EMLINK;
M
Miklos Szeredi 已提交
4291
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4292
			goto out;
M
Miklos Szeredi 已提交
4293 4294 4295 4296 4297
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (!is_dir) {
4298
		error = try_break_deleg(source, delegated_inode);
4299 4300
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4301 4302 4303 4304 4305
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4306
	}
4307
	error = old_dir->i_op->rename(old_dir, old_dentry,
M
Miklos Szeredi 已提交
4308
				       new_dir, new_dentry, flags);
S
Sage Weil 已提交
4309 4310 4311
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4312
	if (!(flags & RENAME_EXCHANGE) && target) {
4313 4314
		if (is_dir) {
			shrink_dcache_parent(new_dentry);
4315
			target->i_flags |= S_DEAD;
4316
		}
S
Sage Weil 已提交
4317
		dont_mount(new_dentry);
4318
		detach_mounts(new_dentry);
4319
	}
M
Miklos Szeredi 已提交
4320 4321 4322 4323 4324 4325
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4326
out:
M
Miklos Szeredi 已提交
4327
	if (!is_dir || (flags & RENAME_EXCHANGE))
4328 4329
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4330
		inode_unlock(target);
L
Linus Torvalds 已提交
4331
	dput(new_dentry);
M
Miklos Szeredi 已提交
4332
	if (!error) {
4333
		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
M
Miklos Szeredi 已提交
4334 4335
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
4336
			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
M
Miklos Szeredi 已提交
4337 4338 4339
				      new_is_dir, NULL, new_dentry);
		}
	}
A
Al Viro 已提交
4340
	release_dentry_name_snapshot(&old_name);
R
Robert Love 已提交
4341

L
Linus Torvalds 已提交
4342 4343
	return error;
}
4344
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4345

4346 4347
static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
			const char __user *newname, unsigned int flags)
L
Linus Torvalds 已提交
4348
{
4349 4350
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4351 4352 4353
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4354
	struct inode *delegated_inode = NULL;
4355 4356
	struct filename *from;
	struct filename *to;
4357
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4358
	bool should_retry = false;
4359
	int error;
M
Miklos Szeredi 已提交
4360

M
Miklos Szeredi 已提交
4361
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
4362 4363
		return -EINVAL;

M
Miklos Szeredi 已提交
4364 4365
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
M
Miklos Szeredi 已提交
4366 4367
		return -EINVAL;

4368 4369 4370
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4371
retry:
A
Al Viro 已提交
4372 4373
	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
				&old_path, &old_last, &old_type);
4374 4375
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
L
Linus Torvalds 已提交
4376
		goto exit;
4377
	}
L
Linus Torvalds 已提交
4378

A
Al Viro 已提交
4379 4380
	to = filename_parentat(newdfd, getname(newname), lookup_flags,
				&new_path, &new_last, &new_type);
4381 4382
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4383
		goto exit1;
4384
	}
L
Linus Torvalds 已提交
4385 4386

	error = -EXDEV;
4387
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4388 4389 4390
		goto exit2;

	error = -EBUSY;
4391
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4392 4393
		goto exit2;

M
Miklos Szeredi 已提交
4394 4395
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4396
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4397 4398
		goto exit2;

4399
	error = mnt_want_write(old_path.mnt);
4400 4401 4402
	if (error)
		goto exit2;

4403
retry_deleg:
4404
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4405

4406
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4407 4408 4409 4410 4411
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4412
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4413
		goto exit4;
4414
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4415 4416 4417 4418 4419 4420
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4421 4422 4423 4424 4425 4426 4427
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4428
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4429 4430 4431
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4432
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4433
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4434
		error = -ENOTDIR;
4435
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4436
			goto exit5;
4437
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4438
			goto exit5;
L
Linus Torvalds 已提交
4439 4440 4441 4442
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4443
		goto exit5;
L
Linus Torvalds 已提交
4444
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4445 4446
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4447 4448 4449
	if (new_dentry == trap)
		goto exit5;

4450 4451
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4452
	if (error)
4453
		goto exit5;
4454 4455
	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
			   new_path.dentry->d_inode, new_dentry,
M
Miklos Szeredi 已提交
4456
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4457 4458 4459 4460 4461
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4462
	unlock_rename(new_path.dentry, old_path.dentry);
4463 4464 4465 4466 4467
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4468
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4469
exit2:
4470 4471
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4472
	path_put(&new_path);
4473
	putname(to);
L
Linus Torvalds 已提交
4474
exit1:
4475
	path_put(&old_path);
L
Linus Torvalds 已提交
4476
	putname(from);
4477 4478 4479 4480 4481
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4482
exit:
L
Linus Torvalds 已提交
4483 4484 4485
	return error;
}

4486 4487 4488 4489 4490 4491
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
{
	return do_renameat2(olddfd, oldname, newdfd, newname, flags);
}

M
Miklos Szeredi 已提交
4492 4493 4494
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
4495
	return do_renameat2(olddfd, oldname, newdfd, newname, 0);
M
Miklos Szeredi 已提交
4496 4497
}

4498
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4499
{
4500
	return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4501 4502
}

A
Al Viro 已提交
4503
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4504
{
A
Al Viro 已提交
4505
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530
/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct inode *inode = d_inode(dentry);
4531 4532 4533
	DEFINE_DELAYED_CALL(done);
	const char *link;
	int res;
4534

4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545
	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
		if (unlikely(inode->i_op->readlink))
			return inode->i_op->readlink(dentry, buffer, buflen);

		if (!d_is_symlink(dentry))
			return -EINVAL;

		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_DEFAULT_READLINK;
		spin_unlock(&inode->i_lock);
	}
4546

4547
	link = READ_ONCE(inode->i_link);
4548 4549 4550 4551 4552 4553 4554 4555
	if (!link) {
		link = inode->i_op->get_link(dentry, inode, &done);
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
	res = readlink_copy(buffer, buflen, link);
	do_delayed_call(&done);
	return res;
4556 4557
}
EXPORT_SYMBOL(vfs_readlink);
L
Linus Torvalds 已提交
4558

M
Miklos Szeredi 已提交
4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583
/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
	const char *res = ERR_PTR(-EINVAL);
	struct inode *inode = d_inode(dentry);

	if (d_is_symlink(dentry)) {
		res = ERR_PTR(security_inode_readlink(dentry));
		if (!res)
			res = inode->i_op->get_link(dentry, inode, done);
	}
	return res;
}
EXPORT_SYMBOL(vfs_get_link);

L
Linus Torvalds 已提交
4584
/* get the link contents into pagecache */
4585
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4586
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4587
{
4588 4589
	char *kaddr;
	struct page *page;
4590 4591
	struct address_space *mapping = inode->i_mapping;

4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4605
	set_delayed_call(callback, page_put_link, page);
4606 4607
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4608
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4609
	return kaddr;
L
Linus Torvalds 已提交
4610 4611
}

4612
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4613

4614
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4615
{
4616
	put_page(arg);
L
Linus Torvalds 已提交
4617
}
4618
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4619

4620 4621
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4622
	DEFINE_DELAYED_CALL(done);
4623 4624
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4625 4626
					      &done));
	do_delayed_call(&done);
4627 4628 4629 4630
	return res;
}
EXPORT_SYMBOL(page_readlink);

4631 4632 4633 4634
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4635 4636
{
	struct address_space *mapping = inode->i_mapping;
4637
	struct page *page;
4638
	void *fsdata;
4639
	int err;
4640
	unsigned int flags = 0;
4641 4642
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4643

4644
retry:
4645
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4646
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4647
	if (err)
4648 4649
		goto fail;

4650
	memcpy(page_address(page), symname, len-1);
4651 4652 4653

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4654 4655
	if (err < 0)
		goto fail;
4656 4657 4658
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4659 4660 4661 4662 4663
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4664
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4665

4666 4667 4668
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4669
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4670
}
4671
EXPORT_SYMBOL(page_symlink);
4672

4673
const struct inode_operations page_symlink_inode_operations = {
4674
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4675 4676
};
EXPORT_SYMBOL(page_symlink_inode_operations);