namei.c 130.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
25
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
26 27
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
28
#include <linux/ima.h>
L
Linus Torvalds 已提交
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
42

43
#include "internal.h"
44
#include "mount.h"
45

L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
80
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

A
Al Viro 已提交
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
130
	struct filename *result;
131
	char *kname;
A
Al Viro 已提交
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

A
Al Viro 已提交
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
A
Al Viro 已提交
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

A
Al Viro 已提交
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
A
Al Viro 已提交
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

A
Al Viro 已提交
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
A
Al Viro 已提交
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
204 205
}

206 207
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
208
{
209
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
L
Linus Torvalds 已提交
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
260 261
}

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
/**
 * check_acl - perform ACL permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
static int check_acl(struct user_namespace *mnt_userns,
		     struct inode *inode, int mask)
280
{
281
#ifdef CONFIG_FS_POSIX_ACL
282 283 284
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
285 286
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
287
	                return -EAGAIN;
288
		/* no ->get_acl() calls in RCU mode... */
289
		if (is_uncached_acl(acl))
290
			return -ECHILD;
291
	        return posix_acl_permission(mnt_userns, inode, acl, mask);
292 293
	}

C
Christoph Hellwig 已提交
294 295 296
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
297
	if (acl) {
298
	        int error = posix_acl_permission(mnt_userns, inode, acl, mask);
299 300 301
	        posix_acl_release(acl);
	        return error;
	}
302
#endif
303 304 305 306

	return -EAGAIN;
}

307 308 309 310 311 312 313 314 315
/**
 * acl_permission_check - perform basic UNIX permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
316
 *
317 318 319 320 321
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
L
Linus Torvalds 已提交
322
 */
323 324
static int acl_permission_check(struct user_namespace *mnt_userns,
				struct inode *inode, int mask)
L
Linus Torvalds 已提交
325
{
326
	unsigned int mode = inode->i_mode;
327
	kuid_t i_uid;
L
Linus Torvalds 已提交
328

329
	/* Are we the owner? If so, ACL's don't matter */
330 331
	i_uid = i_uid_into_mnt(mnt_userns, inode);
	if (likely(uid_eq(current_fsuid(), i_uid))) {
332
		mask &= 7;
L
Linus Torvalds 已提交
333
		mode >>= 6;
334 335
		return (mask & ~mode) ? -EACCES : 0;
	}
L
Linus Torvalds 已提交
336

337 338
	/* Do we have ACL's? */
	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
339
		int error = check_acl(mnt_userns, inode, mask);
340 341
		if (error != -EAGAIN)
			return error;
L
Linus Torvalds 已提交
342 343
	}

344 345 346
	/* Only RWX matters for group/other mode bits */
	mask &= 7;

L
Linus Torvalds 已提交
347
	/*
348 349 350
	 * Are the group permissions different from
	 * the other permissions in the bits we care
	 * about? Need to check group ownership if so.
L
Linus Torvalds 已提交
351
	 */
352
	if (mask & (mode ^ (mode >> 3))) {
353 354
		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
		if (in_group_p(kgid))
355 356 357 358 359
			mode >>= 3;
	}

	/* Bits in 'mode' clear that we require? */
	return (mask & ~mode) ? -EACCES : 0;
360 361 362
}

/**
363
 * generic_permission -  check for access rights on a Posix-like filesystem
364
 * @mnt_userns:	user namespace of the mount the inode was found from
365
 * @inode:	inode to check access rights for
366 367
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *		%MAY_NOT_BLOCK ...)
368 369 370 371
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
372 373 374 375 376
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
377 378 379 380 381 382
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
383
 */
384 385
int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
		       int mask)
386 387 388 389
{
	int ret;

	/*
390
	 * Do the basic permission checks.
391
	 */
392
	ret = acl_permission_check(mnt_userns, inode, mask);
393 394
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
395

396 397 398
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
399
			if (capable_wrt_inode_uidgid(mnt_userns, inode,
400
						     CAP_DAC_READ_SEARCH))
401
				return 0;
402
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
403
					     CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
404
			return 0;
405 406
		return -EACCES;
	}
L
Linus Torvalds 已提交
407 408 409 410

	/*
	 * Searching includes executable on directories, else just read.
	 */
411
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
412
	if (mask == MAY_READ)
413
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
414
					     CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
415
			return 0;
416 417 418 419 420 421
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
422
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
423
					     CAP_DAC_OVERRIDE))
424
			return 0;
L
Linus Torvalds 已提交
425 426 427

	return -EACCES;
}
428
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
429

430 431 432 433 434 435
/**
 * do_inode_permission - UNIX permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
436 437 438 439 440
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
441 442
static inline int do_inode_permission(struct user_namespace *mnt_userns,
				      struct inode *inode, int mask)
443 444 445
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
446
			return inode->i_op->permission(mnt_userns, inode, mask);
447 448 449 450 451 452

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
453
	return generic_permission(mnt_userns, inode, mask);
454 455
}

D
David Howells 已提交
456 457 458
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
459
 * @inode: Inode to check permission on
D
David Howells 已提交
460 461 462 463 464 465 466 467 468 469
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
470
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
D
David Howells 已提交
471 472 473 474 475 476 477
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
478 479 480
 * @mnt_userns:	User namespace of the mount the inode was found from
 * @inode:	Inode to check permission on
 * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
D
David Howells 已提交
481 482 483 484 485 486 487
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
488 489
int inode_permission(struct user_namespace *mnt_userns,
		     struct inode *inode, int mask)
D
David Howells 已提交
490 491 492 493 494 495
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
496 497 498 499 500 501 502 503 504 505 506 507 508

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
509
		if (HAS_UNMAPPED_ID(mnt_userns, inode))
510 511 512
			return -EACCES;
	}

513
	retval = do_inode_permission(mnt_userns, inode, mask);
514 515 516 517 518 519 520 521
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
D
David Howells 已提交
522
}
523
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
524

J
Jan Blunck 已提交
525 526 527 528 529 530
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
531
void path_get(const struct path *path)
J
Jan Blunck 已提交
532 533 534 535 536 537
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
538 539 540 541 542 543
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
544
void path_put(const struct path *path)
L
Linus Torvalds 已提交
545
{
J
Jan Blunck 已提交
546 547
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
548
}
J
Jan Blunck 已提交
549
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
550

551
#define EMBEDDED_LEVELS 2
552 553
struct nameidata {
	struct path	path;
A
Al Viro 已提交
554
	struct qstr	last;
555 556
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
557
	unsigned int	flags, state;
558
	unsigned	seq, m_seq, r_seq;
559 560
	int		last_type;
	unsigned	depth;
561
	int		total_link_count;
562 563
	struct saved {
		struct path link;
564
		struct delayed_call done;
565
		const char *name;
566
		unsigned seq;
567
	} *stack, internal[EMBEDDED_LEVELS];
568 569 570 571
	struct filename	*name;
	struct nameidata *saved;
	unsigned	root_seq;
	int		dfd;
572 573
	kuid_t		dir_uid;
	umode_t		dir_mode;
574
} __randomize_layout;
575

576 577 578 579
#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

580
static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
581
{
582 583
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
584
	p->depth = 0;
585 586
	p->dfd = dfd;
	p->name = name;
587 588
	p->path.mnt = NULL;
	p->path.dentry = NULL;
589
	p->total_link_count = old ? old->total_link_count : 0;
590
	p->saved = old;
591
	current->nameidata = p;
592 593
}

594 595 596 597 598 599 600 601 602 603 604
static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
			  const struct path *root)
{
	__set_nameidata(p, dfd, name);
	p->state = 0;
	if (unlikely(root)) {
		p->state = ND_ROOT_PRESET;
		p->root = *root;
	}
}

605
static void restore_nameidata(void)
606
{
607
	struct nameidata *now = current->nameidata, *old = now->saved;
608 609 610 611

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
612
	if (now->stack != now->internal)
613
		kfree(now->stack);
614 615
}

616
static bool nd_alloc_stack(struct nameidata *nd)
617
{
A
Al Viro 已提交
618 619
	struct saved *p;

620 621 622 623
	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
	if (unlikely(!p))
		return false;
624 625
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
626
	return true;
627 628
}

629
/**
630
 * path_connected - Verify that a dentry is below mnt.mnt_root
631 632 633 634
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
635
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
636
{
637
	struct super_block *sb = mnt->mnt_sb;
638

639 640
	/* Bind mounts can have disconnected paths */
	if (mnt->mnt_root == sb->s_root)
641 642
		return true;

643
	return is_subdir(dentry, mnt->mnt_root);
644 645
}

646 647 648 649 650
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
651 652
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
653 654 655 656 657 658 659 660 661 662 663
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
664
		if (nd->state & ND_ROOT_GRABBED) {
665
			path_put(&nd->root);
666
			nd->state &= ~ND_ROOT_GRABBED;
667
		}
668 669 670 671 672
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
	}
	nd->depth = 0;
673 674
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
675 676 677
}

/* path_put is needed afterwards regardless of success or failure */
678
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
679
{
680
	int res = __legitimize_mnt(path->mnt, mseq);
681 682 683 684 685 686 687 688 689 690 691 692 693
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

694 695 696
static inline bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
A
Al Viro 已提交
697
	return __legitimize_path(path, seq, nd->m_seq);
698 699
}

700 701 702
static bool legitimize_links(struct nameidata *nd)
{
	int i;
703 704 705 706 707
	if (unlikely(nd->flags & LOOKUP_CACHED)) {
		drop_links(nd);
		nd->depth = 0;
		return false;
	}
708 709 710 711 712 713 714 715 716 717 718
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

719 720
static bool legitimize_root(struct nameidata *nd)
{
721 722 723 724 725 726 727 728
	/*
	 * For scoped-lookups (where nd->root has been zeroed), we need to
	 * restart the whole lookup from scratch -- because set_root() is wrong
	 * for these lookups (nd->dfd is the root, not the filesystem root).
	 */
	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
		return false;
	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
729
	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
730
		return true;
731
	nd->state |= ND_ROOT_GRABBED;
732 733 734
	return legitimize_path(nd, &nd->root, nd->root_seq);
}

A
Al Viro 已提交
735
/*
N
Nick Piggin 已提交
736
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
737 738
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
739
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
740 741 742 743
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
744 745 746
 */

/**
747
 * try_to_unlazy - try to switch to ref-walk mode.
A
Al Viro 已提交
748
 * @nd: nameidata pathwalk data
749
 * Returns: true on success, false on failure
N
Nick Piggin 已提交
750
 *
751
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
A
Al Viro 已提交
752 753
 * for ref-walk mode.
 * Must be called from rcu-walk context.
754
 * Nothing should touch nameidata between try_to_unlazy() failure and
755
 * terminate_walk().
N
Nick Piggin 已提交
756
 */
757
static bool try_to_unlazy(struct nameidata *nd)
N
Nick Piggin 已提交
758 759 760 761
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
762

A
Al Viro 已提交
763 764 765
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
766 767
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
768 769
	if (unlikely(!legitimize_root(nd)))
		goto out;
A
Al Viro 已提交
770 771
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
772
	return true;
A
Al Viro 已提交
773

774
out1:
A
Al Viro 已提交
775 776 777 778
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
779
	return false;
A
Al Viro 已提交
780 781 782
}

/**
783
 * try_to_unlazy_next - try to switch to ref-walk mode.
A
Al Viro 已提交
784
 * @nd: nameidata pathwalk data
785 786 787
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
A
Al Viro 已提交
788
 *
789 790 791 792
 * Similar to to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
A
Al Viro 已提交
793 794
 * terminate_walk().
 */
795
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
A
Al Viro 已提交
796 797 798
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

799
	nd->flags &= ~LOOKUP_RCU;
800 801 802 803
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
A
Al Viro 已提交
804
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
805
		goto out1;
A
Al Viro 已提交
806

807
	/*
A
Al Viro 已提交
808 809 810 811 812
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
813
	 */
A
Al Viro 已提交
814 815
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
816 817
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
		goto out_dput;
818 819 820 821
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
822 823
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
A
Al Viro 已提交
824
	rcu_read_unlock();
825
	return true;
A
Al Viro 已提交
826

827 828 829 830
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
831
out:
A
Al Viro 已提交
832
	rcu_read_unlock();
833
	return false;
834 835 836
out_dput:
	rcu_read_unlock();
	dput(dentry);
837
	return false;
N
Nick Piggin 已提交
838 839
}

840
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
841
{
842 843 844 845
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
846 847
}

848 849 850
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
851
 *
852 853 854 855 856
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
857
 */
858
static int complete_walk(struct nameidata *nd)
859
{
A
Al Viro 已提交
860
	struct dentry *dentry = nd->path.dentry;
861 862
	int status;

863
	if (nd->flags & LOOKUP_RCU) {
864 865 866 867
		/*
		 * We don't want to zero nd->root for scoped-lookups or
		 * externally-managed nd->root.
		 */
868 869 870
		if (!(nd->state & ND_ROOT_PRESET))
			if (!(nd->flags & LOOKUP_IS_SCOPED))
				nd->root.mnt = NULL;
J
Jens Axboe 已提交
871
		nd->flags &= ~LOOKUP_CACHED;
872
		if (!try_to_unlazy(nd))
873 874 875
			return -ECHILD;
	}

876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
		/*
		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
		 * ever step outside the root during lookup" and should already
		 * be guaranteed by the rest of namei, we want to avoid a namei
		 * BUG resulting in userspace being given a path that was not
		 * scoped within the root at some point during the lookup.
		 *
		 * So, do a final sanity-check to make sure that in the
		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
		 * we won't silently return an fd completely outside of the
		 * requested root to userspace.
		 *
		 * Userspace could move the path outside the root after this
		 * check, but as discussed elsewhere this is not a concern (the
		 * resolved file was inside the root at some point).
		 */
		if (!path_is_under(&nd->path, &nd->root))
			return -EXDEV;
	}

897
	if (likely(!(nd->state & ND_JUMPED)))
A
Al Viro 已提交
898 899
		return 0;

900
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
901 902
		return 0;

903
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
904 905 906
	if (status > 0)
		return 0;

A
Al Viro 已提交
907
	if (!status)
908
		status = -ESTALE;
A
Al Viro 已提交
909

910 911 912
	return status;
}

913
static int set_root(struct nameidata *nd)
N
Nick Piggin 已提交
914
{
915
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
916

917 918 919 920 921 922 923 924
	/*
	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
	 * still have to ensure it doesn't happen because it will cause a breakout
	 * from the dirfd.
	 */
	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
		return -ENOTRECOVERABLE;

925 926 927 928 929 930 931 932 933 934
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
935
		nd->state |= ND_ROOT_GRABBED;
936
	}
937
	return 0;
N
Nick Piggin 已提交
938 939
}

940 941
static int nd_jump_root(struct nameidata *nd)
{
942 943
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return -EXDEV;
944 945 946 947 948
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		/* Absolute path arguments to path_init() are allowed. */
		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
			return -EXDEV;
	}
949 950 951 952 953
	if (!nd->root.mnt) {
		int error = set_root(nd);
		if (error)
			return error;
	}
954 955 956 957 958 959 960 961 962 963 964 965 966 967
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
968
	nd->state |= ND_JUMPED;
969 970 971
	return 0;
}

C
Christoph Hellwig 已提交
972
/*
973
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
974 975
 * caller must have taken a reference to path beforehand.
 */
976
int nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
977
{
978
	int error = -ELOOP;
979
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
980

981 982 983
	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
		goto err;

984 985 986 987 988
	error = -EXDEV;
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		if (nd->path.mnt != path->mnt)
			goto err;
	}
989 990 991
	/* Not currently safe for scoped-lookups. */
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
		goto err;
992

993
	path_put(&nd->path);
C
Christoph Hellwig 已提交
994 995
	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
996
	nd->state |= ND_JUMPED;
997
	return 0;
998 999 1000 1001

err:
	path_put(path);
	return error;
C
Christoph Hellwig 已提交
1002 1003
}

1004
static inline void put_link(struct nameidata *nd)
1005
{
A
Al Viro 已提交
1006
	struct saved *last = nd->stack + --nd->depth;
1007
	do_delayed_call(&last->done);
A
Al Viro 已提交
1008 1009
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
1010 1011
}

1012 1013
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
1014 1015
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
K
Kees Cook 已提交
1016 1017 1018

/**
 * may_follow_link - Check symlink following for unsafe situations
1019
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
1032
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
K
Kees Cook 已提交
1033
{
1034 1035 1036
	struct user_namespace *mnt_userns;
	kuid_t i_uid;

K
Kees Cook 已提交
1037 1038 1039
	if (!sysctl_protected_symlinks)
		return 0;

1040 1041
	mnt_userns = mnt_user_ns(nd->path.mnt);
	i_uid = i_uid_into_mnt(mnt_userns, inode);
K
Kees Cook 已提交
1042
	/* Allowed if owner and follower match. */
1043
	if (uid_eq(current_cred()->fsuid, i_uid))
K
Kees Cook 已提交
1044 1045 1046
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
1047
	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
K
Kees Cook 已提交
1048 1049 1050
		return 0;

	/* Allowed if parent directory and link owner match. */
1051
	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
K
Kees Cook 已提交
1052 1053
		return 0;

1054 1055 1056
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

1057
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
K
Kees Cook 已提交
1058
	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
K
Kees Cook 已提交
1059 1060 1061 1062 1063
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
1064
 * @mnt_userns:	user namespace of the mount the inode was found from
K
Kees Cook 已提交
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
1075 1076
static bool safe_hardlink_source(struct user_namespace *mnt_userns,
				 struct inode *inode)
K
Kees Cook 已提交
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
1093
	if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
K
Kees Cook 已提交
1094 1095 1096 1097 1098 1099 1100
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
1101
 * @mnt_userns:	user namespace of the mount the inode was found from
K
Kees Cook 已提交
1102 1103 1104 1105 1106 1107
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1108
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
1109
 *
1110 1111 1112 1113 1114 1115
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 *
K
Kees Cook 已提交
1116 1117
 * Returns 0 if successful, -ve on error.
 */
1118
int may_linkat(struct user_namespace *mnt_userns, struct path *link)
K
Kees Cook 已提交
1119
{
1120 1121 1122
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
1123 1124
	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
1125
		return -EOVERFLOW;
K
Kees Cook 已提交
1126 1127 1128 1129 1130 1131 1132

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1133 1134
	if (safe_hardlink_source(mnt_userns, inode) ||
	    inode_owner_or_capable(mnt_userns, inode))
K
Kees Cook 已提交
1135 1136
		return 0;

K
Kees Cook 已提交
1137
	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
K
Kees Cook 已提交
1138 1139 1140
	return -EPERM;
}

1141 1142 1143 1144
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
1145
 * @mnt_userns:	user namespace of the mount the inode was found from
1146
 * @nd: nameidata pathwalk data
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
1160 1161 1162 1163 1164 1165
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 *
1166 1167
 * Returns 0 if the open is allowed, -ve on error.
 */
1168 1169
static int may_create_in_sticky(struct user_namespace *mnt_userns,
				struct nameidata *nd, struct inode *const inode)
1170
{
1171 1172 1173
	umode_t dir_mode = nd->dir_mode;
	kuid_t dir_uid = nd->dir_uid;

1174 1175
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1176
	    likely(!(dir_mode & S_ISVTX)) ||
1177 1178
	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
1179 1180
		return 0;

1181 1182
	if (likely(dir_mode & 0002) ||
	    (dir_mode & 0020 &&
1183 1184
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
K
Kees Cook 已提交
1185 1186 1187 1188
		const char *operation = S_ISFIFO(inode->i_mode) ?
					"sticky_create_fifo" :
					"sticky_create_regular";
		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1189 1190 1191 1192 1193
		return -EACCES;
	}
	return 0;
}

1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1204
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1205
{
1206 1207
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1208
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1209

A
Al Viro 已提交
1210
	read_seqlock_excl(&mount_lock);
1211
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1212
	if (parent == mnt) {
A
Al Viro 已提交
1213
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1214 1215
		return 0;
	}
1216
	mntget(&parent->mnt);
1217
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1218
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1219 1220 1221
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1222
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1223 1224
	return 1;
}
1225
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1226

A
Al Viro 已提交
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
				  struct path *path, unsigned *seqp)
{
	while (mnt_has_parent(m)) {
		struct dentry *mountpoint = m->mnt_mountpoint;

		m = m->mnt_parent;
		if (unlikely(root->dentry == mountpoint &&
			     root->mnt == &m->mnt))
			break;
		if (mountpoint != m->mnt.mnt_root) {
			path->mnt = &m->mnt;
			path->dentry = mountpoint;
			*seqp = read_seqcount_begin(&mountpoint->d_seq);
			return true;
		}
	}
	return false;
}

1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
static bool choose_mountpoint(struct mount *m, const struct path *root,
			      struct path *path)
{
	bool found;

	rcu_read_lock();
	while (1) {
		unsigned seq, mseq = read_seqbegin(&mount_lock);

		found = choose_mountpoint_rcu(m, root, path, &seq);
		if (unlikely(!found)) {
			if (!read_seqretry(&mount_lock, mseq))
				break;
		} else {
			if (likely(__legitimize_path(path, seq, mseq)))
				break;
			rcu_read_unlock();
			path_put(path);
			rcu_read_lock();
		}
	}
	rcu_read_unlock();
	return found;
}

N
Nick Piggin 已提交
1272
/*
1273 1274 1275
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1276
 */
1277
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
N
Nick Piggin 已提交
1278
{
1279
	struct dentry *dentry = path->dentry;
1280

1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1291
	 */
1292
	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1293
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1294
	    dentry->d_inode)
1295
		return -EISDIR;
1296

1297
	if (count && (*count)++ >= MAXSYMLINKS)
1298 1299
		return -ELOOP;

1300
	return finish_automount(dentry->d_op->d_automount(path), path);
A
Al Viro 已提交
1301 1302
}

1303
/*
A
Al Viro 已提交
1304 1305 1306 1307
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
1308
 */
A
Al Viro 已提交
1309 1310
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
			     int *count, unsigned lookup_flags)
L
Linus Torvalds 已提交
1311
{
A
Al Viro 已提交
1312
	struct vfsmount *mnt = path->mnt;
1313
	bool need_mntput = false;
1314
	int ret = 0;
1315

A
Al Viro 已提交
1316
	while (flags & DCACHE_MANAGED_DENTRY) {
1317 1318
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
1319
		if (flags & DCACHE_MANAGE_TRANSIT) {
1320
			ret = path->dentry->d_op->d_manage(path, false);
1321
			flags = smp_load_acquire(&path->dentry->d_flags);
1322
			if (ret < 0)
1323
				break;
1324 1325
		}

A
Al Viro 已提交
1326
		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1327
			struct vfsmount *mounted = lookup_mnt(path);
A
Al Viro 已提交
1328
			if (mounted) {		// ... in our namespace
1329 1330 1331 1332 1333
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
A
Al Viro 已提交
1334 1335
				// here we know it's positive
				flags = path->dentry->d_flags;
1336 1337 1338 1339 1340
				need_mntput = true;
				continue;
			}
		}

A
Al Viro 已提交
1341 1342
		if (!(flags & DCACHE_NEED_AUTOMOUNT))
			break;
1343

A
Al Viro 已提交
1344 1345 1346 1347 1348
		// uncovered automount point
		ret = follow_automount(path, count, lookup_flags);
		flags = smp_load_acquire(&path->dentry->d_flags);
		if (ret < 0)
			break;
L
Linus Torvalds 已提交
1349
	}
1350

A
Al Viro 已提交
1351 1352 1353 1354 1355 1356
	if (ret == -EISDIR)
		ret = 0;
	// possible if you race with several mount --move
	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (!ret && unlikely(d_flags_negative(flags)))
1357
		ret = -ENOENT;
A
Al Viro 已提交
1358
	*jumped = need_mntput;
1359
	return ret;
L
Linus Torvalds 已提交
1360 1361
}

A
Al Viro 已提交
1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
static inline int traverse_mounts(struct path *path, bool *jumped,
				  int *count, unsigned lookup_flags)
{
	unsigned flags = smp_load_acquire(&path->dentry->d_flags);

	/* fastpath */
	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
		*jumped = false;
		if (unlikely(d_flags_negative(flags)))
			return -ENOENT;
		return 0;
	}
	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

1377
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1378 1379 1380
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1381
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1382
	if (mounted) {
A
Al Viro 已提交
1383 1384 1385 1386
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1387 1388 1389 1390
		return 1;
	}
	return 0;
}
1391
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1392

A
Al Viro 已提交
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
	struct vfsmount *mnt = path->mnt;
	bool jumped;
	int ret = traverse_mounts(path, &jumped, NULL, 0);

	if (path->mnt != mnt)
		mntput(mnt);
	return ret;
}
EXPORT_SYMBOL(follow_down);

1410
/*
1411 1412
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1413 1414
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1415
			       struct inode **inode, unsigned *seqp)
1416
{
A
Al Viro 已提交
1417 1418 1419 1420 1421 1422 1423 1424 1425
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;

	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
		return true;

	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
		return false;

1426 1427 1428 1429 1430
	for (;;) {
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
A
Al Viro 已提交
1431 1432 1433 1434 1435
		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
			int res = dentry->d_op->d_manage(path, true);
			if (res)
				return res == -EISDIR;
			flags = dentry->d_flags;
1436
		}
1437

A
Al Viro 已提交
1438 1439 1440 1441 1442
		if (flags & DCACHE_MOUNTED) {
			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
			if (mounted) {
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
1443
				nd->state |= ND_JUMPED;
A
Al Viro 已提交
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
				*seqp = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
				 * ->d_inode read - there will be an RCU delay
				 * between mount hash removal and ->mnt_root
				 * becoming unpinned.
				 */
				flags = dentry->d_flags;
				continue;
			}
			if (read_seqretry(&mount_lock, nd->m_seq))
				return false;
		}
		return !(flags & DCACHE_NEED_AUTOMOUNT);
1459
	}
1460 1461
}

1462 1463 1464
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
1465
{
A
Al Viro 已提交
1466
	bool jumped;
1467
	int ret;
1468

1469 1470
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
1471 1472 1473 1474 1475
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
A
Al Viro 已提交
1476
			return 0;
1477
		if (!try_to_unlazy_next(nd, dentry, seq))
1478 1479 1480 1481 1482
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
	}
A
Al Viro 已提交
1483 1484 1485 1486 1487
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			ret = -EXDEV;
		else
1488
			nd->state |= ND_JUMPED;
A
Al Viro 已提交
1489 1490 1491 1492 1493 1494
	}
	if (unlikely(ret)) {
		dput(path->dentry);
		if (path->mnt != nd->path.mnt)
			mntput(path->mnt);
	} else {
1495 1496 1497 1498 1499 1500
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}

1501
/*
1502 1503
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
1504
 */
1505 1506
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1507
				    unsigned int flags)
1508
{
1509
	struct dentry *dentry = d_lookup(dir, name);
M
Miklos Szeredi 已提交
1510
	if (dentry) {
1511 1512 1513 1514 1515 1516
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error)
				d_invalidate(dentry);
			dput(dentry);
			return ERR_PTR(error);
M
Miklos Szeredi 已提交
1517 1518
		}
	}
1519 1520 1521
	return dentry;
}

1522
/*
1523 1524 1525 1526 1527
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
1528
 */
1529
static struct dentry *__lookup_hash(const struct qstr *name,
1530
		struct dentry *base, unsigned int flags)
1531
{
1532
	struct dentry *dentry = lookup_dcache(name, base, flags);
1533 1534
	struct dentry *old;
	struct inode *dir = base->d_inode;
1535

1536
	if (dentry)
M
Miklos Szeredi 已提交
1537
		return dentry;
1538

1539 1540 1541 1542
	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir)))
		return ERR_PTR(-ENOENT);

1543 1544 1545 1546
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1547 1548 1549 1550 1551 1552
	old = dir->i_op->lookup(dir, dentry, flags);
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
1553 1554
}

1555 1556 1557
static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
L
Linus Torvalds 已提交
1558
{
N
Nick Piggin 已提交
1559
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1560
	int status = 1;
1561

1562 1563
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1564 1565
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1566
	 */
N
Nick Piggin 已提交
1567 1568
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1569
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1570
		if (unlikely(!dentry)) {
1571
			if (!try_to_unlazy(nd))
1572 1573
				return ERR_PTR(-ECHILD);
			return NULL;
A
Al Viro 已提交
1574
		}
A
Al Viro 已提交
1575

1576 1577 1578 1579
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1580
		*inode = d_backing_inode(dentry);
A
Al Viro 已提交
1581
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1582
			return ERR_PTR(-ECHILD);
1583 1584 1585 1586 1587 1588 1589 1590

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1591
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1592
			return ERR_PTR(-ECHILD);
A
Al Viro 已提交
1593

1594
		*seqp = seq;
1595
		status = d_revalidate(dentry, nd->flags);
1596
		if (likely(status > 0))
1597
			return dentry;
1598
		if (!try_to_unlazy_next(nd, dentry, seq))
1599
			return ERR_PTR(-ECHILD);
1600
		if (status == -ECHILD)
1601 1602
			/* we'd been told to redo it in non-rcu mode */
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1603
	} else {
A
Al Viro 已提交
1604
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1605
		if (unlikely(!dentry))
1606
			return NULL;
1607
		status = d_revalidate(dentry, nd->flags);
1608
	}
A
Al Viro 已提交
1609
	if (unlikely(status <= 0)) {
1610
		if (!status)
A
Al Viro 已提交
1611
			d_invalidate(dentry);
1612
		dput(dentry);
1613
		return ERR_PTR(status);
1614
	}
1615
	return dentry;
M
Miklos Szeredi 已提交
1616 1617 1618
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1619 1620 1621
static struct dentry *__lookup_slow(const struct qstr *name,
				    struct dentry *dir,
				    unsigned int flags)
M
Miklos Szeredi 已提交
1622
{
A
Al Viro 已提交
1623
	struct dentry *dentry, *old;
1624
	struct inode *inode = dir->d_inode;
1625
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1626 1627

	/* Don't go there if it's already dead */
A
Al Viro 已提交
1628
	if (unlikely(IS_DEADDIR(inode)))
A
Al Viro 已提交
1629
		return ERR_PTR(-ENOENT);
A
Al Viro 已提交
1630
again:
1631
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1632
	if (IS_ERR(dentry))
A
Al Viro 已提交
1633
		return dentry;
A
Al Viro 已提交
1634
	if (unlikely(!d_in_lookup(dentry))) {
1635 1636 1637 1638
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error) {
				d_invalidate(dentry);
1639
				dput(dentry);
1640
				goto again;
1641
			}
1642 1643
			dput(dentry);
			dentry = ERR_PTR(error);
1644
		}
A
Al Viro 已提交
1645 1646 1647 1648 1649 1650
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1651 1652
		}
	}
1653
	return dentry;
L
Linus Torvalds 已提交
1654 1655
}

A
Al Viro 已提交
1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
{
	struct inode *inode = dir->d_inode;
	struct dentry *res;
	inode_lock_shared(inode);
	res = __lookup_slow(name, dir, flags);
	inode_unlock_shared(inode);
	return res;
}

1668 1669
static inline int may_lookup(struct user_namespace *mnt_userns,
			     struct nameidata *nd)
1670 1671
{
	if (nd->flags & LOOKUP_RCU) {
1672
		int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1673
		if (err != -ECHILD || !try_to_unlazy(nd))
1674 1675
			return err;
	}
1676
	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
1677 1678
}

1679 1680 1681 1682
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
1683 1684 1685 1686 1687

	if (likely(nd->depth != EMBEDDED_LEVELS))
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
1688
	if (likely(nd_alloc_stack(nd)))
1689
		return 0;
1690 1691 1692 1693

	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
1694
		bool grabbed_link = legitimize_path(nd, link, seq);
1695

1696
		if (!try_to_unlazy(nd) != 0 || !grabbed_link)
1697 1698 1699 1700
			return -ECHILD;

		if (nd_alloc_stack(nd))
			return 0;
1701
	}
1702
	return -ENOMEM;
1703 1704
}

1705 1706
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

1707
static const char *pick_link(struct nameidata *nd, struct path *link,
1708
		     struct inode *inode, unsigned seq, int flags)
1709
{
A
Al Viro 已提交
1710
	struct saved *last;
1711
	const char *res;
1712
	int error = reserve_stack(nd, link, seq);
1713

1714
	if (unlikely(error)) {
1715
		if (!(nd->flags & LOOKUP_RCU))
A
Al Viro 已提交
1716
			path_put(link);
1717
		return ERR_PTR(error);
1718
	}
1719
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1720
	last->link = *link;
1721
	clear_delayed_call(&last->done);
1722
	last->seq = seq;
1723

1724
	if (flags & WALK_TRAILING) {
1725 1726 1727 1728 1729
		error = may_follow_link(nd, inode);
		if (unlikely(error))
			return ERR_PTR(error);
	}

1730 1731
	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1732 1733 1734 1735 1736 1737
		return ERR_PTR(-ELOOP);

	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
1738
		if (!try_to_unlazy(nd))
1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
			return ERR_PTR(-ECHILD);
		touch_atime(&last->link);
	}

	error = security_inode_follow_link(link->dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
		return ERR_PTR(error);

	res = READ_ONCE(inode->i_link);
	if (!res) {
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
		if (nd->flags & LOOKUP_RCU) {
			res = get(NULL, inode, &last->done);
1755
			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776
				res = get(link->dentry, inode, &last->done);
		} else {
			res = get(link->dentry, inode, &last->done);
		}
		if (!res)
			goto all_done;
		if (IS_ERR(res))
			return res;
	}
	if (*res == '/') {
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		while (unlikely(*++res == '/'))
			;
	}
	if (*res)
		return res;
all_done: // pure jump
	put_link(nd);
	return NULL;
1777 1778
}

1779 1780 1781 1782 1783 1784
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1785
static const char *step_into(struct nameidata *nd, int flags,
A
Al Viro 已提交
1786
		     struct dentry *dentry, struct inode *inode, unsigned seq)
1787
{
A
Al Viro 已提交
1788 1789 1790 1791
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);

	if (err < 0)
1792
		return ERR_PTR(err);
A
Al Viro 已提交
1793
	if (likely(!d_is_symlink(path.dentry)) ||
1794
	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
A
Al Viro 已提交
1795
	   (flags & WALK_NOFOLLOW)) {
1796
		/* not a symlink or should not follow */
1797 1798 1799 1800 1801 1802
		if (!(nd->flags & LOOKUP_RCU)) {
			dput(nd->path.dentry);
			if (nd->path.mnt != path.mnt)
				mntput(nd->path.mnt);
		}
		nd->path = path;
1803 1804
		nd->inode = inode;
		nd->seq = seq;
1805
		return NULL;
1806
	}
1807
	if (nd->flags & LOOKUP_RCU) {
1808
		/* make sure that d_is_symlink above matches inode */
A
Al Viro 已提交
1809
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
1810
			return ERR_PTR(-ECHILD);
1811 1812 1813
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
1814
	}
1815
	return pick_link(nd, &path, inode, seq, flags);
1816 1817
}

1818 1819 1820
static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
1821
{
A
Al Viro 已提交
1822
	struct dentry *parent, *old;
1823

A
Al Viro 已提交
1824 1825 1826
	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
A
Al Viro 已提交
1827
		struct path path;
1828
		unsigned seq;
A
Al Viro 已提交
1829 1830 1831
		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
					   &nd->root, &path, &seq))
			goto in_root;
1832 1833 1834 1835 1836 1837 1838 1839
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-ECHILD);
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
1840
	}
A
Al Viro 已提交
1841 1842 1843 1844 1845 1846 1847 1848 1849 1850
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
		return ERR_PTR(-ECHILD);
	return parent;
in_root:
1851 1852
	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
		return ERR_PTR(-ECHILD);
1853 1854 1855
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	return NULL;
1856 1857
}

1858 1859 1860
static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
1861
{
A
Al Viro 已提交
1862 1863 1864 1865 1866
	struct dentry *parent;

	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1867 1868 1869 1870 1871
		struct path path;

		if (!choose_mountpoint(real_mount(nd->path.mnt),
				       &nd->root, &path))
			goto in_root;
1872 1873
		path_put(&nd->path);
		nd->path = path;
1874
		nd->inode = path.dentry->d_inode;
1875 1876
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-EXDEV);
1877
	}
A
Al Viro 已提交
1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
	/* rare case of legitimate dget_parent()... */
	parent = dget_parent(nd->path.dentry);
	if (unlikely(!path_connected(nd->path.mnt, parent))) {
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
1889 1890 1891 1892
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	dget(nd->path.dentry);
	return NULL;
1893 1894
}

1895
static const char *handle_dots(struct nameidata *nd, int type)
1896 1897
{
	if (type == LAST_DOTDOT) {
1898
		const char *error = NULL;
1899 1900 1901
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;
1902 1903

		if (!nd->root.mnt) {
1904
			error = ERR_PTR(set_root(nd));
1905 1906 1907 1908
			if (error)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
1909
			parent = follow_dotdot_rcu(nd, &inode, &seq);
1910
		else
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
			parent = follow_dotdot(nd, &inode, &seq);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		if (unlikely(!parent))
			error = step_into(nd, WALK_NOFOLLOW,
					 nd->path.dentry, nd->inode, nd->seq);
		else
			error = step_into(nd, WALK_NOFOLLOW,
					 parent, inode, seq);
		if (unlikely(error))
1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
			return error;

		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
			/*
			 * If there was a racing rename or mount along our
			 * path, then we can't be sure that ".." hasn't jumped
			 * above nd->root (and so userspace should retry or use
			 * some fallback).
			 */
			smp_rmb();
			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1932
				return ERR_PTR(-EAGAIN);
1933
			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1934
				return ERR_PTR(-EAGAIN);
1935 1936
		}
	}
1937
	return NULL;
1938 1939
}

1940
static const char *walk_component(struct nameidata *nd, int flags)
1941
{
1942
	struct dentry *dentry;
1943
	struct inode *inode;
1944
	unsigned seq;
1945 1946 1947 1948 1949
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1950
	if (unlikely(nd->last_type != LAST_NORM)) {
A
Al Viro 已提交
1951
		if (!(flags & WALK_MORE) && nd->depth)
1952
			put_link(nd);
1953
		return handle_dots(nd, nd->last_type);
1954
	}
1955 1956
	dentry = lookup_fast(nd, &inode, &seq);
	if (IS_ERR(dentry))
1957
		return ERR_CAST(dentry);
1958
	if (unlikely(!dentry)) {
1959 1960
		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
		if (IS_ERR(dentry))
1961
			return ERR_CAST(dentry);
1962
	}
1963 1964
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
1965
	return step_into(nd, flags, dentry, inode, seq);
1966 1967
}

1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1987
#include <asm/word-at-a-time.h>
1988

1989
#ifdef HASH_MIX
1990

1991
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1992

1993
#elif defined(CONFIG_64BIT)
1994
/*
1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
2022
 */
2023 2024 2025 2026 2027
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
2028

2029
/*
2030 2031 2032
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
2033
 */
2034
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2035
{
2036 2037 2038
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
2039 2040
}

2041 2042
#else	/* 32-bit case */

2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
2058

2059
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2060
{
2061 2062
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
2063 2064
}

2065 2066
#endif

2067 2068 2069 2070 2071 2072 2073
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
2074
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2075
{
2076
	unsigned long a, x = 0, y = (unsigned long)salt;
2077 2078

	for (;;) {
2079 2080
		if (!len)
			goto done;
2081
		a = load_unaligned_zeropad(name);
2082 2083
		if (len < sizeof(unsigned long))
			break;
2084
		HASH_MIX(x, y, a);
2085 2086 2087
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
2088
	x ^= a & bytemask_from_count(len);
2089
done:
2090
	return fold_hash(x, y);
2091 2092 2093
}
EXPORT_SYMBOL(full_name_hash);

2094
/* Return the "hash_len" (hash and length) of a null-terminated string */
2095
u64 hashlen_string(const void *salt, const char *name)
2096
{
2097 2098
	unsigned long a = 0, x = 0, y = (unsigned long)salt;
	unsigned long adata, mask, len;
2099 2100
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

2101 2102 2103
	len = 0;
	goto inside;

2104
	do {
2105
		HASH_MIX(x, y, a);
2106
		len += sizeof(unsigned long);
2107
inside:
2108 2109 2110 2111 2112
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
2113
	x ^= a & zero_bytemask(mask);
2114

2115
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2116 2117 2118
}
EXPORT_SYMBOL(hashlen_string);

2119 2120
/*
 * Calculate the length and hash of the path component, and
2121
 * return the "hash_len" as the result.
2122
 */
2123
static inline u64 hash_name(const void *salt, const char *name)
2124
{
2125 2126
	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
	unsigned long adata, bdata, mask, len;
2127
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2128

2129 2130 2131
	len = 0;
	goto inside;

2132
	do {
2133
		HASH_MIX(x, y, a);
2134
		len += sizeof(unsigned long);
2135
inside:
2136
		a = load_unaligned_zeropad(name+len);
2137 2138 2139 2140 2141 2142
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
2143
	x ^= a & zero_bytemask(mask);
2144

2145
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2146 2147
}

2148
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2149

2150
/* Return the hash of a string of known length */
2151
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
L
Linus Torvalds 已提交
2152
{
2153
	unsigned long hash = init_name_hash(salt);
L
Linus Torvalds 已提交
2154
	while (len--)
2155
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
2156 2157
	return end_name_hash(hash);
}
2158
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
2159

2160
/* Return the "hash_len" (hash and length) of a null-terminated string */
2161
u64 hashlen_string(const void *salt, const char *name)
2162
{
2163
	unsigned long hash = init_name_hash(salt);
2164 2165 2166
	unsigned long len = 0, c;

	c = (unsigned char)*name;
2167
	while (c) {
2168 2169 2170
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
2171
	}
2172 2173
	return hashlen_create(end_name_hash(hash), len);
}
2174
EXPORT_SYMBOL(hashlen_string);
2175

2176 2177 2178 2179
/*
 * We know there's a real path component here of at least
 * one character.
 */
2180
static inline u64 hash_name(const void *salt, const char *name)
2181
{
2182
	unsigned long hash = init_name_hash(salt);
2183 2184 2185 2186 2187 2188 2189 2190
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2191
	return hashlen_create(end_name_hash(hash), len);
2192 2193
}

2194 2195
#endif

L
Linus Torvalds 已提交
2196 2197
/*
 * Name resolution.
2198 2199
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2200
 *
2201 2202
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2203
 */
2204
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2205
{
2206
	int depth = 0; // depth <= nd->depth
L
Linus Torvalds 已提交
2207
	int err;
A
Al Viro 已提交
2208

2209
	nd->last_type = LAST_ROOT;
2210
	nd->flags |= LOOKUP_PARENT;
2211 2212
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
2213 2214
	while (*name=='/')
		name++;
2215 2216
	if (!*name) {
		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2217
		return 0;
2218
	}
L
Linus Torvalds 已提交
2219 2220 2221

	/* At this point we know we have a real path component. */
	for(;;) {
2222
		struct user_namespace *mnt_userns;
2223
		const char *link;
2224
		u64 hash_len;
A
Al Viro 已提交
2225
		int type;
L
Linus Torvalds 已提交
2226

2227 2228
		mnt_userns = mnt_user_ns(nd->path.mnt);
		err = may_lookup(mnt_userns, nd);
2229
		if (err)
2230
			return err;
L
Linus Torvalds 已提交
2231

2232
		hash_len = hash_name(nd->path.dentry, name);
L
Linus Torvalds 已提交
2233

A
Al Viro 已提交
2234
		type = LAST_NORM;
2235
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2236
			case 2:
2237
				if (name[1] == '.') {
A
Al Viro 已提交
2238
					type = LAST_DOTDOT;
2239
					nd->state |= ND_JUMPED;
A
Al Viro 已提交
2240
				}
A
Al Viro 已提交
2241 2242 2243 2244
				break;
			case 1:
				type = LAST_DOT;
		}
2245 2246
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
2247
			nd->state &= ~ND_JUMPED;
2248
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2249
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2250
				err = parent->d_op->d_hash(parent, &this);
2251
				if (err < 0)
2252
					return err;
2253 2254
				hash_len = this.hash_len;
				name = this.name;
2255 2256
			}
		}
A
Al Viro 已提交
2257

2258 2259
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2260 2261
		nd->last_type = type;

2262 2263
		name += hashlen_len(hash_len);
		if (!*name)
2264
			goto OK;
2265 2266 2267 2268 2269
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2270 2271
			name++;
		} while (unlikely(*name == '/'));
2272 2273
		if (unlikely(!*name)) {
OK:
2274
			/* pathname or trailing symlink, done */
2275
			if (!depth) {
2276
				nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
2277
				nd->dir_mode = nd->inode->i_mode;
2278
				nd->flags &= ~LOOKUP_PARENT;
2279
				return 0;
2280
			}
2281
			/* last component of nested symlink */
2282
			name = nd->stack[--depth].name;
2283
			link = walk_component(nd, 0);
A
Al Viro 已提交
2284 2285
		} else {
			/* not the last component */
2286
			link = walk_component(nd, WALK_MORE);
2287
		}
2288 2289 2290 2291
		if (unlikely(link)) {
			if (IS_ERR(link))
				return PTR_ERR(link);
			/* a symlink to follow */
2292
			nd->stack[depth++].name = name;
2293 2294
			name = link;
			continue;
N
Nick Piggin 已提交
2295
		}
2296 2297
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
2298
				if (!try_to_unlazy(nd))
2299 2300
					return -ECHILD;
			}
2301
			return -ENOTDIR;
2302
		}
L
Linus Torvalds 已提交
2303 2304 2305
	}
}

2306
/* must be paired with terminate_walk() */
2307
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2308
{
2309
	int error;
2310
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2311

J
Jens Axboe 已提交
2312 2313 2314 2315
	/* LOOKUP_CACHED requires RCU, ask caller to retry */
	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
		return ERR_PTR(-EAGAIN);

2316 2317
	if (!*s)
		flags &= ~LOOKUP_RCU;
2318 2319
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
2320

2321 2322
	nd->flags = flags;
	nd->state |= ND_JUMPED;
2323 2324 2325 2326 2327

	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
	smp_rmb();

2328
	if (nd->state & ND_ROOT_PRESET) {
2329 2330
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
2331 2332
		if (*s && unlikely(!d_can_lookup(root)))
			return ERR_PTR(-ENOTDIR);
2333 2334 2335
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
2336
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2337
			nd->root_seq = nd->seq;
2338 2339 2340
		} else {
			path_get(&nd->path);
		}
2341
		return s;
2342 2343
	}

N
Nick Piggin 已提交
2344 2345
	nd->root.mnt = NULL;

2346 2347
	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2348 2349 2350 2351
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		return s;
2352 2353 2354 2355
	}

	/* Relative pathname -- get the starting-point it is relative to. */
	if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2356 2357 2358
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2359

A
Al Viro 已提交
2360 2361 2362
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2363
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2364 2365 2366 2367
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2368
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2369
		}
N
Nick Piggin 已提交
2370
	} else {
2371
		/* Caller must check execute permissions on the starting path component */
2372
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2373 2374
		struct dentry *dentry;

2375
		if (!f.file)
2376
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2377

2378
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2379

2380 2381 2382
		if (*s && unlikely(!d_can_lookup(dentry))) {
			fdput(f);
			return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2383
		}
N
Nick Piggin 已提交
2384

2385
		nd->path = f.file->f_path;
A
Al Viro 已提交
2386
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2387 2388
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2389
		} else {
2390
			path_get(&nd->path);
A
Al Viro 已提交
2391
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2392
		}
A
Al Viro 已提交
2393
		fdput(f);
N
Nick Piggin 已提交
2394
	}
2395

2396 2397 2398 2399 2400 2401 2402
	/* For scoped-lookups we need to set the root to the dirfd as well. */
	if (flags & LOOKUP_IS_SCOPED) {
		nd->root = nd->path;
		if (flags & LOOKUP_RCU) {
			nd->root_seq = nd->seq;
		} else {
			path_get(&nd->root);
2403
			nd->state |= ND_ROOT_GRABBED;
2404 2405 2406
		}
	}
	return s;
2407 2408
}

2409
static inline const char *lookup_last(struct nameidata *nd)
2410 2411 2412 2413
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

2414
	return walk_component(nd, WALK_TRAILING);
2415 2416
}

2417 2418
static int handle_lookup_down(struct nameidata *nd)
{
2419
	if (!(nd->flags & LOOKUP_RCU))
2420
		dget(nd->path.dentry);
2421 2422
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
2423 2424
}

2425
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2426
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2427
{
2428
	const char *s = path_init(nd, flags);
2429
	int err;
N
Nick Piggin 已提交
2430

2431
	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2432
		err = handle_lookup_down(nd);
2433 2434
		if (unlikely(err < 0))
			s = ERR_PTR(err);
2435 2436
	}

2437 2438 2439
	while (!(err = link_path_walk(s, nd)) &&
	       (s = lookup_last(nd)) != NULL)
		;
2440 2441
	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
		err = handle_lookup_down(nd);
2442
		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2443
	}
2444 2445
	if (!err)
		err = complete_walk(nd);
2446

2447 2448
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2449
			err = -ENOTDIR;
2450 2451 2452 2453 2454 2455
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2456
	return err;
A
Al Viro 已提交
2457
}
N
Nick Piggin 已提交
2458

2459 2460
int filename_lookup(int dfd, struct filename *name, unsigned flags,
		    struct path *path, struct path *root)
A
Al Viro 已提交
2461
{
2462
	int retval;
2463
	struct nameidata nd;
2464 2465
	if (IS_ERR(name))
		return PTR_ERR(name);
2466
	set_nameidata(&nd, dfd, name, root);
2467
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2468
	if (unlikely(retval == -ECHILD))
2469
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2470
	if (unlikely(retval == -ESTALE))
2471
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2472

2473
	if (likely(!retval))
2474 2475
		audit_inode(name, path->dentry,
			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2476
	restore_nameidata();
2477
	putname(name);
2478
	return retval;
L
Linus Torvalds 已提交
2479 2480
}

2481
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2482
static int path_parentat(struct nameidata *nd, unsigned flags,
2483
				struct path *parent)
2484
{
2485
	const char *s = path_init(nd, flags);
2486
	int err = link_path_walk(s, nd);
2487 2488
	if (!err)
		err = complete_walk(nd);
2489 2490 2491 2492 2493 2494
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2495 2496 2497
	return err;
}

2498
static struct filename *filename_parentat(int dfd, struct filename *name,
2499 2500
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2501 2502
{
	int retval;
2503
	struct nameidata nd;
2504

2505 2506
	if (IS_ERR(name))
		return name;
2507
	set_nameidata(&nd, dfd, name, NULL);
2508
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2509
	if (unlikely(retval == -ECHILD))
2510
		retval = path_parentat(&nd, flags, parent);
2511
	if (unlikely(retval == -ESTALE))
2512
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2513 2514 2515
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
2516
		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2517 2518 2519
	} else {
		putname(name);
		name = ERR_PTR(retval);
2520
	}
2521
	restore_nameidata();
2522
	return name;
2523 2524
}

A
Al Viro 已提交
2525 2526
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2527
{
2528 2529
	struct filename *filename;
	struct dentry *d;
2530 2531
	struct qstr last;
	int type;
2532

2533 2534
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2535 2536
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2537
	if (unlikely(type != LAST_NORM)) {
2538
		path_put(path);
2539 2540
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2541
	}
A
Al Viro 已提交
2542
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2543
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2544
	if (IS_ERR(d)) {
A
Al Viro 已提交
2545
		inode_unlock(path->dentry->d_inode);
2546
		path_put(path);
A
Al Viro 已提交
2547
	}
2548
	putname(filename);
A
Al Viro 已提交
2549
	return d;
2550 2551
}

A
Al Viro 已提交
2552 2553
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2554 2555
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2556
}
2557
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2558

2559 2560 2561 2562 2563 2564
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2565
 * @path: pointer to struct path to fill
2566 2567 2568
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2569
		    struct path *path)
2570
{
2571 2572
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2573 2574
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2575
}
2576
EXPORT_SYMBOL(vfs_path_lookup);
2577

2578 2579
static int lookup_one_len_common(const char *name, struct dentry *base,
				 int len, struct qstr *this)
2580
{
2581 2582 2583
	this->name = name;
	this->len = len;
	this->hash = full_name_hash(base, name, len);
A
Al Viro 已提交
2584
	if (!len)
2585
		return -EACCES;
A
Al Viro 已提交
2586

A
Al Viro 已提交
2587 2588
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
2589
			return -EACCES;
A
Al Viro 已提交
2590 2591
	}

A
Al Viro 已提交
2592
	while (len--) {
2593
		unsigned int c = *(const unsigned char *)name++;
A
Al Viro 已提交
2594
		if (c == '/' || c == '\0')
2595
			return -EACCES;
A
Al Viro 已提交
2596
	}
2597 2598 2599 2600 2601
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2602
		int err = base->d_op->d_hash(base, this);
2603
		if (err < 0)
2604
			return err;
2605
	}
2606

2607
	return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
2608 2609
}

2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
	if (err)
		return ERR_PTR(err);

	return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651
/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
2652
	struct dentry *dentry;
2653 2654 2655 2656 2657 2658
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
2659 2660 2661
	if (err)
		return ERR_PTR(err);

2662 2663
	dentry = lookup_dcache(&this, base, 0);
	return dentry ? dentry : __lookup_slow(&this, base, 0);
2664
}
2665
EXPORT_SYMBOL(lookup_one_len);
2666

2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	int err;
2684
	struct dentry *ret;
2685

2686
	err = lookup_one_len_common(name, base, len, &this);
2687 2688 2689
	if (err)
		return ERR_PTR(err);

2690 2691 2692 2693
	ret = lookup_dcache(&this, base, 0);
	if (!ret)
		ret = lookup_slow(&this, base, 0);
	return ret;
2694 2695 2696
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

A
Al Viro 已提交
2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2709
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
A
Al Viro 已提交
2710 2711 2712 2713 2714 2715 2716
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

2717 2718 2719 2720 2721 2722
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
2723 2724
	struct dentry *parent = dget_parent(path->dentry);
	struct dentry *child;
A
Al Viro 已提交
2725
	struct qstr this = QSTR_INIT("pts", 3);
2726

2727 2728
	if (unlikely(!path_connected(path->mnt, parent))) {
		dput(parent);
2729
		return -ENOENT;
2730
	}
2731 2732
	dput(path->dentry);
	path->dentry = parent;
2733 2734 2735 2736 2737 2738
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
A
Al Viro 已提交
2739
	follow_down(path);
2740 2741 2742 2743
	return 0;
}
#endif

2744 2745
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2746
{
2747 2748
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2749
}
2750
EXPORT_SYMBOL(user_path_at_empty);
2751

2752 2753
int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
		   struct inode *inode)
L
Linus Torvalds 已提交
2754
{
2755
	kuid_t fsuid = current_fsuid();
2756

2757
	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
L
Linus Torvalds 已提交
2758
		return 0;
2759
	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
L
Linus Torvalds 已提交
2760
		return 0;
2761
	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2762
}
M
Miklos Szeredi 已提交
2763
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
2778 2779 2780 2781 2782
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
L
Linus Torvalds 已提交
2783 2784
 *     nfs_async_unlink().
 */
2785 2786
static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
		      struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2787
{
2788
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2789 2790
	int error;

2791
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2792
		return -ENOENT;
2793
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2794 2795

	BUG_ON(victim->d_parent->d_inode != dir);
2796 2797

	/* Inode writeback is not safe when the uid or gid are invalid. */
2798 2799
	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
2800 2801
		return -EOVERFLOW;

2802
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2803

2804
	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2805 2806 2807 2808
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2809

2810 2811 2812
	if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
	    HAS_UNMAPPED_ID(mnt_userns, inode))
L
Linus Torvalds 已提交
2813 2814
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2815
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2816 2817 2818
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2819
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
2833 2834 2835
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
L
Linus Torvalds 已提交
2836
 */
2837 2838
static inline int may_create(struct user_namespace *mnt_userns,
			     struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2839
{
2840
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2841 2842 2843 2844
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2845
	if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
2846
		return -EOVERFLOW;
2847

2848
	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2849 2850 2851 2852 2853 2854 2855 2856 2857 2858
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2859
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2860 2861 2862
		return NULL;
	}

2863
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2864

2865 2866
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2867 2868
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2869
		return p;
L
Linus Torvalds 已提交
2870 2871
	}

2872 2873
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2874 2875
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2876
		return p;
L
Linus Torvalds 已提交
2877 2878
	}

A
Al Viro 已提交
2879 2880
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2881 2882
	return NULL;
}
2883
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2884 2885 2886

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2887
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2888
	if (p1 != p2) {
A
Al Viro 已提交
2889
		inode_unlock(p2->d_inode);
2890
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2891 2892
	}
}
2893
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2894

2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912
/**
 * vfs_create - create new file
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new file
 * @want_excl:	whether the file must not yet exist
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
	       struct dentry *dentry, umode_t mode, bool want_excl)
L
Linus Torvalds 已提交
2913
{
2914
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
2915 2916 2917
	if (error)
		return error;

A
Al Viro 已提交
2918
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2919 2920 2921 2922 2923 2924
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
2925
	error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
2926
	if (!error)
2927
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2928 2929
	return error;
}
2930
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2931

A
Al Viro 已提交
2932 2933 2934 2935 2936
int vfs_mkobj(struct dentry *dentry, umode_t mode,
		int (*f)(struct dentry *, umode_t, void *),
		void *arg)
{
	struct inode *dir = dentry->d_parent->d_inode;
2937
	int error = may_create(&init_user_ns, dir, dentry);
A
Al Viro 已提交
2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952
	if (error)
		return error;

	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	error = f(dentry, mode, arg);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}
EXPORT_SYMBOL(vfs_mkobj);

2953 2954 2955 2956 2957 2958
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

2959 2960
static int may_open(struct user_namespace *mnt_userns, const struct path *path,
		    int acc_mode, int flag)
L
Linus Torvalds 已提交
2961
{
2962
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2963 2964 2965 2966 2967 2968
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2969 2970
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2971
		return -ELOOP;
C
Christoph Hellwig 已提交
2972
	case S_IFDIR:
2973
		if (acc_mode & MAY_WRITE)
C
Christoph Hellwig 已提交
2974
			return -EISDIR;
2975 2976
		if (acc_mode & MAY_EXEC)
			return -EACCES;
C
Christoph Hellwig 已提交
2977 2978 2979
		break;
	case S_IFBLK:
	case S_IFCHR:
2980
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2981
			return -EACCES;
K
Kees Cook 已提交
2982
		fallthrough;
C
Christoph Hellwig 已提交
2983 2984
	case S_IFIFO:
	case S_IFSOCK:
K
Kees Cook 已提交
2985 2986
		if (acc_mode & MAY_EXEC)
			return -EACCES;
L
Linus Torvalds 已提交
2987
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2988
		break;
2989 2990 2991 2992
	case S_IFREG:
		if ((acc_mode & MAY_EXEC) && path_noexec(path))
			return -EACCES;
		break;
2993
	}
2994

2995
	error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
2996 2997
	if (error)
		return error;
M
Mimi Zohar 已提交
2998

L
Linus Torvalds 已提交
2999 3000 3001 3002
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
3003
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3004
			return -EPERM;
L
Linus Torvalds 已提交
3005
		if (flag & O_TRUNC)
3006
			return -EPERM;
L
Linus Torvalds 已提交
3007 3008 3009
	}

	/* O_NOATIME can only be set by the owner or superuser */
3010
	if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
3011
		return -EPERM;
L
Linus Torvalds 已提交
3012

3013
	return 0;
3014
}
L
Linus Torvalds 已提交
3015

3016
static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
3017
{
A
Al Viro 已提交
3018
	const struct path *path = &filp->f_path;
3019 3020 3021 3022 3023 3024 3025
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
3026
	error = locks_verify_locked(filp);
3027
	if (!error)
3028
		error = security_path_truncate(path);
3029
	if (!error) {
3030
		error = do_truncate(mnt_userns, path->dentry, 0,
3031
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3032
				    filp);
3033 3034
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
3035
	return error;
L
Linus Torvalds 已提交
3036 3037
}

3038 3039
static inline int open_to_namei_flags(int flag)
{
3040 3041
	if ((flag & O_ACCMODE) == 3)
		flag--;
3042 3043 3044
	return flag;
}

3045 3046 3047
static int may_o_create(struct user_namespace *mnt_userns,
			const struct path *dir, struct dentry *dentry,
			umode_t mode)
M
Miklos Szeredi 已提交
3048 3049 3050 3051 3052
{
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

3053
	if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
3054 3055
		return -EOVERFLOW;

3056
	error = inode_permission(mnt_userns, dir->dentry->d_inode,
3057
				 MAY_WRITE | MAY_EXEC);
M
Miklos Szeredi 已提交
3058 3059 3060 3061 3062 3063
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

3064 3065 3066 3067 3068 3069 3070
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
3071 3072 3073
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
3074 3075 3076
 *
 * Returns an error code otherwise.
 */
3077 3078 3079
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
				  struct file *file,
				  int open_flag, umode_t mode)
M
Miklos Szeredi 已提交
3080
{
3081
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
3082 3083 3084 3085 3086 3087
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
3088 3089
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
3090
	error = dir->i_op->atomic_open(dir, dentry, file,
3091
				       open_to_namei_flags(open_flag), mode);
3092
	d_lookup_done(dentry);
3093
	if (!error) {
3094
		if (file->f_mode & FMODE_OPENED) {
3095 3096 3097 3098
			if (unlikely(dentry != file->f_path.dentry)) {
				dput(dentry);
				dentry = dget(file->f_path.dentry);
			}
3099
		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3100
			error = -EIO;
3101
		} else {
3102 3103 3104
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
3105
			}
3106
			if (unlikely(d_is_negative(dentry)))
A
Al Viro 已提交
3107
				error = -ENOENT;
3108
		}
M
Miklos Szeredi 已提交
3109
	}
3110 3111 3112 3113 3114
	if (error) {
		dput(dentry);
		dentry = ERR_PTR(error);
	}
	return dentry;
M
Miklos Szeredi 已提交
3115 3116
}

M
Miklos Szeredi 已提交
3117
/*
3118
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3119
 *
3120
 * Must be called with parent locked (exclusive in O_CREAT case).
3121
 *
3122 3123 3124 3125 3126 3127 3128
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
3129
 *
3130
 * An error code is returned on failure.
M
Miklos Szeredi 已提交
3131
 */
3132 3133 3134
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
M
Miklos Szeredi 已提交
3135
{
3136
	struct user_namespace *mnt_userns;
M
Miklos Szeredi 已提交
3137
	struct dentry *dir = nd->path.dentry;
3138
	struct inode *dir_inode = dir->d_inode;
3139
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3140
	struct dentry *dentry;
3141 3142
	int error, create_error = 0;
	umode_t mode = op->mode;
3143
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3144

3145
	if (unlikely(IS_DEADDIR(dir_inode)))
3146
		return ERR_PTR(-ENOENT);
M
Miklos Szeredi 已提交
3147

3148
	file->f_mode &= ~FMODE_CREATED;
3149 3150 3151 3152 3153
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
3154
				return dentry;
3155 3156 3157
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3158

3159 3160 3161 3162 3163 3164 3165 3166 3167 3168
		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3169
		/* Cached positive dentry: will open in f_op->open */
3170
		return dentry;
3171
	}
M
Miklos Szeredi 已提交
3172

3173 3174 3175 3176 3177 3178 3179 3180 3181
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
3182 3183
	if (unlikely(!got_write))
		open_flag &= ~O_TRUNC;
3184
	mnt_userns = mnt_user_ns(nd->path.mnt);
3185
	if (open_flag & O_CREAT) {
3186 3187
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
3188 3189
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
3190
		if (likely(got_write))
3191
			create_error = may_o_create(mnt_userns, &nd->path,
3192
						    dentry, mode);
3193 3194
		else
			create_error = -EROFS;
M
Miklos Szeredi 已提交
3195
	}
3196 3197
	if (create_error)
		open_flag &= ~O_CREAT;
3198
	if (dir_inode->i_op->atomic_open) {
3199
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3200 3201 3202
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
M
Miklos Szeredi 已提交
3203
	}
3204

3205
	if (d_in_lookup(dentry)) {
3206 3207
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3208
		d_lookup_done(dentry);
3209 3210 3211 3212 3213 3214 3215 3216
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3217 3218
	}

M
Miklos Szeredi 已提交
3219
	/* Negative dentry, just create the file */
3220
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3221
		file->f_mode |= FMODE_CREATED;
3222 3223 3224
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3225
			goto out_dput;
3226
		}
3227 3228 3229

		error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
						mode, open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3230 3231 3232
		if (error)
			goto out_dput;
	}
3233 3234 3235
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3236
	}
3237
	return dentry;
M
Miklos Szeredi 已提交
3238 3239 3240

out_dput:
	dput(dentry);
3241
	return ERR_PTR(error);
M
Miklos Szeredi 已提交
3242 3243
}

3244
static const char *open_last_lookups(struct nameidata *nd,
3245
		   struct file *file, const struct open_flags *op)
3246
{
3247
	struct dentry *dir = nd->path.dentry;
3248
	int open_flag = op->open_flag;
3249
	bool got_write = false;
3250
	unsigned seq;
3251
	struct inode *inode;
3252
	struct dentry *dentry;
3253
	const char *res;
3254

3255 3256
	nd->flags |= op->intent;

3257
	if (nd->last_type != LAST_NORM) {
3258 3259
		if (nd->depth)
			put_link(nd);
3260
		return handle_dots(nd, nd->last_type);
3261
	}
3262

3263
	if (!(open_flag & O_CREAT)) {
3264 3265 3266
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3267 3268
		dentry = lookup_fast(nd, &inode, &seq);
		if (IS_ERR(dentry))
3269
			return ERR_CAST(dentry);
3270
		if (likely(dentry))
3271 3272
			goto finish_lookup;

A
Al Viro 已提交
3273
		BUG_ON(nd->flags & LOOKUP_RCU);
3274 3275
	} else {
		/* create side of things */
3276
		if (nd->flags & LOOKUP_RCU) {
3277 3278
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
3279
		}
3280
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3281
		/* trailing slashes? */
3282
		if (unlikely(nd->last.name[nd->last.len]))
3283
			return ERR_PTR(-EISDIR);
3284
	}
A
Al Viro 已提交
3285

3286
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3287
		got_write = !mnt_want_write(nd->path.mnt);
3288 3289 3290 3291 3292 3293
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3294 3295 3296 3297
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3298
	dentry = lookup_open(nd, file, op, got_write);
3299 3300
	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);
3301 3302 3303 3304
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3305

3306
	if (got_write)
3307
		mnt_drop_write(nd->path.mnt);
M
Miklos Szeredi 已提交
3308

3309 3310 3311
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

3312
	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3313 3314
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
3315
		return NULL;
3316 3317
	}

3318
finish_lookup:
3319 3320
	if (nd->depth)
		put_link(nd);
3321
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3322
	if (unlikely(res))
3323
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3324
	return res;
3325 3326 3327 3328 3329
}

/*
 * Handle the last step of open()
 */
3330
static int do_open(struct nameidata *nd,
3331 3332
		   struct file *file, const struct open_flags *op)
{
3333
	struct user_namespace *mnt_userns;
3334 3335 3336 3337 3338
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

3339 3340 3341 3342 3343
	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
3344 3345
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);
3346
	mnt_userns = mnt_user_ns(nd->path.mnt);
3347
	if (open_flag & O_CREAT) {
3348 3349
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
3350
		if (d_is_dir(nd->path.dentry))
3351
			return -EISDIR;
3352
		error = may_create_in_sticky(mnt_userns, nd,
3353 3354
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
3355
			return error;
3356
	}
M
Miklos Szeredi 已提交
3357
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3358
		return -ENOTDIR;
3359

3360 3361
	do_truncate = false;
	acc_mode = op->acc_mode;
3362 3363 3364 3365
	if (file->f_mode & FMODE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
3366
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3367 3368
		error = mnt_want_write(nd->path.mnt);
		if (error)
3369
			return error;
3370
		do_truncate = true;
3371
	}
3372
	error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
3373
	if (!error && !(file->f_mode & FMODE_OPENED))
A
Al Viro 已提交
3374
		error = vfs_open(&nd->path, file);
3375 3376 3377
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
3378
		error = handle_truncate(mnt_userns, file);
3379 3380 3381 3382
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3383
	if (do_truncate)
3384
		mnt_drop_write(nd->path.mnt);
3385
	return error;
3386 3387
}

3388 3389 3390 3391 3392
/**
 * vfs_tmpfile - create tmpfile
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new tmpfile
3393
 * @open_flag:	flags
3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
			   struct dentry *dentry, umode_t mode, int open_flag)
3405 3406 3407 3408 3409 3410 3411
{
	struct dentry *child = NULL;
	struct inode *dir = dentry->d_inode;
	struct inode *inode;
	int error;

	/* we want directory to be writable */
3412
	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
3413 3414 3415 3416 3417 3418
	if (error)
		goto out_err;
	error = -EOPNOTSUPP;
	if (!dir->i_op->tmpfile)
		goto out_err;
	error = -ENOMEM;
D
David Howells 已提交
3419
	child = d_alloc(dentry, &slash_name);
3420 3421
	if (unlikely(!child))
		goto out_err;
3422
	error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433
	if (error)
		goto out_err;
	error = -ENOENT;
	inode = child->d_inode;
	if (unlikely(!inode))
		goto out_err;
	if (!(open_flag & O_EXCL)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
C
Christian Brauner 已提交
3434
	ima_post_create_tmpfile(mnt_userns, inode);
3435 3436 3437 3438 3439 3440 3441 3442
	return child;

out_err:
	dput(child);
	return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

3443
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3444
		const struct open_flags *op,
3445
		struct file *file)
3446
{
3447
	struct user_namespace *mnt_userns;
3448 3449
	struct dentry *child;
	struct path path;
3450
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3451 3452
	if (unlikely(error))
		return error;
3453
	error = mnt_want_write(path.mnt);
3454 3455
	if (unlikely(error))
		goto out;
3456 3457
	mnt_userns = mnt_user_ns(path.mnt);
	child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
3458
	error = PTR_ERR(child);
3459
	if (IS_ERR(child))
3460
		goto out2;
3461 3462
	dput(path.dentry);
	path.dentry = child;
3463
	audit_inode(nd->name, child, 0);
3464
	/* Don't check for other permissions, the inode was just created */
3465
	error = may_open(mnt_userns, &path, 0, op->open_flag);
3466 3467
	if (!error)
		error = vfs_open(&path, file);
3468
out2:
3469
	mnt_drop_write(path.mnt);
3470
out:
3471
	path_put(&path);
3472 3473 3474
	return error;
}

3475 3476 3477 3478 3479 3480
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
3481
		error = vfs_open(&path, file);
3482 3483 3484 3485 3486
		path_put(&path);
	}
	return error;
}

3487 3488
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3489
{
A
Al Viro 已提交
3490
	struct file *file;
3491
	int error;
N
Nick Piggin 已提交
3492

3493
	file = alloc_empty_file(op->open_flag, current_cred());
3494 3495
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3496

A
Al Viro 已提交
3497
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3498
		error = do_tmpfile(nd, flags, op, file);
3499
	} else if (unlikely(file->f_flags & O_PATH)) {
3500
		error = do_o_path(nd, flags, file);
3501 3502 3503
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
3504
		       (s = open_last_lookups(nd, file, op)) != NULL)
3505
			;
3506 3507
		if (!error)
			error = do_open(nd, file, op);
3508
		terminate_walk(nd);
3509
	}
3510
	if (likely(!error)) {
3511
		if (likely(file->f_mode & FMODE_OPENED))
3512 3513 3514
			return file;
		WARN_ON(1);
		error = -EINVAL;
3515
	}
3516 3517 3518 3519 3520 3521
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
3522
	}
3523
	return ERR_PTR(error);
L
Linus Torvalds 已提交
3524 3525
}

3526
struct file *do_filp_open(int dfd, struct filename *pathname,
3527
		const struct open_flags *op)
3528
{
3529
	struct nameidata nd;
3530
	int flags = op->lookup_flags;
3531 3532
	struct file *filp;

3533
	set_nameidata(&nd, dfd, pathname, NULL);
3534
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3535
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3536
		filp = path_openat(&nd, op, flags);
3537
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3538
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3539
	restore_nameidata();
3540 3541 3542
	return filp;
}

A
Al Viro 已提交
3543
struct file *do_file_open_root(const struct path *root,
3544
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3545
{
3546
	struct nameidata nd;
A
Al Viro 已提交
3547
	struct file *file;
3548
	struct filename *filename;
3549
	int flags = op->lookup_flags;
A
Al Viro 已提交
3550

A
Al Viro 已提交
3551
	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3552 3553
		return ERR_PTR(-ELOOP);

3554
	filename = getname_kernel(name);
3555
	if (IS_ERR(filename))
3556 3557
		return ERR_CAST(filename);

3558
	set_nameidata(&nd, -1, filename, root);
3559
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3560
	if (unlikely(file == ERR_PTR(-ECHILD)))
3561
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3562
	if (unlikely(file == ERR_PTR(-ESTALE)))
3563
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3564
	restore_nameidata();
3565
	putname(filename);
A
Al Viro 已提交
3566 3567 3568
	return file;
}

3569
static struct dentry *filename_create(int dfd, struct filename *name,
3570
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3571
{
3572
	struct dentry *dentry = ERR_PTR(-EEXIST);
3573 3574
	struct qstr last;
	int type;
3575
	int err2;
3576 3577 3578 3579 3580 3581 3582 3583 3584
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3585 3586 3587
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3588

3589 3590 3591 3592
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3593
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3594
		goto out;
3595

3596
	/* don't fail immediately if it's r/o, at least try to report other errors */
3597
	err2 = mnt_want_write(path->mnt);
3598 3599 3600
	/*
	 * Do the final lookup.
	 */
3601
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3602
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3603
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3604
	if (IS_ERR(dentry))
3605
		goto unlock;
3606

3607
	error = -EEXIST;
3608
	if (d_is_positive(dentry))
3609
		goto fail;
3610

3611 3612 3613 3614 3615 3616
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3617
	if (unlikely(!is_dir && last.name[last.len])) {
3618
		error = -ENOENT;
A
Al Viro 已提交
3619
		goto fail;
3620
	}
3621 3622
	if (unlikely(err2)) {
		error = err2;
3623
		goto fail;
3624
	}
3625
	putname(name);
L
Linus Torvalds 已提交
3626 3627
	return dentry;
fail:
3628 3629 3630
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3631
	inode_unlock(path->dentry->d_inode);
3632
	if (!err2)
3633
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3634
out:
3635
	path_put(path);
3636
	putname(name);
L
Linus Torvalds 已提交
3637 3638
	return dentry;
}
3639 3640 3641 3642

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3643 3644
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3645
}
3646 3647
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3648 3649 3650
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3651
	inode_unlock(path->dentry->d_inode);
3652
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3653 3654 3655 3656
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3657
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3658
				struct path *path, unsigned int lookup_flags)
3659
{
3660
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3661 3662 3663
}
EXPORT_SYMBOL(user_path_create);

3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681
/**
 * vfs_mknod - create device node or file
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new device node or file
 * @dev:	device number of device to create
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
	      struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3682
{
3683
	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3684
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
3685 3686 3687 3688

	if (error)
		return error;

3689 3690
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
	    !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3691 3692
		return -EPERM;

A
Al Viro 已提交
3693
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3694 3695
		return -EPERM;

3696 3697 3698 3699
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3700 3701 3702 3703
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

3704
	error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
3705
	if (!error)
3706
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3707 3708
	return error;
}
3709
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3710

A
Al Viro 已提交
3711
static int may_mknod(umode_t mode)
3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

3728
static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3729
		unsigned int dev)
L
Linus Torvalds 已提交
3730
{
3731
	struct user_namespace *mnt_userns;
3732
	struct dentry *dentry;
3733 3734
	struct path path;
	int error;
3735
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3736

3737 3738 3739
	error = may_mknod(mode);
	if (error)
		return error;
3740 3741
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3742 3743
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3744

3745
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3746
		mode &= ~current_umask();
3747
	error = security_path_mknod(&path, dentry, mode, dev);
3748
	if (error)
3749
		goto out;
3750 3751

	mnt_userns = mnt_user_ns(path.mnt);
3752
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3753
		case 0: case S_IFREG:
3754 3755
			error = vfs_create(mnt_userns, path.dentry->d_inode,
					   dentry, mode, true);
3756
			if (!error)
C
Christian Brauner 已提交
3757
				ima_post_path_mknod(mnt_userns, dentry);
L
Linus Torvalds 已提交
3758 3759
			break;
		case S_IFCHR: case S_IFBLK:
3760 3761
			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
					  dentry, mode, new_decode_dev(dev));
L
Linus Torvalds 已提交
3762 3763
			break;
		case S_IFIFO: case S_IFSOCK:
3764 3765
			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
					  dentry, mode, 0);
L
Linus Torvalds 已提交
3766 3767
			break;
	}
3768
out:
A
Al Viro 已提交
3769
	done_path_create(&path, dentry);
3770 3771 3772 3773
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3774 3775 3776
	return error;
}

3777 3778 3779 3780 3781 3782
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
		unsigned int, dev)
{
	return do_mknodat(dfd, filename, mode, dev);
}

A
Al Viro 已提交
3783
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3784
{
3785
	return do_mknodat(AT_FDCWD, filename, mode, dev);
3786 3787
}

3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804
/**
 * vfs_mkdir - create directory
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new directory
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
	      struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3805
{
3806
	int error = may_create(mnt_userns, dir, dentry);
3807
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3808 3809 3810 3811

	if (error)
		return error;

A
Al Viro 已提交
3812
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3813 3814 3815 3816 3817 3818 3819
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3820 3821 3822
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

3823
	error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
3824
	if (!error)
3825
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3826 3827
	return error;
}
3828
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3829

3830
static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
L
Linus Torvalds 已提交
3831
{
3832
	struct dentry *dentry;
3833 3834
	struct path path;
	int error;
3835
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3836

3837 3838
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3839
	if (IS_ERR(dentry))
3840
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3841

3842
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3843
		mode &= ~current_umask();
3844
	error = security_path_mkdir(&path, dentry, mode);
3845 3846 3847
	if (!error) {
		struct user_namespace *mnt_userns;
		mnt_userns = mnt_user_ns(path.mnt);
3848 3849
		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
				  mode);
3850
	}
A
Al Viro 已提交
3851
	done_path_create(&path, dentry);
3852 3853 3854 3855
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3856 3857 3858
	return error;
}

3859 3860 3861 3862 3863
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
	return do_mkdirat(dfd, pathname, mode);
}

3864
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3865
{
3866
	return do_mkdirat(AT_FDCWD, pathname, mode);
3867 3868
}

3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884
/**
 * vfs_rmdir - remove directory
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
		     struct dentry *dentry)
L
Linus Torvalds 已提交
3885
{
3886
	int error = may_delete(mnt_userns, dir, dentry, 1);
L
Linus Torvalds 已提交
3887 3888 3889 3890

	if (error)
		return error;

A
Al Viro 已提交
3891
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3892 3893
		return -EPERM;

3894
	dget(dentry);
A
Al Viro 已提交
3895
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3896 3897

	error = -EBUSY;
3898
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3899 3900 3901 3902 3903 3904 3905 3906 3907 3908
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

3909
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3910 3911
	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3912
	detach_mounts(dentry);
3913
	fsnotify_rmdir(dir, dentry);
S
Sage Weil 已提交
3914 3915

out:
A
Al Viro 已提交
3916
	inode_unlock(dentry->d_inode);
3917
	dput(dentry);
S
Sage Weil 已提交
3918
	if (!error)
L
Linus Torvalds 已提交
3919 3920 3921
		d_delete(dentry);
	return error;
}
3922
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3923

3924
long do_rmdir(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3925
{
3926
	struct user_namespace *mnt_userns;
L
Linus Torvalds 已提交
3927 3928
	int error = 0;
	struct dentry *dentry;
3929 3930 3931
	struct path path;
	struct qstr last;
	int type;
3932 3933
	unsigned int lookup_flags = 0;
retry:
3934
	name = filename_parentat(dfd, name, lookup_flags,
A
Al Viro 已提交
3935
				&path, &last, &type);
3936 3937
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3938

3939
	switch (type) {
3940 3941 3942 3943 3944 3945 3946 3947 3948
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3949
	}
3950

3951
	error = mnt_want_write(path.mnt);
3952 3953
	if (error)
		goto exit1;
3954

A
Al Viro 已提交
3955
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3956
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3957
	error = PTR_ERR(dentry);
3958 3959
	if (IS_ERR(dentry))
		goto exit2;
3960 3961 3962 3963
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3964
	error = security_path_rmdir(&path, dentry);
3965
	if (error)
3966
		goto exit3;
3967 3968
	mnt_userns = mnt_user_ns(path.mnt);
	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
3969
exit3:
3970 3971
	dput(dentry);
exit2:
A
Al Viro 已提交
3972
	inode_unlock(path.dentry->d_inode);
3973
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3974
exit1:
3975
	path_put(&path);
3976 3977 3978 3979
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
A
Al Viro 已提交
3980
	putname(name);
L
Linus Torvalds 已提交
3981 3982 3983
	return error;
}

3984
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3985
{
3986
	return do_rmdir(AT_FDCWD, getname(pathname));
3987 3988
}

3989 3990
/**
 * vfs_unlink - unlink a filesystem object
3991
 * @mnt_userns:	user namespace of the mount the inode was found from
3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
4007 4008 4009 4010 4011 4012
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
4013
 */
4014 4015
int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
	       struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4016
{
J
J. Bruce Fields 已提交
4017
	struct inode *target = dentry->d_inode;
4018
	int error = may_delete(mnt_userns, dir, dentry, 0);
L
Linus Torvalds 已提交
4019 4020 4021 4022

	if (error)
		return error;

A
Al Viro 已提交
4023
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
4024 4025
		return -EPERM;

A
Al Viro 已提交
4026
	inode_lock(target);
4027
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
4028 4029 4030
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
4031
		if (!error) {
4032 4033
			error = try_break_deleg(target, delegated_inode);
			if (error)
4034
				goto out;
L
Linus Torvalds 已提交
4035
			error = dir->i_op->unlink(dir, dentry);
4036
			if (!error) {
4037
				dont_mount(dentry);
4038
				detach_mounts(dentry);
4039
				fsnotify_unlink(dir, dentry);
4040
			}
4041
		}
L
Linus Torvalds 已提交
4042
	}
4043
out:
A
Al Viro 已提交
4044
	inode_unlock(target);
L
Linus Torvalds 已提交
4045 4046 4047

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
4048
		fsnotify_link_count(target);
J
John McCutchan 已提交
4049
		d_delete(dentry);
L
Linus Torvalds 已提交
4050
	}
R
Robert Love 已提交
4051

L
Linus Torvalds 已提交
4052 4053
	return error;
}
4054
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
4055 4056 4057

/*
 * Make sure that the actual truncation of the file will occur outside its
4058
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
4059 4060 4061
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
4062
long do_unlinkat(int dfd, struct filename *name)
L
Linus Torvalds 已提交
4063
{
4064
	int error;
L
Linus Torvalds 已提交
4065
	struct dentry *dentry;
4066 4067 4068
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
4069
	struct inode *inode = NULL;
4070
	struct inode *delegated_inode = NULL;
4071 4072
	unsigned int lookup_flags = 0;
retry:
4073
	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4074 4075
	if (IS_ERR(name))
		return PTR_ERR(name);
4076

L
Linus Torvalds 已提交
4077
	error = -EISDIR;
4078
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
4079
		goto exit1;
4080

4081
	error = mnt_want_write(path.mnt);
4082 4083
	if (error)
		goto exit1;
4084
retry_deleg:
A
Al Viro 已提交
4085
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4086
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4087 4088
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
4089 4090
		struct user_namespace *mnt_userns;

L
Linus Torvalds 已提交
4091
		/* Why not before? Because we want correct error value */
4092
		if (last.name[last.len])
4093
			goto slashes;
L
Linus Torvalds 已提交
4094
		inode = dentry->d_inode;
4095
		if (d_is_negative(dentry))
4096 4097
			goto slashes;
		ihold(inode);
4098
		error = security_path_unlink(&path, dentry);
4099
		if (error)
4100
			goto exit2;
4101
		mnt_userns = mnt_user_ns(path.mnt);
4102 4103
		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
				   &delegated_inode);
4104
exit2:
L
Linus Torvalds 已提交
4105 4106
		dput(dentry);
	}
A
Al Viro 已提交
4107
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
4108 4109
	if (inode)
		iput(inode);	/* truncate the inode here */
4110 4111
	inode = NULL;
	if (delegated_inode) {
4112
		error = break_deleg_wait(&delegated_inode);
4113 4114 4115
		if (!error)
			goto retry_deleg;
	}
4116
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
4117
exit1:
4118
	path_put(&path);
4119 4120 4121 4122 4123
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
4124
	putname(name);
L
Linus Torvalds 已提交
4125 4126 4127
	return error;

slashes:
4128 4129
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
4130
	else if (d_is_dir(dentry))
4131 4132 4133
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
4134 4135 4136
	goto exit2;
}

4137
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4138 4139 4140 4141 4142
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
4143
		return do_rmdir(dfd, getname(pathname));
4144
	return do_unlinkat(dfd, getname(pathname));
4145 4146
}

4147
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4148
{
4149
	return do_unlinkat(AT_FDCWD, getname(pathname));
4150 4151
}

4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168
/**
 * vfs_symlink - create symlink
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @oldname:	name of the file to link to
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
		struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
4169
{
4170
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
4171 4172 4173 4174

	if (error)
		return error;

A
Al Viro 已提交
4175
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
4176 4177 4178 4179 4180 4181
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

4182
	error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
4183
	if (!error)
4184
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
4185 4186
	return error;
}
4187
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
4188

4189
static long do_symlinkat(const char __user *oldname, int newdfd,
4190
		  const char __user *newname)
L
Linus Torvalds 已提交
4191
{
4192
	int error;
4193
	struct filename *from;
4194
	struct dentry *dentry;
4195
	struct path path;
4196
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
4197 4198

	from = getname(oldname);
4199
	if (IS_ERR(from))
L
Linus Torvalds 已提交
4200
		return PTR_ERR(from);
4201 4202
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4203 4204
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
4205
		goto out_putname;
4206

4207
	error = security_path_symlink(&path, dentry, from->name);
4208 4209 4210 4211 4212 4213 4214
	if (!error) {
		struct user_namespace *mnt_userns;

		mnt_userns = mnt_user_ns(path.mnt);
		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
				    from->name);
	}
A
Al Viro 已提交
4215
	done_path_create(&path, dentry);
4216 4217 4218 4219
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4220
out_putname:
L
Linus Torvalds 已提交
4221 4222 4223 4224
	putname(from);
	return error;
}

4225 4226 4227 4228 4229 4230
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return do_symlinkat(oldname, newdfd, newname);
}

4231
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4232
{
4233
	return do_symlinkat(oldname, AT_FDCWD, newname);
4234 4235
}

J
J. Bruce Fields 已提交
4236 4237 4238
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
4239
 * @mnt_userns:	the user namespace of the mount
J
J. Bruce Fields 已提交
4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
4255 4256 4257 4258 4259 4260
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
J
J. Bruce Fields 已提交
4261
 */
4262 4263 4264
int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
	     struct inode *dir, struct dentry *new_dentry,
	     struct inode **delegated_inode)
L
Linus Torvalds 已提交
4265 4266
{
	struct inode *inode = old_dentry->d_inode;
4267
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4268 4269 4270 4271 4272
	int error;

	if (!inode)
		return -ENOENT;

4273
	error = may_create(mnt_userns, dir, new_dentry);
L
Linus Torvalds 已提交
4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
4285 4286 4287 4288 4289
	/*
	 * Updating the link count will likely cause i_uid and i_gid to
	 * be writen back improperly if their true value is unknown to
	 * the vfs.
	 */
4290
	if (HAS_UNMAPPED_ID(mnt_userns, inode))
4291
		return -EPERM;
A
Al Viro 已提交
4292
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4293
		return -EPERM;
4294
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4295 4296 4297 4298 4299 4300
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4301
	inode_lock(inode);
4302
	/* Make sure we don't allow creating hardlink to an unlinked file */
4303
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4304
		error =  -ENOENT;
4305 4306
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4307 4308 4309 4310 4311
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4312 4313 4314 4315 4316 4317

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4318
	inode_unlock(inode);
4319
	if (!error)
4320
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4321 4322
	return error;
}
4323
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4324 4325 4326 4327 4328 4329 4330 4331 4332 4333

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4334
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4335
	      const char __user *newname, int flags)
L
Linus Torvalds 已提交
4336
{
4337
	struct user_namespace *mnt_userns;
L
Linus Torvalds 已提交
4338
	struct dentry *new_dentry;
4339
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4340
	struct inode *delegated_inode = NULL;
4341
	int how = 0;
L
Linus Torvalds 已提交
4342 4343
	int error;

4344
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4345
		return -EINVAL;
4346
	/*
4347 4348 4349
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4350
	 */
4351 4352 4353
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4354
		how = LOOKUP_EMPTY;
4355
	}
4356 4357 4358

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4359
retry:
4360
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4361
	if (error)
4362 4363
		return error;

4364 4365
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4366
	error = PTR_ERR(new_dentry);
4367
	if (IS_ERR(new_dentry))
4368 4369 4370 4371 4372
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
4373 4374
	mnt_userns = mnt_user_ns(new_path.mnt);
	error = may_linkat(mnt_userns, &old_path);
K
Kees Cook 已提交
4375 4376
	if (unlikely(error))
		goto out_dput;
4377
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4378
	if (error)
4379
		goto out_dput;
4380 4381
	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
			 new_dentry, &delegated_inode);
4382
out_dput:
A
Al Viro 已提交
4383
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4384 4385
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4386 4387
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4388
			goto retry;
4389
		}
J
J. Bruce Fields 已提交
4390
	}
4391
	if (retry_estale(error, how)) {
4392
		path_put(&old_path);
4393 4394 4395
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4396
out:
4397
	path_put(&old_path);
L
Linus Torvalds 已提交
4398 4399 4400 4401

	return error;
}

4402 4403 4404 4405 4406 4407
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
{
	return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

4408
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4409
{
4410
	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4411 4412
}

4413 4414
/**
 * vfs_rename - rename a filesystem object
4415
 * @rd:		pointer to &struct renamedata info
4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4430 4431 4432
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4433
 *
4434
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4435 4436
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4437
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4438
 *	   story.
4439 4440
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4441
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4442 4443
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4444
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4445 4446 4447
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4448
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4449 4450 4451
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4452
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4453
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4454
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4455
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4456 4457
 *	   locking].
 */
4458
int vfs_rename(struct renamedata *rd)
L
Linus Torvalds 已提交
4459
{
4460
	int error;
4461 4462 4463 4464 4465
	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
	struct dentry *old_dentry = rd->old_dentry;
	struct dentry *new_dentry = rd->new_dentry;
	struct inode **delegated_inode = rd->delegated_inode;
	unsigned int flags = rd->flags;
4466 4467
	bool is_dir = d_is_dir(old_dentry);
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4468
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4469 4470
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
A
Al Viro 已提交
4471
	struct name_snapshot old_name;
4472

4473
	if (source == target)
4474 4475
		return 0;

4476
	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
4477 4478 4479
	if (error)
		return error;

M
Miklos Szeredi 已提交
4480
	if (!target) {
4481
		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
M
Miklos Szeredi 已提交
4482 4483 4484 4485
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
4486 4487
			error = may_delete(rd->new_mnt_userns, new_dir,
					   new_dentry, is_dir);
M
Miklos Szeredi 已提交
4488
		else
4489 4490
			error = may_delete(rd->new_mnt_userns, new_dir,
					   new_dentry, new_is_dir);
M
Miklos Szeredi 已提交
4491
	}
4492 4493 4494
	if (error)
		return error;

4495
	if (!old_dir->i_op->rename)
4496
		return -EPERM;
L
Linus Torvalds 已提交
4497 4498 4499 4500 4501

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4502 4503
	if (new_dir != old_dir) {
		if (is_dir) {
4504
			error = inode_permission(rd->old_mnt_userns, source,
4505
						 MAY_WRITE);
M
Miklos Szeredi 已提交
4506 4507 4508 4509
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4510
			error = inode_permission(rd->new_mnt_userns, target,
4511
						 MAY_WRITE);
M
Miklos Szeredi 已提交
4512 4513 4514
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4515 4516
	}

4517 4518
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4519 4520 4521
	if (error)
		return error;

A
Al Viro 已提交
4522
	take_dentry_name_snapshot(&old_name, old_dentry);
4523
	dget(new_dentry);
M
Miklos Szeredi 已提交
4524
	if (!is_dir || (flags & RENAME_EXCHANGE))
4525 4526
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4527
		inode_lock(target);
S
Sage Weil 已提交
4528 4529

	error = -EBUSY;
4530
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4531 4532
		goto out;

M
Miklos Szeredi 已提交
4533
	if (max_links && new_dir != old_dir) {
4534
		error = -EMLINK;
M
Miklos Szeredi 已提交
4535
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4536
			goto out;
M
Miklos Szeredi 已提交
4537 4538 4539 4540 4541
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (!is_dir) {
4542
		error = try_break_deleg(source, delegated_inode);
4543 4544
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4545 4546 4547 4548 4549
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4550
	}
4551 4552
	error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
				      new_dir, new_dentry, flags);
S
Sage Weil 已提交
4553 4554 4555
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4556
	if (!(flags & RENAME_EXCHANGE) && target) {
4557 4558
		if (is_dir) {
			shrink_dcache_parent(new_dentry);
4559
			target->i_flags |= S_DEAD;
4560
		}
S
Sage Weil 已提交
4561
		dont_mount(new_dentry);
4562
		detach_mounts(new_dentry);
4563
	}
M
Miklos Szeredi 已提交
4564 4565 4566 4567 4568 4569
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4570
out:
M
Miklos Szeredi 已提交
4571
	if (!is_dir || (flags & RENAME_EXCHANGE))
4572 4573
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4574
		inode_unlock(target);
L
Linus Torvalds 已提交
4575
	dput(new_dentry);
M
Miklos Szeredi 已提交
4576
	if (!error) {
4577
		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
M
Miklos Szeredi 已提交
4578 4579
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
4580
			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
M
Miklos Szeredi 已提交
4581 4582 4583
				      new_is_dir, NULL, new_dentry);
		}
	}
A
Al Viro 已提交
4584
	release_dentry_name_snapshot(&old_name);
R
Robert Love 已提交
4585

L
Linus Torvalds 已提交
4586 4587
	return error;
}
4588
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4589

4590 4591
int do_renameat2(int olddfd, struct filename *from, int newdfd,
		 struct filename *to, unsigned int flags)
L
Linus Torvalds 已提交
4592
{
4593
	struct renamedata rd;
4594 4595
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4596 4597 4598
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4599
	struct inode *delegated_inode = NULL;
4600
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4601
	bool should_retry = false;
4602
	int error = -EINVAL;
M
Miklos Szeredi 已提交
4603

M
Miklos Szeredi 已提交
4604
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4605
		goto put_both;
M
Miklos Szeredi 已提交
4606

M
Miklos Szeredi 已提交
4607 4608
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
4609
		goto put_both;
M
Miklos Szeredi 已提交
4610

4611 4612 4613
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4614
retry:
4615 4616
	from = filename_parentat(olddfd, from, lookup_flags, &old_path,
					&old_last, &old_type);
4617 4618
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
4619
		goto put_new;
4620
	}
L
Linus Torvalds 已提交
4621

4622 4623
	to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
				&new_type);
4624 4625
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4626
		goto exit1;
4627
	}
L
Linus Torvalds 已提交
4628 4629

	error = -EXDEV;
4630
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4631 4632 4633
		goto exit2;

	error = -EBUSY;
4634
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4635 4636
		goto exit2;

M
Miklos Szeredi 已提交
4637 4638
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4639
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4640 4641
		goto exit2;

4642
	error = mnt_want_write(old_path.mnt);
4643 4644 4645
	if (error)
		goto exit2;

4646
retry_deleg:
4647
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4648

4649
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4650 4651 4652 4653 4654
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4655
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4656
		goto exit4;
4657
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4658 4659 4660 4661 4662 4663
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4664 4665 4666 4667 4668 4669 4670
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4671
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4672 4673 4674
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4675
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4676
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4677
		error = -ENOTDIR;
4678
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4679
			goto exit5;
4680
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4681
			goto exit5;
L
Linus Torvalds 已提交
4682 4683 4684 4685
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4686
		goto exit5;
L
Linus Torvalds 已提交
4687
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4688 4689
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4690 4691 4692
	if (new_dentry == trap)
		goto exit5;

4693 4694
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4695
	if (error)
4696
		goto exit5;
4697 4698 4699

	rd.old_dir	   = old_path.dentry->d_inode;
	rd.old_dentry	   = old_dentry;
4700
	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
4701 4702
	rd.new_dir	   = new_path.dentry->d_inode;
	rd.new_dentry	   = new_dentry;
4703
	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
4704 4705 4706
	rd.delegated_inode = &delegated_inode;
	rd.flags	   = flags;
	error = vfs_rename(&rd);
L
Linus Torvalds 已提交
4707 4708 4709 4710 4711
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4712
	unlock_rename(new_path.dentry, old_path.dentry);
4713 4714 4715 4716 4717
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4718
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4719
exit2:
4720 4721
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4722
	path_put(&new_path);
L
Linus Torvalds 已提交
4723
exit1:
4724
	path_put(&old_path);
4725 4726 4727 4728 4729
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4730 4731 4732 4733 4734 4735
put_both:
	if (!IS_ERR(from))
		putname(from);
put_new:
	if (!IS_ERR(to))
		putname(to);
L
Linus Torvalds 已提交
4736 4737 4738
	return error;
}

4739 4740 4741
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
{
4742 4743
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				flags);
4744 4745
}

M
Miklos Szeredi 已提交
4746 4747 4748
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
4749 4750
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				0);
M
Miklos Szeredi 已提交
4751 4752
}

4753
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4754
{
4755 4756
	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
				getname(newname), 0);
4757 4758
}

A
Al Viro 已提交
4759
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4760
{
A
Al Viro 已提交
4761
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786
/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct inode *inode = d_inode(dentry);
4787 4788 4789
	DEFINE_DELAYED_CALL(done);
	const char *link;
	int res;
4790

4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801
	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
		if (unlikely(inode->i_op->readlink))
			return inode->i_op->readlink(dentry, buffer, buflen);

		if (!d_is_symlink(dentry))
			return -EINVAL;

		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_DEFAULT_READLINK;
		spin_unlock(&inode->i_lock);
	}
4802

4803
	link = READ_ONCE(inode->i_link);
4804 4805 4806 4807 4808 4809 4810 4811
	if (!link) {
		link = inode->i_op->get_link(dentry, inode, &done);
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
	res = readlink_copy(buffer, buflen, link);
	do_delayed_call(&done);
	return res;
4812 4813
}
EXPORT_SYMBOL(vfs_readlink);
L
Linus Torvalds 已提交
4814

M
Miklos Szeredi 已提交
4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839
/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
	const char *res = ERR_PTR(-EINVAL);
	struct inode *inode = d_inode(dentry);

	if (d_is_symlink(dentry)) {
		res = ERR_PTR(security_inode_readlink(dentry));
		if (!res)
			res = inode->i_op->get_link(dentry, inode, done);
	}
	return res;
}
EXPORT_SYMBOL(vfs_get_link);

L
Linus Torvalds 已提交
4840
/* get the link contents into pagecache */
4841
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4842
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4843
{
4844 4845
	char *kaddr;
	struct page *page;
4846 4847
	struct address_space *mapping = inode->i_mapping;

4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4861
	set_delayed_call(callback, page_put_link, page);
4862 4863
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4864
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4865
	return kaddr;
L
Linus Torvalds 已提交
4866 4867
}

4868
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4869

4870
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4871
{
4872
	put_page(arg);
L
Linus Torvalds 已提交
4873
}
4874
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4875

4876 4877
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4878
	DEFINE_DELAYED_CALL(done);
4879 4880
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4881 4882
					      &done));
	do_delayed_call(&done);
4883 4884 4885 4886
	return res;
}
EXPORT_SYMBOL(page_readlink);

4887 4888 4889 4890
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4891 4892
{
	struct address_space *mapping = inode->i_mapping;
4893
	struct page *page;
4894
	void *fsdata;
4895
	int err;
4896
	unsigned int flags = 0;
4897 4898
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4899

4900
retry:
4901
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4902
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4903
	if (err)
4904 4905
		goto fail;

4906
	memcpy(page_address(page), symname, len-1);
4907 4908 4909

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4910 4911
	if (err < 0)
		goto fail;
4912 4913 4914
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4915 4916 4917 4918 4919
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4920
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4921

4922 4923 4924
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4925
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4926
}
4927
EXPORT_SYMBOL(page_symlink);
4928

4929
const struct inode_operations page_symlink_inode_operations = {
4930
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4931 4932
};
EXPORT_SYMBOL(page_symlink_inode_operations);