namei.c 130.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
25
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
26 27
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
28
#include <linux/ima.h>
L
Linus Torvalds 已提交
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
42

43
#include "internal.h"
44
#include "mount.h"
45

L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
80
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

A
Al Viro 已提交
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
130
	struct filename *result;
131
	char *kname;
A
Al Viro 已提交
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

A
Al Viro 已提交
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
A
Al Viro 已提交
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

A
Al Viro 已提交
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
A
Al Viro 已提交
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

A
Al Viro 已提交
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
A
Al Viro 已提交
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
204 205
}

206 207
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
208
{
209
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
L
Linus Torvalds 已提交
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
260 261
}

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
/**
 * check_acl - perform ACL permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
static int check_acl(struct user_namespace *mnt_userns,
		     struct inode *inode, int mask)
280
{
281
#ifdef CONFIG_FS_POSIX_ACL
282 283 284
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
285 286
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
287
	                return -EAGAIN;
288
		/* no ->get_acl() calls in RCU mode... */
289
		if (is_uncached_acl(acl))
290
			return -ECHILD;
291
	        return posix_acl_permission(mnt_userns, inode, acl, mask);
292 293
	}

C
Christoph Hellwig 已提交
294 295 296
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
297
	if (acl) {
298
	        int error = posix_acl_permission(mnt_userns, inode, acl, mask);
299 300 301
	        posix_acl_release(acl);
	        return error;
	}
302
#endif
303 304 305 306

	return -EAGAIN;
}

307 308 309 310 311 312 313 314 315
/**
 * acl_permission_check - perform basic UNIX permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
316
 *
317 318 319 320 321
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
L
Linus Torvalds 已提交
322
 */
323 324
static int acl_permission_check(struct user_namespace *mnt_userns,
				struct inode *inode, int mask)
L
Linus Torvalds 已提交
325
{
326
	unsigned int mode = inode->i_mode;
327
	kuid_t i_uid;
L
Linus Torvalds 已提交
328

329
	/* Are we the owner? If so, ACL's don't matter */
330 331
	i_uid = i_uid_into_mnt(mnt_userns, inode);
	if (likely(uid_eq(current_fsuid(), i_uid))) {
332
		mask &= 7;
L
Linus Torvalds 已提交
333
		mode >>= 6;
334 335
		return (mask & ~mode) ? -EACCES : 0;
	}
L
Linus Torvalds 已提交
336

337 338
	/* Do we have ACL's? */
	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
339
		int error = check_acl(mnt_userns, inode, mask);
340 341
		if (error != -EAGAIN)
			return error;
L
Linus Torvalds 已提交
342 343
	}

344 345 346
	/* Only RWX matters for group/other mode bits */
	mask &= 7;

L
Linus Torvalds 已提交
347
	/*
348 349 350
	 * Are the group permissions different from
	 * the other permissions in the bits we care
	 * about? Need to check group ownership if so.
L
Linus Torvalds 已提交
351
	 */
352
	if (mask & (mode ^ (mode >> 3))) {
353 354
		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
		if (in_group_p(kgid))
355 356 357 358 359
			mode >>= 3;
	}

	/* Bits in 'mode' clear that we require? */
	return (mask & ~mode) ? -EACCES : 0;
360 361 362
}

/**
363
 * generic_permission -  check for access rights on a Posix-like filesystem
364
 * @mnt_userns:	user namespace of the mount the inode was found from
365
 * @inode:	inode to check access rights for
366 367
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *		%MAY_NOT_BLOCK ...)
368 369 370 371
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
372 373 374 375 376
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
377 378 379 380 381 382
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
383
 */
384 385
int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
		       int mask)
386 387 388 389
{
	int ret;

	/*
390
	 * Do the basic permission checks.
391
	 */
392
	ret = acl_permission_check(mnt_userns, inode, mask);
393 394
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
395

396 397 398
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
399
			if (capable_wrt_inode_uidgid(mnt_userns, inode,
400
						     CAP_DAC_READ_SEARCH))
401
				return 0;
402
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
403
					     CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
404
			return 0;
405 406
		return -EACCES;
	}
L
Linus Torvalds 已提交
407 408 409 410

	/*
	 * Searching includes executable on directories, else just read.
	 */
411
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
412
	if (mask == MAY_READ)
413
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
414
					     CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
415
			return 0;
416 417 418 419 420 421
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
422
		if (capable_wrt_inode_uidgid(mnt_userns, inode,
423
					     CAP_DAC_OVERRIDE))
424
			return 0;
L
Linus Torvalds 已提交
425 426 427

	return -EACCES;
}
428
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
429

430 431 432 433 434 435
/**
 * do_inode_permission - UNIX permission checking
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @inode:	inode to check permissions on
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
436 437 438 439 440
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
441 442
static inline int do_inode_permission(struct user_namespace *mnt_userns,
				      struct inode *inode, int mask)
443 444 445 446 447 448 449 450 451 452
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
453
	return generic_permission(mnt_userns, inode, mask);
454 455
}

D
David Howells 已提交
456 457 458
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
459
 * @inode: Inode to check permission on
D
David Howells 已提交
460 461 462 463 464 465 466 467 468 469
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
470
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
D
David Howells 已提交
471 472 473 474 475 476 477
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
478 479 480
 * @mnt_userns:	User namespace of the mount the inode was found from
 * @inode:	Inode to check permission on
 * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
D
David Howells 已提交
481 482 483 484 485 486 487
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
488 489
int inode_permission(struct user_namespace *mnt_userns,
		     struct inode *inode, int mask)
D
David Howells 已提交
490 491 492 493 494 495
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
496 497 498 499 500 501 502 503 504 505 506 507 508

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
509
		if (HAS_UNMAPPED_ID(mnt_userns, inode))
510 511 512
			return -EACCES;
	}

513
	retval = do_inode_permission(mnt_userns, inode, mask);
514 515 516 517 518 519 520 521
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
D
David Howells 已提交
522
}
523
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
524

J
Jan Blunck 已提交
525 526 527 528 529 530
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
531
void path_get(const struct path *path)
J
Jan Blunck 已提交
532 533 534 535 536 537
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
538 539 540 541 542 543
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
544
void path_put(const struct path *path)
L
Linus Torvalds 已提交
545
{
J
Jan Blunck 已提交
546 547
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
548
}
J
Jan Blunck 已提交
549
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
550

551
#define EMBEDDED_LEVELS 2
552 553
struct nameidata {
	struct path	path;
A
Al Viro 已提交
554
	struct qstr	last;
555 556 557
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
558
	unsigned	seq, m_seq, r_seq;
559 560
	int		last_type;
	unsigned	depth;
561
	int		total_link_count;
562 563
	struct saved {
		struct path link;
564
		struct delayed_call done;
565
		const char *name;
566
		unsigned seq;
567
	} *stack, internal[EMBEDDED_LEVELS];
568 569 570 571
	struct filename	*name;
	struct nameidata *saved;
	unsigned	root_seq;
	int		dfd;
572 573
	kuid_t		dir_uid;
	umode_t		dir_mode;
574
} __randomize_layout;
575

576
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
577
{
578 579
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
580 581
	p->dfd = dfd;
	p->name = name;
582
	p->total_link_count = old ? old->total_link_count : 0;
583
	p->saved = old;
584
	current->nameidata = p;
585 586
}

587
static void restore_nameidata(void)
588
{
589
	struct nameidata *now = current->nameidata, *old = now->saved;
590 591 592 593

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
594
	if (now->stack != now->internal)
595
		kfree(now->stack);
596 597
}

598
static bool nd_alloc_stack(struct nameidata *nd)
599
{
A
Al Viro 已提交
600 601
	struct saved *p;

602 603 604 605
	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
	if (unlikely(!p))
		return false;
606 607
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
608
	return true;
609 610
}

611
/**
612
 * path_connected - Verify that a dentry is below mnt.mnt_root
613 614 615 616
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
617
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
618
{
619
	struct super_block *sb = mnt->mnt_sb;
620

621 622
	/* Bind mounts can have disconnected paths */
	if (mnt->mnt_root == sb->s_root)
623 624
		return true;

625
	return is_subdir(dentry, mnt->mnt_root);
626 627
}

628 629 630 631 632
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
633 634
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
635 636 637 638 639 640 641 642 643 644 645
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
646
		if (nd->flags & LOOKUP_ROOT_GRABBED) {
647
			path_put(&nd->root);
648
			nd->flags &= ~LOOKUP_ROOT_GRABBED;
649
		}
650 651 652 653 654 655 656 657
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
658
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
659
{
660
	int res = __legitimize_mnt(path->mnt, mseq);
661 662 663 664 665 666 667 668 669 670 671 672 673
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

674 675 676
static inline bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
A
Al Viro 已提交
677
	return __legitimize_path(path, seq, nd->m_seq);
678 679
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693
static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

694 695
static bool legitimize_root(struct nameidata *nd)
{
696 697 698 699 700 701 702 703
	/*
	 * For scoped-lookups (where nd->root has been zeroed), we need to
	 * restart the whole lookup from scratch -- because set_root() is wrong
	 * for these lookups (nd->dfd is the root, not the filesystem root).
	 */
	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
		return false;
	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
704 705
	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
		return true;
706
	nd->flags |= LOOKUP_ROOT_GRABBED;
707 708 709
	return legitimize_path(nd, &nd->root, nd->root_seq);
}

A
Al Viro 已提交
710
/*
N
Nick Piggin 已提交
711
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
712 713
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
714
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
715 716 717 718
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
719 720 721
 */

/**
A
Al Viro 已提交
722 723
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
724
 * Returns: 0 on success, -ECHILD on failure
N
Nick Piggin 已提交
725
 *
A
Al Viro 已提交
726 727 728
 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
729 730
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
N
Nick Piggin 已提交
731
 */
A
Al Viro 已提交
732
static int unlazy_walk(struct nameidata *nd)
N
Nick Piggin 已提交
733 734 735 736
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
737

A
Al Viro 已提交
738 739 740
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
741 742
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
743 744
	if (unlikely(!legitimize_root(nd)))
		goto out;
A
Al Viro 已提交
745 746 747 748
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
	return 0;

749
out1:
A
Al Viro 已提交
750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
	return -ECHILD;
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

774
	nd->flags &= ~LOOKUP_RCU;
775 776 777 778
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
A
Al Viro 已提交
779
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
780
		goto out1;
A
Al Viro 已提交
781

782
	/*
A
Al Viro 已提交
783 784 785 786 787
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
788
	 */
A
Al Viro 已提交
789 790
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
791 792
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
		goto out_dput;
793 794 795 796
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
797 798
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
A
Al Viro 已提交
799
	rcu_read_unlock();
N
Nick Piggin 已提交
800
	return 0;
A
Al Viro 已提交
801

802 803 804 805
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
806
out:
A
Al Viro 已提交
807
	rcu_read_unlock();
808 809 810 811
	return -ECHILD;
out_dput:
	rcu_read_unlock();
	dput(dentry);
N
Nick Piggin 已提交
812 813 814
	return -ECHILD;
}

815
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
816
{
817 818 819 820
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
821 822
}

823 824 825
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
826
 *
827 828 829 830 831
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
832
 */
833
static int complete_walk(struct nameidata *nd)
834
{
A
Al Viro 已提交
835
	struct dentry *dentry = nd->path.dentry;
836 837
	int status;

838
	if (nd->flags & LOOKUP_RCU) {
839 840 841 842 843
		/*
		 * We don't want to zero nd->root for scoped-lookups or
		 * externally-managed nd->root.
		 */
		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
844
			nd->root.mnt = NULL;
A
Al Viro 已提交
845
		if (unlikely(unlazy_walk(nd)))
846 847 848
			return -ECHILD;
	}

849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
		/*
		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
		 * ever step outside the root during lookup" and should already
		 * be guaranteed by the rest of namei, we want to avoid a namei
		 * BUG resulting in userspace being given a path that was not
		 * scoped within the root at some point during the lookup.
		 *
		 * So, do a final sanity-check to make sure that in the
		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
		 * we won't silently return an fd completely outside of the
		 * requested root to userspace.
		 *
		 * Userspace could move the path outside the root after this
		 * check, but as discussed elsewhere this is not a concern (the
		 * resolved file was inside the root at some point).
		 */
		if (!path_is_under(&nd->path, &nd->root))
			return -EXDEV;
	}

A
Al Viro 已提交
870 871 872
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

873
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
874 875
		return 0;

876
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
877 878 879
	if (status > 0)
		return 0;

A
Al Viro 已提交
880
	if (!status)
881
		status = -ESTALE;
A
Al Viro 已提交
882

883 884 885
	return status;
}

886
static int set_root(struct nameidata *nd)
N
Nick Piggin 已提交
887
{
888
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
889

890 891 892 893 894 895 896 897
	/*
	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
	 * still have to ensure it doesn't happen because it will cause a breakout
	 * from the dirfd.
	 */
	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
		return -ENOTRECOVERABLE;

898 899 900 901 902 903 904 905 906 907
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
908
		nd->flags |= LOOKUP_ROOT_GRABBED;
909
	}
910
	return 0;
N
Nick Piggin 已提交
911 912
}

913 914
static int nd_jump_root(struct nameidata *nd)
{
915 916
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return -EXDEV;
917 918 919 920 921
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		/* Absolute path arguments to path_init() are allowed. */
		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
			return -EXDEV;
	}
922 923 924 925 926
	if (!nd->root.mnt) {
		int error = set_root(nd);
		if (error)
			return error;
	}
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

C
Christoph Hellwig 已提交
945
/*
946
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
947 948
 * caller must have taken a reference to path beforehand.
 */
949
int nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
950
{
951
	int error = -ELOOP;
952
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
953

954 955 956
	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
		goto err;

957 958 959 960 961
	error = -EXDEV;
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		if (nd->path.mnt != path->mnt)
			goto err;
	}
962 963 964
	/* Not currently safe for scoped-lookups. */
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
		goto err;
965

966
	path_put(&nd->path);
C
Christoph Hellwig 已提交
967 968 969
	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
970
	return 0;
971 972 973 974

err:
	path_put(path);
	return error;
C
Christoph Hellwig 已提交
975 976
}

977
static inline void put_link(struct nameidata *nd)
978
{
A
Al Viro 已提交
979
	struct saved *last = nd->stack + --nd->depth;
980
	do_delayed_call(&last->done);
A
Al Viro 已提交
981 982
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
983 984
}

985 986
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
987 988
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
K
Kees Cook 已提交
989 990 991

/**
 * may_follow_link - Check symlink following for unsafe situations
992
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
993 994 995 996 997 998 999 1000 1001 1002 1003 1004
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
1005
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
K
Kees Cook 已提交
1006
{
1007 1008 1009
	struct user_namespace *mnt_userns;
	kuid_t i_uid;

K
Kees Cook 已提交
1010 1011 1012
	if (!sysctl_protected_symlinks)
		return 0;

1013 1014
	mnt_userns = mnt_user_ns(nd->path.mnt);
	i_uid = i_uid_into_mnt(mnt_userns, inode);
K
Kees Cook 已提交
1015
	/* Allowed if owner and follower match. */
1016
	if (uid_eq(current_cred()->fsuid, i_uid))
K
Kees Cook 已提交
1017 1018 1019
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
1020
	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
K
Kees Cook 已提交
1021 1022 1023
		return 0;

	/* Allowed if parent directory and link owner match. */
1024
	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
K
Kees Cook 已提交
1025 1026
		return 0;

1027 1028 1029
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

1030
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
K
Kees Cook 已提交
1031
	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
K
Kees Cook 已提交
1032 1033 1034 1035 1036
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
1037
 * @mnt_userns:	user namespace of the mount the inode was found from
K
Kees Cook 已提交
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
1048 1049
static bool safe_hardlink_source(struct user_namespace *mnt_userns,
				 struct inode *inode)
K
Kees Cook 已提交
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
1066
	if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
K
Kees Cook 已提交
1067 1068 1069 1070 1071 1072 1073
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
1074
 * @mnt_userns:	user namespace of the mount the inode was found from
K
Kees Cook 已提交
1075 1076 1077 1078 1079 1080
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1081
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
1082
 *
1083 1084 1085 1086 1087 1088
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 *
K
Kees Cook 已提交
1089 1090
 * Returns 0 if successful, -ve on error.
 */
1091
int may_linkat(struct user_namespace *mnt_userns, struct path *link)
K
Kees Cook 已提交
1092
{
1093 1094 1095
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
1096 1097
	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
1098
		return -EOVERFLOW;
K
Kees Cook 已提交
1099 1100 1101 1102 1103 1104 1105

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1106 1107
	if (safe_hardlink_source(mnt_userns, inode) ||
	    inode_owner_or_capable(mnt_userns, inode))
K
Kees Cook 已提交
1108 1109
		return 0;

K
Kees Cook 已提交
1110
	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
K
Kees Cook 已提交
1111 1112 1113
	return -EPERM;
}

1114 1115 1116 1117
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
1118
 * @mnt_userns:	user namespace of the mount the inode was found from
1119 1120
 * @dir_mode: mode bits of directory
 * @dir_uid: owner of directory
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
1134 1135 1136 1137 1138 1139
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 *
1140 1141
 * Returns 0 if the open is allowed, -ve on error.
 */
1142 1143
static int may_create_in_sticky(struct user_namespace *mnt_userns,
				struct nameidata *nd, struct inode *const inode)
1144
{
1145 1146 1147
	umode_t dir_mode = nd->dir_mode;
	kuid_t dir_uid = nd->dir_uid;

1148 1149
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1150
	    likely(!(dir_mode & S_ISVTX)) ||
1151 1152
	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
1153 1154
		return 0;

1155 1156
	if (likely(dir_mode & 0002) ||
	    (dir_mode & 0020 &&
1157 1158
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
K
Kees Cook 已提交
1159 1160 1161 1162
		const char *operation = S_ISFIFO(inode->i_mode) ?
					"sticky_create_fifo" :
					"sticky_create_regular";
		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1163 1164 1165 1166 1167
		return -EACCES;
	}
	return 0;
}

1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1178
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1179
{
1180 1181
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1182
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1183

A
Al Viro 已提交
1184
	read_seqlock_excl(&mount_lock);
1185
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1186
	if (parent == mnt) {
A
Al Viro 已提交
1187
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1188 1189
		return 0;
	}
1190
	mntget(&parent->mnt);
1191
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1192
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1193 1194 1195
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1196
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1197 1198
	return 1;
}
1199
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1200

A
Al Viro 已提交
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220
static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
				  struct path *path, unsigned *seqp)
{
	while (mnt_has_parent(m)) {
		struct dentry *mountpoint = m->mnt_mountpoint;

		m = m->mnt_parent;
		if (unlikely(root->dentry == mountpoint &&
			     root->mnt == &m->mnt))
			break;
		if (mountpoint != m->mnt.mnt_root) {
			path->mnt = &m->mnt;
			path->dentry = mountpoint;
			*seqp = read_seqcount_begin(&mountpoint->d_seq);
			return true;
		}
	}
	return false;
}

1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
static bool choose_mountpoint(struct mount *m, const struct path *root,
			      struct path *path)
{
	bool found;

	rcu_read_lock();
	while (1) {
		unsigned seq, mseq = read_seqbegin(&mount_lock);

		found = choose_mountpoint_rcu(m, root, path, &seq);
		if (unlikely(!found)) {
			if (!read_seqretry(&mount_lock, mseq))
				break;
		} else {
			if (likely(__legitimize_path(path, seq, mseq)))
				break;
			rcu_read_unlock();
			path_put(path);
			rcu_read_lock();
		}
	}
	rcu_read_unlock();
	return found;
}

N
Nick Piggin 已提交
1246
/*
1247 1248 1249
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1250
 */
1251
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
N
Nick Piggin 已提交
1252
{
1253
	struct dentry *dentry = path->dentry;
1254

1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1265
	 */
1266
	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1267
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1268
	    dentry->d_inode)
1269
		return -EISDIR;
1270

1271
	if (count && (*count)++ >= MAXSYMLINKS)
1272 1273
		return -ELOOP;

1274
	return finish_automount(dentry->d_op->d_automount(path), path);
A
Al Viro 已提交
1275 1276
}

1277
/*
A
Al Viro 已提交
1278 1279 1280 1281
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
1282
 */
A
Al Viro 已提交
1283 1284
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
			     int *count, unsigned lookup_flags)
L
Linus Torvalds 已提交
1285
{
A
Al Viro 已提交
1286
	struct vfsmount *mnt = path->mnt;
1287
	bool need_mntput = false;
1288
	int ret = 0;
1289

A
Al Viro 已提交
1290
	while (flags & DCACHE_MANAGED_DENTRY) {
1291 1292
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
1293
		if (flags & DCACHE_MANAGE_TRANSIT) {
1294
			ret = path->dentry->d_op->d_manage(path, false);
1295
			flags = smp_load_acquire(&path->dentry->d_flags);
1296
			if (ret < 0)
1297
				break;
1298 1299
		}

A
Al Viro 已提交
1300
		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1301
			struct vfsmount *mounted = lookup_mnt(path);
A
Al Viro 已提交
1302
			if (mounted) {		// ... in our namespace
1303 1304 1305 1306 1307
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
A
Al Viro 已提交
1308 1309
				// here we know it's positive
				flags = path->dentry->d_flags;
1310 1311 1312 1313 1314
				need_mntput = true;
				continue;
			}
		}

A
Al Viro 已提交
1315 1316
		if (!(flags & DCACHE_NEED_AUTOMOUNT))
			break;
1317

A
Al Viro 已提交
1318 1319 1320 1321 1322
		// uncovered automount point
		ret = follow_automount(path, count, lookup_flags);
		flags = smp_load_acquire(&path->dentry->d_flags);
		if (ret < 0)
			break;
L
Linus Torvalds 已提交
1323
	}
1324

A
Al Viro 已提交
1325 1326 1327 1328 1329 1330
	if (ret == -EISDIR)
		ret = 0;
	// possible if you race with several mount --move
	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (!ret && unlikely(d_flags_negative(flags)))
1331
		ret = -ENOENT;
A
Al Viro 已提交
1332
	*jumped = need_mntput;
1333
	return ret;
L
Linus Torvalds 已提交
1334 1335
}

A
Al Viro 已提交
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
static inline int traverse_mounts(struct path *path, bool *jumped,
				  int *count, unsigned lookup_flags)
{
	unsigned flags = smp_load_acquire(&path->dentry->d_flags);

	/* fastpath */
	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
		*jumped = false;
		if (unlikely(d_flags_negative(flags)))
			return -ENOENT;
		return 0;
	}
	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

1351
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1352 1353 1354
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1355
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1356
	if (mounted) {
A
Al Viro 已提交
1357 1358 1359 1360
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1361 1362 1363 1364
		return 1;
	}
	return 0;
}
1365
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1366

A
Al Viro 已提交
1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
	struct vfsmount *mnt = path->mnt;
	bool jumped;
	int ret = traverse_mounts(path, &jumped, NULL, 0);

	if (path->mnt != mnt)
		mntput(mnt);
	return ret;
}
EXPORT_SYMBOL(follow_down);

1384
/*
1385 1386
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1387 1388
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1389
			       struct inode **inode, unsigned *seqp)
1390
{
A
Al Viro 已提交
1391 1392 1393 1394 1395 1396 1397 1398 1399
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;

	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
		return true;

	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
		return false;

1400 1401 1402 1403 1404
	for (;;) {
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
A
Al Viro 已提交
1405 1406 1407 1408 1409
		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
			int res = dentry->d_op->d_manage(path, true);
			if (res)
				return res == -EISDIR;
			flags = dentry->d_flags;
1410
		}
1411

A
Al Viro 已提交
1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
		if (flags & DCACHE_MOUNTED) {
			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
			if (mounted) {
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
				nd->flags |= LOOKUP_JUMPED;
				*seqp = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
				 * ->d_inode read - there will be an RCU delay
				 * between mount hash removal and ->mnt_root
				 * becoming unpinned.
				 */
				flags = dentry->d_flags;
				continue;
			}
			if (read_seqretry(&mount_lock, nd->m_seq))
				return false;
		}
		return !(flags & DCACHE_NEED_AUTOMOUNT);
1433
	}
1434 1435
}

1436 1437 1438
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
1439
{
A
Al Viro 已提交
1440
	bool jumped;
1441
	int ret;
1442

1443 1444
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
1445 1446 1447 1448 1449
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
A
Al Viro 已提交
1450
			return 0;
1451 1452 1453 1454 1455 1456
		if (unlazy_child(nd, dentry, seq))
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
	}
A
Al Viro 已提交
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			ret = -EXDEV;
		else
			nd->flags |= LOOKUP_JUMPED;
	}
	if (unlikely(ret)) {
		dput(path->dentry);
		if (path->mnt != nd->path.mnt)
			mntput(path->mnt);
	} else {
1469 1470 1471 1472 1473 1474
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}

1475
/*
1476 1477
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
1478
 */
1479 1480
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1481
				    unsigned int flags)
1482
{
1483
	struct dentry *dentry = d_lookup(dir, name);
M
Miklos Szeredi 已提交
1484
	if (dentry) {
1485 1486 1487 1488 1489 1490
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error)
				d_invalidate(dentry);
			dput(dentry);
			return ERR_PTR(error);
M
Miklos Szeredi 已提交
1491 1492
		}
	}
1493 1494 1495
	return dentry;
}

1496
/*
1497 1498 1499 1500 1501
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
1502
 */
1503
static struct dentry *__lookup_hash(const struct qstr *name,
1504
		struct dentry *base, unsigned int flags)
1505
{
1506
	struct dentry *dentry = lookup_dcache(name, base, flags);
1507 1508
	struct dentry *old;
	struct inode *dir = base->d_inode;
1509

1510
	if (dentry)
M
Miklos Szeredi 已提交
1511
		return dentry;
1512

1513 1514 1515 1516
	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir)))
		return ERR_PTR(-ENOENT);

1517 1518 1519 1520
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1521 1522 1523 1524 1525 1526
	old = dir->i_op->lookup(dir, dentry, flags);
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
1527 1528
}

1529 1530 1531
static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
L
Linus Torvalds 已提交
1532
{
N
Nick Piggin 已提交
1533
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1534
	int status = 1;
1535

1536 1537
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1538 1539
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1540
	 */
N
Nick Piggin 已提交
1541 1542
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1543
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1544
		if (unlikely(!dentry)) {
A
Al Viro 已提交
1545
			if (unlazy_walk(nd))
1546 1547
				return ERR_PTR(-ECHILD);
			return NULL;
A
Al Viro 已提交
1548
		}
A
Al Viro 已提交
1549

1550 1551 1552 1553
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1554
		*inode = d_backing_inode(dentry);
A
Al Viro 已提交
1555
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1556
			return ERR_PTR(-ECHILD);
1557 1558 1559 1560 1561 1562 1563 1564

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1565
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1566
			return ERR_PTR(-ECHILD);
A
Al Viro 已提交
1567

1568
		*seqp = seq;
1569
		status = d_revalidate(dentry, nd->flags);
1570
		if (likely(status > 0))
1571
			return dentry;
A
Al Viro 已提交
1572
		if (unlazy_child(nd, dentry, seq))
1573
			return ERR_PTR(-ECHILD);
1574 1575 1576
		if (unlikely(status == -ECHILD))
			/* we'd been told to redo it in non-rcu mode */
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1577
	} else {
A
Al Viro 已提交
1578
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1579
		if (unlikely(!dentry))
1580
			return NULL;
1581
		status = d_revalidate(dentry, nd->flags);
1582
	}
A
Al Viro 已提交
1583
	if (unlikely(status <= 0)) {
1584
		if (!status)
A
Al Viro 已提交
1585
			d_invalidate(dentry);
1586
		dput(dentry);
1587
		return ERR_PTR(status);
1588
	}
1589
	return dentry;
M
Miklos Szeredi 已提交
1590 1591 1592
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1593 1594 1595
static struct dentry *__lookup_slow(const struct qstr *name,
				    struct dentry *dir,
				    unsigned int flags)
M
Miklos Szeredi 已提交
1596
{
A
Al Viro 已提交
1597
	struct dentry *dentry, *old;
1598
	struct inode *inode = dir->d_inode;
1599
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1600 1601

	/* Don't go there if it's already dead */
A
Al Viro 已提交
1602
	if (unlikely(IS_DEADDIR(inode)))
A
Al Viro 已提交
1603
		return ERR_PTR(-ENOENT);
A
Al Viro 已提交
1604
again:
1605
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1606
	if (IS_ERR(dentry))
A
Al Viro 已提交
1607
		return dentry;
A
Al Viro 已提交
1608
	if (unlikely(!d_in_lookup(dentry))) {
1609 1610 1611 1612
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error) {
				d_invalidate(dentry);
1613
				dput(dentry);
1614
				goto again;
1615
			}
1616 1617
			dput(dentry);
			dentry = ERR_PTR(error);
1618
		}
A
Al Viro 已提交
1619 1620 1621 1622 1623 1624
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1625 1626
		}
	}
1627
	return dentry;
L
Linus Torvalds 已提交
1628 1629
}

A
Al Viro 已提交
1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
{
	struct inode *inode = dir->d_inode;
	struct dentry *res;
	inode_lock_shared(inode);
	res = __lookup_slow(name, dir, flags);
	inode_unlock_shared(inode);
	return res;
}

1642 1643
static inline int may_lookup(struct user_namespace *mnt_userns,
			     struct nameidata *nd)
1644 1645
{
	if (nd->flags & LOOKUP_RCU) {
1646
		int err = inode_permission(mnt_userns, nd->inode,
1647
					   MAY_EXEC | MAY_NOT_BLOCK);
1648 1649
		if (err != -ECHILD)
			return err;
A
Al Viro 已提交
1650
		if (unlazy_walk(nd))
1651 1652
			return -ECHILD;
	}
1653
	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
1654 1655
}

1656 1657 1658 1659
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
1660 1661 1662 1663 1664

	if (likely(nd->depth != EMBEDDED_LEVELS))
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
1665
	if (likely(nd_alloc_stack(nd)))
1666
		return 0;
1667 1668 1669 1670

	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
1671
		bool grabbed_link = legitimize_path(nd, link, seq);
1672 1673 1674 1675 1676 1677

		if (unlazy_walk(nd) != 0 || !grabbed_link)
			return -ECHILD;

		if (nd_alloc_stack(nd))
			return 0;
1678
	}
1679
	return -ENOMEM;
1680 1681
}

1682 1683
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

1684
static const char *pick_link(struct nameidata *nd, struct path *link,
1685
		     struct inode *inode, unsigned seq, int flags)
1686
{
A
Al Viro 已提交
1687
	struct saved *last;
1688
	const char *res;
1689
	int error = reserve_stack(nd, link, seq);
1690

1691
	if (unlikely(error)) {
1692
		if (!(nd->flags & LOOKUP_RCU))
A
Al Viro 已提交
1693
			path_put(link);
1694
		return ERR_PTR(error);
1695
	}
1696
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1697
	last->link = *link;
1698
	clear_delayed_call(&last->done);
1699
	last->seq = seq;
1700

1701
	if (flags & WALK_TRAILING) {
1702 1703 1704 1705 1706
		error = may_follow_link(nd, inode);
		if (unlikely(error))
			return ERR_PTR(error);
	}

1707 1708
	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
		return ERR_PTR(-ELOOP);

	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
		if (unlikely(unlazy_walk(nd)))
			return ERR_PTR(-ECHILD);
		touch_atime(&last->link);
	}

	error = security_inode_follow_link(link->dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
		return ERR_PTR(error);

	res = READ_ONCE(inode->i_link);
	if (!res) {
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
		if (nd->flags & LOOKUP_RCU) {
			res = get(NULL, inode, &last->done);
			if (res == ERR_PTR(-ECHILD)) {
				if (unlikely(unlazy_walk(nd)))
					return ERR_PTR(-ECHILD);
				res = get(link->dentry, inode, &last->done);
			}
		} else {
			res = get(link->dentry, inode, &last->done);
		}
		if (!res)
			goto all_done;
		if (IS_ERR(res))
			return res;
	}
	if (*res == '/') {
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		while (unlikely(*++res == '/'))
			;
	}
	if (*res)
		return res;
all_done: // pure jump
	put_link(nd);
	return NULL;
1757 1758
}

1759 1760 1761 1762 1763 1764
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1765
static const char *step_into(struct nameidata *nd, int flags,
A
Al Viro 已提交
1766
		     struct dentry *dentry, struct inode *inode, unsigned seq)
1767
{
A
Al Viro 已提交
1768 1769 1770 1771
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);

	if (err < 0)
1772
		return ERR_PTR(err);
A
Al Viro 已提交
1773
	if (likely(!d_is_symlink(path.dentry)) ||
1774
	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
A
Al Viro 已提交
1775
	   (flags & WALK_NOFOLLOW)) {
1776
		/* not a symlink or should not follow */
1777 1778 1779 1780 1781 1782
		if (!(nd->flags & LOOKUP_RCU)) {
			dput(nd->path.dentry);
			if (nd->path.mnt != path.mnt)
				mntput(nd->path.mnt);
		}
		nd->path = path;
1783 1784
		nd->inode = inode;
		nd->seq = seq;
1785
		return NULL;
1786
	}
1787
	if (nd->flags & LOOKUP_RCU) {
1788
		/* make sure that d_is_symlink above matches inode */
A
Al Viro 已提交
1789
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
1790
			return ERR_PTR(-ECHILD);
1791 1792 1793
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
1794
	}
1795
	return pick_link(nd, &path, inode, seq, flags);
1796 1797
}

1798 1799 1800
static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
1801
{
A
Al Viro 已提交
1802
	struct dentry *parent, *old;
1803

A
Al Viro 已提交
1804 1805 1806
	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
A
Al Viro 已提交
1807
		struct path path;
1808
		unsigned seq;
A
Al Viro 已提交
1809 1810 1811
		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
					   &nd->root, &path, &seq))
			goto in_root;
1812 1813 1814 1815 1816 1817 1818 1819
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-ECHILD);
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
1820
	}
A
Al Viro 已提交
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
		return ERR_PTR(-ECHILD);
	return parent;
in_root:
1831 1832
	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
		return ERR_PTR(-ECHILD);
1833 1834 1835
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	return NULL;
1836 1837
}

1838 1839 1840
static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
1841
{
A
Al Viro 已提交
1842 1843 1844 1845 1846
	struct dentry *parent;

	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1847 1848 1849 1850 1851
		struct path path;

		if (!choose_mountpoint(real_mount(nd->path.mnt),
				       &nd->root, &path))
			goto in_root;
1852 1853
		path_put(&nd->path);
		nd->path = path;
1854
		nd->inode = path.dentry->d_inode;
1855 1856
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-EXDEV);
1857
	}
A
Al Viro 已提交
1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868
	/* rare case of legitimate dget_parent()... */
	parent = dget_parent(nd->path.dentry);
	if (unlikely(!path_connected(nd->path.mnt, parent))) {
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
1869 1870 1871 1872
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	dget(nd->path.dentry);
	return NULL;
1873 1874
}

1875
static const char *handle_dots(struct nameidata *nd, int type)
1876 1877
{
	if (type == LAST_DOTDOT) {
1878
		const char *error = NULL;
1879 1880 1881
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;
1882 1883

		if (!nd->root.mnt) {
1884
			error = ERR_PTR(set_root(nd));
1885 1886 1887 1888
			if (error)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
1889
			parent = follow_dotdot_rcu(nd, &inode, &seq);
1890
		else
1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
			parent = follow_dotdot(nd, &inode, &seq);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		if (unlikely(!parent))
			error = step_into(nd, WALK_NOFOLLOW,
					 nd->path.dentry, nd->inode, nd->seq);
		else
			error = step_into(nd, WALK_NOFOLLOW,
					 parent, inode, seq);
		if (unlikely(error))
1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
			return error;

		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
			/*
			 * If there was a racing rename or mount along our
			 * path, then we can't be sure that ".." hasn't jumped
			 * above nd->root (and so userspace should retry or use
			 * some fallback).
			 */
			smp_rmb();
			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1912
				return ERR_PTR(-EAGAIN);
1913
			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1914
				return ERR_PTR(-EAGAIN);
1915 1916
		}
	}
1917
	return NULL;
1918 1919
}

1920
static const char *walk_component(struct nameidata *nd, int flags)
1921
{
1922
	struct dentry *dentry;
1923
	struct inode *inode;
1924
	unsigned seq;
1925 1926 1927 1928 1929
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1930
	if (unlikely(nd->last_type != LAST_NORM)) {
A
Al Viro 已提交
1931
		if (!(flags & WALK_MORE) && nd->depth)
1932
			put_link(nd);
1933
		return handle_dots(nd, nd->last_type);
1934
	}
1935 1936
	dentry = lookup_fast(nd, &inode, &seq);
	if (IS_ERR(dentry))
1937
		return ERR_CAST(dentry);
1938
	if (unlikely(!dentry)) {
1939 1940
		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
		if (IS_ERR(dentry))
1941
			return ERR_CAST(dentry);
1942
	}
1943 1944
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
1945
	return step_into(nd, flags, dentry, inode, seq);
1946 1947
}

1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1967
#include <asm/word-at-a-time.h>
1968

1969
#ifdef HASH_MIX
1970

1971
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1972

1973
#elif defined(CONFIG_64BIT)
1974
/*
1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
2002
 */
2003 2004 2005 2006 2007
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
2008

2009
/*
2010 2011 2012
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
2013
 */
2014
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2015
{
2016 2017 2018
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
2019 2020
}

2021 2022
#else	/* 32-bit case */

2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
2038

2039
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2040
{
2041 2042
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
2043 2044
}

2045 2046
#endif

2047 2048 2049 2050 2051 2052 2053
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
2054
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2055
{
2056
	unsigned long a, x = 0, y = (unsigned long)salt;
2057 2058

	for (;;) {
2059 2060
		if (!len)
			goto done;
2061
		a = load_unaligned_zeropad(name);
2062 2063
		if (len < sizeof(unsigned long))
			break;
2064
		HASH_MIX(x, y, a);
2065 2066 2067
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
2068
	x ^= a & bytemask_from_count(len);
2069
done:
2070
	return fold_hash(x, y);
2071 2072 2073
}
EXPORT_SYMBOL(full_name_hash);

2074
/* Return the "hash_len" (hash and length) of a null-terminated string */
2075
u64 hashlen_string(const void *salt, const char *name)
2076
{
2077 2078
	unsigned long a = 0, x = 0, y = (unsigned long)salt;
	unsigned long adata, mask, len;
2079 2080
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

2081 2082 2083
	len = 0;
	goto inside;

2084
	do {
2085
		HASH_MIX(x, y, a);
2086
		len += sizeof(unsigned long);
2087
inside:
2088 2089 2090 2091 2092
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
2093
	x ^= a & zero_bytemask(mask);
2094

2095
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2096 2097 2098
}
EXPORT_SYMBOL(hashlen_string);

2099 2100
/*
 * Calculate the length and hash of the path component, and
2101
 * return the "hash_len" as the result.
2102
 */
2103
static inline u64 hash_name(const void *salt, const char *name)
2104
{
2105 2106
	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
	unsigned long adata, bdata, mask, len;
2107
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2108

2109 2110 2111
	len = 0;
	goto inside;

2112
	do {
2113
		HASH_MIX(x, y, a);
2114
		len += sizeof(unsigned long);
2115
inside:
2116
		a = load_unaligned_zeropad(name+len);
2117 2118 2119 2120 2121 2122
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
2123
	x ^= a & zero_bytemask(mask);
2124

2125
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2126 2127
}

2128
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2129

2130
/* Return the hash of a string of known length */
2131
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
L
Linus Torvalds 已提交
2132
{
2133
	unsigned long hash = init_name_hash(salt);
L
Linus Torvalds 已提交
2134
	while (len--)
2135
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
2136 2137
	return end_name_hash(hash);
}
2138
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
2139

2140
/* Return the "hash_len" (hash and length) of a null-terminated string */
2141
u64 hashlen_string(const void *salt, const char *name)
2142
{
2143
	unsigned long hash = init_name_hash(salt);
2144 2145 2146
	unsigned long len = 0, c;

	c = (unsigned char)*name;
2147
	while (c) {
2148 2149 2150
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
2151
	}
2152 2153
	return hashlen_create(end_name_hash(hash), len);
}
2154
EXPORT_SYMBOL(hashlen_string);
2155

2156 2157 2158 2159
/*
 * We know there's a real path component here of at least
 * one character.
 */
2160
static inline u64 hash_name(const void *salt, const char *name)
2161
{
2162
	unsigned long hash = init_name_hash(salt);
2163 2164 2165 2166 2167 2168 2169 2170
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2171
	return hashlen_create(end_name_hash(hash), len);
2172 2173
}

2174 2175
#endif

L
Linus Torvalds 已提交
2176 2177
/*
 * Name resolution.
2178 2179
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2180
 *
2181 2182
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2183
 */
2184
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2185
{
2186
	int depth = 0; // depth <= nd->depth
L
Linus Torvalds 已提交
2187
	int err;
A
Al Viro 已提交
2188

2189
	nd->last_type = LAST_ROOT;
2190
	nd->flags |= LOOKUP_PARENT;
2191 2192
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
2193 2194
	while (*name=='/')
		name++;
2195 2196
	if (!*name) {
		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2197
		return 0;
2198
	}
L
Linus Torvalds 已提交
2199 2200 2201

	/* At this point we know we have a real path component. */
	for(;;) {
2202
		const char *link;
2203
		u64 hash_len;
A
Al Viro 已提交
2204
		int type;
L
Linus Torvalds 已提交
2205

2206
		err = may_lookup(&init_user_ns, nd);
2207
		if (err)
2208
			return err;
L
Linus Torvalds 已提交
2209

2210
		hash_len = hash_name(nd->path.dentry, name);
L
Linus Torvalds 已提交
2211

A
Al Viro 已提交
2212
		type = LAST_NORM;
2213
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2214
			case 2:
2215
				if (name[1] == '.') {
A
Al Viro 已提交
2216
					type = LAST_DOTDOT;
A
Al Viro 已提交
2217 2218
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
2219 2220 2221 2222
				break;
			case 1:
				type = LAST_DOT;
		}
2223 2224
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
2225
			nd->flags &= ~LOOKUP_JUMPED;
2226
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2227
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2228
				err = parent->d_op->d_hash(parent, &this);
2229
				if (err < 0)
2230
					return err;
2231 2232
				hash_len = this.hash_len;
				name = this.name;
2233 2234
			}
		}
A
Al Viro 已提交
2235

2236 2237
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2238 2239
		nd->last_type = type;

2240 2241
		name += hashlen_len(hash_len);
		if (!*name)
2242
			goto OK;
2243 2244 2245 2246 2247
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2248 2249
			name++;
		} while (unlikely(*name == '/'));
2250 2251
		if (unlikely(!*name)) {
OK:
2252
			/* pathname or trailing symlink, done */
2253
			if (!depth) {
2254
				nd->dir_uid = i_uid_into_mnt(&init_user_ns, nd->inode);
2255
				nd->dir_mode = nd->inode->i_mode;
2256
				nd->flags &= ~LOOKUP_PARENT;
2257
				return 0;
2258
			}
2259
			/* last component of nested symlink */
2260
			name = nd->stack[--depth].name;
2261
			link = walk_component(nd, 0);
A
Al Viro 已提交
2262 2263
		} else {
			/* not the last component */
2264
			link = walk_component(nd, WALK_MORE);
2265
		}
2266 2267 2268 2269
		if (unlikely(link)) {
			if (IS_ERR(link))
				return PTR_ERR(link);
			/* a symlink to follow */
2270
			nd->stack[depth++].name = name;
2271 2272
			name = link;
			continue;
N
Nick Piggin 已提交
2273
		}
2274 2275
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
A
Al Viro 已提交
2276
				if (unlazy_walk(nd))
2277 2278
					return -ECHILD;
			}
2279
			return -ENOTDIR;
2280
		}
L
Linus Torvalds 已提交
2281 2282 2283
	}
}

2284
/* must be paired with terminate_walk() */
2285
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2286
{
2287
	int error;
2288
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2289

2290 2291
	if (!*s)
		flags &= ~LOOKUP_RCU;
2292 2293
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
2294

2295
	nd->flags = flags | LOOKUP_JUMPED;
N
Nick Piggin 已提交
2296
	nd->depth = 0;
2297 2298 2299 2300 2301

	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
	smp_rmb();

2302
	if (flags & LOOKUP_ROOT) {
2303 2304
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
2305 2306
		if (*s && unlikely(!d_can_lookup(root)))
			return ERR_PTR(-ENOTDIR);
2307 2308 2309
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
2310
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2311
			nd->root_seq = nd->seq;
2312 2313 2314
		} else {
			path_get(&nd->path);
		}
2315
		return s;
2316 2317
	}

N
Nick Piggin 已提交
2318
	nd->root.mnt = NULL;
2319 2320
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
N
Nick Piggin 已提交
2321

2322 2323
	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2324 2325 2326 2327
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		return s;
2328 2329 2330 2331
	}

	/* Relative pathname -- get the starting-point it is relative to. */
	if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2332 2333 2334
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2335

A
Al Viro 已提交
2336 2337 2338
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2339
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2340 2341 2342 2343
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2344
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2345
		}
N
Nick Piggin 已提交
2346
	} else {
2347
		/* Caller must check execute permissions on the starting path component */
2348
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2349 2350
		struct dentry *dentry;

2351
		if (!f.file)
2352
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2353

2354
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2355

2356 2357 2358
		if (*s && unlikely(!d_can_lookup(dentry))) {
			fdput(f);
			return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2359
		}
N
Nick Piggin 已提交
2360

2361
		nd->path = f.file->f_path;
A
Al Viro 已提交
2362
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2363 2364
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2365
		} else {
2366
			path_get(&nd->path);
A
Al Viro 已提交
2367
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2368
		}
A
Al Viro 已提交
2369
		fdput(f);
N
Nick Piggin 已提交
2370
	}
2371

2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382
	/* For scoped-lookups we need to set the root to the dirfd as well. */
	if (flags & LOOKUP_IS_SCOPED) {
		nd->root = nd->path;
		if (flags & LOOKUP_RCU) {
			nd->root_seq = nd->seq;
		} else {
			path_get(&nd->root);
			nd->flags |= LOOKUP_ROOT_GRABBED;
		}
	}
	return s;
2383 2384
}

2385
static inline const char *lookup_last(struct nameidata *nd)
2386 2387 2388 2389
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

2390
	return walk_component(nd, WALK_TRAILING);
2391 2392
}

2393 2394
static int handle_lookup_down(struct nameidata *nd)
{
2395
	if (!(nd->flags & LOOKUP_RCU))
2396
		dget(nd->path.dentry);
2397 2398
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
2399 2400
}

2401
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2402
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2403
{
2404
	const char *s = path_init(nd, flags);
2405
	int err;
N
Nick Piggin 已提交
2406

2407
	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2408
		err = handle_lookup_down(nd);
2409 2410
		if (unlikely(err < 0))
			s = ERR_PTR(err);
2411 2412
	}

2413 2414 2415
	while (!(err = link_path_walk(s, nd)) &&
	       (s = lookup_last(nd)) != NULL)
		;
2416 2417
	if (!err)
		err = complete_walk(nd);
2418

2419 2420
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2421
			err = -ENOTDIR;
2422 2423 2424 2425
	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
		err = handle_lookup_down(nd);
		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
	}
2426 2427 2428 2429 2430 2431
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2432
	return err;
A
Al Viro 已提交
2433
}
N
Nick Piggin 已提交
2434

2435 2436
int filename_lookup(int dfd, struct filename *name, unsigned flags,
		    struct path *path, struct path *root)
A
Al Viro 已提交
2437
{
2438
	int retval;
2439
	struct nameidata nd;
2440 2441
	if (IS_ERR(name))
		return PTR_ERR(name);
2442 2443 2444 2445
	if (unlikely(root)) {
		nd.root = *root;
		flags |= LOOKUP_ROOT;
	}
2446
	set_nameidata(&nd, dfd, name);
2447
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2448
	if (unlikely(retval == -ECHILD))
2449
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2450
	if (unlikely(retval == -ESTALE))
2451
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2452

2453
	if (likely(!retval))
2454 2455
		audit_inode(name, path->dentry,
			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2456
	restore_nameidata();
2457
	putname(name);
2458
	return retval;
L
Linus Torvalds 已提交
2459 2460
}

2461
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2462
static int path_parentat(struct nameidata *nd, unsigned flags,
2463
				struct path *parent)
2464
{
2465
	const char *s = path_init(nd, flags);
2466
	int err = link_path_walk(s, nd);
2467 2468
	if (!err)
		err = complete_walk(nd);
2469 2470 2471 2472 2473 2474
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2475 2476 2477
	return err;
}

2478
static struct filename *filename_parentat(int dfd, struct filename *name,
2479 2480
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2481 2482
{
	int retval;
2483
	struct nameidata nd;
2484

2485 2486
	if (IS_ERR(name))
		return name;
2487
	set_nameidata(&nd, dfd, name);
2488
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2489
	if (unlikely(retval == -ECHILD))
2490
		retval = path_parentat(&nd, flags, parent);
2491
	if (unlikely(retval == -ESTALE))
2492
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2493 2494 2495
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
2496
		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2497 2498 2499
	} else {
		putname(name);
		name = ERR_PTR(retval);
2500
	}
2501
	restore_nameidata();
2502
	return name;
2503 2504
}

A
Al Viro 已提交
2505 2506
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2507
{
2508 2509
	struct filename *filename;
	struct dentry *d;
2510 2511
	struct qstr last;
	int type;
2512

2513 2514
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2515 2516
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2517
	if (unlikely(type != LAST_NORM)) {
2518
		path_put(path);
2519 2520
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2521
	}
A
Al Viro 已提交
2522
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2523
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2524
	if (IS_ERR(d)) {
A
Al Viro 已提交
2525
		inode_unlock(path->dentry->d_inode);
2526
		path_put(path);
A
Al Viro 已提交
2527
	}
2528
	putname(filename);
A
Al Viro 已提交
2529
	return d;
2530 2531
}

A
Al Viro 已提交
2532 2533
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2534 2535
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2536
}
2537
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2538

2539 2540 2541 2542 2543 2544
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2545
 * @path: pointer to struct path to fill
2546 2547 2548
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2549
		    struct path *path)
2550
{
2551 2552
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2553 2554
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2555
}
2556
EXPORT_SYMBOL(vfs_path_lookup);
2557

2558 2559
static int lookup_one_len_common(const char *name, struct dentry *base,
				 int len, struct qstr *this)
2560
{
2561 2562 2563
	this->name = name;
	this->len = len;
	this->hash = full_name_hash(base, name, len);
A
Al Viro 已提交
2564
	if (!len)
2565
		return -EACCES;
A
Al Viro 已提交
2566

A
Al Viro 已提交
2567 2568
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
2569
			return -EACCES;
A
Al Viro 已提交
2570 2571
	}

A
Al Viro 已提交
2572
	while (len--) {
2573
		unsigned int c = *(const unsigned char *)name++;
A
Al Viro 已提交
2574
		if (c == '/' || c == '\0')
2575
			return -EACCES;
A
Al Viro 已提交
2576
	}
2577 2578 2579 2580 2581
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2582
		int err = base->d_op->d_hash(base, this);
2583
		if (err < 0)
2584
			return err;
2585
	}
2586

2587
	return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
2588 2589
}

2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618
/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
	if (err)
		return ERR_PTR(err);

	return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631
/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
2632
	struct dentry *dentry;
2633 2634 2635 2636 2637 2638
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
2639 2640 2641
	if (err)
		return ERR_PTR(err);

2642 2643
	dentry = lookup_dcache(&this, base, 0);
	return dentry ? dentry : __lookup_slow(&this, base, 0);
2644
}
2645
EXPORT_SYMBOL(lookup_one_len);
2646

2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	int err;
2664
	struct dentry *ret;
2665

2666
	err = lookup_one_len_common(name, base, len, &this);
2667 2668 2669
	if (err)
		return ERR_PTR(err);

2670 2671 2672 2673
	ret = lookup_dcache(&this, base, 0);
	if (!ret)
		ret = lookup_slow(&this, base, 0);
	return ret;
2674 2675 2676
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

A
Al Viro 已提交
2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688
/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2689
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
A
Al Viro 已提交
2690 2691 2692 2693 2694 2695 2696
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

2697 2698 2699 2700 2701 2702
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
2703 2704
	struct dentry *parent = dget_parent(path->dentry);
	struct dentry *child;
A
Al Viro 已提交
2705
	struct qstr this = QSTR_INIT("pts", 3);
2706

2707 2708
	if (unlikely(!path_connected(path->mnt, parent))) {
		dput(parent);
2709
		return -ENOENT;
2710
	}
2711 2712
	dput(path->dentry);
	path->dentry = parent;
2713 2714 2715 2716 2717 2718
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
A
Al Viro 已提交
2719
	follow_down(path);
2720 2721 2722 2723
	return 0;
}
#endif

2724 2725
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2726
{
2727 2728
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2729
}
2730
EXPORT_SYMBOL(user_path_at_empty);
2731

2732 2733
int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
		   struct inode *inode)
L
Linus Torvalds 已提交
2734
{
2735
	kuid_t fsuid = current_fsuid();
2736

2737
	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
L
Linus Torvalds 已提交
2738
		return 0;
2739
	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
L
Linus Torvalds 已提交
2740
		return 0;
2741
	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2742
}
M
Miklos Szeredi 已提交
2743
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
2758 2759 2760 2761 2762
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
L
Linus Torvalds 已提交
2763 2764
 *     nfs_async_unlink().
 */
2765 2766
static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
		      struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2767
{
2768
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2769 2770
	int error;

2771
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2772
		return -ENOENT;
2773
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2774 2775

	BUG_ON(victim->d_parent->d_inode != dir);
2776 2777

	/* Inode writeback is not safe when the uid or gid are invalid. */
2778 2779
	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
2780 2781
		return -EOVERFLOW;

2782
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2783

2784
	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2785 2786 2787 2788
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2789

2790 2791 2792
	if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
	    HAS_UNMAPPED_ID(mnt_userns, inode))
L
Linus Torvalds 已提交
2793 2794
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2795
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2796 2797 2798
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2799
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
2813 2814 2815
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
L
Linus Torvalds 已提交
2816
 */
2817 2818
static inline int may_create(struct user_namespace *mnt_userns,
			     struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2819
{
2820
	struct user_namespace *s_user_ns;
2821
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2822 2823 2824 2825
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2826
	s_user_ns = dir->i_sb->s_user_ns;
2827 2828
	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
2829
		return -EOVERFLOW;
2830
	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2831 2832 2833 2834 2835 2836 2837 2838 2839 2840
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2841
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2842 2843 2844
		return NULL;
	}

2845
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2846

2847 2848
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2849 2850
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2851
		return p;
L
Linus Torvalds 已提交
2852 2853
	}

2854 2855
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2856 2857
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2858
		return p;
L
Linus Torvalds 已提交
2859 2860
	}

A
Al Viro 已提交
2861 2862
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2863 2864
	return NULL;
}
2865
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2866 2867 2868

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2869
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2870
	if (p1 != p2) {
A
Al Viro 已提交
2871
		inode_unlock(p2->d_inode);
2872
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2873 2874
	}
}
2875
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2876

2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894
/**
 * vfs_create - create new file
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new file
 * @want_excl:	whether the file must not yet exist
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
	       struct dentry *dentry, umode_t mode, bool want_excl)
L
Linus Torvalds 已提交
2895
{
2896
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
2897 2898 2899
	if (error)
		return error;

A
Al Viro 已提交
2900
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2901 2902 2903 2904 2905 2906
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2907
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2908
	if (!error)
2909
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2910 2911
	return error;
}
2912
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2913

A
Al Viro 已提交
2914 2915 2916 2917 2918
int vfs_mkobj(struct dentry *dentry, umode_t mode,
		int (*f)(struct dentry *, umode_t, void *),
		void *arg)
{
	struct inode *dir = dentry->d_parent->d_inode;
2919
	int error = may_create(&init_user_ns, dir, dentry);
A
Al Viro 已提交
2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
	if (error)
		return error;

	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	error = f(dentry, mode, arg);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}
EXPORT_SYMBOL(vfs_mkobj);

2935 2936 2937 2938 2939 2940
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

2941 2942
static int may_open(struct user_namespace *mnt_userns, const struct path *path,
		    int acc_mode, int flag)
L
Linus Torvalds 已提交
2943
{
2944
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2945 2946 2947 2948 2949 2950
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2951 2952
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2953
		return -ELOOP;
C
Christoph Hellwig 已提交
2954
	case S_IFDIR:
2955
		if (acc_mode & MAY_WRITE)
C
Christoph Hellwig 已提交
2956
			return -EISDIR;
2957 2958
		if (acc_mode & MAY_EXEC)
			return -EACCES;
C
Christoph Hellwig 已提交
2959 2960 2961
		break;
	case S_IFBLK:
	case S_IFCHR:
2962
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2963
			return -EACCES;
K
Kees Cook 已提交
2964
		fallthrough;
C
Christoph Hellwig 已提交
2965 2966
	case S_IFIFO:
	case S_IFSOCK:
K
Kees Cook 已提交
2967 2968
		if (acc_mode & MAY_EXEC)
			return -EACCES;
L
Linus Torvalds 已提交
2969
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2970
		break;
2971 2972 2973 2974
	case S_IFREG:
		if ((acc_mode & MAY_EXEC) && path_noexec(path))
			return -EACCES;
		break;
2975
	}
2976

2977
	error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
2978 2979
	if (error)
		return error;
M
Mimi Zohar 已提交
2980

L
Linus Torvalds 已提交
2981 2982 2983 2984
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2985
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2986
			return -EPERM;
L
Linus Torvalds 已提交
2987
		if (flag & O_TRUNC)
2988
			return -EPERM;
L
Linus Torvalds 已提交
2989 2990 2991
	}

	/* O_NOATIME can only be set by the owner or superuser */
2992
	if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
2993
		return -EPERM;
L
Linus Torvalds 已提交
2994

2995
	return 0;
2996
}
L
Linus Torvalds 已提交
2997

2998
static int handle_truncate(struct file *filp)
2999
{
A
Al Viro 已提交
3000
	const struct path *path = &filp->f_path;
3001 3002 3003 3004 3005 3006 3007
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
3008
	error = locks_verify_locked(filp);
3009
	if (!error)
3010
		error = security_path_truncate(path);
3011
	if (!error) {
3012
		error = do_truncate(&init_user_ns, path->dentry, 0,
3013
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3014
				    filp);
3015 3016
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
3017
	return error;
L
Linus Torvalds 已提交
3018 3019
}

3020 3021
static inline int open_to_namei_flags(int flag)
{
3022 3023
	if ((flag & O_ACCMODE) == 3)
		flag--;
3024 3025 3026
	return flag;
}

3027 3028 3029
static int may_o_create(struct user_namespace *mnt_userns,
			const struct path *dir, struct dentry *dentry,
			umode_t mode)
M
Miklos Szeredi 已提交
3030
{
3031
	struct user_namespace *s_user_ns;
M
Miklos Szeredi 已提交
3032 3033 3034 3035
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

3036
	s_user_ns = dir->dentry->d_sb->s_user_ns;
3037 3038
	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
3039 3040
		return -EOVERFLOW;

3041
	error = inode_permission(mnt_userns, dir->dentry->d_inode,
3042
				 MAY_WRITE | MAY_EXEC);
M
Miklos Szeredi 已提交
3043 3044 3045 3046 3047 3048
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

3049 3050 3051 3052 3053 3054 3055
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
3056 3057 3058
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
3059 3060 3061
 *
 * Returns an error code otherwise.
 */
3062 3063 3064
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
				  struct file *file,
				  int open_flag, umode_t mode)
M
Miklos Szeredi 已提交
3065
{
3066
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
3067 3068 3069 3070 3071 3072
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
3073 3074
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
3075
	error = dir->i_op->atomic_open(dir, dentry, file,
3076
				       open_to_namei_flags(open_flag), mode);
3077
	d_lookup_done(dentry);
3078
	if (!error) {
3079
		if (file->f_mode & FMODE_OPENED) {
3080 3081 3082 3083
			if (unlikely(dentry != file->f_path.dentry)) {
				dput(dentry);
				dentry = dget(file->f_path.dentry);
			}
3084
		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3085
			error = -EIO;
3086
		} else {
3087 3088 3089
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
3090
			}
3091
			if (unlikely(d_is_negative(dentry)))
A
Al Viro 已提交
3092
				error = -ENOENT;
3093
		}
M
Miklos Szeredi 已提交
3094
	}
3095 3096 3097 3098 3099
	if (error) {
		dput(dentry);
		dentry = ERR_PTR(error);
	}
	return dentry;
M
Miklos Szeredi 已提交
3100 3101
}

M
Miklos Szeredi 已提交
3102
/*
3103
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3104
 *
3105
 * Must be called with parent locked (exclusive in O_CREAT case).
3106
 *
3107 3108 3109 3110 3111 3112 3113
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
3114
 *
3115
 * An error code is returned on failure.
M
Miklos Szeredi 已提交
3116
 */
3117 3118 3119
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
M
Miklos Szeredi 已提交
3120 3121
{
	struct dentry *dir = nd->path.dentry;
3122
	struct inode *dir_inode = dir->d_inode;
3123
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3124
	struct dentry *dentry;
3125 3126
	int error, create_error = 0;
	umode_t mode = op->mode;
3127
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3128

3129
	if (unlikely(IS_DEADDIR(dir_inode)))
3130
		return ERR_PTR(-ENOENT);
M
Miklos Szeredi 已提交
3131

3132
	file->f_mode &= ~FMODE_CREATED;
3133 3134 3135 3136 3137
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
3138
				return dentry;
3139 3140 3141
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3142

3143 3144 3145 3146 3147 3148 3149 3150 3151 3152
		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3153
		/* Cached positive dentry: will open in f_op->open */
3154
		return dentry;
3155
	}
M
Miklos Szeredi 已提交
3156

3157 3158 3159 3160 3161 3162 3163 3164 3165
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
3166 3167
	if (unlikely(!got_write))
		open_flag &= ~O_TRUNC;
3168
	if (open_flag & O_CREAT) {
3169 3170
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
3171 3172
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
3173
		if (likely(got_write))
3174 3175
			create_error = may_o_create(&init_user_ns, &nd->path,
						    dentry, mode);
3176 3177
		else
			create_error = -EROFS;
M
Miklos Szeredi 已提交
3178
	}
3179 3180
	if (create_error)
		open_flag &= ~O_CREAT;
3181
	if (dir_inode->i_op->atomic_open) {
3182
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3183 3184 3185
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
M
Miklos Szeredi 已提交
3186
	}
3187

3188
	if (d_in_lookup(dentry)) {
3189 3190
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3191
		d_lookup_done(dentry);
3192 3193 3194 3195 3196 3197 3198 3199
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3200 3201
	}

M
Miklos Szeredi 已提交
3202
	/* Negative dentry, just create the file */
3203
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3204
		file->f_mode |= FMODE_CREATED;
3205 3206 3207
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3208
			goto out_dput;
3209 3210
		}
		error = dir_inode->i_op->create(dir_inode, dentry, mode,
3211
						open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3212 3213 3214
		if (error)
			goto out_dput;
	}
3215 3216 3217
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3218
	}
3219
	return dentry;
M
Miklos Szeredi 已提交
3220 3221 3222

out_dput:
	dput(dentry);
3223
	return ERR_PTR(error);
M
Miklos Szeredi 已提交
3224 3225
}

3226
static const char *open_last_lookups(struct nameidata *nd,
3227
		   struct file *file, const struct open_flags *op)
3228
{
3229
	struct dentry *dir = nd->path.dentry;
3230
	int open_flag = op->open_flag;
3231
	bool got_write = false;
3232
	unsigned seq;
3233
	struct inode *inode;
3234
	struct dentry *dentry;
3235
	const char *res;
A
Al Viro 已提交
3236
	int error;
3237

3238 3239
	nd->flags |= op->intent;

3240
	if (nd->last_type != LAST_NORM) {
3241 3242
		if (nd->depth)
			put_link(nd);
3243
		return handle_dots(nd, nd->last_type);
3244
	}
3245

3246
	if (!(open_flag & O_CREAT)) {
3247 3248 3249
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3250 3251
		dentry = lookup_fast(nd, &inode, &seq);
		if (IS_ERR(dentry))
3252
			return ERR_CAST(dentry);
3253
		if (likely(dentry))
3254 3255
			goto finish_lookup;

A
Al Viro 已提交
3256
		BUG_ON(nd->flags & LOOKUP_RCU);
3257 3258
	} else {
		/* create side of things */
3259 3260 3261 3262 3263
		if (nd->flags & LOOKUP_RCU) {
			error = unlazy_walk(nd);
			if (unlikely(error))
				return ERR_PTR(error);
		}
3264
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3265
		/* trailing slashes? */
3266
		if (unlikely(nd->last.name[nd->last.len]))
3267
			return ERR_PTR(-EISDIR);
3268
	}
A
Al Viro 已提交
3269

3270
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3271 3272 3273 3274 3275 3276 3277 3278 3279
		error = mnt_want_write(nd->path.mnt);
		if (!error)
			got_write = true;
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3280 3281 3282 3283
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3284
	dentry = lookup_open(nd, file, op, got_write);
3285 3286
	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);
3287 3288 3289 3290
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3291

3292
	if (got_write)
3293
		mnt_drop_write(nd->path.mnt);
M
Miklos Szeredi 已提交
3294

3295 3296 3297
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

3298
	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3299 3300
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
3301
		return NULL;
3302 3303
	}

3304
finish_lookup:
3305 3306
	if (nd->depth)
		put_link(nd);
3307
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3308
	if (unlikely(res))
3309
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3310
	return res;
3311 3312 3313 3314 3315
}

/*
 * Handle the last step of open()
 */
3316
static int do_open(struct nameidata *nd,
3317 3318 3319 3320 3321 3322 3323
		   struct file *file, const struct open_flags *op)
{
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

3324 3325 3326 3327 3328
	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
3329 3330
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);
3331
	if (open_flag & O_CREAT) {
3332 3333
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
3334
		if (d_is_dir(nd->path.dentry))
3335
			return -EISDIR;
3336
		error = may_create_in_sticky(&init_user_ns, nd,
3337 3338
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
3339
			return error;
3340
	}
M
Miklos Szeredi 已提交
3341
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3342
		return -ENOTDIR;
3343

3344 3345
	do_truncate = false;
	acc_mode = op->acc_mode;
3346 3347 3348 3349
	if (file->f_mode & FMODE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
3350
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3351 3352
		error = mnt_want_write(nd->path.mnt);
		if (error)
3353
			return error;
3354
		do_truncate = true;
3355
	}
3356
	error = may_open(&init_user_ns, &nd->path, acc_mode, open_flag);
3357
	if (!error && !(file->f_mode & FMODE_OPENED))
A
Al Viro 已提交
3358
		error = vfs_open(&nd->path, file);
3359 3360 3361
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
3362
		error = handle_truncate(file);
3363 3364 3365 3366
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3367
	if (do_truncate)
3368
		mnt_drop_write(nd->path.mnt);
3369
	return error;
3370 3371
}

3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388
/**
 * vfs_tmpfile - create tmpfile
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new tmpfile
 * @open_flags:	flags
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
			   struct dentry *dentry, umode_t mode, int open_flag)
3389 3390 3391 3392 3393 3394 3395
{
	struct dentry *child = NULL;
	struct inode *dir = dentry->d_inode;
	struct inode *inode;
	int error;

	/* we want directory to be writable */
3396
	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
3397 3398 3399 3400 3401 3402
	if (error)
		goto out_err;
	error = -EOPNOTSUPP;
	if (!dir->i_op->tmpfile)
		goto out_err;
	error = -ENOMEM;
D
David Howells 已提交
3403
	child = d_alloc(dentry, &slash_name);
3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417
	if (unlikely(!child))
		goto out_err;
	error = dir->i_op->tmpfile(dir, child, mode);
	if (error)
		goto out_err;
	error = -ENOENT;
	inode = child->d_inode;
	if (unlikely(!inode))
		goto out_err;
	if (!(open_flag & O_EXCL)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3418
	ima_post_create_tmpfile(inode);
3419 3420 3421 3422 3423 3424 3425 3426
	return child;

out_err:
	dput(child);
	return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

3427
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3428
		const struct open_flags *op,
3429
		struct file *file)
3430
{
3431
	struct user_namespace *mnt_userns;
3432 3433
	struct dentry *child;
	struct path path;
3434
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3435 3436
	if (unlikely(error))
		return error;
3437
	error = mnt_want_write(path.mnt);
3438 3439
	if (unlikely(error))
		goto out;
3440 3441
	mnt_userns = mnt_user_ns(path.mnt);
	child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
3442
	error = PTR_ERR(child);
3443
	if (IS_ERR(child))
3444
		goto out2;
3445 3446
	dput(path.dentry);
	path.dentry = child;
3447
	audit_inode(nd->name, child, 0);
3448
	/* Don't check for other permissions, the inode was just created */
3449
	error = may_open(&init_user_ns, &path, 0, op->open_flag);
3450 3451
	if (error)
		goto out2;
3452
	file->f_path.mnt = path.mnt;
3453
	error = finish_open(file, child, NULL);
3454
out2:
3455
	mnt_drop_write(path.mnt);
3456
out:
3457
	path_put(&path);
3458 3459 3460
	return error;
}

3461 3462 3463 3464 3465 3466
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
3467
		error = vfs_open(&path, file);
3468 3469 3470 3471 3472
		path_put(&path);
	}
	return error;
}

3473 3474
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3475
{
A
Al Viro 已提交
3476
	struct file *file;
3477
	int error;
N
Nick Piggin 已提交
3478

3479
	file = alloc_empty_file(op->open_flag, current_cred());
3480 3481
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3482

A
Al Viro 已提交
3483
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3484
		error = do_tmpfile(nd, flags, op, file);
3485
	} else if (unlikely(file->f_flags & O_PATH)) {
3486
		error = do_o_path(nd, flags, file);
3487 3488 3489
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
3490
		       (s = open_last_lookups(nd, file, op)) != NULL)
3491
			;
3492 3493
		if (!error)
			error = do_open(nd, file, op);
3494
		terminate_walk(nd);
3495
	}
3496
	if (likely(!error)) {
3497
		if (likely(file->f_mode & FMODE_OPENED))
3498 3499 3500
			return file;
		WARN_ON(1);
		error = -EINVAL;
3501
	}
3502 3503 3504 3505 3506 3507
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
3508
	}
3509
	return ERR_PTR(error);
L
Linus Torvalds 已提交
3510 3511
}

3512
struct file *do_filp_open(int dfd, struct filename *pathname,
3513
		const struct open_flags *op)
3514
{
3515
	struct nameidata nd;
3516
	int flags = op->lookup_flags;
3517 3518
	struct file *filp;

3519
	set_nameidata(&nd, dfd, pathname);
3520
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3521
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3522
		filp = path_openat(&nd, op, flags);
3523
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3524
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3525
	restore_nameidata();
3526 3527 3528
	return filp;
}

A
Al Viro 已提交
3529
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3530
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3531
{
3532
	struct nameidata nd;
A
Al Viro 已提交
3533
	struct file *file;
3534
	struct filename *filename;
3535
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3536 3537 3538 3539

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3540
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3541 3542
		return ERR_PTR(-ELOOP);

3543
	filename = getname_kernel(name);
3544
	if (IS_ERR(filename))
3545 3546
		return ERR_CAST(filename);

3547
	set_nameidata(&nd, -1, filename);
3548
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3549
	if (unlikely(file == ERR_PTR(-ECHILD)))
3550
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3551
	if (unlikely(file == ERR_PTR(-ESTALE)))
3552
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3553
	restore_nameidata();
3554
	putname(filename);
A
Al Viro 已提交
3555 3556 3557
	return file;
}

3558
static struct dentry *filename_create(int dfd, struct filename *name,
3559
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3560
{
3561
	struct dentry *dentry = ERR_PTR(-EEXIST);
3562 3563
	struct qstr last;
	int type;
3564
	int err2;
3565 3566 3567 3568 3569 3570 3571 3572 3573
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3574 3575 3576
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3577

3578 3579 3580 3581
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3582
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3583
		goto out;
3584

3585
	/* don't fail immediately if it's r/o, at least try to report other errors */
3586
	err2 = mnt_want_write(path->mnt);
3587 3588 3589
	/*
	 * Do the final lookup.
	 */
3590
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3591
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3592
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3593
	if (IS_ERR(dentry))
3594
		goto unlock;
3595

3596
	error = -EEXIST;
3597
	if (d_is_positive(dentry))
3598
		goto fail;
3599

3600 3601 3602 3603 3604 3605
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3606
	if (unlikely(!is_dir && last.name[last.len])) {
3607
		error = -ENOENT;
A
Al Viro 已提交
3608
		goto fail;
3609
	}
3610 3611
	if (unlikely(err2)) {
		error = err2;
3612
		goto fail;
3613
	}
3614
	putname(name);
L
Linus Torvalds 已提交
3615 3616
	return dentry;
fail:
3617 3618 3619
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3620
	inode_unlock(path->dentry->d_inode);
3621
	if (!err2)
3622
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3623
out:
3624
	path_put(path);
3625
	putname(name);
L
Linus Torvalds 已提交
3626 3627
	return dentry;
}
3628 3629 3630 3631

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3632 3633
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3634
}
3635 3636
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3637 3638 3639
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3640
	inode_unlock(path->dentry->d_inode);
3641
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3642 3643 3644 3645
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3646
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3647
				struct path *path, unsigned int lookup_flags)
3648
{
3649
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3650 3651 3652
}
EXPORT_SYMBOL(user_path_create);

3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670
/**
 * vfs_mknod - create device node or file
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new device node or file
 * @dev:	device number of device to create
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
	      struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3671
{
3672
	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3673
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
3674 3675 3676 3677

	if (error)
		return error;

3678 3679
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
	    !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3680 3681
		return -EPERM;

A
Al Viro 已提交
3682
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3683 3684
		return -EPERM;

3685 3686 3687 3688
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3689 3690 3691 3692 3693
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3694
	if (!error)
3695
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3696 3697
	return error;
}
3698
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3699

A
Al Viro 已提交
3700
static int may_mknod(umode_t mode)
3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

3717
static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3718
		unsigned int dev)
L
Linus Torvalds 已提交
3719
{
3720
	struct user_namespace *mnt_userns;
3721
	struct dentry *dentry;
3722 3723
	struct path path;
	int error;
3724
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3725

3726 3727 3728
	error = may_mknod(mode);
	if (error)
		return error;
3729 3730
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3731 3732
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3733

3734
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3735
		mode &= ~current_umask();
3736
	error = security_path_mknod(&path, dentry, mode, dev);
3737
	if (error)
3738
		goto out;
3739 3740

	mnt_userns = mnt_user_ns(path.mnt);
3741
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3742
		case 0: case S_IFREG:
3743 3744
			error = vfs_create(mnt_userns, path.dentry->d_inode,
					   dentry, mode, true);
3745 3746
			if (!error)
				ima_post_path_mknod(dentry);
L
Linus Torvalds 已提交
3747 3748
			break;
		case S_IFCHR: case S_IFBLK:
3749 3750
			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
					  dentry, mode, new_decode_dev(dev));
L
Linus Torvalds 已提交
3751 3752
			break;
		case S_IFIFO: case S_IFSOCK:
3753 3754
			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
					  dentry, mode, 0);
L
Linus Torvalds 已提交
3755 3756
			break;
	}
3757
out:
A
Al Viro 已提交
3758
	done_path_create(&path, dentry);
3759 3760 3761 3762
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3763 3764 3765
	return error;
}

3766 3767 3768 3769 3770 3771
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
		unsigned int, dev)
{
	return do_mknodat(dfd, filename, mode, dev);
}

A
Al Viro 已提交
3772
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3773
{
3774
	return do_mknodat(AT_FDCWD, filename, mode, dev);
3775 3776
}

3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793
/**
 * vfs_mkdir - create directory
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @mode:	mode of the new directory
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
	      struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3794
{
3795
	int error = may_create(mnt_userns, dir, dentry);
3796
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3797 3798 3799 3800

	if (error)
		return error;

A
Al Viro 已提交
3801
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3802 3803 3804 3805 3806 3807 3808
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3809 3810 3811
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3812
	error = dir->i_op->mkdir(dir, dentry, mode);
3813
	if (!error)
3814
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3815 3816
	return error;
}
3817
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3818

3819
static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
L
Linus Torvalds 已提交
3820
{
3821
	struct dentry *dentry;
3822 3823
	struct path path;
	int error;
3824
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3825

3826 3827
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3828
	if (IS_ERR(dentry))
3829
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3830

3831
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3832
		mode &= ~current_umask();
3833
	error = security_path_mkdir(&path, dentry, mode);
3834 3835 3836 3837 3838
	if (!error) {
		struct user_namespace *mnt_userns;
		mnt_userns = mnt_user_ns(path.mnt);
		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, mode);
	}
A
Al Viro 已提交
3839
	done_path_create(&path, dentry);
3840 3841 3842 3843
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3844 3845 3846
	return error;
}

3847 3848 3849 3850 3851
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
	return do_mkdirat(dfd, pathname, mode);
}

3852
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3853
{
3854
	return do_mkdirat(AT_FDCWD, pathname, mode);
3855 3856
}

3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872
/**
 * vfs_rmdir - remove directory
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
		     struct dentry *dentry)
L
Linus Torvalds 已提交
3873
{
3874
	int error = may_delete(mnt_userns, dir, dentry, 1);
L
Linus Torvalds 已提交
3875 3876 3877 3878

	if (error)
		return error;

A
Al Viro 已提交
3879
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3880 3881
		return -EPERM;

3882
	dget(dentry);
A
Al Viro 已提交
3883
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3884 3885

	error = -EBUSY;
3886
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3887 3888 3889 3890 3891 3892 3893 3894 3895 3896
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

3897
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3898 3899
	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3900
	detach_mounts(dentry);
3901
	fsnotify_rmdir(dir, dentry);
S
Sage Weil 已提交
3902 3903

out:
A
Al Viro 已提交
3904
	inode_unlock(dentry->d_inode);
3905
	dput(dentry);
S
Sage Weil 已提交
3906
	if (!error)
L
Linus Torvalds 已提交
3907 3908 3909
		d_delete(dentry);
	return error;
}
3910
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3911

3912
long do_rmdir(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3913
{
3914
	struct user_namespace *mnt_userns;
L
Linus Torvalds 已提交
3915 3916
	int error = 0;
	struct dentry *dentry;
3917 3918 3919
	struct path path;
	struct qstr last;
	int type;
3920 3921
	unsigned int lookup_flags = 0;
retry:
3922
	name = filename_parentat(dfd, name, lookup_flags,
A
Al Viro 已提交
3923
				&path, &last, &type);
3924 3925
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3926

3927
	switch (type) {
3928 3929 3930 3931 3932 3933 3934 3935 3936
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3937
	}
3938

3939
	error = mnt_want_write(path.mnt);
3940 3941
	if (error)
		goto exit1;
3942

A
Al Viro 已提交
3943
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3944
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3945
	error = PTR_ERR(dentry);
3946 3947
	if (IS_ERR(dentry))
		goto exit2;
3948 3949 3950 3951
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3952
	error = security_path_rmdir(&path, dentry);
3953
	if (error)
3954
		goto exit3;
3955 3956
	mnt_userns = mnt_user_ns(path.mnt);
	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
3957
exit3:
3958 3959
	dput(dentry);
exit2:
A
Al Viro 已提交
3960
	inode_unlock(path.dentry->d_inode);
3961
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3962
exit1:
3963
	path_put(&path);
3964 3965 3966 3967
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
A
Al Viro 已提交
3968
	putname(name);
L
Linus Torvalds 已提交
3969 3970 3971
	return error;
}

3972
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3973
{
3974
	return do_rmdir(AT_FDCWD, getname(pathname));
3975 3976
}

3977 3978
/**
 * vfs_unlink - unlink a filesystem object
3979
 * @mnt_userns:	user namespace of the mount the inode was found from
3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
3995 3996 3997 3998 3999 4000
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
4001
 */
4002 4003
int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
	       struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4004
{
J
J. Bruce Fields 已提交
4005
	struct inode *target = dentry->d_inode;
4006
	int error = may_delete(mnt_userns, dir, dentry, 0);
L
Linus Torvalds 已提交
4007 4008 4009 4010

	if (error)
		return error;

A
Al Viro 已提交
4011
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
4012 4013
		return -EPERM;

A
Al Viro 已提交
4014
	inode_lock(target);
4015
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
4016 4017 4018
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
4019
		if (!error) {
4020 4021
			error = try_break_deleg(target, delegated_inode);
			if (error)
4022
				goto out;
L
Linus Torvalds 已提交
4023
			error = dir->i_op->unlink(dir, dentry);
4024
			if (!error) {
4025
				dont_mount(dentry);
4026
				detach_mounts(dentry);
4027
				fsnotify_unlink(dir, dentry);
4028
			}
4029
		}
L
Linus Torvalds 已提交
4030
	}
4031
out:
A
Al Viro 已提交
4032
	inode_unlock(target);
L
Linus Torvalds 已提交
4033 4034 4035

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
4036
		fsnotify_link_count(target);
J
John McCutchan 已提交
4037
		d_delete(dentry);
L
Linus Torvalds 已提交
4038
	}
R
Robert Love 已提交
4039

L
Linus Torvalds 已提交
4040 4041
	return error;
}
4042
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
4043 4044 4045

/*
 * Make sure that the actual truncation of the file will occur outside its
4046
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
4047 4048 4049
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
4050
long do_unlinkat(int dfd, struct filename *name)
L
Linus Torvalds 已提交
4051
{
4052
	int error;
L
Linus Torvalds 已提交
4053
	struct dentry *dentry;
4054 4055 4056
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
4057
	struct inode *inode = NULL;
4058
	struct inode *delegated_inode = NULL;
4059 4060
	unsigned int lookup_flags = 0;
retry:
4061
	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4062 4063
	if (IS_ERR(name))
		return PTR_ERR(name);
4064

L
Linus Torvalds 已提交
4065
	error = -EISDIR;
4066
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
4067
		goto exit1;
4068

4069
	error = mnt_want_write(path.mnt);
4070 4071
	if (error)
		goto exit1;
4072
retry_deleg:
A
Al Viro 已提交
4073
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4074
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4075 4076
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
4077 4078
		struct user_namespace *mnt_userns;

L
Linus Torvalds 已提交
4079
		/* Why not before? Because we want correct error value */
4080
		if (last.name[last.len])
4081
			goto slashes;
L
Linus Torvalds 已提交
4082
		inode = dentry->d_inode;
4083
		if (d_is_negative(dentry))
4084 4085
			goto slashes;
		ihold(inode);
4086
		error = security_path_unlink(&path, dentry);
4087
		if (error)
4088
			goto exit2;
4089 4090
		mnt_userns = mnt_user_ns(path.mnt);
		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode);
4091
exit2:
L
Linus Torvalds 已提交
4092 4093
		dput(dentry);
	}
A
Al Viro 已提交
4094
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
4095 4096
	if (inode)
		iput(inode);	/* truncate the inode here */
4097 4098
	inode = NULL;
	if (delegated_inode) {
4099
		error = break_deleg_wait(&delegated_inode);
4100 4101 4102
		if (!error)
			goto retry_deleg;
	}
4103
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
4104
exit1:
4105
	path_put(&path);
4106 4107 4108 4109 4110
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
4111
	putname(name);
L
Linus Torvalds 已提交
4112 4113 4114
	return error;

slashes:
4115 4116
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
4117
	else if (d_is_dir(dentry))
4118 4119 4120
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
4121 4122 4123
	goto exit2;
}

4124
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4125 4126 4127 4128 4129
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
4130
		return do_rmdir(dfd, getname(pathname));
4131
	return do_unlinkat(dfd, getname(pathname));
4132 4133
}

4134
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4135
{
4136
	return do_unlinkat(AT_FDCWD, getname(pathname));
4137 4138
}

4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155
/**
 * vfs_symlink - create symlink
 * @mnt_userns:	user namespace of the mount the inode was found from
 * @dir:	inode of @dentry
 * @dentry:	pointer to dentry of the base directory
 * @oldname:	name of the file to link to
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
 */
int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
		struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
4156
{
4157
	int error = may_create(mnt_userns, dir, dentry);
L
Linus Torvalds 已提交
4158 4159 4160 4161

	if (error)
		return error;

A
Al Viro 已提交
4162
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
4163 4164 4165 4166 4167 4168 4169
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
4170
	if (!error)
4171
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
4172 4173
	return error;
}
4174
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
4175

4176
static long do_symlinkat(const char __user *oldname, int newdfd,
4177
		  const char __user *newname)
L
Linus Torvalds 已提交
4178
{
4179
	int error;
4180
	struct filename *from;
4181
	struct dentry *dentry;
4182
	struct path path;
4183
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
4184 4185

	from = getname(oldname);
4186
	if (IS_ERR(from))
L
Linus Torvalds 已提交
4187
		return PTR_ERR(from);
4188 4189
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4190 4191
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
4192
		goto out_putname;
4193

4194
	error = security_path_symlink(&path, dentry, from->name);
4195 4196 4197 4198 4199 4200 4201
	if (!error) {
		struct user_namespace *mnt_userns;

		mnt_userns = mnt_user_ns(path.mnt);
		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
				    from->name);
	}
A
Al Viro 已提交
4202
	done_path_create(&path, dentry);
4203 4204 4205 4206
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4207
out_putname:
L
Linus Torvalds 已提交
4208 4209 4210 4211
	putname(from);
	return error;
}

4212 4213 4214 4215 4216 4217
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return do_symlinkat(oldname, newdfd, newname);
}

4218
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4219
{
4220
	return do_symlinkat(oldname, AT_FDCWD, newname);
4221 4222
}

J
J. Bruce Fields 已提交
4223 4224 4225
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
4226
 * @mnt_userns:	the user namespace of the mount
J
J. Bruce Fields 已提交
4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
4242 4243 4244 4245 4246 4247
 *
 * If the inode has been found through an idmapped mount the user namespace of
 * the vfsmount must be passed through @mnt_userns. This function will then take
 * care to map the inode according to @mnt_userns before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply passs init_user_ns.
J
J. Bruce Fields 已提交
4248
 */
4249 4250 4251
int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
	     struct inode *dir, struct dentry *new_dentry,
	     struct inode **delegated_inode)
L
Linus Torvalds 已提交
4252 4253
{
	struct inode *inode = old_dentry->d_inode;
4254
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4255 4256 4257 4258 4259
	int error;

	if (!inode)
		return -ENOENT;

4260
	error = may_create(mnt_userns, dir, new_dentry);
L
Linus Torvalds 已提交
4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
4272 4273 4274 4275 4276
	/*
	 * Updating the link count will likely cause i_uid and i_gid to
	 * be writen back improperly if their true value is unknown to
	 * the vfs.
	 */
4277
	if (HAS_UNMAPPED_ID(mnt_userns, inode))
4278
		return -EPERM;
A
Al Viro 已提交
4279
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4280
		return -EPERM;
4281
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4282 4283 4284 4285 4286 4287
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4288
	inode_lock(inode);
4289
	/* Make sure we don't allow creating hardlink to an unlinked file */
4290
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4291
		error =  -ENOENT;
4292 4293
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4294 4295 4296 4297 4298
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4299 4300 4301 4302 4303 4304

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4305
	inode_unlock(inode);
4306
	if (!error)
4307
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4308 4309
	return error;
}
4310
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4311 4312 4313 4314 4315 4316 4317 4318 4319 4320

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4321
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4322
	      const char __user *newname, int flags)
L
Linus Torvalds 已提交
4323
{
4324
	struct user_namespace *mnt_userns;
L
Linus Torvalds 已提交
4325
	struct dentry *new_dentry;
4326
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4327
	struct inode *delegated_inode = NULL;
4328
	int how = 0;
L
Linus Torvalds 已提交
4329 4330
	int error;

4331
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4332
		return -EINVAL;
4333
	/*
4334 4335 4336
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4337
	 */
4338 4339 4340
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4341
		how = LOOKUP_EMPTY;
4342
	}
4343 4344 4345

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4346
retry:
4347
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4348
	if (error)
4349 4350
		return error;

4351 4352
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4353
	error = PTR_ERR(new_dentry);
4354
	if (IS_ERR(new_dentry))
4355 4356 4357 4358 4359
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
4360
	error = may_linkat(&init_user_ns, &old_path);
K
Kees Cook 已提交
4361 4362
	if (unlikely(error))
		goto out_dput;
4363
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4364
	if (error)
4365
		goto out_dput;
4366 4367 4368
	mnt_userns = mnt_user_ns(new_path.mnt);
	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
			 new_dentry, &delegated_inode);
4369
out_dput:
A
Al Viro 已提交
4370
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4371 4372
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4373 4374
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4375
			goto retry;
4376
		}
J
J. Bruce Fields 已提交
4377
	}
4378
	if (retry_estale(error, how)) {
4379
		path_put(&old_path);
4380 4381 4382
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4383
out:
4384
	path_put(&old_path);
L
Linus Torvalds 已提交
4385 4386 4387 4388

	return error;
}

4389 4390 4391 4392 4393 4394
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
{
	return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

4395
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4396
{
4397
	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4398 4399
}

4400 4401
/**
 * vfs_rename - rename a filesystem object
4402 4403 4404 4405 4406 4407 4408 4409
 * @old_mnt_userns:	old user namespace of the mount the inode was found from
 * @old_dir:		parent of source
 * @old_dentry:		source
 * @new_mnt_userns:	new user namespace of the mount the inode was found from
 * @new_dir:		parent of destination
 * @new_dentry:		destination
 * @delegated_inode:	returns an inode needing a delegation break
 * @flags:		rename flags
4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4424 4425 4426
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4427
 *
4428
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4429 4430
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4431
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4432
 *	   story.
4433 4434
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4435
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4436 4437
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4438
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4439 4440 4441
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4442
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4443 4444 4445
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4446
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4447
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4448
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4449
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4450 4451
 *	   locking].
 */
4452
int vfs_rename(struct renamedata *rd)
L
Linus Torvalds 已提交
4453
{
4454
	int error;
4455 4456 4457 4458 4459
	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
	struct dentry *old_dentry = rd->old_dentry;
	struct dentry *new_dentry = rd->new_dentry;
	struct inode **delegated_inode = rd->delegated_inode;
	unsigned int flags = rd->flags;
4460 4461
	bool is_dir = d_is_dir(old_dentry);
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4462
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4463 4464
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
A
Al Viro 已提交
4465
	struct name_snapshot old_name;
4466

4467
	if (source == target)
4468 4469
		return 0;

4470
	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
4471 4472 4473
	if (error)
		return error;

M
Miklos Szeredi 已提交
4474
	if (!target) {
4475
		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
M
Miklos Szeredi 已提交
4476 4477 4478 4479
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
4480 4481
			error = may_delete(rd->new_mnt_userns, new_dir,
					   new_dentry, is_dir);
M
Miklos Szeredi 已提交
4482
		else
4483 4484
			error = may_delete(rd->new_mnt_userns, new_dir,
					   new_dentry, new_is_dir);
M
Miklos Szeredi 已提交
4485
	}
4486 4487 4488
	if (error)
		return error;

4489
	if (!old_dir->i_op->rename)
4490
		return -EPERM;
L
Linus Torvalds 已提交
4491 4492 4493 4494 4495

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4496 4497
	if (new_dir != old_dir) {
		if (is_dir) {
4498
			error = inode_permission(rd->old_mnt_userns, source,
4499
						 MAY_WRITE);
M
Miklos Szeredi 已提交
4500 4501 4502 4503
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4504
			error = inode_permission(rd->new_mnt_userns, target,
4505
						 MAY_WRITE);
M
Miklos Szeredi 已提交
4506 4507 4508
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4509 4510
	}

4511 4512
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4513 4514 4515
	if (error)
		return error;

A
Al Viro 已提交
4516
	take_dentry_name_snapshot(&old_name, old_dentry);
4517
	dget(new_dentry);
M
Miklos Szeredi 已提交
4518
	if (!is_dir || (flags & RENAME_EXCHANGE))
4519 4520
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4521
		inode_lock(target);
S
Sage Weil 已提交
4522 4523

	error = -EBUSY;
4524
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4525 4526
		goto out;

M
Miklos Szeredi 已提交
4527
	if (max_links && new_dir != old_dir) {
4528
		error = -EMLINK;
M
Miklos Szeredi 已提交
4529
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4530
			goto out;
M
Miklos Szeredi 已提交
4531 4532 4533 4534 4535
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (!is_dir) {
4536
		error = try_break_deleg(source, delegated_inode);
4537 4538
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4539 4540 4541 4542 4543
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4544
	}
4545
	error = old_dir->i_op->rename(old_dir, old_dentry,
M
Miklos Szeredi 已提交
4546
				       new_dir, new_dentry, flags);
S
Sage Weil 已提交
4547 4548 4549
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4550
	if (!(flags & RENAME_EXCHANGE) && target) {
4551 4552
		if (is_dir) {
			shrink_dcache_parent(new_dentry);
4553
			target->i_flags |= S_DEAD;
4554
		}
S
Sage Weil 已提交
4555
		dont_mount(new_dentry);
4556
		detach_mounts(new_dentry);
4557
	}
M
Miklos Szeredi 已提交
4558 4559 4560 4561 4562 4563
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4564
out:
M
Miklos Szeredi 已提交
4565
	if (!is_dir || (flags & RENAME_EXCHANGE))
4566 4567
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4568
		inode_unlock(target);
L
Linus Torvalds 已提交
4569
	dput(new_dentry);
M
Miklos Szeredi 已提交
4570
	if (!error) {
4571
		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
M
Miklos Szeredi 已提交
4572 4573
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
4574
			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
M
Miklos Szeredi 已提交
4575 4576 4577
				      new_is_dir, NULL, new_dentry);
		}
	}
A
Al Viro 已提交
4578
	release_dentry_name_snapshot(&old_name);
R
Robert Love 已提交
4579

L
Linus Torvalds 已提交
4580 4581
	return error;
}
4582
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4583

4584 4585
int do_renameat2(int olddfd, struct filename *from, int newdfd,
		 struct filename *to, unsigned int flags)
L
Linus Torvalds 已提交
4586
{
4587
	struct renamedata rd;
4588 4589
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4590 4591 4592
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4593
	struct inode *delegated_inode = NULL;
4594
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4595
	bool should_retry = false;
4596
	int error = -EINVAL;
M
Miklos Szeredi 已提交
4597

M
Miklos Szeredi 已提交
4598
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4599
		goto put_both;
M
Miklos Szeredi 已提交
4600

M
Miklos Szeredi 已提交
4601 4602
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
4603
		goto put_both;
M
Miklos Szeredi 已提交
4604

4605 4606 4607
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4608
retry:
4609 4610
	from = filename_parentat(olddfd, from, lookup_flags, &old_path,
					&old_last, &old_type);
4611 4612
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
4613
		goto put_new;
4614
	}
L
Linus Torvalds 已提交
4615

4616 4617
	to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
				&new_type);
4618 4619
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4620
		goto exit1;
4621
	}
L
Linus Torvalds 已提交
4622 4623

	error = -EXDEV;
4624
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4625 4626 4627
		goto exit2;

	error = -EBUSY;
4628
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4629 4630
		goto exit2;

M
Miklos Szeredi 已提交
4631 4632
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4633
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4634 4635
		goto exit2;

4636
	error = mnt_want_write(old_path.mnt);
4637 4638 4639
	if (error)
		goto exit2;

4640
retry_deleg:
4641
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4642

4643
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4644 4645 4646 4647 4648
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4649
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4650
		goto exit4;
4651
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4652 4653 4654 4655 4656 4657
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4658 4659 4660 4661 4662 4663 4664
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4665
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4666 4667 4668
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4669
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4670
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4671
		error = -ENOTDIR;
4672
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4673
			goto exit5;
4674
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4675
			goto exit5;
L
Linus Torvalds 已提交
4676 4677 4678 4679
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4680
		goto exit5;
L
Linus Torvalds 已提交
4681
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4682 4683
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4684 4685 4686
	if (new_dentry == trap)
		goto exit5;

4687 4688
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4689
	if (error)
4690
		goto exit5;
4691 4692 4693

	rd.old_dir	   = old_path.dentry->d_inode;
	rd.old_dentry	   = old_dentry;
4694
	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
4695 4696
	rd.new_dir	   = new_path.dentry->d_inode;
	rd.new_dentry	   = new_dentry;
4697
	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
4698 4699 4700
	rd.delegated_inode = &delegated_inode;
	rd.flags	   = flags;
	error = vfs_rename(&rd);
L
Linus Torvalds 已提交
4701 4702 4703 4704 4705
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4706
	unlock_rename(new_path.dentry, old_path.dentry);
4707 4708 4709 4710 4711
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4712
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4713
exit2:
4714 4715
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4716
	path_put(&new_path);
L
Linus Torvalds 已提交
4717
exit1:
4718
	path_put(&old_path);
4719 4720 4721 4722 4723
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4724 4725 4726 4727 4728 4729
put_both:
	if (!IS_ERR(from))
		putname(from);
put_new:
	if (!IS_ERR(to))
		putname(to);
L
Linus Torvalds 已提交
4730 4731 4732
	return error;
}

4733 4734 4735
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
{
4736 4737
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				flags);
4738 4739
}

M
Miklos Szeredi 已提交
4740 4741 4742
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
4743 4744
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				0);
M
Miklos Szeredi 已提交
4745 4746
}

4747
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4748
{
4749 4750
	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
				getname(newname), 0);
4751 4752
}

A
Al Viro 已提交
4753
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4754
{
A
Al Viro 已提交
4755
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780
/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct inode *inode = d_inode(dentry);
4781 4782 4783
	DEFINE_DELAYED_CALL(done);
	const char *link;
	int res;
4784

4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795
	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
		if (unlikely(inode->i_op->readlink))
			return inode->i_op->readlink(dentry, buffer, buflen);

		if (!d_is_symlink(dentry))
			return -EINVAL;

		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_DEFAULT_READLINK;
		spin_unlock(&inode->i_lock);
	}
4796

4797
	link = READ_ONCE(inode->i_link);
4798 4799 4800 4801 4802 4803 4804 4805
	if (!link) {
		link = inode->i_op->get_link(dentry, inode, &done);
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
	res = readlink_copy(buffer, buflen, link);
	do_delayed_call(&done);
	return res;
4806 4807
}
EXPORT_SYMBOL(vfs_readlink);
L
Linus Torvalds 已提交
4808

M
Miklos Szeredi 已提交
4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833
/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
	const char *res = ERR_PTR(-EINVAL);
	struct inode *inode = d_inode(dentry);

	if (d_is_symlink(dentry)) {
		res = ERR_PTR(security_inode_readlink(dentry));
		if (!res)
			res = inode->i_op->get_link(dentry, inode, done);
	}
	return res;
}
EXPORT_SYMBOL(vfs_get_link);

L
Linus Torvalds 已提交
4834
/* get the link contents into pagecache */
4835
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4836
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4837
{
4838 4839
	char *kaddr;
	struct page *page;
4840 4841
	struct address_space *mapping = inode->i_mapping;

4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4855
	set_delayed_call(callback, page_put_link, page);
4856 4857
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4858
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4859
	return kaddr;
L
Linus Torvalds 已提交
4860 4861
}

4862
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4863

4864
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4865
{
4866
	put_page(arg);
L
Linus Torvalds 已提交
4867
}
4868
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4869

4870 4871
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4872
	DEFINE_DELAYED_CALL(done);
4873 4874
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4875 4876
					      &done));
	do_delayed_call(&done);
4877 4878 4879 4880
	return res;
}
EXPORT_SYMBOL(page_readlink);

4881 4882 4883 4884
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4885 4886
{
	struct address_space *mapping = inode->i_mapping;
4887
	struct page *page;
4888
	void *fsdata;
4889
	int err;
4890
	unsigned int flags = 0;
4891 4892
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4893

4894
retry:
4895
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4896
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4897
	if (err)
4898 4899
		goto fail;

4900
	memcpy(page_address(page), symname, len-1);
4901 4902 4903

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4904 4905
	if (err < 0)
		goto fail;
4906 4907 4908
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4909 4910 4911 4912 4913
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4914
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4915

4916 4917 4918
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4919
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4920
}
4921
EXPORT_SYMBOL(page_symlink);
4922

4923
const struct inode_operations page_symlink_inode_operations = {
4924
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4925 4926
};
EXPORT_SYMBOL(page_symlink_inode_operations);