namei.c 120.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
R
Robert Love 已提交
25
#include <linux/fsnotify.h>
L
Linus Torvalds 已提交
26 27
#include <linux/personality.h>
#include <linux/security.h>
M
Mimi Zohar 已提交
28
#include <linux/ima.h>
L
Linus Torvalds 已提交
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
42

43
#include "internal.h"
44
#include "mount.h"
45

L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
L
Lucas De Marchi 已提交
80
 * the name is a symlink pointing to a non-existent name.
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

A
Al Viro 已提交
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
A
Al Viro 已提交
130
	struct filename *result;
131
	char *kname;
A
Al Viro 已提交
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
A
Al Viro 已提交
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

A
Al Viro 已提交
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
A
Al Viro 已提交
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
A
Al Viro 已提交
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
A
Al Viro 已提交
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

A
Al Viro 已提交
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
A
Al Viro 已提交
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
A
Al Viro 已提交
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

A
Al Viro 已提交
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
A
Al Viro 已提交
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
L
Linus Torvalds 已提交
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
L
Linus Torvalds 已提交
204 205
}

206 207
struct filename *
getname(const char __user * filename)
A
Al Viro 已提交
208
{
209
	return getname_flags(filename, 0, NULL);
A
Al Viro 已提交
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
A
Al Viro 已提交
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
L
Linus Torvalds 已提交
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

A
Al Viro 已提交
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
L
Linus Torvalds 已提交
260 261
}

262 263
static int check_acl(struct inode *inode, int mask)
{
264
#ifdef CONFIG_FS_POSIX_ACL
265 266 267
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
268 269
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
270
	                return -EAGAIN;
271
		/* no ->get_acl() calls in RCU mode... */
272
		if (is_uncached_acl(acl))
273
			return -ECHILD;
274
	        return posix_acl_permission(inode, acl, mask);
275 276
	}

C
Christoph Hellwig 已提交
277 278 279
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
280 281 282 283 284
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
285
#endif
286 287 288 289

	return -EAGAIN;
}

290
/*
291 292 293 294
 * This does the basic UNIX permission checking.
 *
 * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
 * for RCU walking.
L
Linus Torvalds 已提交
295
 */
296
static int acl_permission_check(struct inode *inode, int mask)
L
Linus Torvalds 已提交
297
{
298
	unsigned int mode = inode->i_mode;
L
Linus Torvalds 已提交
299

300 301 302
	/* Are we the owner? If so, ACL's don't matter */
	if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
		mask &= 7;
L
Linus Torvalds 已提交
303
		mode >>= 6;
304 305
		return (mask & ~mode) ? -EACCES : 0;
	}
L
Linus Torvalds 已提交
306

307 308 309 310 311
	/* Do we have ACL's? */
	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
		int error = check_acl(inode, mask);
		if (error != -EAGAIN)
			return error;
L
Linus Torvalds 已提交
312 313
	}

314 315 316
	/* Only RWX matters for group/other mode bits */
	mask &= 7;

L
Linus Torvalds 已提交
317
	/*
318 319 320
	 * Are the group permissions different from
	 * the other permissions in the bits we care
	 * about? Need to check group ownership if so.
L
Linus Torvalds 已提交
321
	 */
322 323 324 325 326 327 328
	if (mask & (mode ^ (mode >> 3))) {
		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/* Bits in 'mode' clear that we require? */
	return (mask & ~mode) ? -EACCES : 0;
329 330 331
}

/**
332
 * generic_permission -  check for access rights on a Posix-like filesystem
333
 * @inode:	inode to check access rights for
334 335
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *		%MAY_NOT_BLOCK ...)
336 337 338 339
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
340 341 342 343 344
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
345
 */
346
int generic_permission(struct inode *inode, int mask)
347 348 349 350
{
	int ret;

	/*
351
	 * Do the basic permission checks.
352
	 */
353
	ret = acl_permission_check(inode, mask);
354 355
	if (ret != -EACCES)
		return ret;
L
Linus Torvalds 已提交
356

357 358 359
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
360 361
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
362
				return 0;
363
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
L
Linus Torvalds 已提交
364
			return 0;
365 366
		return -EACCES;
	}
L
Linus Torvalds 已提交
367 368 369 370

	/*
	 * Searching includes executable on directories, else just read.
	 */
371
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
372
	if (mask == MAY_READ)
373
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
L
Linus Torvalds 已提交
374
			return 0;
375 376 377 378 379 380 381 382
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
L
Linus Torvalds 已提交
383 384 385

	return -EACCES;
}
386
EXPORT_SYMBOL(generic_permission);
L
Linus Torvalds 已提交
387

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

D
David Howells 已提交
408 409 410
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
411
 * @inode: Inode to check permission on
D
David Howells 已提交
412 413 414 415 416 417 418 419 420 421
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
422
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
D
David Howells 已提交
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
	}

	retval = do_inode_permission(inode, mask);
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
D
David Howells 已提交
472
}
473
EXPORT_SYMBOL(inode_permission);
D
David Howells 已提交
474

J
Jan Blunck 已提交
475 476 477 478 479 480
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
481
void path_get(const struct path *path)
J
Jan Blunck 已提交
482 483 484 485 486 487
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

J
Jan Blunck 已提交
488 489 490 491 492 493
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
494
void path_put(const struct path *path)
L
Linus Torvalds 已提交
495
{
J
Jan Blunck 已提交
496 497
	dput(path->dentry);
	mntput(path->mnt);
L
Linus Torvalds 已提交
498
}
J
Jan Blunck 已提交
499
EXPORT_SYMBOL(path_put);
L
Linus Torvalds 已提交
500

501
#define EMBEDDED_LEVELS 2
502 503
struct nameidata {
	struct path	path;
A
Al Viro 已提交
504
	struct qstr	last;
505 506 507
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
508
	unsigned	seq, m_seq, r_seq;
509 510
	int		last_type;
	unsigned	depth;
511
	int		total_link_count;
512 513
	struct saved {
		struct path link;
514
		struct delayed_call done;
515
		const char *name;
516
		unsigned seq;
517
	} *stack, internal[EMBEDDED_LEVELS];
518 519 520 521
	struct filename	*name;
	struct nameidata *saved;
	unsigned	root_seq;
	int		dfd;
522 523
	kuid_t		dir_uid;
	umode_t		dir_mode;
524
} __randomize_layout;
525

526
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
527
{
528 529
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
530 531
	p->dfd = dfd;
	p->name = name;
532
	p->total_link_count = old ? old->total_link_count : 0;
533
	p->saved = old;
534
	current->nameidata = p;
535 536
}

537
static void restore_nameidata(void)
538
{
539
	struct nameidata *now = current->nameidata, *old = now->saved;
540 541 542 543

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
544
	if (now->stack != now->internal)
545
		kfree(now->stack);
546 547
}

548
static bool nd_alloc_stack(struct nameidata *nd)
549
{
A
Al Viro 已提交
550 551
	struct saved *p;

552 553 554 555
	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
	if (unlikely(!p))
		return false;
556 557
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
558
	return true;
559 560
}

561
/**
562
 * path_connected - Verify that a dentry is below mnt.mnt_root
563 564 565 566
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
567
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
568
{
569
	struct super_block *sb = mnt->mnt_sb;
570

571 572
	/* Bind mounts can have disconnected paths */
	if (mnt->mnt_root == sb->s_root)
573 574
		return true;

575
	return is_subdir(dentry, mnt->mnt_root);
576 577
}

578 579 580 581 582
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
583 584
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
585 586 587 588 589 590 591 592 593 594 595
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
596
		if (nd->flags & LOOKUP_ROOT_GRABBED) {
597
			path_put(&nd->root);
598
			nd->flags &= ~LOOKUP_ROOT_GRABBED;
599
		}
600 601 602 603 604 605 606 607
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
608
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
609
{
610
	int res = __legitimize_mnt(path->mnt, mseq);
611 612 613 614 615 616 617 618 619 620 621 622 623
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

624 625 626
static inline bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
A
Al Viro 已提交
627
	return __legitimize_path(path, seq, nd->m_seq);
628 629
}

630 631 632
static bool legitimize_links(struct nameidata *nd)
{
	int i;
633 634 635 636 637
	if (unlikely(nd->flags & LOOKUP_CACHED)) {
		drop_links(nd);
		nd->depth = 0;
		return false;
	}
638 639 640 641 642 643 644 645 646 647 648
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

649 650
static bool legitimize_root(struct nameidata *nd)
{
651 652 653 654 655 656 657 658
	/*
	 * For scoped-lookups (where nd->root has been zeroed), we need to
	 * restart the whole lookup from scratch -- because set_root() is wrong
	 * for these lookups (nd->dfd is the root, not the filesystem root).
	 */
	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
		return false;
	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
659 660
	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
		return true;
661
	nd->flags |= LOOKUP_ROOT_GRABBED;
662 663 664
	return legitimize_path(nd, &nd->root, nd->root_seq);
}

A
Al Viro 已提交
665
/*
N
Nick Piggin 已提交
666
 * Path walking has 2 modes, rcu-walk and ref-walk (see
A
Al Viro 已提交
667 668
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
M
Mike Marshall 已提交
669
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
A
Al Viro 已提交
670 671 672 673
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
N
Nick Piggin 已提交
674 675 676
 */

/**
677
 * try_to_unlazy - try to switch to ref-walk mode.
A
Al Viro 已提交
678
 * @nd: nameidata pathwalk data
679
 * Returns: true on success, false on failure
N
Nick Piggin 已提交
680
 *
681
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
A
Al Viro 已提交
682 683
 * for ref-walk mode.
 * Must be called from rcu-walk context.
684
 * Nothing should touch nameidata between try_to_unlazy() failure and
685
 * terminate_walk().
N
Nick Piggin 已提交
686
 */
687
static bool try_to_unlazy(struct nameidata *nd)
N
Nick Piggin 已提交
688 689 690 691
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
692

A
Al Viro 已提交
693 694 695
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
696 697
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
698 699
	if (unlikely(!legitimize_root(nd)))
		goto out;
A
Al Viro 已提交
700 701
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
702
	return true;
A
Al Viro 已提交
703

704
out1:
A
Al Viro 已提交
705 706 707 708
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
709
	return false;
A
Al Viro 已提交
710 711 712
}

/**
713
 * try_to_unlazy_next - try to switch to ref-walk mode.
A
Al Viro 已提交
714
 * @nd: nameidata pathwalk data
715 716 717
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
A
Al Viro 已提交
718
 *
719 720 721 722
 * Similar to to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
A
Al Viro 已提交
723 724
 * terminate_walk().
 */
725
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
A
Al Viro 已提交
726 727 728
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

729
	nd->flags &= ~LOOKUP_RCU;
730 731 732 733
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
A
Al Viro 已提交
734
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
735
		goto out1;
A
Al Viro 已提交
736

737
	/*
A
Al Viro 已提交
738 739 740 741 742
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
743
	 */
A
Al Viro 已提交
744 745
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
746 747
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
		goto out_dput;
748 749 750 751
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
752 753
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
A
Al Viro 已提交
754
	rcu_read_unlock();
755
	return true;
A
Al Viro 已提交
756

757 758 759 760
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
761
out:
A
Al Viro 已提交
762
	rcu_read_unlock();
763
	return false;
764 765 766
out_dput:
	rcu_read_unlock();
	dput(dentry);
767
	return false;
N
Nick Piggin 已提交
768 769
}

770
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
771
{
772 773 774 775
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
776 777
}

778 779 780
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
781
 *
782 783 784 785 786
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
787
 */
788
static int complete_walk(struct nameidata *nd)
789
{
A
Al Viro 已提交
790
	struct dentry *dentry = nd->path.dentry;
791 792
	int status;

793
	if (nd->flags & LOOKUP_RCU) {
794 795 796 797 798
		/*
		 * We don't want to zero nd->root for scoped-lookups or
		 * externally-managed nd->root.
		 */
		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
799
			nd->root.mnt = NULL;
J
Jens Axboe 已提交
800
		nd->flags &= ~LOOKUP_CACHED;
801
		if (!try_to_unlazy(nd))
802 803 804
			return -ECHILD;
	}

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
		/*
		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
		 * ever step outside the root during lookup" and should already
		 * be guaranteed by the rest of namei, we want to avoid a namei
		 * BUG resulting in userspace being given a path that was not
		 * scoped within the root at some point during the lookup.
		 *
		 * So, do a final sanity-check to make sure that in the
		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
		 * we won't silently return an fd completely outside of the
		 * requested root to userspace.
		 *
		 * Userspace could move the path outside the root after this
		 * check, but as discussed elsewhere this is not a concern (the
		 * resolved file was inside the root at some point).
		 */
		if (!path_is_under(&nd->path, &nd->root))
			return -EXDEV;
	}

A
Al Viro 已提交
826 827 828
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

829
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
830 831
		return 0;

832
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
833 834 835
	if (status > 0)
		return 0;

A
Al Viro 已提交
836
	if (!status)
837
		status = -ESTALE;
A
Al Viro 已提交
838

839 840 841
	return status;
}

842
static int set_root(struct nameidata *nd)
N
Nick Piggin 已提交
843
{
844
	struct fs_struct *fs = current->fs;
N
Nick Piggin 已提交
845

846 847 848 849 850 851 852 853
	/*
	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
	 * still have to ensure it doesn't happen because it will cause a breakout
	 * from the dirfd.
	 */
	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
		return -ENOTRECOVERABLE;

854 855 856 857 858 859 860 861 862 863
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
864
		nd->flags |= LOOKUP_ROOT_GRABBED;
865
	}
866
	return 0;
N
Nick Piggin 已提交
867 868
}

869 870
static int nd_jump_root(struct nameidata *nd)
{
871 872
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return -EXDEV;
873 874 875 876 877
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		/* Absolute path arguments to path_init() are allowed. */
		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
			return -EXDEV;
	}
878 879 880 881 882
	if (!nd->root.mnt) {
		int error = set_root(nd);
		if (error)
			return error;
	}
883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

C
Christoph Hellwig 已提交
901
/*
902
 * Helper to directly jump to a known parsed path from ->get_link,
C
Christoph Hellwig 已提交
903 904
 * caller must have taken a reference to path beforehand.
 */
905
int nd_jump_link(struct path *path)
C
Christoph Hellwig 已提交
906
{
907
	int error = -ELOOP;
908
	struct nameidata *nd = current->nameidata;
C
Christoph Hellwig 已提交
909

910 911 912
	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
		goto err;

913 914 915 916 917
	error = -EXDEV;
	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
		if (nd->path.mnt != path->mnt)
			goto err;
	}
918 919 920
	/* Not currently safe for scoped-lookups. */
	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
		goto err;
921

922
	path_put(&nd->path);
C
Christoph Hellwig 已提交
923 924 925
	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
926
	return 0;
927 928 929 930

err:
	path_put(path);
	return error;
C
Christoph Hellwig 已提交
931 932
}

933
static inline void put_link(struct nameidata *nd)
934
{
A
Al Viro 已提交
935
	struct saved *last = nd->stack + --nd->depth;
936
	do_delayed_call(&last->done);
A
Al Viro 已提交
937 938
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
939 940
}

941 942
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
943 944
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
K
Kees Cook 已提交
945 946 947

/**
 * may_follow_link - Check symlink following for unsafe situations
948
 * @nd: nameidata pathwalk data
K
Kees Cook 已提交
949 950 951 952 953 954 955 956 957 958 959 960
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
961
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
K
Kees Cook 已提交
962 963 964 965 966
{
	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
967
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
K
Kees Cook 已提交
968 969 970
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
971
	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
K
Kees Cook 已提交
972 973 974
		return 0;

	/* Allowed if parent directory and link owner match. */
975
	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
K
Kees Cook 已提交
976 977
		return 0;

978 979 980
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

981
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
K
Kees Cook 已提交
982
	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
K
Kees Cook 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1029
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
K
Kees Cook 已提交
1030 1031 1032
 *
 * Returns 0 if successful, -ve on error.
 */
1033
int may_linkat(struct path *link)
K
Kees Cook 已提交
1034
{
1035 1036 1037 1038 1039
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;
K
Kees Cook 已提交
1040 1041 1042 1043 1044 1045 1046

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1047
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
K
Kees Cook 已提交
1048 1049
		return 0;

K
Kees Cook 已提交
1050
	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
K
Kees Cook 已提交
1051 1052 1053
	return -EPERM;
}

1054 1055 1056 1057
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
1058 1059
 * @dir_mode: mode bits of directory
 * @dir_uid: owner of directory
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
1075
static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
1076 1077 1078 1079
				struct inode * const inode)
{
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1080 1081
	    likely(!(dir_mode & S_ISVTX)) ||
	    uid_eq(inode->i_uid, dir_uid) ||
1082 1083 1084
	    uid_eq(current_fsuid(), inode->i_uid))
		return 0;

1085 1086
	if (likely(dir_mode & 0002) ||
	    (dir_mode & 0020 &&
1087 1088
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
K
Kees Cook 已提交
1089 1090 1091 1092
		const char *operation = S_ISFIFO(inode->i_mode) ?
					"sticky_create_fifo" :
					"sticky_create_regular";
		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1093 1094 1095 1096 1097
		return -EACCES;
	}
	return 0;
}

1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
A
Al Viro 已提交
1108
int follow_up(struct path *path)
L
Linus Torvalds 已提交
1109
{
1110 1111
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
L
Linus Torvalds 已提交
1112
	struct dentry *mountpoint;
N
Nick Piggin 已提交
1113

A
Al Viro 已提交
1114
	read_seqlock_excl(&mount_lock);
1115
	parent = mnt->mnt_parent;
A
Al Viro 已提交
1116
	if (parent == mnt) {
A
Al Viro 已提交
1117
		read_sequnlock_excl(&mount_lock);
L
Linus Torvalds 已提交
1118 1119
		return 0;
	}
1120
	mntget(&parent->mnt);
1121
	mountpoint = dget(mnt->mnt_mountpoint);
A
Al Viro 已提交
1122
	read_sequnlock_excl(&mount_lock);
A
Al Viro 已提交
1123 1124 1125
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1126
	path->mnt = &parent->mnt;
L
Linus Torvalds 已提交
1127 1128
	return 1;
}
1129
EXPORT_SYMBOL(follow_up);
L
Linus Torvalds 已提交
1130

A
Al Viro 已提交
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
				  struct path *path, unsigned *seqp)
{
	while (mnt_has_parent(m)) {
		struct dentry *mountpoint = m->mnt_mountpoint;

		m = m->mnt_parent;
		if (unlikely(root->dentry == mountpoint &&
			     root->mnt == &m->mnt))
			break;
		if (mountpoint != m->mnt.mnt_root) {
			path->mnt = &m->mnt;
			path->dentry = mountpoint;
			*seqp = read_seqcount_begin(&mountpoint->d_seq);
			return true;
		}
	}
	return false;
}

1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
static bool choose_mountpoint(struct mount *m, const struct path *root,
			      struct path *path)
{
	bool found;

	rcu_read_lock();
	while (1) {
		unsigned seq, mseq = read_seqbegin(&mount_lock);

		found = choose_mountpoint_rcu(m, root, path, &seq);
		if (unlikely(!found)) {
			if (!read_seqretry(&mount_lock, mseq))
				break;
		} else {
			if (likely(__legitimize_path(path, seq, mseq)))
				break;
			rcu_read_unlock();
			path_put(path);
			rcu_read_lock();
		}
	}
	rcu_read_unlock();
	return found;
}

N
Nick Piggin 已提交
1176
/*
1177 1178 1179
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
L
Linus Torvalds 已提交
1180
 */
1181
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
N
Nick Piggin 已提交
1182
{
1183
	struct dentry *dentry = path->dentry;
1184

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1195
	 */
1196
	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1197
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1198
	    dentry->d_inode)
1199
		return -EISDIR;
1200

1201
	if (count && (*count)++ >= MAXSYMLINKS)
1202 1203
		return -ELOOP;

1204
	return finish_automount(dentry->d_op->d_automount(path), path);
A
Al Viro 已提交
1205 1206
}

1207
/*
A
Al Viro 已提交
1208 1209 1210 1211
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
1212
 */
A
Al Viro 已提交
1213 1214
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
			     int *count, unsigned lookup_flags)
L
Linus Torvalds 已提交
1215
{
A
Al Viro 已提交
1216
	struct vfsmount *mnt = path->mnt;
1217
	bool need_mntput = false;
1218
	int ret = 0;
1219

A
Al Viro 已提交
1220
	while (flags & DCACHE_MANAGED_DENTRY) {
1221 1222
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
1223
		if (flags & DCACHE_MANAGE_TRANSIT) {
1224
			ret = path->dentry->d_op->d_manage(path, false);
1225
			flags = smp_load_acquire(&path->dentry->d_flags);
1226
			if (ret < 0)
1227
				break;
1228 1229
		}

A
Al Viro 已提交
1230
		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1231
			struct vfsmount *mounted = lookup_mnt(path);
A
Al Viro 已提交
1232
			if (mounted) {		// ... in our namespace
1233 1234 1235 1236 1237
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
A
Al Viro 已提交
1238 1239
				// here we know it's positive
				flags = path->dentry->d_flags;
1240 1241 1242 1243 1244
				need_mntput = true;
				continue;
			}
		}

A
Al Viro 已提交
1245 1246
		if (!(flags & DCACHE_NEED_AUTOMOUNT))
			break;
1247

A
Al Viro 已提交
1248 1249 1250 1251 1252
		// uncovered automount point
		ret = follow_automount(path, count, lookup_flags);
		flags = smp_load_acquire(&path->dentry->d_flags);
		if (ret < 0)
			break;
L
Linus Torvalds 已提交
1253
	}
1254

A
Al Viro 已提交
1255 1256 1257 1258 1259 1260
	if (ret == -EISDIR)
		ret = 0;
	// possible if you race with several mount --move
	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (!ret && unlikely(d_flags_negative(flags)))
1261
		ret = -ENOENT;
A
Al Viro 已提交
1262
	*jumped = need_mntput;
1263
	return ret;
L
Linus Torvalds 已提交
1264 1265
}

A
Al Viro 已提交
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
static inline int traverse_mounts(struct path *path, bool *jumped,
				  int *count, unsigned lookup_flags)
{
	unsigned flags = smp_load_acquire(&path->dentry->d_flags);

	/* fastpath */
	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
		*jumped = false;
		if (unlikely(d_flags_negative(flags)))
			return -ENOENT;
		return 0;
	}
	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

1281
int follow_down_one(struct path *path)
L
Linus Torvalds 已提交
1282 1283 1284
{
	struct vfsmount *mounted;

A
Al Viro 已提交
1285
	mounted = lookup_mnt(path);
L
Linus Torvalds 已提交
1286
	if (mounted) {
A
Al Viro 已提交
1287 1288 1289 1290
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
L
Linus Torvalds 已提交
1291 1292 1293 1294
		return 1;
	}
	return 0;
}
1295
EXPORT_SYMBOL(follow_down_one);
L
Linus Torvalds 已提交
1296

A
Al Viro 已提交
1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
	struct vfsmount *mnt = path->mnt;
	bool jumped;
	int ret = traverse_mounts(path, &jumped, NULL, 0);

	if (path->mnt != mnt)
		mntput(mnt);
	return ret;
}
EXPORT_SYMBOL(follow_down);

1314
/*
1315 1316
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1317 1318
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1319
			       struct inode **inode, unsigned *seqp)
1320
{
A
Al Viro 已提交
1321 1322 1323 1324 1325 1326 1327 1328 1329
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;

	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
		return true;

	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
		return false;

1330 1331 1332 1333 1334
	for (;;) {
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
A
Al Viro 已提交
1335 1336 1337 1338 1339
		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
			int res = dentry->d_op->d_manage(path, true);
			if (res)
				return res == -EISDIR;
			flags = dentry->d_flags;
1340
		}
1341

A
Al Viro 已提交
1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
		if (flags & DCACHE_MOUNTED) {
			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
			if (mounted) {
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
				nd->flags |= LOOKUP_JUMPED;
				*seqp = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
				 * ->d_inode read - there will be an RCU delay
				 * between mount hash removal and ->mnt_root
				 * becoming unpinned.
				 */
				flags = dentry->d_flags;
				continue;
			}
			if (read_seqretry(&mount_lock, nd->m_seq))
				return false;
		}
		return !(flags & DCACHE_NEED_AUTOMOUNT);
1363
	}
1364 1365
}

1366 1367 1368
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
1369
{
A
Al Viro 已提交
1370
	bool jumped;
1371
	int ret;
1372

1373 1374
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
1375 1376 1377 1378 1379
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
A
Al Viro 已提交
1380
			return 0;
1381
		if (!try_to_unlazy_next(nd, dentry, seq))
1382 1383 1384 1385 1386
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
	}
A
Al Viro 已提交
1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			ret = -EXDEV;
		else
			nd->flags |= LOOKUP_JUMPED;
	}
	if (unlikely(ret)) {
		dput(path->dentry);
		if (path->mnt != nd->path.mnt)
			mntput(path->mnt);
	} else {
1399 1400 1401 1402 1403 1404
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}

1405
/*
1406 1407
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
1408
 */
1409 1410
static struct dentry *lookup_dcache(const struct qstr *name,
				    struct dentry *dir,
1411
				    unsigned int flags)
1412
{
1413
	struct dentry *dentry = d_lookup(dir, name);
M
Miklos Szeredi 已提交
1414
	if (dentry) {
1415 1416 1417 1418 1419 1420
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error)
				d_invalidate(dentry);
			dput(dentry);
			return ERR_PTR(error);
M
Miklos Szeredi 已提交
1421 1422
		}
	}
1423 1424 1425
	return dentry;
}

1426
/*
1427 1428 1429 1430 1431
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
1432
 */
1433
static struct dentry *__lookup_hash(const struct qstr *name,
1434
		struct dentry *base, unsigned int flags)
1435
{
1436
	struct dentry *dentry = lookup_dcache(name, base, flags);
1437 1438
	struct dentry *old;
	struct inode *dir = base->d_inode;
1439

1440
	if (dentry)
M
Miklos Szeredi 已提交
1441
		return dentry;
1442

1443 1444 1445 1446
	/* Don't create child dentry for a dead directory. */
	if (unlikely(IS_DEADDIR(dir)))
		return ERR_PTR(-ENOENT);

1447 1448 1449 1450
	dentry = d_alloc(base, name);
	if (unlikely(!dentry))
		return ERR_PTR(-ENOMEM);

1451 1452 1453 1454 1455 1456
	old = dir->i_op->lookup(dir, dentry, flags);
	if (unlikely(old)) {
		dput(dentry);
		dentry = old;
	}
	return dentry;
1457 1458
}

1459 1460 1461
static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
L
Linus Torvalds 已提交
1462
{
N
Nick Piggin 已提交
1463
	struct dentry *dentry, *parent = nd->path.dentry;
A
Al Viro 已提交
1464
	int status = 1;
1465

1466 1467
	/*
	 * Rename seqlock is not required here because in the off chance
A
Al Viro 已提交
1468 1469
	 * of a false negative due to a concurrent rename, the caller is
	 * going to fall back to non-racy lookup.
1470
	 */
N
Nick Piggin 已提交
1471 1472
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
1473
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
A
Al Viro 已提交
1474
		if (unlikely(!dentry)) {
1475
			if (!try_to_unlazy(nd))
1476 1477
				return ERR_PTR(-ECHILD);
			return NULL;
A
Al Viro 已提交
1478
		}
A
Al Viro 已提交
1479

1480 1481 1482 1483
		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
1484
		*inode = d_backing_inode(dentry);
A
Al Viro 已提交
1485
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1486
			return ERR_PTR(-ECHILD);
1487 1488 1489 1490 1491 1492 1493 1494

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
A
Al Viro 已提交
1495
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1496
			return ERR_PTR(-ECHILD);
A
Al Viro 已提交
1497

1498
		*seqp = seq;
1499
		status = d_revalidate(dentry, nd->flags);
1500
		if (likely(status > 0))
1501
			return dentry;
1502
		if (!try_to_unlazy_next(nd, dentry, seq))
1503
			return ERR_PTR(-ECHILD);
1504
		if (status == -ECHILD)
1505 1506
			/* we'd been told to redo it in non-rcu mode */
			status = d_revalidate(dentry, nd->flags);
A
Al Viro 已提交
1507
	} else {
A
Al Viro 已提交
1508
		dentry = __d_lookup(parent, &nd->last);
A
Al Viro 已提交
1509
		if (unlikely(!dentry))
1510
			return NULL;
1511
		status = d_revalidate(dentry, nd->flags);
1512
	}
A
Al Viro 已提交
1513
	if (unlikely(status <= 0)) {
1514
		if (!status)
A
Al Viro 已提交
1515
			d_invalidate(dentry);
1516
		dput(dentry);
1517
		return ERR_PTR(status);
1518
	}
1519
	return dentry;
M
Miklos Szeredi 已提交
1520 1521 1522
}

/* Fast lookup failed, do it the slow way */
A
Al Viro 已提交
1523 1524 1525
static struct dentry *__lookup_slow(const struct qstr *name,
				    struct dentry *dir,
				    unsigned int flags)
M
Miklos Szeredi 已提交
1526
{
A
Al Viro 已提交
1527
	struct dentry *dentry, *old;
1528
	struct inode *inode = dir->d_inode;
1529
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1530 1531

	/* Don't go there if it's already dead */
A
Al Viro 已提交
1532
	if (unlikely(IS_DEADDIR(inode)))
A
Al Viro 已提交
1533
		return ERR_PTR(-ENOENT);
A
Al Viro 已提交
1534
again:
1535
	dentry = d_alloc_parallel(dir, name, &wq);
A
Al Viro 已提交
1536
	if (IS_ERR(dentry))
A
Al Viro 已提交
1537
		return dentry;
A
Al Viro 已提交
1538
	if (unlikely(!d_in_lookup(dentry))) {
1539 1540 1541 1542
		int error = d_revalidate(dentry, flags);
		if (unlikely(error <= 0)) {
			if (!error) {
				d_invalidate(dentry);
1543
				dput(dentry);
1544
				goto again;
1545
			}
1546 1547
			dput(dentry);
			dentry = ERR_PTR(error);
1548
		}
A
Al Viro 已提交
1549 1550 1551 1552 1553 1554
	} else {
		old = inode->i_op->lookup(inode, dentry, flags);
		d_lookup_done(dentry);
		if (unlikely(old)) {
			dput(dentry);
			dentry = old;
1555 1556
		}
	}
1557
	return dentry;
L
Linus Torvalds 已提交
1558 1559
}

A
Al Viro 已提交
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
static struct dentry *lookup_slow(const struct qstr *name,
				  struct dentry *dir,
				  unsigned int flags)
{
	struct inode *inode = dir->d_inode;
	struct dentry *res;
	inode_lock_shared(inode);
	res = __lookup_slow(name, dir, flags);
	inode_unlock_shared(inode);
	return res;
}

1572 1573 1574
static inline int may_lookup(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
1575
		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1576
		if (err != -ECHILD || !try_to_unlazy(nd))
1577 1578
			return err;
	}
1579
	return inode_permission(nd->inode, MAY_EXEC);
1580 1581
}

1582 1583 1584 1585
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
1586 1587 1588 1589 1590

	if (likely(nd->depth != EMBEDDED_LEVELS))
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
1591
	if (likely(nd_alloc_stack(nd)))
1592
		return 0;
1593 1594 1595 1596

	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
1597
		bool grabbed_link = legitimize_path(nd, link, seq);
1598

1599
		if (!try_to_unlazy(nd) != 0 || !grabbed_link)
1600 1601 1602 1603
			return -ECHILD;

		if (nd_alloc_stack(nd))
			return 0;
1604
	}
1605
	return -ENOMEM;
1606 1607
}

1608 1609
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

1610
static const char *pick_link(struct nameidata *nd, struct path *link,
1611
		     struct inode *inode, unsigned seq, int flags)
1612
{
A
Al Viro 已提交
1613
	struct saved *last;
1614
	const char *res;
1615
	int error = reserve_stack(nd, link, seq);
1616

1617
	if (unlikely(error)) {
1618
		if (!(nd->flags & LOOKUP_RCU))
A
Al Viro 已提交
1619
			path_put(link);
1620
		return ERR_PTR(error);
1621
	}
1622
	last = nd->stack + nd->depth++;
A
Al Viro 已提交
1623
	last->link = *link;
1624
	clear_delayed_call(&last->done);
1625
	last->seq = seq;
1626

1627
	if (flags & WALK_TRAILING) {
1628 1629 1630 1631 1632
		error = may_follow_link(nd, inode);
		if (unlikely(error))
			return ERR_PTR(error);
	}

1633 1634
	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1635 1636 1637 1638 1639 1640
		return ERR_PTR(-ELOOP);

	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
	} else if (atime_needs_update(&last->link, inode)) {
1641
		if (!try_to_unlazy(nd))
1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657
			return ERR_PTR(-ECHILD);
		touch_atime(&last->link);
	}

	error = security_inode_follow_link(link->dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
		return ERR_PTR(error);

	res = READ_ONCE(inode->i_link);
	if (!res) {
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
		if (nd->flags & LOOKUP_RCU) {
			res = get(NULL, inode, &last->done);
1658
			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
				res = get(link->dentry, inode, &last->done);
		} else {
			res = get(link->dentry, inode, &last->done);
		}
		if (!res)
			goto all_done;
		if (IS_ERR(res))
			return res;
	}
	if (*res == '/') {
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		while (unlikely(*++res == '/'))
			;
	}
	if (*res)
		return res;
all_done: // pure jump
	put_link(nd);
	return NULL;
1680 1681
}

1682 1683 1684 1685 1686 1687
/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
1688
static const char *step_into(struct nameidata *nd, int flags,
A
Al Viro 已提交
1689
		     struct dentry *dentry, struct inode *inode, unsigned seq)
1690
{
A
Al Viro 已提交
1691 1692 1693 1694
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);

	if (err < 0)
1695
		return ERR_PTR(err);
A
Al Viro 已提交
1696
	if (likely(!d_is_symlink(path.dentry)) ||
1697
	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
A
Al Viro 已提交
1698
	   (flags & WALK_NOFOLLOW)) {
1699
		/* not a symlink or should not follow */
1700 1701 1702 1703 1704 1705
		if (!(nd->flags & LOOKUP_RCU)) {
			dput(nd->path.dentry);
			if (nd->path.mnt != path.mnt)
				mntput(nd->path.mnt);
		}
		nd->path = path;
1706 1707
		nd->inode = inode;
		nd->seq = seq;
1708
		return NULL;
1709
	}
1710
	if (nd->flags & LOOKUP_RCU) {
1711
		/* make sure that d_is_symlink above matches inode */
A
Al Viro 已提交
1712
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
1713
			return ERR_PTR(-ECHILD);
1714 1715 1716
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
1717
	}
1718
	return pick_link(nd, &path, inode, seq, flags);
1719 1720
}

1721 1722 1723
static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
1724
{
A
Al Viro 已提交
1725
	struct dentry *parent, *old;
1726

A
Al Viro 已提交
1727 1728 1729
	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
A
Al Viro 已提交
1730
		struct path path;
1731
		unsigned seq;
A
Al Viro 已提交
1732 1733 1734
		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
					   &nd->root, &path, &seq))
			goto in_root;
1735 1736 1737 1738 1739 1740 1741 1742
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-ECHILD);
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
1743
	}
A
Al Viro 已提交
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
		return ERR_PTR(-ECHILD);
	return parent;
in_root:
1754 1755
	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
		return ERR_PTR(-ECHILD);
1756 1757 1758
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	return NULL;
1759 1760
}

1761 1762 1763
static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
1764
{
A
Al Viro 已提交
1765 1766 1767 1768 1769
	struct dentry *parent;

	if (path_equal(&nd->path, &nd->root))
		goto in_root;
	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1770 1771 1772 1773 1774
		struct path path;

		if (!choose_mountpoint(real_mount(nd->path.mnt),
				       &nd->root, &path))
			goto in_root;
1775 1776
		path_put(&nd->path);
		nd->path = path;
1777
		nd->inode = path.dentry->d_inode;
1778 1779
		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
			return ERR_PTR(-EXDEV);
1780
	}
A
Al Viro 已提交
1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791
	/* rare case of legitimate dget_parent()... */
	parent = dget_parent(nd->path.dentry);
	if (unlikely(!path_connected(nd->path.mnt, parent))) {
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
1792 1793 1794 1795
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	dget(nd->path.dentry);
	return NULL;
1796 1797
}

1798
static const char *handle_dots(struct nameidata *nd, int type)
1799 1800
{
	if (type == LAST_DOTDOT) {
1801
		const char *error = NULL;
1802 1803 1804
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;
1805 1806

		if (!nd->root.mnt) {
1807
			error = ERR_PTR(set_root(nd));
1808 1809 1810 1811
			if (error)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
1812
			parent = follow_dotdot_rcu(nd, &inode, &seq);
1813
		else
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
			parent = follow_dotdot(nd, &inode, &seq);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		if (unlikely(!parent))
			error = step_into(nd, WALK_NOFOLLOW,
					 nd->path.dentry, nd->inode, nd->seq);
		else
			error = step_into(nd, WALK_NOFOLLOW,
					 parent, inode, seq);
		if (unlikely(error))
1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
			return error;

		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
			/*
			 * If there was a racing rename or mount along our
			 * path, then we can't be sure that ".." hasn't jumped
			 * above nd->root (and so userspace should retry or use
			 * some fallback).
			 */
			smp_rmb();
			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1835
				return ERR_PTR(-EAGAIN);
1836
			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1837
				return ERR_PTR(-EAGAIN);
1838 1839
		}
	}
1840
	return NULL;
1841 1842
}

1843
static const char *walk_component(struct nameidata *nd, int flags)
1844
{
1845
	struct dentry *dentry;
1846
	struct inode *inode;
1847
	unsigned seq;
1848 1849 1850 1851 1852
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
	 * parent relationships.
	 */
1853
	if (unlikely(nd->last_type != LAST_NORM)) {
A
Al Viro 已提交
1854
		if (!(flags & WALK_MORE) && nd->depth)
1855
			put_link(nd);
1856
		return handle_dots(nd, nd->last_type);
1857
	}
1858 1859
	dentry = lookup_fast(nd, &inode, &seq);
	if (IS_ERR(dentry))
1860
		return ERR_CAST(dentry);
1861
	if (unlikely(!dentry)) {
1862 1863
		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
		if (IS_ERR(dentry))
1864
			return ERR_CAST(dentry);
1865
	}
1866 1867
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
1868
	return step_into(nd, flags, dentry, inode, seq);
1869 1870
}

1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

1890
#include <asm/word-at-a-time.h>
1891

1892
#ifdef HASH_MIX
1893

1894
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1895

1896
#elif defined(CONFIG_64BIT)
1897
/*
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
1925
 */
1926 1927 1928 1929 1930
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol64(x,12),\
	x += y,	y = rol64(y,45),\
	y *= 9			)
1931

1932
/*
1933 1934 1935
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
1936
 */
1937
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1938
{
1939 1940 1941
	y ^= x * GOLDEN_RATIO_64;
	y *= GOLDEN_RATIO_64;
	return y >> 32;
1942 1943
}

1944 1945
#else	/* 32-bit case */

1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)	\
	(	x ^= (a),	\
	y ^= x,	x = rol32(x, 7),\
	x += y,	y = rol32(y,20),\
	y *= 9			)
1961

1962
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1963
{
1964 1965
	/* Use arch-optimized multiply if one exists */
	return __hash_32(y ^ __hash_32(x));
1966 1967
}

1968 1969
#endif

1970 1971 1972 1973 1974 1975 1976
/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
1977
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1978
{
1979
	unsigned long a, x = 0, y = (unsigned long)salt;
1980 1981

	for (;;) {
1982 1983
		if (!len)
			goto done;
1984
		a = load_unaligned_zeropad(name);
1985 1986
		if (len < sizeof(unsigned long))
			break;
1987
		HASH_MIX(x, y, a);
1988 1989 1990
		name += sizeof(unsigned long);
		len -= sizeof(unsigned long);
	}
1991
	x ^= a & bytemask_from_count(len);
1992
done:
1993
	return fold_hash(x, y);
1994 1995 1996
}
EXPORT_SYMBOL(full_name_hash);

1997
/* Return the "hash_len" (hash and length) of a null-terminated string */
1998
u64 hashlen_string(const void *salt, const char *name)
1999
{
2000 2001
	unsigned long a = 0, x = 0, y = (unsigned long)salt;
	unsigned long adata, mask, len;
2002 2003
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

2004 2005 2006
	len = 0;
	goto inside;

2007
	do {
2008
		HASH_MIX(x, y, a);
2009
		len += sizeof(unsigned long);
2010
inside:
2011 2012 2013 2014 2015
		a = load_unaligned_zeropad(name+len);
	} while (!has_zero(a, &adata, &constants));

	adata = prep_zero_mask(a, adata, &constants);
	mask = create_zero_mask(adata);
2016
	x ^= a & zero_bytemask(mask);
2017

2018
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2019 2020 2021
}
EXPORT_SYMBOL(hashlen_string);

2022 2023
/*
 * Calculate the length and hash of the path component, and
2024
 * return the "hash_len" as the result.
2025
 */
2026
static inline u64 hash_name(const void *salt, const char *name)
2027
{
2028 2029
	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
	unsigned long adata, bdata, mask, len;
2030
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2031

2032 2033 2034
	len = 0;
	goto inside;

2035
	do {
2036
		HASH_MIX(x, y, a);
2037
		len += sizeof(unsigned long);
2038
inside:
2039
		a = load_unaligned_zeropad(name+len);
2040 2041 2042 2043 2044 2045
		b = a ^ REPEAT_BYTE('/');
	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

	adata = prep_zero_mask(a, adata, &constants);
	bdata = prep_zero_mask(b, bdata, &constants);
	mask = create_zero_mask(adata | bdata);
2046
	x ^= a & zero_bytemask(mask);
2047

2048
	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2049 2050
}

2051
#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2052

2053
/* Return the hash of a string of known length */
2054
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
L
Linus Torvalds 已提交
2055
{
2056
	unsigned long hash = init_name_hash(salt);
L
Linus Torvalds 已提交
2057
	while (len--)
2058
		hash = partial_name_hash((unsigned char)*name++, hash);
L
Linus Torvalds 已提交
2059 2060
	return end_name_hash(hash);
}
2061
EXPORT_SYMBOL(full_name_hash);
L
Linus Torvalds 已提交
2062

2063
/* Return the "hash_len" (hash and length) of a null-terminated string */
2064
u64 hashlen_string(const void *salt, const char *name)
2065
{
2066
	unsigned long hash = init_name_hash(salt);
2067 2068 2069
	unsigned long len = 0, c;

	c = (unsigned char)*name;
2070
	while (c) {
2071 2072 2073
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
2074
	}
2075 2076
	return hashlen_create(end_name_hash(hash), len);
}
2077
EXPORT_SYMBOL(hashlen_string);
2078

2079 2080 2081 2082
/*
 * We know there's a real path component here of at least
 * one character.
 */
2083
static inline u64 hash_name(const void *salt, const char *name)
2084
{
2085
	unsigned long hash = init_name_hash(salt);
2086 2087 2088 2089 2090 2091 2092 2093
	unsigned long len = 0, c;

	c = (unsigned char)*name;
	do {
		len++;
		hash = partial_name_hash(c, hash);
		c = (unsigned char)name[len];
	} while (c && c != '/');
2094
	return hashlen_create(end_name_hash(hash), len);
2095 2096
}

2097 2098
#endif

L
Linus Torvalds 已提交
2099 2100
/*
 * Name resolution.
2101 2102
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
L
Linus Torvalds 已提交
2103
 *
2104 2105
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
L
Linus Torvalds 已提交
2106
 */
2107
static int link_path_walk(const char *name, struct nameidata *nd)
L
Linus Torvalds 已提交
2108
{
2109
	int depth = 0; // depth <= nd->depth
L
Linus Torvalds 已提交
2110
	int err;
A
Al Viro 已提交
2111

2112
	nd->last_type = LAST_ROOT;
2113
	nd->flags |= LOOKUP_PARENT;
2114 2115
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
2116 2117
	while (*name=='/')
		name++;
2118 2119
	if (!*name) {
		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2120
		return 0;
2121
	}
L
Linus Torvalds 已提交
2122 2123 2124

	/* At this point we know we have a real path component. */
	for(;;) {
2125
		const char *link;
2126
		u64 hash_len;
A
Al Viro 已提交
2127
		int type;
L
Linus Torvalds 已提交
2128

2129
		err = may_lookup(nd);
2130
		if (err)
2131
			return err;
L
Linus Torvalds 已提交
2132

2133
		hash_len = hash_name(nd->path.dentry, name);
L
Linus Torvalds 已提交
2134

A
Al Viro 已提交
2135
		type = LAST_NORM;
2136
		if (name[0] == '.') switch (hashlen_len(hash_len)) {
A
Al Viro 已提交
2137
			case 2:
2138
				if (name[1] == '.') {
A
Al Viro 已提交
2139
					type = LAST_DOTDOT;
A
Al Viro 已提交
2140 2141
					nd->flags |= LOOKUP_JUMPED;
				}
A
Al Viro 已提交
2142 2143 2144 2145
				break;
			case 1:
				type = LAST_DOT;
		}
2146 2147
		if (likely(type == LAST_NORM)) {
			struct dentry *parent = nd->path.dentry;
A
Al Viro 已提交
2148
			nd->flags &= ~LOOKUP_JUMPED;
2149
			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2150
				struct qstr this = { { .hash_len = hash_len }, .name = name };
2151
				err = parent->d_op->d_hash(parent, &this);
2152
				if (err < 0)
2153
					return err;
2154 2155
				hash_len = this.hash_len;
				name = this.name;
2156 2157
			}
		}
A
Al Viro 已提交
2158

2159 2160
		nd->last.hash_len = hash_len;
		nd->last.name = name;
2161 2162
		nd->last_type = type;

2163 2164
		name += hashlen_len(hash_len);
		if (!*name)
2165
			goto OK;
2166 2167 2168 2169 2170
		/*
		 * If it wasn't NUL, we know it was '/'. Skip that
		 * slash, and continue until no more slashes.
		 */
		do {
2171 2172
			name++;
		} while (unlikely(*name == '/'));
2173 2174
		if (unlikely(!*name)) {
OK:
2175
			/* pathname or trailing symlink, done */
2176
			if (!depth) {
2177 2178
				nd->dir_uid = nd->inode->i_uid;
				nd->dir_mode = nd->inode->i_mode;
2179
				nd->flags &= ~LOOKUP_PARENT;
2180
				return 0;
2181
			}
2182
			/* last component of nested symlink */
2183
			name = nd->stack[--depth].name;
2184
			link = walk_component(nd, 0);
A
Al Viro 已提交
2185 2186
		} else {
			/* not the last component */
2187
			link = walk_component(nd, WALK_MORE);
2188
		}
2189 2190 2191 2192
		if (unlikely(link)) {
			if (IS_ERR(link))
				return PTR_ERR(link);
			/* a symlink to follow */
2193
			nd->stack[depth++].name = name;
2194 2195
			name = link;
			continue;
N
Nick Piggin 已提交
2196
		}
2197 2198
		if (unlikely(!d_can_lookup(nd->path.dentry))) {
			if (nd->flags & LOOKUP_RCU) {
2199
				if (!try_to_unlazy(nd))
2200 2201
					return -ECHILD;
			}
2202
			return -ENOTDIR;
2203
		}
L
Linus Torvalds 已提交
2204 2205 2206
	}
}

2207
/* must be paired with terminate_walk() */
2208
static const char *path_init(struct nameidata *nd, unsigned flags)
N
Nick Piggin 已提交
2209
{
2210
	int error;
2211
	const char *s = nd->name->name;
N
Nick Piggin 已提交
2212

J
Jens Axboe 已提交
2213 2214 2215 2216
	/* LOOKUP_CACHED requires RCU, ask caller to retry */
	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
		return ERR_PTR(-EAGAIN);

2217 2218
	if (!*s)
		flags &= ~LOOKUP_RCU;
2219 2220
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
2221

2222
	nd->flags = flags | LOOKUP_JUMPED;
N
Nick Piggin 已提交
2223
	nd->depth = 0;
2224 2225 2226 2227 2228

	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
	smp_rmb();

2229
	if (flags & LOOKUP_ROOT) {
2230 2231
		struct dentry *root = nd->root.dentry;
		struct inode *inode = root->d_inode;
2232 2233
		if (*s && unlikely(!d_can_lookup(root)))
			return ERR_PTR(-ENOTDIR);
2234 2235 2236
		nd->path = nd->root;
		nd->inode = inode;
		if (flags & LOOKUP_RCU) {
2237
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2238
			nd->root_seq = nd->seq;
2239 2240 2241
		} else {
			path_get(&nd->path);
		}
2242
		return s;
2243 2244
	}

N
Nick Piggin 已提交
2245
	nd->root.mnt = NULL;
2246 2247
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
N
Nick Piggin 已提交
2248

2249 2250
	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2251 2252 2253 2254
		error = nd_jump_root(nd);
		if (unlikely(error))
			return ERR_PTR(error);
		return s;
2255 2256 2257 2258
	}

	/* Relative pathname -- get the starting-point it is relative to. */
	if (nd->dfd == AT_FDCWD) {
A
Al Viro 已提交
2259 2260 2261
		if (flags & LOOKUP_RCU) {
			struct fs_struct *fs = current->fs;
			unsigned seq;
N
Nick Piggin 已提交
2262

A
Al Viro 已提交
2263 2264 2265
			do {
				seq = read_seqcount_begin(&fs->seq);
				nd->path = fs->pwd;
2266
				nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2267 2268 2269 2270
				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
			} while (read_seqcount_retry(&fs->seq, seq));
		} else {
			get_fs_pwd(current->fs, &nd->path);
2271
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2272
		}
N
Nick Piggin 已提交
2273
	} else {
2274
		/* Caller must check execute permissions on the starting path component */
2275
		struct fd f = fdget_raw(nd->dfd);
N
Nick Piggin 已提交
2276 2277
		struct dentry *dentry;

2278
		if (!f.file)
2279
			return ERR_PTR(-EBADF);
N
Nick Piggin 已提交
2280

2281
		dentry = f.file->f_path.dentry;
N
Nick Piggin 已提交
2282

2283 2284 2285
		if (*s && unlikely(!d_can_lookup(dentry))) {
			fdput(f);
			return ERR_PTR(-ENOTDIR);
A
Al Viro 已提交
2286
		}
N
Nick Piggin 已提交
2287

2288
		nd->path = f.file->f_path;
A
Al Viro 已提交
2289
		if (flags & LOOKUP_RCU) {
A
Al Viro 已提交
2290 2291
			nd->inode = nd->path.dentry->d_inode;
			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
A
Al Viro 已提交
2292
		} else {
2293
			path_get(&nd->path);
A
Al Viro 已提交
2294
			nd->inode = nd->path.dentry->d_inode;
A
Al Viro 已提交
2295
		}
A
Al Viro 已提交
2296
		fdput(f);
N
Nick Piggin 已提交
2297
	}
2298

2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309
	/* For scoped-lookups we need to set the root to the dirfd as well. */
	if (flags & LOOKUP_IS_SCOPED) {
		nd->root = nd->path;
		if (flags & LOOKUP_RCU) {
			nd->root_seq = nd->seq;
		} else {
			path_get(&nd->root);
			nd->flags |= LOOKUP_ROOT_GRABBED;
		}
	}
	return s;
2310 2311
}

2312
static inline const char *lookup_last(struct nameidata *nd)
2313 2314 2315 2316
{
	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

2317
	return walk_component(nd, WALK_TRAILING);
2318 2319
}

2320 2321
static int handle_lookup_down(struct nameidata *nd)
{
2322
	if (!(nd->flags & LOOKUP_RCU))
2323
		dget(nd->path.dentry);
2324 2325
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
2326 2327
}

2328
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2329
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2330
{
2331
	const char *s = path_init(nd, flags);
2332
	int err;
N
Nick Piggin 已提交
2333

2334
	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2335
		err = handle_lookup_down(nd);
2336 2337
		if (unlikely(err < 0))
			s = ERR_PTR(err);
2338 2339
	}

2340 2341 2342
	while (!(err = link_path_walk(s, nd)) &&
	       (s = lookup_last(nd)) != NULL)
		;
2343 2344
	if (!err)
		err = complete_walk(nd);
2345

2346 2347
	if (!err && nd->flags & LOOKUP_DIRECTORY)
		if (!d_can_lookup(nd->path.dentry))
A
Al Viro 已提交
2348
			err = -ENOTDIR;
2349 2350 2351 2352
	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
		err = handle_lookup_down(nd);
		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
	}
2353 2354 2355 2356 2357 2358
	if (!err) {
		*path = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2359
	return err;
A
Al Viro 已提交
2360
}
N
Nick Piggin 已提交
2361

2362 2363
int filename_lookup(int dfd, struct filename *name, unsigned flags,
		    struct path *path, struct path *root)
A
Al Viro 已提交
2364
{
2365
	int retval;
2366
	struct nameidata nd;
2367 2368
	if (IS_ERR(name))
		return PTR_ERR(name);
2369 2370 2371 2372
	if (unlikely(root)) {
		nd.root = *root;
		flags |= LOOKUP_ROOT;
	}
2373
	set_nameidata(&nd, dfd, name);
2374
	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
A
Al Viro 已提交
2375
	if (unlikely(retval == -ECHILD))
2376
		retval = path_lookupat(&nd, flags, path);
A
Al Viro 已提交
2377
	if (unlikely(retval == -ESTALE))
2378
		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
N
Nick Piggin 已提交
2379

2380
	if (likely(!retval))
2381 2382
		audit_inode(name, path->dentry,
			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2383
	restore_nameidata();
2384
	putname(name);
2385
	return retval;
L
Linus Torvalds 已提交
2386 2387
}

2388
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2389
static int path_parentat(struct nameidata *nd, unsigned flags,
2390
				struct path *parent)
2391
{
2392
	const char *s = path_init(nd, flags);
2393
	int err = link_path_walk(s, nd);
2394 2395
	if (!err)
		err = complete_walk(nd);
2396 2397 2398 2399 2400 2401
	if (!err) {
		*parent = nd->path;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
	}
	terminate_walk(nd);
2402 2403 2404
	return err;
}

2405
static struct filename *filename_parentat(int dfd, struct filename *name,
2406 2407
				unsigned int flags, struct path *parent,
				struct qstr *last, int *type)
2408 2409
{
	int retval;
2410
	struct nameidata nd;
2411

2412 2413
	if (IS_ERR(name))
		return name;
2414
	set_nameidata(&nd, dfd, name);
2415
	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2416
	if (unlikely(retval == -ECHILD))
2417
		retval = path_parentat(&nd, flags, parent);
2418
	if (unlikely(retval == -ESTALE))
2419
		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2420 2421 2422
	if (likely(!retval)) {
		*last = nd.last;
		*type = nd.last_type;
2423
		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2424 2425 2426
	} else {
		putname(name);
		name = ERR_PTR(retval);
2427
	}
2428
	restore_nameidata();
2429
	return name;
2430 2431
}

A
Al Viro 已提交
2432 2433
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
2434
{
2435 2436
	struct filename *filename;
	struct dentry *d;
2437 2438
	struct qstr last;
	int type;
2439

2440 2441
	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				    &last, &type);
2442 2443
	if (IS_ERR(filename))
		return ERR_CAST(filename);
2444
	if (unlikely(type != LAST_NORM)) {
2445
		path_put(path);
2446 2447
		putname(filename);
		return ERR_PTR(-EINVAL);
A
Al Viro 已提交
2448
	}
A
Al Viro 已提交
2449
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2450
	d = __lookup_hash(&last, path->dentry, 0);
A
Al Viro 已提交
2451
	if (IS_ERR(d)) {
A
Al Viro 已提交
2452
		inode_unlock(path->dentry->d_inode);
2453
		path_put(path);
A
Al Viro 已提交
2454
	}
2455
	putname(filename);
A
Al Viro 已提交
2456
	return d;
2457 2458
}

A
Al Viro 已提交
2459 2460
int kern_path(const char *name, unsigned int flags, struct path *path)
{
2461 2462
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags, path, NULL);
A
Al Viro 已提交
2463
}
2464
EXPORT_SYMBOL(kern_path);
A
Al Viro 已提交
2465

2466 2467 2468 2469 2470 2471
/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
2472
 * @path: pointer to struct path to fill
2473 2474 2475
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
		    const char *name, unsigned int flags,
2476
		    struct path *path)
2477
{
2478 2479
	struct path root = {.mnt = mnt, .dentry = dentry};
	/* the first argument of filename_lookup() is ignored with root */
2480 2481
	return filename_lookup(AT_FDCWD, getname_kernel(name),
			       flags , path, &root);
2482
}
2483
EXPORT_SYMBOL(vfs_path_lookup);
2484

2485 2486
static int lookup_one_len_common(const char *name, struct dentry *base,
				 int len, struct qstr *this)
2487
{
2488 2489 2490
	this->name = name;
	this->len = len;
	this->hash = full_name_hash(base, name, len);
A
Al Viro 已提交
2491
	if (!len)
2492
		return -EACCES;
A
Al Viro 已提交
2493

A
Al Viro 已提交
2494 2495
	if (unlikely(name[0] == '.')) {
		if (len < 2 || (len == 2 && name[1] == '.'))
2496
			return -EACCES;
A
Al Viro 已提交
2497 2498
	}

A
Al Viro 已提交
2499
	while (len--) {
2500
		unsigned int c = *(const unsigned char *)name++;
A
Al Viro 已提交
2501
		if (c == '/' || c == '\0')
2502
			return -EACCES;
A
Al Viro 已提交
2503
	}
2504 2505 2506 2507 2508
	/*
	 * See if the low-level filesystem might want
	 * to use its own hash..
	 */
	if (base->d_flags & DCACHE_OP_HASH) {
2509
		int err = base->d_op->d_hash(base, this);
2510
		if (err < 0)
2511
			return err;
2512
	}
2513

2514 2515 2516
	return inode_permission(base->d_inode, MAY_EXEC);
}

2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
	if (err)
		return ERR_PTR(err);

	return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558
/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
2559
	struct dentry *dentry;
2560 2561 2562 2563 2564 2565
	struct qstr this;
	int err;

	WARN_ON_ONCE(!inode_is_locked(base->d_inode));

	err = lookup_one_len_common(name, base, len, &this);
2566 2567 2568
	if (err)
		return ERR_PTR(err);

2569 2570
	dentry = lookup_dcache(&this, base, 0);
	return dentry ? dentry : __lookup_slow(&this, base, 0);
2571
}
2572
EXPORT_SYMBOL(lookup_one_len);
2573

2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590
/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:	pathname component to lookup
 * @base:	base directory to lookup from
 * @len:	maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct qstr this;
	int err;
2591
	struct dentry *ret;
2592

2593
	err = lookup_one_len_common(name, base, len, &this);
2594 2595 2596
	if (err)
		return ERR_PTR(err);

2597 2598 2599 2600
	ret = lookup_dcache(&this, base, 0);
	if (!ret)
		ret = lookup_slow(&this, base, 0);
	return ret;
2601 2602 2603
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

A
Al Viro 已提交
2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615
/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
				       struct dentry *base, int len)
{
	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2616
	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
A
Al Viro 已提交
2617 2618 2619 2620 2621 2622 2623
		dput(ret);
		ret = ERR_PTR(-ENOENT);
	}
	return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

2624 2625 2626 2627 2628 2629
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
	/* Find something mounted on "pts" in the same directory as
	 * the input path.
	 */
2630 2631
	struct dentry *parent = dget_parent(path->dentry);
	struct dentry *child;
A
Al Viro 已提交
2632
	struct qstr this = QSTR_INIT("pts", 3);
2633

2634 2635
	if (unlikely(!path_connected(path->mnt, parent))) {
		dput(parent);
2636
		return -ENOENT;
2637
	}
2638 2639
	dput(path->dentry);
	path->dentry = parent;
2640 2641 2642 2643 2644 2645
	child = d_hash_and_lookup(parent, &this);
	if (!child)
		return -ENOENT;

	path->dentry = child;
	dput(parent);
A
Al Viro 已提交
2646
	follow_down(path);
2647 2648 2649 2650
	return 0;
}
#endif

2651 2652
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
		 struct path *path, int *empty)
L
Linus Torvalds 已提交
2653
{
2654 2655
	return filename_lookup(dfd, getname_flags(name, flags, empty),
			       flags, path, NULL);
L
Linus Torvalds 已提交
2656
}
2657
EXPORT_SYMBOL(user_path_at_empty);
2658

M
Miklos Szeredi 已提交
2659
int __check_sticky(struct inode *dir, struct inode *inode)
L
Linus Torvalds 已提交
2660
{
2661
	kuid_t fsuid = current_fsuid();
2662

2663
	if (uid_eq(inode->i_uid, fsuid))
L
Linus Torvalds 已提交
2664
		return 0;
2665
	if (uid_eq(dir->i_uid, fsuid))
L
Linus Torvalds 已提交
2666
		return 0;
2667
	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
L
Linus Torvalds 已提交
2668
}
M
Miklos Szeredi 已提交
2669
EXPORT_SYMBOL(__check_sticky);
L
Linus Torvalds 已提交
2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683

/*
 *	Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *	a. be owner of dir, or
 *	b. be owner of victim, or
 *	c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
2684 2685 2686 2687 2688
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
L
Linus Torvalds 已提交
2689 2690
 *     nfs_async_unlink().
 */
2691
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
L
Linus Torvalds 已提交
2692
{
2693
	struct inode *inode = d_backing_inode(victim);
L
Linus Torvalds 已提交
2694 2695
	int error;

2696
	if (d_is_negative(victim))
L
Linus Torvalds 已提交
2697
		return -ENOENT;
2698
	BUG_ON(!inode);
L
Linus Torvalds 已提交
2699 2700

	BUG_ON(victim->d_parent->d_inode != dir);
2701 2702 2703 2704 2705

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;

2706
	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
L
Linus Torvalds 已提交
2707

2708
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2709 2710 2711 2712
	if (error)
		return error;
	if (IS_APPEND(dir))
		return -EPERM;
2713 2714

	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2715
	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
L
Linus Torvalds 已提交
2716 2717
		return -EPERM;
	if (isdir) {
M
Miklos Szeredi 已提交
2718
		if (!d_is_dir(victim))
L
Linus Torvalds 已提交
2719 2720 2721
			return -ENOTDIR;
		if (IS_ROOT(victim))
			return -EBUSY;
M
Miklos Szeredi 已提交
2722
	} else if (d_is_dir(victim))
L
Linus Torvalds 已提交
2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735
		return -EISDIR;
	if (IS_DEADDIR(dir))
		return -ENOENT;
	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
		return -EBUSY;
	return 0;
}

/*	Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
2736 2737 2738
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
L
Linus Torvalds 已提交
2739
 */
2740
static inline int may_create(struct inode *dir, struct dentry *child)
L
Linus Torvalds 已提交
2741
{
2742
	struct user_namespace *s_user_ns;
2743
	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
L
Linus Torvalds 已提交
2744 2745 2746 2747
	if (child->d_inode)
		return -EEXIST;
	if (IS_DEADDIR(dir))
		return -ENOENT;
2748 2749 2750 2751
	s_user_ns = dir->i_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;
2752
	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
L
Linus Torvalds 已提交
2753 2754 2755 2756 2757 2758 2759 2760 2761 2762
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
	struct dentry *p;

	if (p1 == p2) {
A
Al Viro 已提交
2763
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
L
Linus Torvalds 已提交
2764 2765 2766
		return NULL;
	}

2767
	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2768

2769 2770
	p = d_ancestor(p2, p1);
	if (p) {
A
Al Viro 已提交
2771 2772
		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2773
		return p;
L
Linus Torvalds 已提交
2774 2775
	}

2776 2777
	p = d_ancestor(p1, p2);
	if (p) {
A
Al Viro 已提交
2778 2779
		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2780
		return p;
L
Linus Torvalds 已提交
2781 2782
	}

A
Al Viro 已提交
2783 2784
	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
L
Linus Torvalds 已提交
2785 2786
	return NULL;
}
2787
EXPORT_SYMBOL(lock_rename);
L
Linus Torvalds 已提交
2788 2789 2790

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
A
Al Viro 已提交
2791
	inode_unlock(p1->d_inode);
L
Linus Torvalds 已提交
2792
	if (p1 != p2) {
A
Al Viro 已提交
2793
		inode_unlock(p2->d_inode);
2794
		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
L
Linus Torvalds 已提交
2795 2796
	}
}
2797
EXPORT_SYMBOL(unlock_rename);
L
Linus Torvalds 已提交
2798

A
Al Viro 已提交
2799
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
A
Al Viro 已提交
2800
		bool want_excl)
L
Linus Torvalds 已提交
2801
{
2802
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
2803 2804 2805
	if (error)
		return error;

A
Al Viro 已提交
2806
	if (!dir->i_op->create)
L
Linus Torvalds 已提交
2807 2808 2809 2810 2811 2812
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
A
Al Viro 已提交
2813
	error = dir->i_op->create(dir, dentry, mode, want_excl);
2814
	if (!error)
2815
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
2816 2817
	return error;
}
2818
EXPORT_SYMBOL(vfs_create);
L
Linus Torvalds 已提交
2819

A
Al Viro 已提交
2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840
int vfs_mkobj(struct dentry *dentry, umode_t mode,
		int (*f)(struct dentry *, umode_t, void *),
		void *arg)
{
	struct inode *dir = dentry->d_parent->d_inode;
	int error = may_create(dir, dentry);
	if (error)
		return error;

	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	error = f(dentry, mode, arg);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}
EXPORT_SYMBOL(vfs_mkobj);

2841 2842 2843 2844 2845 2846
bool may_open_dev(const struct path *path)
{
	return !(path->mnt->mnt_flags & MNT_NODEV) &&
		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

A
Al Viro 已提交
2847
static int may_open(const struct path *path, int acc_mode, int flag)
L
Linus Torvalds 已提交
2848
{
2849
	struct dentry *dentry = path->dentry;
L
Linus Torvalds 已提交
2850 2851 2852 2853 2854 2855
	struct inode *inode = dentry->d_inode;
	int error;

	if (!inode)
		return -ENOENT;

C
Christoph Hellwig 已提交
2856 2857
	switch (inode->i_mode & S_IFMT) {
	case S_IFLNK:
L
Linus Torvalds 已提交
2858
		return -ELOOP;
C
Christoph Hellwig 已提交
2859
	case S_IFDIR:
2860
		if (acc_mode & MAY_WRITE)
C
Christoph Hellwig 已提交
2861
			return -EISDIR;
2862 2863
		if (acc_mode & MAY_EXEC)
			return -EACCES;
C
Christoph Hellwig 已提交
2864 2865 2866
		break;
	case S_IFBLK:
	case S_IFCHR:
2867
		if (!may_open_dev(path))
L
Linus Torvalds 已提交
2868
			return -EACCES;
K
Kees Cook 已提交
2869
		fallthrough;
C
Christoph Hellwig 已提交
2870 2871
	case S_IFIFO:
	case S_IFSOCK:
K
Kees Cook 已提交
2872 2873
		if (acc_mode & MAY_EXEC)
			return -EACCES;
L
Linus Torvalds 已提交
2874
		flag &= ~O_TRUNC;
C
Christoph Hellwig 已提交
2875
		break;
2876 2877 2878 2879
	case S_IFREG:
		if ((acc_mode & MAY_EXEC) && path_noexec(path))
			return -EACCES;
		break;
2880
	}
2881

A
Al Viro 已提交
2882
	error = inode_permission(inode, MAY_OPEN | acc_mode);
2883 2884
	if (error)
		return error;
M
Mimi Zohar 已提交
2885

L
Linus Torvalds 已提交
2886 2887 2888 2889
	/*
	 * An append-only file must be opened in append mode for writing.
	 */
	if (IS_APPEND(inode)) {
2890
		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2891
			return -EPERM;
L
Linus Torvalds 已提交
2892
		if (flag & O_TRUNC)
2893
			return -EPERM;
L
Linus Torvalds 已提交
2894 2895 2896
	}

	/* O_NOATIME can only be set by the owner or superuser */
2897
	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2898
		return -EPERM;
L
Linus Torvalds 已提交
2899

2900
	return 0;
2901
}
L
Linus Torvalds 已提交
2902

2903
static int handle_truncate(struct file *filp)
2904
{
A
Al Viro 已提交
2905
	const struct path *path = &filp->f_path;
2906 2907 2908 2909 2910 2911 2912
	struct inode *inode = path->dentry->d_inode;
	int error = get_write_access(inode);
	if (error)
		return error;
	/*
	 * Refuse to truncate files with mandatory locks held on them.
	 */
2913
	error = locks_verify_locked(filp);
2914
	if (!error)
2915
		error = security_path_truncate(path);
2916 2917 2918
	if (!error) {
		error = do_truncate(path->dentry, 0,
				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2919
				    filp);
2920 2921
	}
	put_write_access(inode);
M
Mimi Zohar 已提交
2922
	return error;
L
Linus Torvalds 已提交
2923 2924
}

2925 2926
static inline int open_to_namei_flags(int flag)
{
2927 2928
	if ((flag & O_ACCMODE) == 3)
		flag--;
2929 2930 2931
	return flag;
}

2932
static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
M
Miklos Szeredi 已提交
2933
{
2934
	struct user_namespace *s_user_ns;
M
Miklos Szeredi 已提交
2935 2936 2937 2938
	int error = security_path_mknod(dir, dentry, mode, 0);
	if (error)
		return error;

2939 2940 2941 2942 2943
	s_user_ns = dir->dentry->d_sb->s_user_ns;
	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
	    !kgid_has_mapping(s_user_ns, current_fsgid()))
		return -EOVERFLOW;

M
Miklos Szeredi 已提交
2944 2945 2946 2947 2948 2949 2950
	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
	if (error)
		return error;

	return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

2951 2952 2953 2954 2955 2956 2957
/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
2958 2959 2960
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
2961 2962 2963
 *
 * Returns an error code otherwise.
 */
2964 2965 2966
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
				  struct file *file,
				  int open_flag, umode_t mode)
M
Miklos Szeredi 已提交
2967
{
2968
	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
M
Miklos Szeredi 已提交
2969 2970 2971 2972 2973 2974
	struct inode *dir =  nd->path.dentry->d_inode;
	int error;

	if (nd->flags & LOOKUP_DIRECTORY)
		open_flag |= O_DIRECTORY;

A
Al Viro 已提交
2975 2976
	file->f_path.dentry = DENTRY_NOT_SET;
	file->f_path.mnt = nd->path.mnt;
2977
	error = dir->i_op->atomic_open(dir, dentry, file,
2978
				       open_to_namei_flags(open_flag), mode);
2979
	d_lookup_done(dentry);
2980
	if (!error) {
2981
		if (file->f_mode & FMODE_OPENED) {
2982 2983 2984 2985
			if (unlikely(dentry != file->f_path.dentry)) {
				dput(dentry);
				dentry = dget(file->f_path.dentry);
			}
2986
		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2987
			error = -EIO;
2988
		} else {
2989 2990 2991
			if (file->f_path.dentry) {
				dput(dentry);
				dentry = file->f_path.dentry;
2992
			}
2993
			if (unlikely(d_is_negative(dentry)))
A
Al Viro 已提交
2994
				error = -ENOENT;
2995
		}
M
Miklos Szeredi 已提交
2996
	}
2997 2998 2999 3000 3001
	if (error) {
		dput(dentry);
		dentry = ERR_PTR(error);
	}
	return dentry;
M
Miklos Szeredi 已提交
3002 3003
}

M
Miklos Szeredi 已提交
3004
/*
3005
 * Look up and maybe create and open the last component.
M
Miklos Szeredi 已提交
3006
 *
3007
 * Must be called with parent locked (exclusive in O_CREAT case).
3008
 *
3009 3010 3011 3012 3013 3014 3015
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
3016
 *
3017
 * An error code is returned on failure.
M
Miklos Szeredi 已提交
3018
 */
3019 3020 3021
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
M
Miklos Szeredi 已提交
3022 3023
{
	struct dentry *dir = nd->path.dentry;
3024
	struct inode *dir_inode = dir->d_inode;
3025
	int open_flag = op->open_flag;
M
Miklos Szeredi 已提交
3026
	struct dentry *dentry;
3027 3028
	int error, create_error = 0;
	umode_t mode = op->mode;
3029
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
M
Miklos Szeredi 已提交
3030

3031
	if (unlikely(IS_DEADDIR(dir_inode)))
3032
		return ERR_PTR(-ENOENT);
M
Miklos Szeredi 已提交
3033

3034
	file->f_mode &= ~FMODE_CREATED;
3035 3036 3037 3038 3039
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
3040
				return dentry;
3041 3042 3043
		}
		if (d_in_lookup(dentry))
			break;
M
Miklos Szeredi 已提交
3044

3045 3046 3047 3048 3049 3050 3051 3052 3053 3054
		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
3055
		/* Cached positive dentry: will open in f_op->open */
3056
		return dentry;
3057
	}
M
Miklos Szeredi 已提交
3058

3059 3060 3061 3062 3063 3064 3065 3066 3067
	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
3068 3069
	if (unlikely(!got_write))
		open_flag &= ~O_TRUNC;
3070
	if (open_flag & O_CREAT) {
3071 3072
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
3073 3074
		if (!IS_POSIXACL(dir->d_inode))
			mode &= ~current_umask();
3075
		if (likely(got_write))
3076
			create_error = may_o_create(&nd->path, dentry, mode);
3077 3078
		else
			create_error = -EROFS;
M
Miklos Szeredi 已提交
3079
	}
3080 3081
	if (create_error)
		open_flag &= ~O_CREAT;
3082
	if (dir_inode->i_op->atomic_open) {
3083
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3084 3085 3086
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
M
Miklos Szeredi 已提交
3087
	}
3088

3089
	if (d_in_lookup(dentry)) {
3090 3091
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
3092
		d_lookup_done(dentry);
3093 3094 3095 3096 3097 3098 3099 3100
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
3101 3102
	}

M
Miklos Szeredi 已提交
3103
	/* Negative dentry, just create the file */
3104
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3105
		file->f_mode |= FMODE_CREATED;
3106 3107 3108
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
M
Miklos Szeredi 已提交
3109
			goto out_dput;
3110 3111
		}
		error = dir_inode->i_op->create(dir_inode, dentry, mode,
3112
						open_flag & O_EXCL);
M
Miklos Szeredi 已提交
3113 3114 3115
		if (error)
			goto out_dput;
	}
3116 3117 3118
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
M
Miklos Szeredi 已提交
3119
	}
3120
	return dentry;
M
Miklos Szeredi 已提交
3121 3122 3123

out_dput:
	dput(dentry);
3124
	return ERR_PTR(error);
M
Miklos Szeredi 已提交
3125 3126
}

3127
static const char *open_last_lookups(struct nameidata *nd,
3128
		   struct file *file, const struct open_flags *op)
3129
{
3130
	struct dentry *dir = nd->path.dentry;
3131
	int open_flag = op->open_flag;
3132
	bool got_write = false;
3133
	unsigned seq;
3134
	struct inode *inode;
3135
	struct dentry *dentry;
3136
	const char *res;
3137

3138 3139
	nd->flags |= op->intent;

3140
	if (nd->last_type != LAST_NORM) {
3141 3142
		if (nd->depth)
			put_link(nd);
3143
		return handle_dots(nd, nd->last_type);
3144
	}
3145

3146
	if (!(open_flag & O_CREAT)) {
3147 3148 3149
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
3150 3151
		dentry = lookup_fast(nd, &inode, &seq);
		if (IS_ERR(dentry))
3152
			return ERR_CAST(dentry);
3153
		if (likely(dentry))
3154 3155
			goto finish_lookup;

A
Al Viro 已提交
3156
		BUG_ON(nd->flags & LOOKUP_RCU);
3157 3158
	} else {
		/* create side of things */
3159
		if (nd->flags & LOOKUP_RCU) {
3160 3161
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
3162
		}
3163
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3164
		/* trailing slashes? */
3165
		if (unlikely(nd->last.name[nd->last.len]))
3166
			return ERR_PTR(-EISDIR);
3167
	}
A
Al Viro 已提交
3168

3169
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3170
		got_write = !mnt_want_write(nd->path.mnt);
3171 3172 3173 3174 3175 3176
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
3177 3178 3179 3180
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
3181
	dentry = lookup_open(nd, file, op, got_write);
3182 3183
	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);
3184 3185 3186 3187
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);
3188

3189
	if (got_write)
3190
		mnt_drop_write(nd->path.mnt);
M
Miklos Szeredi 已提交
3191

3192 3193 3194
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

3195
	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3196 3197
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
3198
		return NULL;
3199 3200
	}

3201
finish_lookup:
3202 3203
	if (nd->depth)
		put_link(nd);
3204
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3205
	if (unlikely(res))
3206
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3207
	return res;
3208 3209 3210 3211 3212
}

/*
 * Handle the last step of open()
 */
3213
static int do_open(struct nameidata *nd,
3214 3215 3216 3217 3218 3219 3220
		   struct file *file, const struct open_flags *op)
{
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

3221 3222 3223 3224 3225
	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
3226 3227
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);
3228
	if (open_flag & O_CREAT) {
3229 3230
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
3231
		if (d_is_dir(nd->path.dentry))
3232
			return -EISDIR;
3233
		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
3234 3235
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
3236
			return error;
3237
	}
M
Miklos Szeredi 已提交
3238
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3239
		return -ENOTDIR;
3240

3241 3242
	do_truncate = false;
	acc_mode = op->acc_mode;
3243 3244 3245 3246
	if (file->f_mode & FMODE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
3247
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3248 3249
		error = mnt_want_write(nd->path.mnt);
		if (error)
3250
			return error;
3251
		do_truncate = true;
3252
	}
3253
	error = may_open(&nd->path, acc_mode, open_flag);
3254
	if (!error && !(file->f_mode & FMODE_OPENED))
A
Al Viro 已提交
3255
		error = vfs_open(&nd->path, file);
3256 3257 3258
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
3259
		error = handle_truncate(file);
3260 3261 3262 3263
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
3264
	if (do_truncate)
3265
		mnt_drop_write(nd->path.mnt);
3266
	return error;
3267 3268
}

3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283
struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
{
	struct dentry *child = NULL;
	struct inode *dir = dentry->d_inode;
	struct inode *inode;
	int error;

	/* we want directory to be writable */
	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
	if (error)
		goto out_err;
	error = -EOPNOTSUPP;
	if (!dir->i_op->tmpfile)
		goto out_err;
	error = -ENOMEM;
D
David Howells 已提交
3284
	child = d_alloc(dentry, &slash_name);
3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298
	if (unlikely(!child))
		goto out_err;
	error = dir->i_op->tmpfile(dir, child, mode);
	if (error)
		goto out_err;
	error = -ENOENT;
	inode = child->d_inode;
	if (unlikely(!inode))
		goto out_err;
	if (!(open_flag & O_EXCL)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
3299
	ima_post_create_tmpfile(inode);
3300 3301 3302 3303 3304 3305 3306 3307
	return child;

out_err:
	dput(child);
	return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

3308
static int do_tmpfile(struct nameidata *nd, unsigned flags,
3309
		const struct open_flags *op,
3310
		struct file *file)
3311
{
3312 3313
	struct dentry *child;
	struct path path;
3314
	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3315 3316
	if (unlikely(error))
		return error;
3317
	error = mnt_want_write(path.mnt);
3318 3319
	if (unlikely(error))
		goto out;
3320 3321
	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
	error = PTR_ERR(child);
3322
	if (IS_ERR(child))
3323
		goto out2;
3324 3325
	dput(path.dentry);
	path.dentry = child;
3326
	audit_inode(nd->name, child, 0);
3327
	/* Don't check for other permissions, the inode was just created */
A
Al Viro 已提交
3328
	error = may_open(&path, 0, op->open_flag);
3329 3330
	if (!error)
		error = vfs_open(&path, file);
3331
out2:
3332
	mnt_drop_write(path.mnt);
3333
out:
3334
	path_put(&path);
3335 3336 3337
	return error;
}

3338 3339 3340 3341 3342 3343
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
	struct path path;
	int error = path_lookupat(nd, flags, &path);
	if (!error) {
		audit_inode(nd->name, path.dentry, 0);
3344
		error = vfs_open(&path, file);
3345 3346 3347 3348 3349
		path_put(&path);
	}
	return error;
}

3350 3351
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
L
Linus Torvalds 已提交
3352
{
A
Al Viro 已提交
3353
	struct file *file;
3354
	int error;
N
Nick Piggin 已提交
3355

3356
	file = alloc_empty_file(op->open_flag, current_cred());
3357 3358
	if (IS_ERR(file))
		return file;
N
Nick Piggin 已提交
3359

A
Al Viro 已提交
3360
	if (unlikely(file->f_flags & __O_TMPFILE)) {
3361
		error = do_tmpfile(nd, flags, op, file);
3362
	} else if (unlikely(file->f_flags & O_PATH)) {
3363
		error = do_o_path(nd, flags, file);
3364 3365 3366
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
3367
		       (s = open_last_lookups(nd, file, op)) != NULL)
3368
			;
3369 3370
		if (!error)
			error = do_open(nd, file, op);
3371
		terminate_walk(nd);
3372
	}
3373
	if (likely(!error)) {
3374
		if (likely(file->f_mode & FMODE_OPENED))
3375 3376 3377
			return file;
		WARN_ON(1);
		error = -EINVAL;
3378
	}
3379 3380 3381 3382 3383 3384
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
3385
	}
3386
	return ERR_PTR(error);
L
Linus Torvalds 已提交
3387 3388
}

3389
struct file *do_filp_open(int dfd, struct filename *pathname,
3390
		const struct open_flags *op)
3391
{
3392
	struct nameidata nd;
3393
	int flags = op->lookup_flags;
3394 3395
	struct file *filp;

3396
	set_nameidata(&nd, dfd, pathname);
3397
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3398
	if (unlikely(filp == ERR_PTR(-ECHILD)))
3399
		filp = path_openat(&nd, op, flags);
3400
	if (unlikely(filp == ERR_PTR(-ESTALE)))
3401
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3402
	restore_nameidata();
3403 3404 3405
	return filp;
}

A
Al Viro 已提交
3406
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3407
		const char *name, const struct open_flags *op)
A
Al Viro 已提交
3408
{
3409
	struct nameidata nd;
A
Al Viro 已提交
3410
	struct file *file;
3411
	struct filename *filename;
3412
	int flags = op->lookup_flags | LOOKUP_ROOT;
A
Al Viro 已提交
3413 3414 3415 3416

	nd.root.mnt = mnt;
	nd.root.dentry = dentry;

3417
	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
A
Al Viro 已提交
3418 3419
		return ERR_PTR(-ELOOP);

3420
	filename = getname_kernel(name);
3421
	if (IS_ERR(filename))
3422 3423
		return ERR_CAST(filename);

3424
	set_nameidata(&nd, -1, filename);
3425
	file = path_openat(&nd, op, flags | LOOKUP_RCU);
A
Al Viro 已提交
3426
	if (unlikely(file == ERR_PTR(-ECHILD)))
3427
		file = path_openat(&nd, op, flags);
A
Al Viro 已提交
3428
	if (unlikely(file == ERR_PTR(-ESTALE)))
3429
		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3430
	restore_nameidata();
3431
	putname(filename);
A
Al Viro 已提交
3432 3433 3434
	return file;
}

3435
static struct dentry *filename_create(int dfd, struct filename *name,
3436
				struct path *path, unsigned int lookup_flags)
L
Linus Torvalds 已提交
3437
{
3438
	struct dentry *dentry = ERR_PTR(-EEXIST);
3439 3440
	struct qstr last;
	int type;
3441
	int err2;
3442 3443 3444 3445 3446 3447 3448 3449 3450
	int error;
	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

	/*
	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
	 * other flags passed in are ignored!
	 */
	lookup_flags &= LOOKUP_REVAL;

3451 3452 3453
	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
	if (IS_ERR(name))
		return ERR_CAST(name);
L
Linus Torvalds 已提交
3454

3455 3456 3457 3458
	/*
	 * Yucky last component or no last component at all?
	 * (foo/., foo/.., /////)
	 */
3459
	if (unlikely(type != LAST_NORM))
A
Al Viro 已提交
3460
		goto out;
3461

3462
	/* don't fail immediately if it's r/o, at least try to report other errors */
3463
	err2 = mnt_want_write(path->mnt);
3464 3465 3466
	/*
	 * Do the final lookup.
	 */
3467
	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
A
Al Viro 已提交
3468
	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3469
	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
L
Linus Torvalds 已提交
3470
	if (IS_ERR(dentry))
3471
		goto unlock;
3472

3473
	error = -EEXIST;
3474
	if (d_is_positive(dentry))
3475
		goto fail;
3476

3477 3478 3479 3480 3481 3482
	/*
	 * Special case - lookup gave negative, but... we had foo/bar/
	 * From the vfs_mknod() POV we just have a negative dentry -
	 * all is fine. Let's be bastards - you had / on the end, you've
	 * been asking for (non-existent) directory. -ENOENT for you.
	 */
3483
	if (unlikely(!is_dir && last.name[last.len])) {
3484
		error = -ENOENT;
A
Al Viro 已提交
3485
		goto fail;
3486
	}
3487 3488
	if (unlikely(err2)) {
		error = err2;
3489
		goto fail;
3490
	}
3491
	putname(name);
L
Linus Torvalds 已提交
3492 3493
	return dentry;
fail:
3494 3495 3496
	dput(dentry);
	dentry = ERR_PTR(error);
unlock:
A
Al Viro 已提交
3497
	inode_unlock(path->dentry->d_inode);
3498
	if (!err2)
3499
		mnt_drop_write(path->mnt);
A
Al Viro 已提交
3500
out:
3501
	path_put(path);
3502
	putname(name);
L
Linus Torvalds 已提交
3503 3504
	return dentry;
}
3505 3506 3507 3508

struct dentry *kern_path_create(int dfd, const char *pathname,
				struct path *path, unsigned int lookup_flags)
{
3509 3510
	return filename_create(dfd, getname_kernel(pathname),
				path, lookup_flags);
3511
}
3512 3513
EXPORT_SYMBOL(kern_path_create);

A
Al Viro 已提交
3514 3515 3516
void done_path_create(struct path *path, struct dentry *dentry)
{
	dput(dentry);
A
Al Viro 已提交
3517
	inode_unlock(path->dentry->d_inode);
3518
	mnt_drop_write(path->mnt);
A
Al Viro 已提交
3519 3520 3521 3522
	path_put(path);
}
EXPORT_SYMBOL(done_path_create);

A
Al Viro 已提交
3523
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3524
				struct path *path, unsigned int lookup_flags)
3525
{
3526
	return filename_create(dfd, getname(pathname), path, lookup_flags);
3527 3528 3529
}
EXPORT_SYMBOL(user_path_create);

A
Al Viro 已提交
3530
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
L
Linus Torvalds 已提交
3531
{
3532
	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3533
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3534 3535 3536 3537

	if (error)
		return error;

3538 3539
	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
	    !capable(CAP_MKNOD))
L
Linus Torvalds 已提交
3540 3541
		return -EPERM;

A
Al Viro 已提交
3542
	if (!dir->i_op->mknod)
L
Linus Torvalds 已提交
3543 3544
		return -EPERM;

3545 3546 3547 3548
	error = devcgroup_inode_mknod(mode, dev);
	if (error)
		return error;

L
Linus Torvalds 已提交
3549 3550 3551 3552 3553
	error = security_inode_mknod(dir, dentry, mode, dev);
	if (error)
		return error;

	error = dir->i_op->mknod(dir, dentry, mode, dev);
3554
	if (!error)
3555
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3556 3557
	return error;
}
3558
EXPORT_SYMBOL(vfs_mknod);
L
Linus Torvalds 已提交
3559

A
Al Viro 已提交
3560
static int may_mknod(umode_t mode)
3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576
{
	switch (mode & S_IFMT) {
	case S_IFREG:
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
	case 0: /* zero mode translates to S_IFREG */
		return 0;
	case S_IFDIR:
		return -EPERM;
	default:
		return -EINVAL;
	}
}

3577
static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3578
		unsigned int dev)
L
Linus Torvalds 已提交
3579
{
3580
	struct dentry *dentry;
3581 3582
	struct path path;
	int error;
3583
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3584

3585 3586 3587
	error = may_mknod(mode);
	if (error)
		return error;
3588 3589
retry:
	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3590 3591
	if (IS_ERR(dentry))
		return PTR_ERR(dentry);
3592

3593
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3594
		mode &= ~current_umask();
3595
	error = security_path_mknod(&path, dentry, mode, dev);
3596
	if (error)
3597
		goto out;
3598
	switch (mode & S_IFMT) {
L
Linus Torvalds 已提交
3599
		case 0: case S_IFREG:
A
Al Viro 已提交
3600
			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3601 3602
			if (!error)
				ima_post_path_mknod(dentry);
L
Linus Torvalds 已提交
3603 3604
			break;
		case S_IFCHR: case S_IFBLK:
3605
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
L
Linus Torvalds 已提交
3606 3607 3608
					new_decode_dev(dev));
			break;
		case S_IFIFO: case S_IFSOCK:
3609
			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
L
Linus Torvalds 已提交
3610 3611
			break;
	}
3612
out:
A
Al Viro 已提交
3613
	done_path_create(&path, dentry);
3614 3615 3616 3617
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3618 3619 3620
	return error;
}

3621 3622 3623 3624 3625 3626
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
		unsigned int, dev)
{
	return do_mknodat(dfd, filename, mode, dev);
}

A
Al Viro 已提交
3627
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3628
{
3629
	return do_mknodat(AT_FDCWD, filename, mode, dev);
3630 3631
}

3632
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
L
Linus Torvalds 已提交
3633
{
3634
	int error = may_create(dir, dentry);
3635
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
3636 3637 3638 3639

	if (error)
		return error;

A
Al Viro 已提交
3640
	if (!dir->i_op->mkdir)
L
Linus Torvalds 已提交
3641 3642 3643 3644 3645 3646 3647
		return -EPERM;

	mode &= (S_IRWXUGO|S_ISVTX);
	error = security_inode_mkdir(dir, dentry, mode);
	if (error)
		return error;

3648 3649 3650
	if (max_links && dir->i_nlink >= max_links)
		return -EMLINK;

L
Linus Torvalds 已提交
3651
	error = dir->i_op->mkdir(dir, dentry, mode);
3652
	if (!error)
3653
		fsnotify_mkdir(dir, dentry);
L
Linus Torvalds 已提交
3654 3655
	return error;
}
3656
EXPORT_SYMBOL(vfs_mkdir);
L
Linus Torvalds 已提交
3657

3658
static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
L
Linus Torvalds 已提交
3659
{
3660
	struct dentry *dentry;
3661 3662
	struct path path;
	int error;
3663
	unsigned int lookup_flags = LOOKUP_DIRECTORY;
L
Linus Torvalds 已提交
3664

3665 3666
retry:
	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3667
	if (IS_ERR(dentry))
3668
		return PTR_ERR(dentry);
L
Linus Torvalds 已提交
3669

3670
	if (!IS_POSIXACL(path.dentry->d_inode))
A
Al Viro 已提交
3671
		mode &= ~current_umask();
3672
	error = security_path_mkdir(&path, dentry, mode);
3673 3674
	if (!error)
		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
A
Al Viro 已提交
3675
	done_path_create(&path, dentry);
3676 3677 3678 3679
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
3680 3681 3682
	return error;
}

3683 3684 3685 3686 3687
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
	return do_mkdirat(dfd, pathname, mode);
}

3688
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3689
{
3690
	return do_mkdirat(AT_FDCWD, pathname, mode);
3691 3692
}

L
Linus Torvalds 已提交
3693 3694 3695 3696 3697 3698 3699
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	int error = may_delete(dir, dentry, 1);

	if (error)
		return error;

A
Al Viro 已提交
3700
	if (!dir->i_op->rmdir)
L
Linus Torvalds 已提交
3701 3702
		return -EPERM;

3703
	dget(dentry);
A
Al Viro 已提交
3704
	inode_lock(dentry->d_inode);
S
Sage Weil 已提交
3705 3706

	error = -EBUSY;
3707
	if (is_local_mountpoint(dentry))
S
Sage Weil 已提交
3708 3709 3710 3711 3712 3713 3714 3715 3716 3717
		goto out;

	error = security_inode_rmdir(dir, dentry);
	if (error)
		goto out;

	error = dir->i_op->rmdir(dir, dentry);
	if (error)
		goto out;

3718
	shrink_dcache_parent(dentry);
S
Sage Weil 已提交
3719 3720
	dentry->d_inode->i_flags |= S_DEAD;
	dont_mount(dentry);
3721
	detach_mounts(dentry);
3722
	fsnotify_rmdir(dir, dentry);
S
Sage Weil 已提交
3723 3724

out:
A
Al Viro 已提交
3725
	inode_unlock(dentry->d_inode);
3726
	dput(dentry);
S
Sage Weil 已提交
3727
	if (!error)
L
Linus Torvalds 已提交
3728 3729 3730
		d_delete(dentry);
	return error;
}
3731
EXPORT_SYMBOL(vfs_rmdir);
L
Linus Torvalds 已提交
3732

3733
long do_rmdir(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3734 3735 3736
{
	int error = 0;
	struct dentry *dentry;
3737 3738 3739
	struct path path;
	struct qstr last;
	int type;
3740 3741
	unsigned int lookup_flags = 0;
retry:
3742
	name = filename_parentat(dfd, name, lookup_flags,
A
Al Viro 已提交
3743
				&path, &last, &type);
3744 3745
	if (IS_ERR(name))
		return PTR_ERR(name);
L
Linus Torvalds 已提交
3746

3747
	switch (type) {
3748 3749 3750 3751 3752 3753 3754 3755 3756
	case LAST_DOTDOT:
		error = -ENOTEMPTY;
		goto exit1;
	case LAST_DOT:
		error = -EINVAL;
		goto exit1;
	case LAST_ROOT:
		error = -EBUSY;
		goto exit1;
L
Linus Torvalds 已提交
3757
	}
3758

3759
	error = mnt_want_write(path.mnt);
3760 3761
	if (error)
		goto exit1;
3762

A
Al Viro 已提交
3763
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3764
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3765
	error = PTR_ERR(dentry);
3766 3767
	if (IS_ERR(dentry))
		goto exit2;
3768 3769 3770 3771
	if (!dentry->d_inode) {
		error = -ENOENT;
		goto exit3;
	}
3772
	error = security_path_rmdir(&path, dentry);
3773
	if (error)
3774
		goto exit3;
3775
	error = vfs_rmdir(path.dentry->d_inode, dentry);
3776
exit3:
3777 3778
	dput(dentry);
exit2:
A
Al Viro 已提交
3779
	inode_unlock(path.dentry->d_inode);
3780
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3781
exit1:
3782
	path_put(&path);
3783 3784 3785 3786
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
A
Al Viro 已提交
3787
	putname(name);
L
Linus Torvalds 已提交
3788 3789 3790
	return error;
}

3791
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3792
{
3793
	return do_rmdir(AT_FDCWD, getname(pathname));
3794 3795
}

3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814
/**
 * vfs_unlink - unlink a filesystem object
 * @dir:	parent directory
 * @dentry:	victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
3815
{
J
J. Bruce Fields 已提交
3816
	struct inode *target = dentry->d_inode;
L
Linus Torvalds 已提交
3817 3818 3819 3820 3821
	int error = may_delete(dir, dentry, 0);

	if (error)
		return error;

A
Al Viro 已提交
3822
	if (!dir->i_op->unlink)
L
Linus Torvalds 已提交
3823 3824
		return -EPERM;

A
Al Viro 已提交
3825
	inode_lock(target);
3826
	if (is_local_mountpoint(dentry))
L
Linus Torvalds 已提交
3827 3828 3829
		error = -EBUSY;
	else {
		error = security_inode_unlink(dir, dentry);
3830
		if (!error) {
3831 3832
			error = try_break_deleg(target, delegated_inode);
			if (error)
3833
				goto out;
L
Linus Torvalds 已提交
3834
			error = dir->i_op->unlink(dir, dentry);
3835
			if (!error) {
3836
				dont_mount(dentry);
3837
				detach_mounts(dentry);
3838
				fsnotify_unlink(dir, dentry);
3839
			}
3840
		}
L
Linus Torvalds 已提交
3841
	}
3842
out:
A
Al Viro 已提交
3843
	inode_unlock(target);
L
Linus Torvalds 已提交
3844 3845 3846

	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
J
J. Bruce Fields 已提交
3847
		fsnotify_link_count(target);
J
John McCutchan 已提交
3848
		d_delete(dentry);
L
Linus Torvalds 已提交
3849
	}
R
Robert Love 已提交
3850

L
Linus Torvalds 已提交
3851 3852
	return error;
}
3853
EXPORT_SYMBOL(vfs_unlink);
L
Linus Torvalds 已提交
3854 3855 3856

/*
 * Make sure that the actual truncation of the file will occur outside its
3857
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
L
Linus Torvalds 已提交
3858 3859 3860
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
3861
long do_unlinkat(int dfd, struct filename *name)
L
Linus Torvalds 已提交
3862
{
3863
	int error;
L
Linus Torvalds 已提交
3864
	struct dentry *dentry;
3865 3866 3867
	struct path path;
	struct qstr last;
	int type;
L
Linus Torvalds 已提交
3868
	struct inode *inode = NULL;
3869
	struct inode *delegated_inode = NULL;
3870 3871
	unsigned int lookup_flags = 0;
retry:
3872
	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
3873 3874
	if (IS_ERR(name))
		return PTR_ERR(name);
3875

L
Linus Torvalds 已提交
3876
	error = -EISDIR;
3877
	if (type != LAST_NORM)
L
Linus Torvalds 已提交
3878
		goto exit1;
3879

3880
	error = mnt_want_write(path.mnt);
3881 3882
	if (error)
		goto exit1;
3883
retry_deleg:
A
Al Viro 已提交
3884
	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3885
	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
L
Linus Torvalds 已提交
3886 3887 3888
	error = PTR_ERR(dentry);
	if (!IS_ERR(dentry)) {
		/* Why not before? Because we want correct error value */
3889
		if (last.name[last.len])
3890
			goto slashes;
L
Linus Torvalds 已提交
3891
		inode = dentry->d_inode;
3892
		if (d_is_negative(dentry))
3893 3894
			goto slashes;
		ihold(inode);
3895
		error = security_path_unlink(&path, dentry);
3896
		if (error)
3897
			goto exit2;
3898
		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
3899
exit2:
L
Linus Torvalds 已提交
3900 3901
		dput(dentry);
	}
A
Al Viro 已提交
3902
	inode_unlock(path.dentry->d_inode);
L
Linus Torvalds 已提交
3903 3904
	if (inode)
		iput(inode);	/* truncate the inode here */
3905 3906
	inode = NULL;
	if (delegated_inode) {
3907
		error = break_deleg_wait(&delegated_inode);
3908 3909 3910
		if (!error)
			goto retry_deleg;
	}
3911
	mnt_drop_write(path.mnt);
L
Linus Torvalds 已提交
3912
exit1:
3913
	path_put(&path);
3914 3915 3916 3917 3918
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		inode = NULL;
		goto retry;
	}
3919
	putname(name);
L
Linus Torvalds 已提交
3920 3921 3922
	return error;

slashes:
3923 3924
	if (d_is_negative(dentry))
		error = -ENOENT;
M
Miklos Szeredi 已提交
3925
	else if (d_is_dir(dentry))
3926 3927 3928
		error = -EISDIR;
	else
		error = -ENOTDIR;
L
Linus Torvalds 已提交
3929 3930 3931
	goto exit2;
}

3932
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3933 3934 3935 3936 3937
{
	if ((flag & ~AT_REMOVEDIR) != 0)
		return -EINVAL;

	if (flag & AT_REMOVEDIR)
3938
		return do_rmdir(dfd, getname(pathname));
3939
	return do_unlinkat(dfd, getname(pathname));
3940 3941
}

3942
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3943
{
3944
	return do_unlinkat(AT_FDCWD, getname(pathname));
3945 3946
}

3947
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
L
Linus Torvalds 已提交
3948
{
3949
	int error = may_create(dir, dentry);
L
Linus Torvalds 已提交
3950 3951 3952 3953

	if (error)
		return error;

A
Al Viro 已提交
3954
	if (!dir->i_op->symlink)
L
Linus Torvalds 已提交
3955 3956 3957 3958 3959 3960 3961
		return -EPERM;

	error = security_inode_symlink(dir, dentry, oldname);
	if (error)
		return error;

	error = dir->i_op->symlink(dir, dentry, oldname);
3962
	if (!error)
3963
		fsnotify_create(dir, dentry);
L
Linus Torvalds 已提交
3964 3965
	return error;
}
3966
EXPORT_SYMBOL(vfs_symlink);
L
Linus Torvalds 已提交
3967

3968
static long do_symlinkat(const char __user *oldname, int newdfd,
3969
		  const char __user *newname)
L
Linus Torvalds 已提交
3970
{
3971
	int error;
3972
	struct filename *from;
3973
	struct dentry *dentry;
3974
	struct path path;
3975
	unsigned int lookup_flags = 0;
L
Linus Torvalds 已提交
3976 3977

	from = getname(oldname);
3978
	if (IS_ERR(from))
L
Linus Torvalds 已提交
3979
		return PTR_ERR(from);
3980 3981
retry:
	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3982 3983
	error = PTR_ERR(dentry);
	if (IS_ERR(dentry))
3984
		goto out_putname;
3985

3986
	error = security_path_symlink(&path, dentry, from->name);
3987
	if (!error)
3988
		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
A
Al Viro 已提交
3989
	done_path_create(&path, dentry);
3990 3991 3992 3993
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
3994
out_putname:
L
Linus Torvalds 已提交
3995 3996 3997 3998
	putname(from);
	return error;
}

3999 4000 4001 4002 4003 4004
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
	return do_symlinkat(oldname, newdfd, newname);
}

4005
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4006
{
4007
	return do_symlinkat(oldname, AT_FDCWD, newname);
4008 4009
}

J
J. Bruce Fields 已提交
4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029
/**
 * vfs_link - create a new link
 * @old_dentry:	object to be linked
 * @dir:	new parent
 * @new_dentry:	where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
L
Linus Torvalds 已提交
4030 4031
{
	struct inode *inode = old_dentry->d_inode;
4032
	unsigned max_links = dir->i_sb->s_max_links;
L
Linus Torvalds 已提交
4033 4034 4035 4036 4037
	int error;

	if (!inode)
		return -ENOENT;

4038
	error = may_create(dir, new_dentry);
L
Linus Torvalds 已提交
4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049
	if (error)
		return error;

	if (dir->i_sb != inode->i_sb)
		return -EXDEV;

	/*
	 * A link to an append-only or immutable file cannot be created.
	 */
	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
		return -EPERM;
4050 4051 4052 4053 4054 4055 4056
	/*
	 * Updating the link count will likely cause i_uid and i_gid to
	 * be writen back improperly if their true value is unknown to
	 * the vfs.
	 */
	if (HAS_UNMAPPED_ID(inode))
		return -EPERM;
A
Al Viro 已提交
4057
	if (!dir->i_op->link)
L
Linus Torvalds 已提交
4058
		return -EPERM;
4059
	if (S_ISDIR(inode->i_mode))
L
Linus Torvalds 已提交
4060 4061 4062 4063 4064 4065
		return -EPERM;

	error = security_inode_link(old_dentry, dir, new_dentry);
	if (error)
		return error;

A
Al Viro 已提交
4066
	inode_lock(inode);
4067
	/* Make sure we don't allow creating hardlink to an unlinked file */
4068
	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4069
		error =  -ENOENT;
4070 4071
	else if (max_links && inode->i_nlink >= max_links)
		error = -EMLINK;
J
J. Bruce Fields 已提交
4072 4073 4074 4075 4076
	else {
		error = try_break_deleg(inode, delegated_inode);
		if (!error)
			error = dir->i_op->link(old_dentry, dir, new_dentry);
	}
4077 4078 4079 4080 4081 4082

	if (!error && (inode->i_state & I_LINKABLE)) {
		spin_lock(&inode->i_lock);
		inode->i_state &= ~I_LINKABLE;
		spin_unlock(&inode->i_lock);
	}
A
Al Viro 已提交
4083
	inode_unlock(inode);
4084
	if (!error)
4085
		fsnotify_link(dir, inode, new_dentry);
L
Linus Torvalds 已提交
4086 4087
	return error;
}
4088
EXPORT_SYMBOL(vfs_link);
L
Linus Torvalds 已提交
4089 4090 4091 4092 4093 4094 4095 4096 4097 4098

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
4099
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4100
	      const char __user *newname, int flags)
L
Linus Torvalds 已提交
4101 4102
{
	struct dentry *new_dentry;
4103
	struct path old_path, new_path;
J
J. Bruce Fields 已提交
4104
	struct inode *delegated_inode = NULL;
4105
	int how = 0;
L
Linus Torvalds 已提交
4106 4107
	int error;

4108
	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4109
		return -EINVAL;
4110
	/*
4111 4112 4113
	 * To use null names we require CAP_DAC_READ_SEARCH
	 * This ensures that not everyone will be able to create
	 * handlink using the passed filedescriptor.
4114
	 */
4115 4116 4117
	if (flags & AT_EMPTY_PATH) {
		if (!capable(CAP_DAC_READ_SEARCH))
			return -ENOENT;
4118
		how = LOOKUP_EMPTY;
4119
	}
4120 4121 4122

	if (flags & AT_SYMLINK_FOLLOW)
		how |= LOOKUP_FOLLOW;
4123
retry:
4124
	error = user_path_at(olddfd, oldname, how, &old_path);
L
Linus Torvalds 已提交
4125
	if (error)
4126 4127
		return error;

4128 4129
	new_dentry = user_path_create(newdfd, newname, &new_path,
					(how & LOOKUP_REVAL));
L
Linus Torvalds 已提交
4130
	error = PTR_ERR(new_dentry);
4131
	if (IS_ERR(new_dentry))
4132 4133 4134 4135 4136
		goto out;

	error = -EXDEV;
	if (old_path.mnt != new_path.mnt)
		goto out_dput;
K
Kees Cook 已提交
4137 4138 4139
	error = may_linkat(&old_path);
	if (unlikely(error))
		goto out_dput;
4140
	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4141
	if (error)
4142
		goto out_dput;
J
J. Bruce Fields 已提交
4143
	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4144
out_dput:
A
Al Viro 已提交
4145
	done_path_create(&new_path, new_dentry);
J
J. Bruce Fields 已提交
4146 4147
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
4148 4149
		if (!error) {
			path_put(&old_path);
J
J. Bruce Fields 已提交
4150
			goto retry;
4151
		}
J
J. Bruce Fields 已提交
4152
	}
4153
	if (retry_estale(error, how)) {
4154
		path_put(&old_path);
4155 4156 4157
		how |= LOOKUP_REVAL;
		goto retry;
	}
L
Linus Torvalds 已提交
4158
out:
4159
	path_put(&old_path);
L
Linus Torvalds 已提交
4160 4161 4162 4163

	return error;
}

4164 4165 4166 4167 4168 4169
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, int, flags)
{
	return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

4170
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4171
{
4172
	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4173 4174
}

4175 4176 4177 4178 4179 4180 4181
/**
 * vfs_rename - rename a filesystem object
 * @old_dir:	parent of source
 * @old_dentry:	source
 * @new_dir:	parent of destination
 * @new_dentry:	destination
 * @delegated_inode: returns an inode needing a delegation break
M
Miklos Szeredi 已提交
4182
 * @flags:	rename flags
4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
L
Linus Torvalds 已提交
4197 4198 4199
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
4200
 *
4201
 *	a) we can get into loop creation.
L
Linus Torvalds 已提交
4202 4203
 *	b) race potential - two innocent renames can create a loop together.
 *	   That's where 4.4 screws up. Current fix: serialization on
4204
 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
L
Linus Torvalds 已提交
4205
 *	   story.
4206 4207
 *	c) we have to lock _four_ objects - parents and victim (if it exists),
 *	   and source (if it is not a directory).
4208
 *	   And that - after we got ->i_mutex on parents (until then we don't know
L
Linus Torvalds 已提交
4209 4210
 *	   whether the target exists).  Solution: try to be smart with locking
 *	   order for inodes.  We rely on the fact that tree topology may change
4211
 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
L
Linus Torvalds 已提交
4212 4213 4214
 *	   move will be locked.  Thus we can rank directories by the tree
 *	   (ancestors first) and rank all non-directories after them.
 *	   That works since everybody except rename does "lock parent, lookup,
4215
 *	   lock child" and rename is under ->s_vfs_rename_mutex.
L
Linus Torvalds 已提交
4216 4217 4218
 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *	   we'd better make sure that there's no link(2) for them.
4219
 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4220
 *	   we are removing the target. Solution: we will have to grab ->i_mutex
L
Linus Torvalds 已提交
4221
 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4222
 *	   ->i_mutex on parents, which works but leads to some truly excessive
L
Linus Torvalds 已提交
4223 4224
 *	   locking].
 */
4225 4226
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
	       struct inode *new_dir, struct dentry *new_dentry,
M
Miklos Szeredi 已提交
4227
	       struct inode **delegated_inode, unsigned int flags)
L
Linus Torvalds 已提交
4228
{
4229 4230 4231
	int error;
	bool is_dir = d_is_dir(old_dentry);
	struct inode *source = old_dentry->d_inode;
S
Sage Weil 已提交
4232
	struct inode *target = new_dentry->d_inode;
M
Miklos Szeredi 已提交
4233 4234
	bool new_is_dir = false;
	unsigned max_links = new_dir->i_sb->s_max_links;
A
Al Viro 已提交
4235
	struct name_snapshot old_name;
4236

4237
	if (source == target)
4238 4239 4240 4241 4242 4243
		return 0;

	error = may_delete(old_dir, old_dentry, is_dir);
	if (error)
		return error;

M
Miklos Szeredi 已提交
4244
	if (!target) {
4245
		error = may_create(new_dir, new_dentry);
M
Miklos Szeredi 已提交
4246 4247 4248 4249 4250 4251 4252 4253
	} else {
		new_is_dir = d_is_dir(new_dentry);

		if (!(flags & RENAME_EXCHANGE))
			error = may_delete(new_dir, new_dentry, is_dir);
		else
			error = may_delete(new_dir, new_dentry, new_is_dir);
	}
4254 4255 4256
	if (error)
		return error;

4257
	if (!old_dir->i_op->rename)
4258
		return -EPERM;
L
Linus Torvalds 已提交
4259 4260 4261 4262 4263

	/*
	 * If we are going to change the parent - check write permissions,
	 * we'll need to flip '..'.
	 */
M
Miklos Szeredi 已提交
4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274
	if (new_dir != old_dir) {
		if (is_dir) {
			error = inode_permission(source, MAY_WRITE);
			if (error)
				return error;
		}
		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
			error = inode_permission(target, MAY_WRITE);
			if (error)
				return error;
		}
L
Linus Torvalds 已提交
4275 4276
	}

4277 4278
	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				      flags);
L
Linus Torvalds 已提交
4279 4280 4281
	if (error)
		return error;

A
Al Viro 已提交
4282
	take_dentry_name_snapshot(&old_name, old_dentry);
4283
	dget(new_dentry);
M
Miklos Szeredi 已提交
4284
	if (!is_dir || (flags & RENAME_EXCHANGE))
4285 4286
		lock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4287
		inode_lock(target);
S
Sage Weil 已提交
4288 4289

	error = -EBUSY;
4290
	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
S
Sage Weil 已提交
4291 4292
		goto out;

M
Miklos Szeredi 已提交
4293
	if (max_links && new_dir != old_dir) {
4294
		error = -EMLINK;
M
Miklos Szeredi 已提交
4295
		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4296
			goto out;
M
Miklos Szeredi 已提交
4297 4298 4299 4300 4301
		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
		    old_dir->i_nlink >= max_links)
			goto out;
	}
	if (!is_dir) {
4302
		error = try_break_deleg(source, delegated_inode);
4303 4304
		if (error)
			goto out;
M
Miklos Szeredi 已提交
4305 4306 4307 4308 4309
	}
	if (target && !new_is_dir) {
		error = try_break_deleg(target, delegated_inode);
		if (error)
			goto out;
4310
	}
4311
	error = old_dir->i_op->rename(old_dir, old_dentry,
M
Miklos Szeredi 已提交
4312
				       new_dir, new_dentry, flags);
S
Sage Weil 已提交
4313 4314 4315
	if (error)
		goto out;

M
Miklos Szeredi 已提交
4316
	if (!(flags & RENAME_EXCHANGE) && target) {
4317 4318
		if (is_dir) {
			shrink_dcache_parent(new_dentry);
4319
			target->i_flags |= S_DEAD;
4320
		}
S
Sage Weil 已提交
4321
		dont_mount(new_dentry);
4322
		detach_mounts(new_dentry);
4323
	}
M
Miklos Szeredi 已提交
4324 4325 4326 4327 4328 4329
	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
		if (!(flags & RENAME_EXCHANGE))
			d_move(old_dentry, new_dentry);
		else
			d_exchange(old_dentry, new_dentry);
	}
S
Sage Weil 已提交
4330
out:
M
Miklos Szeredi 已提交
4331
	if (!is_dir || (flags & RENAME_EXCHANGE))
4332 4333
		unlock_two_nondirectories(source, target);
	else if (target)
A
Al Viro 已提交
4334
		inode_unlock(target);
L
Linus Torvalds 已提交
4335
	dput(new_dentry);
M
Miklos Szeredi 已提交
4336
	if (!error) {
4337
		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
M
Miklos Szeredi 已提交
4338 4339
			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
		if (flags & RENAME_EXCHANGE) {
4340
			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
M
Miklos Szeredi 已提交
4341 4342 4343
				      new_is_dir, NULL, new_dentry);
		}
	}
A
Al Viro 已提交
4344
	release_dentry_name_snapshot(&old_name);
R
Robert Love 已提交
4345

L
Linus Torvalds 已提交
4346 4347
	return error;
}
4348
EXPORT_SYMBOL(vfs_rename);
L
Linus Torvalds 已提交
4349

4350 4351
int do_renameat2(int olddfd, struct filename *from, int newdfd,
		 struct filename *to, unsigned int flags)
L
Linus Torvalds 已提交
4352
{
4353 4354
	struct dentry *old_dentry, *new_dentry;
	struct dentry *trap;
4355 4356 4357
	struct path old_path, new_path;
	struct qstr old_last, new_last;
	int old_type, new_type;
4358
	struct inode *delegated_inode = NULL;
4359
	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4360
	bool should_retry = false;
4361
	int error = -EINVAL;
M
Miklos Szeredi 已提交
4362

M
Miklos Szeredi 已提交
4363
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4364
		goto put_both;
M
Miklos Szeredi 已提交
4365

M
Miklos Szeredi 已提交
4366 4367
	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
	    (flags & RENAME_EXCHANGE))
4368
		goto put_both;
M
Miklos Szeredi 已提交
4369

4370 4371 4372
	if (flags & RENAME_EXCHANGE)
		target_flags = 0;

4373
retry:
4374 4375
	from = filename_parentat(olddfd, from, lookup_flags, &old_path,
					&old_last, &old_type);
4376 4377
	if (IS_ERR(from)) {
		error = PTR_ERR(from);
4378
		goto put_new;
4379
	}
L
Linus Torvalds 已提交
4380

4381 4382
	to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
				&new_type);
4383 4384
	if (IS_ERR(to)) {
		error = PTR_ERR(to);
L
Linus Torvalds 已提交
4385
		goto exit1;
4386
	}
L
Linus Torvalds 已提交
4387 4388

	error = -EXDEV;
4389
	if (old_path.mnt != new_path.mnt)
L
Linus Torvalds 已提交
4390 4391 4392
		goto exit2;

	error = -EBUSY;
4393
	if (old_type != LAST_NORM)
L
Linus Torvalds 已提交
4394 4395
		goto exit2;

M
Miklos Szeredi 已提交
4396 4397
	if (flags & RENAME_NOREPLACE)
		error = -EEXIST;
4398
	if (new_type != LAST_NORM)
L
Linus Torvalds 已提交
4399 4400
		goto exit2;

4401
	error = mnt_want_write(old_path.mnt);
4402 4403 4404
	if (error)
		goto exit2;

4405
retry_deleg:
4406
	trap = lock_rename(new_path.dentry, old_path.dentry);
L
Linus Torvalds 已提交
4407

4408
	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
L
Linus Torvalds 已提交
4409 4410 4411 4412 4413
	error = PTR_ERR(old_dentry);
	if (IS_ERR(old_dentry))
		goto exit3;
	/* source must exist */
	error = -ENOENT;
4414
	if (d_is_negative(old_dentry))
L
Linus Torvalds 已提交
4415
		goto exit4;
4416
	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
M
Miklos Szeredi 已提交
4417 4418 4419 4420 4421 4422
	error = PTR_ERR(new_dentry);
	if (IS_ERR(new_dentry))
		goto exit4;
	error = -EEXIST;
	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
		goto exit5;
M
Miklos Szeredi 已提交
4423 4424 4425 4426 4427 4428 4429
	if (flags & RENAME_EXCHANGE) {
		error = -ENOENT;
		if (d_is_negative(new_dentry))
			goto exit5;

		if (!d_is_dir(new_dentry)) {
			error = -ENOTDIR;
4430
			if (new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4431 4432 4433
				goto exit5;
		}
	}
L
Linus Torvalds 已提交
4434
	/* unless the source is a directory trailing slashes give -ENOTDIR */
M
Miklos Szeredi 已提交
4435
	if (!d_is_dir(old_dentry)) {
L
Linus Torvalds 已提交
4436
		error = -ENOTDIR;
4437
		if (old_last.name[old_last.len])
M
Miklos Szeredi 已提交
4438
			goto exit5;
4439
		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
M
Miklos Szeredi 已提交
4440
			goto exit5;
L
Linus Torvalds 已提交
4441 4442 4443 4444
	}
	/* source should not be ancestor of target */
	error = -EINVAL;
	if (old_dentry == trap)
M
Miklos Szeredi 已提交
4445
		goto exit5;
L
Linus Torvalds 已提交
4446
	/* target should not be an ancestor of source */
M
Miklos Szeredi 已提交
4447 4448
	if (!(flags & RENAME_EXCHANGE))
		error = -ENOTEMPTY;
L
Linus Torvalds 已提交
4449 4450 4451
	if (new_dentry == trap)
		goto exit5;

4452 4453
	error = security_path_rename(&old_path, old_dentry,
				     &new_path, new_dentry, flags);
4454
	if (error)
4455
		goto exit5;
4456 4457
	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
			   new_path.dentry->d_inode, new_dentry,
M
Miklos Szeredi 已提交
4458
			   &delegated_inode, flags);
L
Linus Torvalds 已提交
4459 4460 4461 4462 4463
exit5:
	dput(new_dentry);
exit4:
	dput(old_dentry);
exit3:
4464
	unlock_rename(new_path.dentry, old_path.dentry);
4465 4466 4467 4468 4469
	if (delegated_inode) {
		error = break_deleg_wait(&delegated_inode);
		if (!error)
			goto retry_deleg;
	}
4470
	mnt_drop_write(old_path.mnt);
L
Linus Torvalds 已提交
4471
exit2:
4472 4473
	if (retry_estale(error, lookup_flags))
		should_retry = true;
4474
	path_put(&new_path);
L
Linus Torvalds 已提交
4475
exit1:
4476
	path_put(&old_path);
4477 4478 4479 4480 4481
	if (should_retry) {
		should_retry = false;
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
4482 4483 4484 4485 4486 4487
put_both:
	if (!IS_ERR(from))
		putname(from);
put_new:
	if (!IS_ERR(to))
		putname(to);
L
Linus Torvalds 已提交
4488 4489 4490
	return error;
}

4491 4492 4493
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname, unsigned int, flags)
{
4494 4495
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				flags);
4496 4497
}

M
Miklos Szeredi 已提交
4498 4499 4500
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
		int, newdfd, const char __user *, newname)
{
4501 4502
	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
				0);
M
Miklos Szeredi 已提交
4503 4504
}

4505
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4506
{
4507 4508
	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
				getname(newname), 0);
4509 4510
}

A
Al Viro 已提交
4511
int readlink_copy(char __user *buffer, int buflen, const char *link)
L
Linus Torvalds 已提交
4512
{
A
Al Viro 已提交
4513
	int len = PTR_ERR(link);
L
Linus Torvalds 已提交
4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525
	if (IS_ERR(link))
		goto out;

	len = strlen(link);
	if (len > (unsigned) buflen)
		len = buflen;
	if (copy_to_user(buffer, link, len))
		len = -EFAULT;
out:
	return len;
}

4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538
/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
	struct inode *inode = d_inode(dentry);
4539 4540 4541
	DEFINE_DELAYED_CALL(done);
	const char *link;
	int res;
4542

4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553
	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
		if (unlikely(inode->i_op->readlink))
			return inode->i_op->readlink(dentry, buffer, buflen);

		if (!d_is_symlink(dentry))
			return -EINVAL;

		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_DEFAULT_READLINK;
		spin_unlock(&inode->i_lock);
	}
4554

4555
	link = READ_ONCE(inode->i_link);
4556 4557 4558 4559 4560 4561 4562 4563
	if (!link) {
		link = inode->i_op->get_link(dentry, inode, &done);
		if (IS_ERR(link))
			return PTR_ERR(link);
	}
	res = readlink_copy(buffer, buflen, link);
	do_delayed_call(&done);
	return res;
4564 4565
}
EXPORT_SYMBOL(vfs_readlink);
L
Linus Torvalds 已提交
4566

M
Miklos Szeredi 已提交
4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591
/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
	const char *res = ERR_PTR(-EINVAL);
	struct inode *inode = d_inode(dentry);

	if (d_is_symlink(dentry)) {
		res = ERR_PTR(security_inode_readlink(dentry));
		if (!res)
			res = inode->i_op->get_link(dentry, inode, done);
	}
	return res;
}
EXPORT_SYMBOL(vfs_get_link);

L
Linus Torvalds 已提交
4592
/* get the link contents into pagecache */
4593
const char *page_get_link(struct dentry *dentry, struct inode *inode,
4594
			  struct delayed_call *callback)
L
Linus Torvalds 已提交
4595
{
4596 4597
	char *kaddr;
	struct page *page;
4598 4599
	struct address_space *mapping = inode->i_mapping;

4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612
	if (!dentry) {
		page = find_get_page(mapping, 0);
		if (!page)
			return ERR_PTR(-ECHILD);
		if (!PageUptodate(page)) {
			put_page(page);
			return ERR_PTR(-ECHILD);
		}
	} else {
		page = read_mapping_page(mapping, 0, NULL);
		if (IS_ERR(page))
			return (char*)page;
	}
4613
	set_delayed_call(callback, page_put_link, page);
4614 4615
	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
	kaddr = page_address(page);
4616
	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4617
	return kaddr;
L
Linus Torvalds 已提交
4618 4619
}

4620
EXPORT_SYMBOL(page_get_link);
L
Linus Torvalds 已提交
4621

4622
void page_put_link(void *arg)
L
Linus Torvalds 已提交
4623
{
4624
	put_page(arg);
L
Linus Torvalds 已提交
4625
}
4626
EXPORT_SYMBOL(page_put_link);
L
Linus Torvalds 已提交
4627

4628 4629
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
4630
	DEFINE_DELAYED_CALL(done);
4631 4632
	int res = readlink_copy(buffer, buflen,
				page_get_link(dentry, d_inode(dentry),
4633 4634
					      &done));
	do_delayed_call(&done);
4635 4636 4637 4638
	return res;
}
EXPORT_SYMBOL(page_readlink);

4639 4640 4641 4642
/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
L
Linus Torvalds 已提交
4643 4644
{
	struct address_space *mapping = inode->i_mapping;
4645
	struct page *page;
4646
	void *fsdata;
4647
	int err;
4648
	unsigned int flags = 0;
4649 4650
	if (nofs)
		flags |= AOP_FLAG_NOFS;
L
Linus Torvalds 已提交
4651

4652
retry:
4653
	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4654
				flags, &page, &fsdata);
L
Linus Torvalds 已提交
4655
	if (err)
4656 4657
		goto fail;

4658
	memcpy(page_address(page), symname, len-1);
4659 4660 4661

	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
							page, fsdata);
L
Linus Torvalds 已提交
4662 4663
	if (err < 0)
		goto fail;
4664 4665 4666
	if (err < len-1)
		goto retry;

L
Linus Torvalds 已提交
4667 4668 4669 4670 4671
	mark_inode_dirty(inode);
	return 0;
fail:
	return err;
}
4672
EXPORT_SYMBOL(__page_symlink);
L
Linus Torvalds 已提交
4673

4674 4675 4676
int page_symlink(struct inode *inode, const char *symname, int len)
{
	return __page_symlink(inode, symname, len,
4677
			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4678
}
4679
EXPORT_SYMBOL(page_symlink);
4680

4681
const struct inode_operations page_symlink_inode_operations = {
4682
	.get_link	= page_get_link,
L
Linus Torvalds 已提交
4683 4684
};
EXPORT_SYMBOL(page_symlink_inode_operations);