dir.c 35.7 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2.
 */
10

11
#include <linux/sched.h>
12 13 14 15 16 17 18 19 20
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>

#include "kernfs-internal.h"

21
DEFINE_MUTEX(kernfs_mutex);
22 23
static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
24

25
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
26

T
Tejun Heo 已提交
27 28 29 30 31 32
static bool kernfs_active(struct kernfs_node *kn)
{
	lockdep_assert_held(&kernfs_mutex);
	return atomic_read(&kn->active) >= 0;
}

33 34 35 36 37 38 39 40 41
static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	return kn->flags & KERNFS_LOCKDEP;
#else
	return false;
#endif
}

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
	return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
					      size_t buflen)
{
	char *p = buf + buflen;
	int len;

	*--p = '\0';

	do {
		len = strlen(kn->name);
		if (p - buf < len + 1) {
			buf[0] = '\0';
			p = NULL;
			break;
		}
		p -= len;
		memcpy(p, kn->name, len);
		*--p = '/';
		kn = kn->parent;
	} while (kn && kn->parent);

	return p;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strlcpy().  It returns the length of @kn's name and if @buf
 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	ret = kernfs_name_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return ret;
}

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
 * path is built from the end of @buf so the returned pointer usually
 * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
 * and %NULL is returned.
 */
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	p = kernfs_path_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return p;
}

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
	pr_cont("%s", kernfs_pr_cont_buf);

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
			       sizeof(kernfs_pr_cont_buf));
	if (p)
		pr_cont("%s", p);
	else
		pr_cont("<name too long>");

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
	struct kernfs_node *parent;
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	parent = kn->parent;
	kernfs_get(parent);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);

	return parent;
}

177
/**
178
 *	kernfs_name_hash
179 180 181 182 183
 *	@name: Null terminated string to hash
 *	@ns:   Namespace tag to hash
 *
 *	Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
184
static unsigned int kernfs_name_hash(const char *name, const void *ns)
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
{
	unsigned long hash = init_name_hash();
	unsigned int len = strlen(name);
	while (len--)
		hash = partial_name_hash(*name++, hash);
	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
	hash &= 0x7fffffffU;
	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
	if (hash < 1)
		hash += 2;
	if (hash >= INT_MAX)
		hash = INT_MAX - 1;
	return hash;
}

200 201
static int kernfs_name_compare(unsigned int hash, const char *name,
			       const void *ns, const struct kernfs_node *kn)
202
{
203 204 205 206 207
	if (hash != kn->hash)
		return hash - kn->hash;
	if (ns != kn->ns)
		return ns - kn->ns;
	return strcmp(name, kn->name);
208 209
}

210 211
static int kernfs_sd_compare(const struct kernfs_node *left,
			     const struct kernfs_node *right)
212
{
213
	return kernfs_name_compare(left->hash, left->name, left->ns, right);
214 215 216
}

/**
217
 *	kernfs_link_sibling - link kernfs_node into sibling rbtree
218
 *	@kn: kernfs_node of interest
219
 *
220
 *	Link @kn into its sibling rbtree which starts from
221
 *	@kn->parent->dir.children.
222 223
 *
 *	Locking:
224
 *	mutex_lock(kernfs_mutex)
225 226 227 228
 *
 *	RETURNS:
 *	0 on susccess -EEXIST on failure.
 */
229
static int kernfs_link_sibling(struct kernfs_node *kn)
230
{
231
	struct rb_node **node = &kn->parent->dir.children.rb_node;
232 233
	struct rb_node *parent = NULL;

T
Tejun Heo 已提交
234
	if (kernfs_type(kn) == KERNFS_DIR)
235
		kn->parent->dir.subdirs++;
236 237

	while (*node) {
238
		struct kernfs_node *pos;
239 240
		int result;

241
		pos = rb_to_kn(*node);
242
		parent = *node;
243
		result = kernfs_sd_compare(kn, pos);
244
		if (result < 0)
245
			node = &pos->rb.rb_left;
246
		else if (result > 0)
247
			node = &pos->rb.rb_right;
248 249 250 251
		else
			return -EEXIST;
	}
	/* add new node and rebalance the tree */
252 253
	rb_link_node(&kn->rb, parent, node);
	rb_insert_color(&kn->rb, &kn->parent->dir.children);
254 255 256 257
	return 0;
}

/**
258
 *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
259
 *	@kn: kernfs_node of interest
260
 *
261 262 263
 *	Try to unlink @kn from its sibling rbtree which starts from
 *	kn->parent->dir.children.  Returns %true if @kn was actually
 *	removed, %false if @kn wasn't on the rbtree.
264 265
 *
 *	Locking:
266
 *	mutex_lock(kernfs_mutex)
267
 */
268
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
269
{
270 271 272
	if (RB_EMPTY_NODE(&kn->rb))
		return false;

T
Tejun Heo 已提交
273
	if (kernfs_type(kn) == KERNFS_DIR)
274
		kn->parent->dir.subdirs--;
275

276
	rb_erase(&kn->rb, &kn->parent->dir.children);
277 278
	RB_CLEAR_NODE(&kn->rb);
	return true;
279 280 281
}

/**
282
 *	kernfs_get_active - get an active reference to kernfs_node
283
 *	@kn: kernfs_node to get an active reference to
284
 *
285
 *	Get an active reference of @kn.  This function is noop if @kn
286 287 288
 *	is NULL.
 *
 *	RETURNS:
289
 *	Pointer to @kn on success, NULL on failure.
290
 */
291
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
292
{
293
	if (unlikely(!kn))
294 295
		return NULL;

296 297
	if (!atomic_inc_unless_negative(&kn->active))
		return NULL;
298

299
	if (kernfs_lockdep(kn))
300 301
		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
	return kn;
302 303 304
}

/**
305
 *	kernfs_put_active - put an active reference to kernfs_node
306
 *	@kn: kernfs_node to put an active reference to
307
 *
308
 *	Put an active reference to @kn.  This function is noop if @kn
309 310
 *	is NULL.
 */
311
void kernfs_put_active(struct kernfs_node *kn)
312
{
313
	struct kernfs_root *root = kernfs_root(kn);
314 315
	int v;

316
	if (unlikely(!kn))
317 318
		return;

319
	if (kernfs_lockdep(kn))
320
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
321
	v = atomic_dec_return(&kn->active);
T
Tejun Heo 已提交
322
	if (likely(v != KN_DEACTIVATED_BIAS))
323 324
		return;

325
	wake_up_all(&root->deactivate_waitq);
326 327 328
}

/**
T
Tejun Heo 已提交
329 330
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
331
 *
T
Tejun Heo 已提交
332 333 334
 * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
335
 */
T
Tejun Heo 已提交
336
static void kernfs_drain(struct kernfs_node *kn)
337
	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
338
{
339
	struct kernfs_root *root = kernfs_root(kn);
340

341
	lockdep_assert_held(&kernfs_mutex);
T
Tejun Heo 已提交
342
	WARN_ON_ONCE(kernfs_active(kn));
343

344
	mutex_unlock(&kernfs_mutex);
345

346
	if (kernfs_lockdep(kn)) {
347 348 349 350
		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
			lock_contended(&kn->dep_map, _RET_IP_);
	}
351

352
	/* but everyone should wait for draining */
353 354
	wait_event(root->deactivate_waitq,
		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
355

356
	if (kernfs_lockdep(kn)) {
357 358 359
		lock_acquired(&kn->dep_map, _RET_IP_);
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
	}
360

361 362
	kernfs_unmap_bin_file(kn);

363
	mutex_lock(&kernfs_mutex);
364 365 366
}

/**
367 368
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
369
 */
370
void kernfs_get(struct kernfs_node *kn)
371
{
372
	if (kn) {
373 374
		WARN_ON(!atomic_read(&kn->count));
		atomic_inc(&kn->count);
375 376 377 378 379
	}
}
EXPORT_SYMBOL_GPL(kernfs_get);

/**
380 381
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
382
 *
383
 * Put a reference count of @kn and destroy it if it reached zero.
384
 */
385
void kernfs_put(struct kernfs_node *kn)
386
{
387
	struct kernfs_node *parent;
388
	struct kernfs_root *root;
389

390
	if (!kn || !atomic_dec_and_test(&kn->count))
391
		return;
392
	root = kernfs_root(kn);
393
 repeat:
T
Tejun Heo 已提交
394 395
	/*
	 * Moving/renaming is always done while holding reference.
396
	 * kn->parent won't change beneath us.
397
	 */
398
	parent = kn->parent;
399

T
Tejun Heo 已提交
400 401 402
	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
403

T
Tejun Heo 已提交
404
	if (kernfs_type(kn) == KERNFS_LINK)
405
		kernfs_put(kn->symlink.target_kn);
406
	if (!(kn->flags & KERNFS_STATIC_NAME))
407 408 409 410 411 412
		kfree(kn->name);
	if (kn->iattr) {
		if (kn->iattr->ia_secdata)
			security_release_secctx(kn->iattr->ia_secdata,
						kn->iattr->ia_secdata_len);
		simple_xattrs_free(&kn->iattr->xattrs);
413
	}
414 415
	kfree(kn->iattr);
	ida_simple_remove(&root->ino_ida, kn->ino);
416
	kmem_cache_free(kernfs_node_cache, kn);
417

418 419
	kn = parent;
	if (kn) {
420
		if (atomic_dec_and_test(&kn->count))
421 422
			goto repeat;
	} else {
423
		/* just released the root kn, free @root too */
424
		ida_destroy(&root->ino_ida);
425 426
		kfree(root);
	}
427 428 429
}
EXPORT_SYMBOL_GPL(kernfs_put);

430
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
431
{
432
	struct kernfs_node *kn;
433 434 435 436

	if (flags & LOOKUP_RCU)
		return -ECHILD;

T
Tejun Heo 已提交
437 438 439 440
	/* Always perform fresh lookup for negatives */
	if (!dentry->d_inode)
		goto out_bad_unlocked;

441
	kn = dentry->d_fsdata;
442
	mutex_lock(&kernfs_mutex);
443

T
Tejun Heo 已提交
444 445
	/* The kernfs node has been deactivated */
	if (!kernfs_active(kn))
446 447
		goto out_bad;

448
	/* The kernfs node has been moved? */
449
	if (dentry->d_parent->d_fsdata != kn->parent)
450 451
		goto out_bad;

452
	/* The kernfs node has been renamed */
453
	if (strcmp(dentry->d_name.name, kn->name) != 0)
454 455
		goto out_bad;

456
	/* The kernfs node has been moved to a different namespace */
457
	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
458
	    kernfs_info(dentry->d_sb)->ns != kn->ns)
459 460
		goto out_bad;

461
	mutex_unlock(&kernfs_mutex);
462 463 464
out_valid:
	return 1;
out_bad:
465
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
466 467 468 469 470 471 472
out_bad_unlocked:
	/*
	 * @dentry doesn't match the underlying kernfs node, drop the
	 * dentry and force lookup.  If we have submounts we must allow the
	 * vfs caches to lie about the state of the filesystem to prevent
	 * leaks and other nasty things, so use check_submounts_and_drop()
	 * instead of d_drop().
473 474 475 476 477 478 479
	 */
	if (check_submounts_and_drop(dentry) != 0)
		goto out_valid;

	return 0;
}

480
static void kernfs_dop_release(struct dentry *dentry)
481 482 483 484
{
	kernfs_put(dentry->d_fsdata);
}

485
const struct dentry_operations kernfs_dops = {
486 487
	.d_revalidate	= kernfs_dop_revalidate,
	.d_release	= kernfs_dop_release,
488 489
};

490 491 492 493 494 495 496 497 498 499 500 501 502
/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
L
Li Zefan 已提交
503
	if (dentry->d_sb->s_op == &kernfs_sops)
504 505 506 507
		return dentry->d_fsdata;
	return NULL;
}

508 509 510
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
					     const char *name, umode_t mode,
					     unsigned flags)
511 512
{
	char *dup_name = NULL;
513
	struct kernfs_node *kn;
514
	int ret;
515

516
	if (!(flags & KERNFS_STATIC_NAME)) {
517 518 519 520 521
		name = dup_name = kstrdup(name, GFP_KERNEL);
		if (!name)
			return NULL;
	}

522
	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
523
	if (!kn)
524 525
		goto err_out1;

526 527
	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
	if (ret < 0)
528
		goto err_out2;
529
	kn->ino = ret;
530

531
	atomic_set(&kn->count, 1);
T
Tejun Heo 已提交
532
	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
533
	RB_CLEAR_NODE(&kn->rb);
534

535 536
	kn->name = name;
	kn->mode = mode;
T
Tejun Heo 已提交
537
	kn->flags = flags;
538

539
	return kn;
540 541

 err_out2:
542
	kmem_cache_free(kernfs_node_cache, kn);
543 544 545 546 547
 err_out1:
	kfree(dup_name);
	return NULL;
}

548 549 550 551 552 553 554 555 556 557 558 559 560 561
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
				    const char *name, umode_t mode,
				    unsigned flags)
{
	struct kernfs_node *kn;

	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
	if (kn) {
		kernfs_get(parent);
		kn->parent = parent;
	}
	return kn;
}

562
/**
563
 *	kernfs_add_one - add kernfs_node to parent without warning
564
 *	@kn: kernfs_node to be added
565
 *
566 567 568
 *	The caller must already have initialized @kn->parent.  This
 *	function increments nlink of the parent's inode if @kn is a
 *	directory and link into the children list of the parent.
569 570 571 572 573
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
 */
T
Tejun Heo 已提交
574
int kernfs_add_one(struct kernfs_node *kn)
575
{
576
	struct kernfs_node *parent = kn->parent;
577
	struct kernfs_iattrs *ps_iattr;
T
Tejun Heo 已提交
578
	bool has_ns;
579 580
	int ret;

T
Tejun Heo 已提交
581 582 583 584 585 586 587
	mutex_lock(&kernfs_mutex);

	ret = -EINVAL;
	has_ns = kernfs_ns_enabled(parent);
	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
		 has_ns ? "required" : "invalid", parent->name, kn->name))
		goto out_unlock;
588

T
Tejun Heo 已提交
589
	if (kernfs_type(parent) != KERNFS_DIR)
T
Tejun Heo 已提交
590
		goto out_unlock;
591

T
Tejun Heo 已提交
592
	ret = -ENOENT;
593
	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
T
Tejun Heo 已提交
594
		goto out_unlock;
595

596
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
597

598
	ret = kernfs_link_sibling(kn);
599
	if (ret)
T
Tejun Heo 已提交
600
		goto out_unlock;
601 602

	/* Update timestamps on the parent */
603
	ps_iattr = parent->iattr;
604 605 606 607 608
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

609 610 611 612 613 614 615 616 617 618 619 620 621
	mutex_unlock(&kernfs_mutex);

	/*
	 * Activate the new node unless CREATE_DEACTIVATED is requested.
	 * If not activated here, the kernfs user is responsible for
	 * activating the node with kernfs_activate().  A node which hasn't
	 * been activated is not visible to userland and its removal won't
	 * trigger deactivation.
	 */
	if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);
	return 0;

T
Tejun Heo 已提交
622
out_unlock:
623
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
624
	return ret;
625 626 627
}

/**
628 629
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
630 631 632
 * @name: name to look for
 * @ns: the namespace tag to use
 *
633 634
 * Look for kernfs_node with name @name under @parent.  Returns pointer to
 * the found kernfs_node on success, %NULL on failure.
635
 */
636 637 638
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
					  const unsigned char *name,
					  const void *ns)
639
{
640
	struct rb_node *node = parent->dir.children.rb_node;
641
	bool has_ns = kernfs_ns_enabled(parent);
642 643
	unsigned int hash;

644
	lockdep_assert_held(&kernfs_mutex);
645 646

	if (has_ns != (bool)ns) {
647
		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
648
		     has_ns ? "required" : "invalid", parent->name, name);
649 650 651
		return NULL;
	}

652
	hash = kernfs_name_hash(name, ns);
653
	while (node) {
654
		struct kernfs_node *kn;
655 656
		int result;

657
		kn = rb_to_kn(node);
658
		result = kernfs_name_compare(hash, name, ns, kn);
659 660 661 662 663
		if (result < 0)
			node = node->rb_left;
		else if (result > 0)
			node = node->rb_right;
		else
664
			return kn;
665 666 667 668 669
	}
	return NULL;
}

/**
670 671
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
672 673 674
 * @name: name to look for
 * @ns: the namespace tag to use
 *
675
 * Look for kernfs_node with name @name under @parent and get a reference
676
 * if found.  This function may sleep and returns pointer to the found
677
 * kernfs_node on success, %NULL on failure.
678
 */
679 680
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
					   const char *name, const void *ns)
681
{
682
	struct kernfs_node *kn;
683

684
	mutex_lock(&kernfs_mutex);
685 686
	kn = kernfs_find_ns(parent, name, ns);
	kernfs_get(kn);
687
	mutex_unlock(&kernfs_mutex);
688

689
	return kn;
690 691 692
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

693 694
/**
 * kernfs_create_root - create a new kernfs hierarchy
695
 * @scops: optional syscall operations for the hierarchy
696
 * @flags: KERNFS_ROOT_* flags
697 698 699 700 701
 * @priv: opaque data associated with the new directory
 *
 * Returns the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
702
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
703
				       unsigned int flags, void *priv)
704 705
{
	struct kernfs_root *root;
706
	struct kernfs_node *kn;
707 708 709 710 711

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root)
		return ERR_PTR(-ENOMEM);

712 713
	ida_init(&root->ino_ida);

714 715
	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
			       KERNFS_DIR);
716
	if (!kn) {
717
		ida_destroy(&root->ino_ida);
718 719 720 721
		kfree(root);
		return ERR_PTR(-ENOMEM);
	}

722
	kn->priv = priv;
723
	kn->dir.root = root;
724

725
	root->syscall_ops = scops;
726
	root->flags = flags;
727
	root->kn = kn;
728
	init_waitqueue_head(&root->deactivate_waitq);
729

730 731 732
	if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);

733 734 735 736 737 738 739 740 741 742 743 744
	return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
745
	kernfs_remove(root->kn);	/* will also free @root */
746 747
}

748 749 750 751
/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
752
 * @mode: mode of the new directory
753 754 755 756 757
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
758
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
759 760
					 const char *name, umode_t mode,
					 void *priv, const void *ns)
761
{
762
	struct kernfs_node *kn;
763 764 765
	int rc;

	/* allocate */
766
	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
767
	if (!kn)
768 769
		return ERR_PTR(-ENOMEM);

770 771
	kn->dir.root = parent->dir.root;
	kn->ns = ns;
772
	kn->priv = priv;
773 774

	/* link in */
T
Tejun Heo 已提交
775
	rc = kernfs_add_one(kn);
776
	if (!rc)
777
		return kn;
778

779
	kernfs_put(kn);
780 781 782
	return ERR_PTR(rc);
}

783 784 785
static struct dentry *kernfs_iop_lookup(struct inode *dir,
					struct dentry *dentry,
					unsigned int flags)
786
{
T
Tejun Heo 已提交
787
	struct dentry *ret;
788 789
	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
	struct kernfs_node *kn;
790 791 792
	struct inode *inode;
	const void *ns = NULL;

793
	mutex_lock(&kernfs_mutex);
794

795
	if (kernfs_ns_enabled(parent))
796
		ns = kernfs_info(dir->i_sb)->ns;
797

798
	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
799 800

	/* no such entry */
801
	if (!kn || !kernfs_active(kn)) {
T
Tejun Heo 已提交
802
		ret = NULL;
803 804
		goto out_unlock;
	}
805 806
	kernfs_get(kn);
	dentry->d_fsdata = kn;
807 808

	/* attach dentry and inode */
809
	inode = kernfs_get_inode(dir->i_sb, kn);
810 811 812 813 814 815 816 817
	if (!inode) {
		ret = ERR_PTR(-ENOMEM);
		goto out_unlock;
	}

	/* instantiate and hash dentry */
	ret = d_materialise_unique(dentry, inode);
 out_unlock:
818
	mutex_unlock(&kernfs_mutex);
819 820 821
	return ret;
}

T
Tejun Heo 已提交
822 823 824 825
static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
			    umode_t mode)
{
	struct kernfs_node *parent = dir->i_private;
826
	struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
827
	int ret;
T
Tejun Heo 已提交
828

829
	if (!scops || !scops->mkdir)
T
Tejun Heo 已提交
830 831
		return -EPERM;

832 833 834
	if (!kernfs_get_active(parent))
		return -ENODEV;

835
	ret = scops->mkdir(parent, dentry->d_name.name, mode);
836 837 838

	kernfs_put_active(parent);
	return ret;
T
Tejun Heo 已提交
839 840 841 842 843
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
	struct kernfs_node *kn  = dentry->d_fsdata;
844
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
845
	int ret;
T
Tejun Heo 已提交
846

847
	if (!scops || !scops->rmdir)
T
Tejun Heo 已提交
848 849
		return -EPERM;

850 851 852
	if (!kernfs_get_active(kn))
		return -ENODEV;

853
	ret = scops->rmdir(kn);
854 855 856

	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
857 858 859 860 861 862 863
}

static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
			     struct inode *new_dir, struct dentry *new_dentry)
{
	struct kernfs_node *kn  = old_dentry->d_fsdata;
	struct kernfs_node *new_parent = new_dir->i_private;
864
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
865
	int ret;
T
Tejun Heo 已提交
866

867
	if (!scops || !scops->rename)
T
Tejun Heo 已提交
868 869
		return -EPERM;

870 871 872 873 874 875 876 877
	if (!kernfs_get_active(kn))
		return -ENODEV;

	if (!kernfs_get_active(new_parent)) {
		kernfs_put_active(kn);
		return -ENODEV;
	}

878
	ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
879 880 881 882

	kernfs_put_active(new_parent);
	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
883 884
}

885
const struct inode_operations kernfs_dir_iops = {
886 887 888 889 890 891 892 893
	.lookup		= kernfs_iop_lookup,
	.permission	= kernfs_iop_permission,
	.setattr	= kernfs_iop_setattr,
	.getattr	= kernfs_iop_getattr,
	.setxattr	= kernfs_iop_setxattr,
	.removexattr	= kernfs_iop_removexattr,
	.getxattr	= kernfs_iop_getxattr,
	.listxattr	= kernfs_iop_listxattr,
T
Tejun Heo 已提交
894 895 896 897

	.mkdir		= kernfs_iop_mkdir,
	.rmdir		= kernfs_iop_rmdir,
	.rename		= kernfs_iop_rename,
898 899
};

900
static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
901
{
902
	struct kernfs_node *last;
903 904 905 906 907 908

	while (true) {
		struct rb_node *rbn;

		last = pos;

T
Tejun Heo 已提交
909
		if (kernfs_type(pos) != KERNFS_DIR)
910 911
			break;

912
		rbn = rb_first(&pos->dir.children);
913 914 915
		if (!rbn)
			break;

916
		pos = rb_to_kn(rbn);
917 918 919 920 921 922
	}

	return last;
}

/**
923
 * kernfs_next_descendant_post - find the next descendant for post-order walk
924
 * @pos: the current position (%NULL to initiate traversal)
925
 * @root: kernfs_node whose descendants to walk
926 927 928 929 930
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
931 932
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
						       struct kernfs_node *root)
933 934 935
{
	struct rb_node *rbn;

936
	lockdep_assert_held(&kernfs_mutex);
937 938 939

	/* if first iteration, visit leftmost descendant which may be root */
	if (!pos)
940
		return kernfs_leftmost_descendant(root);
941 942 943 944 945 946

	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
947
	rbn = rb_next(&pos->rb);
948
	if (rbn)
949
		return kernfs_leftmost_descendant(rb_to_kn(rbn));
950 951

	/* no sibling left, visit parent */
952
	return pos->parent;
953 954
}

955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
	struct kernfs_node *pos;

	mutex_lock(&kernfs_mutex);

	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn))) {
		if (!pos || (pos->flags & KERNFS_ACTIVATED))
			continue;

		WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
		WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);

		atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
		pos->flags |= KERNFS_ACTIVATED;
	}

	mutex_unlock(&kernfs_mutex);
}

T
Tejun Heo 已提交
989
static void __kernfs_remove(struct kernfs_node *kn)
990
{
991 992 993
	struct kernfs_node *pos;

	lockdep_assert_held(&kernfs_mutex);
994

995 996 997 998 999 1000
	/*
	 * Short-circuit if non-root @kn has already finished removal.
	 * This is for kernfs_remove_self() which plays with active ref
	 * after removal.
	 */
	if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
1001 1002
		return;

1003
	pr_debug("kernfs %s: removing\n", kn->name);
1004

T
Tejun Heo 已提交
1005
	/* prevent any new usage under @kn by deactivating all nodes */
1006 1007
	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn)))
T
Tejun Heo 已提交
1008 1009
		if (kernfs_active(pos))
			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1010 1011

	/* deactivate and unlink the subtree node-by-node */
1012
	do {
1013 1014 1015
		pos = kernfs_leftmost_descendant(kn);

		/*
T
Tejun Heo 已提交
1016 1017 1018 1019
		 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
		 * base ref could have been put by someone else by the time
		 * the function returns.  Make sure it doesn't go away
		 * underneath us.
1020 1021 1022
		 */
		kernfs_get(pos);

1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
		/*
		 * Drain iff @kn was activated.  This avoids draining and
		 * its lockdep annotations for nodes which have never been
		 * activated and allows embedding kernfs_remove() in create
		 * error paths without worrying about draining.
		 */
		if (kn->flags & KERNFS_ACTIVATED)
			kernfs_drain(pos);
		else
			WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047

		/*
		 * kernfs_unlink_sibling() succeeds once per node.  Use it
		 * to decide who's responsible for cleanups.
		 */
		if (!pos->parent || kernfs_unlink_sibling(pos)) {
			struct kernfs_iattrs *ps_iattr =
				pos->parent ? pos->parent->iattr : NULL;

			/* update timestamps on the parent */
			if (ps_iattr) {
				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
			}

T
Tejun Heo 已提交
1048
			kernfs_put(pos);
1049 1050 1051 1052
		}

		kernfs_put(pos);
	} while (pos != kn);
1053 1054 1055
}

/**
1056 1057
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
1058
 *
1059
 * Remove @kn along with all its subdirectories and files.
1060
 */
1061
void kernfs_remove(struct kernfs_node *kn)
1062
{
T
Tejun Heo 已提交
1063 1064 1065
	mutex_lock(&kernfs_mutex);
	__kernfs_remove(kn);
	mutex_unlock(&kernfs_mutex);
1066 1067
}

1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
	/*
	 * Take out ourself out of the active ref dependency chain.  If
	 * we're called without an active ref, lockdep will complain.
	 */
	kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being removed.  Once kernfs_break_active_protection() is invoked, that
 * protection is irreversibly gone for the kernfs operation instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
	/*
	 * @kn->active could be in any state; however, the increment we do
	 * here will be undone as soon as the enclosing kernfs operation
	 * finishes and this temporary bump can't break anything.  If @kn
	 * is alive, nothing changes.  If @kn is being deactivated, the
	 * soon-to-follow put will either finish deactivation or restore
	 * deactivated state.  If @kn is already removed, the temporary
	 * bump is guaranteed to be gone before @kn is released.
	 */
	atomic_inc(&kn->active);
	if (kernfs_lockdep(kn))
		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
	bool ret;

	mutex_lock(&kernfs_mutex);
	kernfs_break_active_protection(kn);

	/*
	 * SUICIDAL is used to arbitrate among competing invocations.  Only
	 * the first one will actually perform removal.  When the removal
	 * is complete, SUICIDED is set and the active ref is restored
	 * while holding kernfs_mutex.  The ones which lost arbitration
	 * waits for SUICDED && drained which can happen only after the
	 * enclosing kernfs operation which executed the winning instance
	 * of kernfs_remove_self() finished.
	 */
	if (!(kn->flags & KERNFS_SUICIDAL)) {
		kn->flags |= KERNFS_SUICIDAL;
		__kernfs_remove(kn);
		kn->flags |= KERNFS_SUICIDED;
		ret = true;
	} else {
		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
		DEFINE_WAIT(wait);

		while (true) {
			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

			if ((kn->flags & KERNFS_SUICIDED) &&
			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
				break;

			mutex_unlock(&kernfs_mutex);
			schedule();
			mutex_lock(&kernfs_mutex);
		}
		finish_wait(waitq, &wait);
		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
		ret = false;
	}

	/*
	 * This must be done while holding kernfs_mutex; otherwise, waiting
	 * for SUICIDED && deactivated could finish prematurely.
	 */
	kernfs_unbreak_active_protection(kn);

	mutex_unlock(&kernfs_mutex);
	return ret;
}

1199
/**
1200 1201 1202 1203
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
1204
 *
1205 1206
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 * Returns 0 on success, -ENOENT if such entry doesn't exist.
1207
 */
1208
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
1209 1210
			     const void *ns)
{
1211
	struct kernfs_node *kn;
1212

1213
	if (!parent) {
1214
		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
1215 1216 1217 1218
			name);
		return -ENOENT;
	}

T
Tejun Heo 已提交
1219
	mutex_lock(&kernfs_mutex);
1220

1221 1222
	kn = kernfs_find_ns(parent, name, ns);
	if (kn)
T
Tejun Heo 已提交
1223
		__kernfs_remove(kn);
1224

T
Tejun Heo 已提交
1225
	mutex_unlock(&kernfs_mutex);
1226

1227
	if (kn)
1228 1229 1230 1231 1232 1233 1234
		return 0;
	else
		return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
1235
 * @kn: target node
1236 1237 1238 1239
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
1240
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1241 1242
		     const char *new_name, const void *new_ns)
{
1243 1244
	struct kernfs_node *old_parent;
	const char *old_name = NULL;
1245 1246
	int error;

1247 1248 1249 1250
	/* can't move or rename root */
	if (!kn->parent)
		return -EINVAL;

1251 1252
	mutex_lock(&kernfs_mutex);

1253
	error = -ENOENT;
T
Tejun Heo 已提交
1254
	if (!kernfs_active(kn) || !kernfs_active(new_parent))
1255 1256
		goto out;

1257
	error = 0;
1258 1259
	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
	    (strcmp(kn->name, new_name) == 0))
1260
		goto out;	/* nothing to rename */
1261 1262 1263

	error = -EEXIST;
	if (kernfs_find_ns(new_parent, new_name, new_ns))
1264
		goto out;
1265

1266
	/* rename kernfs_node */
1267
	if (strcmp(kn->name, new_name) != 0) {
1268 1269 1270
		error = -ENOMEM;
		new_name = kstrdup(new_name, GFP_KERNEL);
		if (!new_name)
1271
			goto out;
1272 1273
	} else {
		new_name = NULL;
1274 1275 1276 1277 1278
	}

	/*
	 * Move to the appropriate place in the appropriate directories rbtree.
	 */
1279
	kernfs_unlink_sibling(kn);
1280
	kernfs_get(new_parent);
1281 1282 1283 1284 1285

	/* rename_lock protects ->parent and ->name accessors */
	spin_lock_irq(&kernfs_rename_lock);

	old_parent = kn->parent;
1286
	kn->parent = new_parent;
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297

	kn->ns = new_ns;
	if (new_name) {
		if (!(kn->flags & KERNFS_STATIC_NAME))
			old_name = kn->name;
		kn->flags &= ~KERNFS_STATIC_NAME;
		kn->name = new_name;
	}

	spin_unlock_irq(&kernfs_rename_lock);

1298
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
1299
	kernfs_link_sibling(kn);
1300

1301 1302 1303
	kernfs_put(old_parent);
	kfree(old_name);

1304
	error = 0;
1305
 out:
1306
	mutex_unlock(&kernfs_mutex);
1307 1308 1309 1310
	return error;
}

/* Relationship between s_mode and the DT_xxx types */
1311
static inline unsigned char dt_type(struct kernfs_node *kn)
1312
{
1313
	return (kn->mode >> 12) & 15;
1314 1315
}

1316
static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
1317 1318 1319 1320 1321
{
	kernfs_put(filp->private_data);
	return 0;
}

1322
static struct kernfs_node *kernfs_dir_pos(const void *ns,
1323
	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
1324 1325
{
	if (pos) {
T
Tejun Heo 已提交
1326
		int valid = kernfs_active(pos) &&
1327
			pos->parent == parent && hash == pos->hash;
1328 1329 1330 1331 1332
		kernfs_put(pos);
		if (!valid)
			pos = NULL;
	}
	if (!pos && (hash > 1) && (hash < INT_MAX)) {
1333
		struct rb_node *node = parent->dir.children.rb_node;
1334
		while (node) {
1335
			pos = rb_to_kn(node);
1336

1337
			if (hash < pos->hash)
1338
				node = node->rb_left;
1339
			else if (hash > pos->hash)
1340 1341 1342 1343 1344
				node = node->rb_right;
			else
				break;
		}
	}
1345 1346
	/* Skip over entries which are dying/dead or in the wrong namespace */
	while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
1347
		struct rb_node *node = rb_next(&pos->rb);
1348 1349 1350
		if (!node)
			pos = NULL;
		else
1351
			pos = rb_to_kn(node);
1352 1353 1354 1355
	}
	return pos;
}

1356
static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1357
	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1358
{
1359
	pos = kernfs_dir_pos(ns, parent, ino, pos);
1360
	if (pos) {
1361
		do {
1362
			struct rb_node *node = rb_next(&pos->rb);
1363 1364 1365
			if (!node)
				pos = NULL;
			else
1366
				pos = rb_to_kn(node);
1367 1368
		} while (pos && (!kernfs_active(pos) || pos->ns != ns));
	}
1369 1370 1371
	return pos;
}

1372
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
1373 1374
{
	struct dentry *dentry = file->f_path.dentry;
1375 1376
	struct kernfs_node *parent = dentry->d_fsdata;
	struct kernfs_node *pos = file->private_data;
1377 1378 1379 1380
	const void *ns = NULL;

	if (!dir_emit_dots(file, ctx))
		return 0;
1381
	mutex_lock(&kernfs_mutex);
1382

1383
	if (kernfs_ns_enabled(parent))
1384
		ns = kernfs_info(dentry->d_sb)->ns;
1385

1386
	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
1387
	     pos;
1388
	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
1389
		const char *name = pos->name;
1390 1391
		unsigned int type = dt_type(pos);
		int len = strlen(name);
1392
		ino_t ino = pos->ino;
1393

1394
		ctx->pos = pos->hash;
1395 1396 1397
		file->private_data = pos;
		kernfs_get(pos);

1398
		mutex_unlock(&kernfs_mutex);
1399 1400
		if (!dir_emit(ctx, name, len, ino, type))
			return 0;
1401
		mutex_lock(&kernfs_mutex);
1402
	}
1403
	mutex_unlock(&kernfs_mutex);
1404 1405 1406 1407 1408
	file->private_data = NULL;
	ctx->pos = INT_MAX;
	return 0;
}

1409 1410
static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
				    int whence)
1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421
{
	struct inode *inode = file_inode(file);
	loff_t ret;

	mutex_lock(&inode->i_mutex);
	ret = generic_file_llseek(file, offset, whence);
	mutex_unlock(&inode->i_mutex);

	return ret;
}

1422
const struct file_operations kernfs_dir_fops = {
1423
	.read		= generic_read_dir,
1424 1425 1426
	.iterate	= kernfs_fop_readdir,
	.release	= kernfs_dir_fop_release,
	.llseek		= kernfs_dir_fop_llseek,
1427
};