dir.c 35.3 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2.
 */
10

11
#include <linux/sched.h>
12 13 14 15 16 17 18 19 20
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>

#include "kernfs-internal.h"

21
DEFINE_MUTEX(kernfs_mutex);
22 23
static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
24

25
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
26

T
Tejun Heo 已提交
27 28 29 30 31 32
static bool kernfs_active(struct kernfs_node *kn)
{
	lockdep_assert_held(&kernfs_mutex);
	return atomic_read(&kn->active) >= 0;
}

33 34 35 36 37 38 39 40 41
static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	return kn->flags & KERNFS_LOCKDEP;
#else
	return false;
#endif
}

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
	return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
					      size_t buflen)
{
	char *p = buf + buflen;
	int len;

	*--p = '\0';

	do {
		len = strlen(kn->name);
		if (p - buf < len + 1) {
			buf[0] = '\0';
			p = NULL;
			break;
		}
		p -= len;
		memcpy(p, kn->name, len);
		*--p = '/';
		kn = kn->parent;
	} while (kn && kn->parent);

	return p;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strlcpy().  It returns the length of @kn's name and if @buf
 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	ret = kernfs_name_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return ret;
}

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
 * path is built from the end of @buf so the returned pointer usually
 * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
 * and %NULL is returned.
 */
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	p = kernfs_path_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return p;
}
T
Tejun Heo 已提交
115
EXPORT_SYMBOL_GPL(kernfs_path);
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
	pr_cont("%s", kernfs_pr_cont_buf);

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
			       sizeof(kernfs_pr_cont_buf));
	if (p)
		pr_cont("%s", p);
	else
		pr_cont("<name too long>");

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
	struct kernfs_node *parent;
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	parent = kn->parent;
	kernfs_get(parent);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);

	return parent;
}

178
/**
179
 *	kernfs_name_hash
180 181 182 183 184
 *	@name: Null terminated string to hash
 *	@ns:   Namespace tag to hash
 *
 *	Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
185
static unsigned int kernfs_name_hash(const char *name, const void *ns)
186 187 188 189 190 191 192 193
{
	unsigned long hash = init_name_hash();
	unsigned int len = strlen(name);
	while (len--)
		hash = partial_name_hash(*name++, hash);
	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
	hash &= 0x7fffffffU;
	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
R
Richard Cochran 已提交
194
	if (hash < 2)
195 196 197 198 199 200
		hash += 2;
	if (hash >= INT_MAX)
		hash = INT_MAX - 1;
	return hash;
}

201 202
static int kernfs_name_compare(unsigned int hash, const char *name,
			       const void *ns, const struct kernfs_node *kn)
203
{
204 205 206 207 208 209 210 211
	if (hash < kn->hash)
		return -1;
	if (hash > kn->hash)
		return 1;
	if (ns < kn->ns)
		return -1;
	if (ns > kn->ns)
		return 1;
212
	return strcmp(name, kn->name);
213 214
}

215 216
static int kernfs_sd_compare(const struct kernfs_node *left,
			     const struct kernfs_node *right)
217
{
218
	return kernfs_name_compare(left->hash, left->name, left->ns, right);
219 220 221
}

/**
222
 *	kernfs_link_sibling - link kernfs_node into sibling rbtree
223
 *	@kn: kernfs_node of interest
224
 *
225
 *	Link @kn into its sibling rbtree which starts from
226
 *	@kn->parent->dir.children.
227 228
 *
 *	Locking:
229
 *	mutex_lock(kernfs_mutex)
230 231 232 233
 *
 *	RETURNS:
 *	0 on susccess -EEXIST on failure.
 */
234
static int kernfs_link_sibling(struct kernfs_node *kn)
235
{
236
	struct rb_node **node = &kn->parent->dir.children.rb_node;
237 238 239
	struct rb_node *parent = NULL;

	while (*node) {
240
		struct kernfs_node *pos;
241 242
		int result;

243
		pos = rb_to_kn(*node);
244
		parent = *node;
245
		result = kernfs_sd_compare(kn, pos);
246
		if (result < 0)
247
			node = &pos->rb.rb_left;
248
		else if (result > 0)
249
			node = &pos->rb.rb_right;
250 251 252
		else
			return -EEXIST;
	}
J
Jianyu Zhan 已提交
253

254
	/* add new node and rebalance the tree */
255 256
	rb_link_node(&kn->rb, parent, node);
	rb_insert_color(&kn->rb, &kn->parent->dir.children);
J
Jianyu Zhan 已提交
257 258 259 260 261

	/* successfully added, account subdir number */
	if (kernfs_type(kn) == KERNFS_DIR)
		kn->parent->dir.subdirs++;

262 263 264 265
	return 0;
}

/**
266
 *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
267
 *	@kn: kernfs_node of interest
268
 *
269 270 271
 *	Try to unlink @kn from its sibling rbtree which starts from
 *	kn->parent->dir.children.  Returns %true if @kn was actually
 *	removed, %false if @kn wasn't on the rbtree.
272 273
 *
 *	Locking:
274
 *	mutex_lock(kernfs_mutex)
275
 */
276
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
277
{
278 279 280
	if (RB_EMPTY_NODE(&kn->rb))
		return false;

T
Tejun Heo 已提交
281
	if (kernfs_type(kn) == KERNFS_DIR)
282
		kn->parent->dir.subdirs--;
283

284
	rb_erase(&kn->rb, &kn->parent->dir.children);
285 286
	RB_CLEAR_NODE(&kn->rb);
	return true;
287 288 289
}

/**
290
 *	kernfs_get_active - get an active reference to kernfs_node
291
 *	@kn: kernfs_node to get an active reference to
292
 *
293
 *	Get an active reference of @kn.  This function is noop if @kn
294 295 296
 *	is NULL.
 *
 *	RETURNS:
297
 *	Pointer to @kn on success, NULL on failure.
298
 */
299
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
300
{
301
	if (unlikely(!kn))
302 303
		return NULL;

304 305
	if (!atomic_inc_unless_negative(&kn->active))
		return NULL;
306

307
	if (kernfs_lockdep(kn))
308 309
		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
	return kn;
310 311 312
}

/**
313
 *	kernfs_put_active - put an active reference to kernfs_node
314
 *	@kn: kernfs_node to put an active reference to
315
 *
316
 *	Put an active reference to @kn.  This function is noop if @kn
317 318
 *	is NULL.
 */
319
void kernfs_put_active(struct kernfs_node *kn)
320
{
321
	struct kernfs_root *root = kernfs_root(kn);
322 323
	int v;

324
	if (unlikely(!kn))
325 326
		return;

327
	if (kernfs_lockdep(kn))
328
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
329
	v = atomic_dec_return(&kn->active);
T
Tejun Heo 已提交
330
	if (likely(v != KN_DEACTIVATED_BIAS))
331 332
		return;

333
	wake_up_all(&root->deactivate_waitq);
334 335 336
}

/**
T
Tejun Heo 已提交
337 338
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
339
 *
T
Tejun Heo 已提交
340 341 342
 * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
343
 */
T
Tejun Heo 已提交
344
static void kernfs_drain(struct kernfs_node *kn)
345
	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
346
{
347
	struct kernfs_root *root = kernfs_root(kn);
348

349
	lockdep_assert_held(&kernfs_mutex);
T
Tejun Heo 已提交
350
	WARN_ON_ONCE(kernfs_active(kn));
351

352
	mutex_unlock(&kernfs_mutex);
353

354
	if (kernfs_lockdep(kn)) {
355 356 357 358
		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
			lock_contended(&kn->dep_map, _RET_IP_);
	}
359

360
	/* but everyone should wait for draining */
361 362
	wait_event(root->deactivate_waitq,
		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
363

364
	if (kernfs_lockdep(kn)) {
365 366 367
		lock_acquired(&kn->dep_map, _RET_IP_);
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
	}
368

369 370
	kernfs_unmap_bin_file(kn);

371
	mutex_lock(&kernfs_mutex);
372 373 374
}

/**
375 376
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
377
 */
378
void kernfs_get(struct kernfs_node *kn)
379
{
380
	if (kn) {
381 382
		WARN_ON(!atomic_read(&kn->count));
		atomic_inc(&kn->count);
383 384 385 386 387
	}
}
EXPORT_SYMBOL_GPL(kernfs_get);

/**
388 389
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
390
 *
391
 * Put a reference count of @kn and destroy it if it reached zero.
392
 */
393
void kernfs_put(struct kernfs_node *kn)
394
{
395
	struct kernfs_node *parent;
396
	struct kernfs_root *root;
397

398
	if (!kn || !atomic_dec_and_test(&kn->count))
399
		return;
400
	root = kernfs_root(kn);
401
 repeat:
T
Tejun Heo 已提交
402 403
	/*
	 * Moving/renaming is always done while holding reference.
404
	 * kn->parent won't change beneath us.
405
	 */
406
	parent = kn->parent;
407

T
Tejun Heo 已提交
408 409 410
	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
411

T
Tejun Heo 已提交
412
	if (kernfs_type(kn) == KERNFS_LINK)
413
		kernfs_put(kn->symlink.target_kn);
T
Tejun Heo 已提交
414 415 416

	kfree_const(kn->name);

417 418 419 420 421
	if (kn->iattr) {
		if (kn->iattr->ia_secdata)
			security_release_secctx(kn->iattr->ia_secdata,
						kn->iattr->ia_secdata_len);
		simple_xattrs_free(&kn->iattr->xattrs);
422
	}
423 424
	kfree(kn->iattr);
	ida_simple_remove(&root->ino_ida, kn->ino);
425
	kmem_cache_free(kernfs_node_cache, kn);
426

427 428
	kn = parent;
	if (kn) {
429
		if (atomic_dec_and_test(&kn->count))
430 431
			goto repeat;
	} else {
432
		/* just released the root kn, free @root too */
433
		ida_destroy(&root->ino_ida);
434 435
		kfree(root);
	}
436 437 438
}
EXPORT_SYMBOL_GPL(kernfs_put);

439
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
440
{
441
	struct kernfs_node *kn;
442 443 444 445

	if (flags & LOOKUP_RCU)
		return -ECHILD;

T
Tejun Heo 已提交
446 447 448 449
	/* Always perform fresh lookup for negatives */
	if (!dentry->d_inode)
		goto out_bad_unlocked;

450
	kn = dentry->d_fsdata;
451
	mutex_lock(&kernfs_mutex);
452

T
Tejun Heo 已提交
453 454
	/* The kernfs node has been deactivated */
	if (!kernfs_active(kn))
455 456
		goto out_bad;

457
	/* The kernfs node has been moved? */
458
	if (dentry->d_parent->d_fsdata != kn->parent)
459 460
		goto out_bad;

461
	/* The kernfs node has been renamed */
462
	if (strcmp(dentry->d_name.name, kn->name) != 0)
463 464
		goto out_bad;

465
	/* The kernfs node has been moved to a different namespace */
466
	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
467
	    kernfs_info(dentry->d_sb)->ns != kn->ns)
468 469
		goto out_bad;

470
	mutex_unlock(&kernfs_mutex);
471 472
	return 1;
out_bad:
473
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
474
out_bad_unlocked:
475 476 477
	return 0;
}

478
static void kernfs_dop_release(struct dentry *dentry)
479 480 481 482
{
	kernfs_put(dentry->d_fsdata);
}

483
const struct dentry_operations kernfs_dops = {
484 485
	.d_revalidate	= kernfs_dop_revalidate,
	.d_release	= kernfs_dop_release,
486 487
};

488 489 490 491 492 493 494 495 496 497 498 499 500
/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
L
Li Zefan 已提交
501
	if (dentry->d_sb->s_op == &kernfs_sops)
502 503 504 505
		return dentry->d_fsdata;
	return NULL;
}

506 507 508
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
					     const char *name, umode_t mode,
					     unsigned flags)
509
{
510
	struct kernfs_node *kn;
511
	int ret;
512

T
Tejun Heo 已提交
513 514 515
	name = kstrdup_const(name, GFP_KERNEL);
	if (!name)
		return NULL;
516

517
	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
518
	if (!kn)
519 520
		goto err_out1;

521 522
	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
	if (ret < 0)
523
		goto err_out2;
524
	kn->ino = ret;
525

526
	atomic_set(&kn->count, 1);
T
Tejun Heo 已提交
527
	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
528
	RB_CLEAR_NODE(&kn->rb);
529

530 531
	kn->name = name;
	kn->mode = mode;
T
Tejun Heo 已提交
532
	kn->flags = flags;
533

534
	return kn;
535 536

 err_out2:
537
	kmem_cache_free(kernfs_node_cache, kn);
538
 err_out1:
T
Tejun Heo 已提交
539
	kfree_const(name);
540 541 542
	return NULL;
}

543 544 545 546 547 548 549 550 551 552 553 554 555 556
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
				    const char *name, umode_t mode,
				    unsigned flags)
{
	struct kernfs_node *kn;

	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
	if (kn) {
		kernfs_get(parent);
		kn->parent = parent;
	}
	return kn;
}

557
/**
558
 *	kernfs_add_one - add kernfs_node to parent without warning
559
 *	@kn: kernfs_node to be added
560
 *
561 562 563
 *	The caller must already have initialized @kn->parent.  This
 *	function increments nlink of the parent's inode if @kn is a
 *	directory and link into the children list of the parent.
564 565 566 567 568
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
 */
T
Tejun Heo 已提交
569
int kernfs_add_one(struct kernfs_node *kn)
570
{
571
	struct kernfs_node *parent = kn->parent;
572
	struct kernfs_iattrs *ps_iattr;
T
Tejun Heo 已提交
573
	bool has_ns;
574 575
	int ret;

T
Tejun Heo 已提交
576 577 578 579 580 581 582
	mutex_lock(&kernfs_mutex);

	ret = -EINVAL;
	has_ns = kernfs_ns_enabled(parent);
	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
		 has_ns ? "required" : "invalid", parent->name, kn->name))
		goto out_unlock;
583

T
Tejun Heo 已提交
584
	if (kernfs_type(parent) != KERNFS_DIR)
T
Tejun Heo 已提交
585
		goto out_unlock;
586

T
Tejun Heo 已提交
587
	ret = -ENOENT;
588
	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
T
Tejun Heo 已提交
589
		goto out_unlock;
590

591
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
592

593
	ret = kernfs_link_sibling(kn);
594
	if (ret)
T
Tejun Heo 已提交
595
		goto out_unlock;
596 597

	/* Update timestamps on the parent */
598
	ps_iattr = parent->iattr;
599 600 601 602 603
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

604 605 606 607 608 609 610 611 612 613 614 615 616
	mutex_unlock(&kernfs_mutex);

	/*
	 * Activate the new node unless CREATE_DEACTIVATED is requested.
	 * If not activated here, the kernfs user is responsible for
	 * activating the node with kernfs_activate().  A node which hasn't
	 * been activated is not visible to userland and its removal won't
	 * trigger deactivation.
	 */
	if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);
	return 0;

T
Tejun Heo 已提交
617
out_unlock:
618
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
619
	return ret;
620 621 622
}

/**
623 624
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
625 626 627
 * @name: name to look for
 * @ns: the namespace tag to use
 *
628 629
 * Look for kernfs_node with name @name under @parent.  Returns pointer to
 * the found kernfs_node on success, %NULL on failure.
630
 */
631 632 633
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
					  const unsigned char *name,
					  const void *ns)
634
{
635
	struct rb_node *node = parent->dir.children.rb_node;
636
	bool has_ns = kernfs_ns_enabled(parent);
637 638
	unsigned int hash;

639
	lockdep_assert_held(&kernfs_mutex);
640 641

	if (has_ns != (bool)ns) {
642
		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
643
		     has_ns ? "required" : "invalid", parent->name, name);
644 645 646
		return NULL;
	}

647
	hash = kernfs_name_hash(name, ns);
648
	while (node) {
649
		struct kernfs_node *kn;
650 651
		int result;

652
		kn = rb_to_kn(node);
653
		result = kernfs_name_compare(hash, name, ns, kn);
654 655 656 657 658
		if (result < 0)
			node = node->rb_left;
		else if (result > 0)
			node = node->rb_right;
		else
659
			return kn;
660 661 662 663 664
	}
	return NULL;
}

/**
665 666
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
667 668 669
 * @name: name to look for
 * @ns: the namespace tag to use
 *
670
 * Look for kernfs_node with name @name under @parent and get a reference
671
 * if found.  This function may sleep and returns pointer to the found
672
 * kernfs_node on success, %NULL on failure.
673
 */
674 675
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
					   const char *name, const void *ns)
676
{
677
	struct kernfs_node *kn;
678

679
	mutex_lock(&kernfs_mutex);
680 681
	kn = kernfs_find_ns(parent, name, ns);
	kernfs_get(kn);
682
	mutex_unlock(&kernfs_mutex);
683

684
	return kn;
685 686 687
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

688 689
/**
 * kernfs_create_root - create a new kernfs hierarchy
690
 * @scops: optional syscall operations for the hierarchy
691
 * @flags: KERNFS_ROOT_* flags
692 693 694 695 696
 * @priv: opaque data associated with the new directory
 *
 * Returns the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
697
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
698
				       unsigned int flags, void *priv)
699 700
{
	struct kernfs_root *root;
701
	struct kernfs_node *kn;
702 703 704 705 706

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root)
		return ERR_PTR(-ENOMEM);

707
	ida_init(&root->ino_ida);
708
	INIT_LIST_HEAD(&root->supers);
709

710 711
	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
			       KERNFS_DIR);
712
	if (!kn) {
713
		ida_destroy(&root->ino_ida);
714 715 716 717
		kfree(root);
		return ERR_PTR(-ENOMEM);
	}

718
	kn->priv = priv;
719
	kn->dir.root = root;
720

721
	root->syscall_ops = scops;
722
	root->flags = flags;
723
	root->kn = kn;
724
	init_waitqueue_head(&root->deactivate_waitq);
725

726 727 728
	if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);

729 730 731 732 733 734 735 736 737 738 739 740
	return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
741
	kernfs_remove(root->kn);	/* will also free @root */
742 743
}

744 745 746 747
/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
748
 * @mode: mode of the new directory
749 750 751 752 753
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
754
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
755 756
					 const char *name, umode_t mode,
					 void *priv, const void *ns)
757
{
758
	struct kernfs_node *kn;
759 760 761
	int rc;

	/* allocate */
762
	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
763
	if (!kn)
764 765
		return ERR_PTR(-ENOMEM);

766 767
	kn->dir.root = parent->dir.root;
	kn->ns = ns;
768
	kn->priv = priv;
769 770

	/* link in */
T
Tejun Heo 已提交
771
	rc = kernfs_add_one(kn);
772
	if (!rc)
773
		return kn;
774

775
	kernfs_put(kn);
776 777 778
	return ERR_PTR(rc);
}

779 780 781
static struct dentry *kernfs_iop_lookup(struct inode *dir,
					struct dentry *dentry,
					unsigned int flags)
782
{
T
Tejun Heo 已提交
783
	struct dentry *ret;
784 785
	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
	struct kernfs_node *kn;
786 787 788
	struct inode *inode;
	const void *ns = NULL;

789
	mutex_lock(&kernfs_mutex);
790

791
	if (kernfs_ns_enabled(parent))
792
		ns = kernfs_info(dir->i_sb)->ns;
793

794
	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
795 796

	/* no such entry */
797
	if (!kn || !kernfs_active(kn)) {
T
Tejun Heo 已提交
798
		ret = NULL;
799 800
		goto out_unlock;
	}
801 802
	kernfs_get(kn);
	dentry->d_fsdata = kn;
803 804

	/* attach dentry and inode */
805
	inode = kernfs_get_inode(dir->i_sb, kn);
806 807 808 809 810 811
	if (!inode) {
		ret = ERR_PTR(-ENOMEM);
		goto out_unlock;
	}

	/* instantiate and hash dentry */
812
	ret = d_splice_alias(inode, dentry);
813
 out_unlock:
814
	mutex_unlock(&kernfs_mutex);
815 816 817
	return ret;
}

T
Tejun Heo 已提交
818 819 820 821
static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
			    umode_t mode)
{
	struct kernfs_node *parent = dir->i_private;
822
	struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
823
	int ret;
T
Tejun Heo 已提交
824

825
	if (!scops || !scops->mkdir)
T
Tejun Heo 已提交
826 827
		return -EPERM;

828 829 830
	if (!kernfs_get_active(parent))
		return -ENODEV;

831
	ret = scops->mkdir(parent, dentry->d_name.name, mode);
832 833 834

	kernfs_put_active(parent);
	return ret;
T
Tejun Heo 已提交
835 836 837 838 839
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
	struct kernfs_node *kn  = dentry->d_fsdata;
840
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
841
	int ret;
T
Tejun Heo 已提交
842

843
	if (!scops || !scops->rmdir)
T
Tejun Heo 已提交
844 845
		return -EPERM;

846 847 848
	if (!kernfs_get_active(kn))
		return -ENODEV;

849
	ret = scops->rmdir(kn);
850 851 852

	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
853 854 855 856 857 858 859
}

static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
			     struct inode *new_dir, struct dentry *new_dentry)
{
	struct kernfs_node *kn  = old_dentry->d_fsdata;
	struct kernfs_node *new_parent = new_dir->i_private;
860
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
861
	int ret;
T
Tejun Heo 已提交
862

863
	if (!scops || !scops->rename)
T
Tejun Heo 已提交
864 865
		return -EPERM;

866 867 868 869 870 871 872 873
	if (!kernfs_get_active(kn))
		return -ENODEV;

	if (!kernfs_get_active(new_parent)) {
		kernfs_put_active(kn);
		return -ENODEV;
	}

874
	ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
875 876 877 878

	kernfs_put_active(new_parent);
	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
879 880
}

881
const struct inode_operations kernfs_dir_iops = {
882 883 884 885 886 887 888 889
	.lookup		= kernfs_iop_lookup,
	.permission	= kernfs_iop_permission,
	.setattr	= kernfs_iop_setattr,
	.getattr	= kernfs_iop_getattr,
	.setxattr	= kernfs_iop_setxattr,
	.removexattr	= kernfs_iop_removexattr,
	.getxattr	= kernfs_iop_getxattr,
	.listxattr	= kernfs_iop_listxattr,
T
Tejun Heo 已提交
890 891 892 893

	.mkdir		= kernfs_iop_mkdir,
	.rmdir		= kernfs_iop_rmdir,
	.rename		= kernfs_iop_rename,
894 895
};

896
static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
897
{
898
	struct kernfs_node *last;
899 900 901 902 903 904

	while (true) {
		struct rb_node *rbn;

		last = pos;

T
Tejun Heo 已提交
905
		if (kernfs_type(pos) != KERNFS_DIR)
906 907
			break;

908
		rbn = rb_first(&pos->dir.children);
909 910 911
		if (!rbn)
			break;

912
		pos = rb_to_kn(rbn);
913 914 915 916 917 918
	}

	return last;
}

/**
919
 * kernfs_next_descendant_post - find the next descendant for post-order walk
920
 * @pos: the current position (%NULL to initiate traversal)
921
 * @root: kernfs_node whose descendants to walk
922 923 924 925 926
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
927 928
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
						       struct kernfs_node *root)
929 930 931
{
	struct rb_node *rbn;

932
	lockdep_assert_held(&kernfs_mutex);
933 934 935

	/* if first iteration, visit leftmost descendant which may be root */
	if (!pos)
936
		return kernfs_leftmost_descendant(root);
937 938 939 940 941 942

	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
943
	rbn = rb_next(&pos->rb);
944
	if (rbn)
945
		return kernfs_leftmost_descendant(rb_to_kn(rbn));
946 947

	/* no sibling left, visit parent */
948
	return pos->parent;
949 950
}

951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984
/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
	struct kernfs_node *pos;

	mutex_lock(&kernfs_mutex);

	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn))) {
		if (!pos || (pos->flags & KERNFS_ACTIVATED))
			continue;

		WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
		WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);

		atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
		pos->flags |= KERNFS_ACTIVATED;
	}

	mutex_unlock(&kernfs_mutex);
}

T
Tejun Heo 已提交
985
static void __kernfs_remove(struct kernfs_node *kn)
986
{
987 988 989
	struct kernfs_node *pos;

	lockdep_assert_held(&kernfs_mutex);
990

991 992 993 994 995 996
	/*
	 * Short-circuit if non-root @kn has already finished removal.
	 * This is for kernfs_remove_self() which plays with active ref
	 * after removal.
	 */
	if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
997 998
		return;

999
	pr_debug("kernfs %s: removing\n", kn->name);
1000

T
Tejun Heo 已提交
1001
	/* prevent any new usage under @kn by deactivating all nodes */
1002 1003
	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn)))
T
Tejun Heo 已提交
1004 1005
		if (kernfs_active(pos))
			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1006 1007

	/* deactivate and unlink the subtree node-by-node */
1008
	do {
1009 1010 1011
		pos = kernfs_leftmost_descendant(kn);

		/*
T
Tejun Heo 已提交
1012 1013 1014 1015
		 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
		 * base ref could have been put by someone else by the time
		 * the function returns.  Make sure it doesn't go away
		 * underneath us.
1016 1017 1018
		 */
		kernfs_get(pos);

1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
		/*
		 * Drain iff @kn was activated.  This avoids draining and
		 * its lockdep annotations for nodes which have never been
		 * activated and allows embedding kernfs_remove() in create
		 * error paths without worrying about draining.
		 */
		if (kn->flags & KERNFS_ACTIVATED)
			kernfs_drain(pos);
		else
			WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043

		/*
		 * kernfs_unlink_sibling() succeeds once per node.  Use it
		 * to decide who's responsible for cleanups.
		 */
		if (!pos->parent || kernfs_unlink_sibling(pos)) {
			struct kernfs_iattrs *ps_iattr =
				pos->parent ? pos->parent->iattr : NULL;

			/* update timestamps on the parent */
			if (ps_iattr) {
				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
			}

T
Tejun Heo 已提交
1044
			kernfs_put(pos);
1045 1046 1047 1048
		}

		kernfs_put(pos);
	} while (pos != kn);
1049 1050 1051
}

/**
1052 1053
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
1054
 *
1055
 * Remove @kn along with all its subdirectories and files.
1056
 */
1057
void kernfs_remove(struct kernfs_node *kn)
1058
{
T
Tejun Heo 已提交
1059 1060 1061
	mutex_lock(&kernfs_mutex);
	__kernfs_remove(kn);
	mutex_unlock(&kernfs_mutex);
1062 1063
}

1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
	/*
	 * Take out ourself out of the active ref dependency chain.  If
	 * we're called without an active ref, lockdep will complain.
	 */
	kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being removed.  Once kernfs_break_active_protection() is invoked, that
 * protection is irreversibly gone for the kernfs operation instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
	/*
	 * @kn->active could be in any state; however, the increment we do
	 * here will be undone as soon as the enclosing kernfs operation
	 * finishes and this temporary bump can't break anything.  If @kn
	 * is alive, nothing changes.  If @kn is being deactivated, the
	 * soon-to-follow put will either finish deactivation or restore
	 * deactivated state.  If @kn is already removed, the temporary
	 * bump is guaranteed to be gone before @kn is released.
	 */
	atomic_inc(&kn->active);
	if (kernfs_lockdep(kn))
		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
	bool ret;

	mutex_lock(&kernfs_mutex);
	kernfs_break_active_protection(kn);

	/*
	 * SUICIDAL is used to arbitrate among competing invocations.  Only
	 * the first one will actually perform removal.  When the removal
	 * is complete, SUICIDED is set and the active ref is restored
	 * while holding kernfs_mutex.  The ones which lost arbitration
	 * waits for SUICDED && drained which can happen only after the
	 * enclosing kernfs operation which executed the winning instance
	 * of kernfs_remove_self() finished.
	 */
	if (!(kn->flags & KERNFS_SUICIDAL)) {
		kn->flags |= KERNFS_SUICIDAL;
		__kernfs_remove(kn);
		kn->flags |= KERNFS_SUICIDED;
		ret = true;
	} else {
		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
		DEFINE_WAIT(wait);

		while (true) {
			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

			if ((kn->flags & KERNFS_SUICIDED) &&
			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
				break;

			mutex_unlock(&kernfs_mutex);
			schedule();
			mutex_lock(&kernfs_mutex);
		}
		finish_wait(waitq, &wait);
		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
		ret = false;
	}

	/*
	 * This must be done while holding kernfs_mutex; otherwise, waiting
	 * for SUICIDED && deactivated could finish prematurely.
	 */
	kernfs_unbreak_active_protection(kn);

	mutex_unlock(&kernfs_mutex);
	return ret;
}

1195
/**
1196 1197 1198 1199
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
1200
 *
1201 1202
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 * Returns 0 on success, -ENOENT if such entry doesn't exist.
1203
 */
1204
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
1205 1206
			     const void *ns)
{
1207
	struct kernfs_node *kn;
1208

1209
	if (!parent) {
1210
		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
1211 1212 1213 1214
			name);
		return -ENOENT;
	}

T
Tejun Heo 已提交
1215
	mutex_lock(&kernfs_mutex);
1216

1217 1218
	kn = kernfs_find_ns(parent, name, ns);
	if (kn)
T
Tejun Heo 已提交
1219
		__kernfs_remove(kn);
1220

T
Tejun Heo 已提交
1221
	mutex_unlock(&kernfs_mutex);
1222

1223
	if (kn)
1224 1225 1226 1227 1228 1229 1230
		return 0;
	else
		return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
1231
 * @kn: target node
1232 1233 1234 1235
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
1236
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1237 1238
		     const char *new_name, const void *new_ns)
{
1239 1240
	struct kernfs_node *old_parent;
	const char *old_name = NULL;
1241 1242
	int error;

1243 1244 1245 1246
	/* can't move or rename root */
	if (!kn->parent)
		return -EINVAL;

1247 1248
	mutex_lock(&kernfs_mutex);

1249
	error = -ENOENT;
T
Tejun Heo 已提交
1250
	if (!kernfs_active(kn) || !kernfs_active(new_parent))
1251 1252
		goto out;

1253
	error = 0;
1254 1255
	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
	    (strcmp(kn->name, new_name) == 0))
1256
		goto out;	/* nothing to rename */
1257 1258 1259

	error = -EEXIST;
	if (kernfs_find_ns(new_parent, new_name, new_ns))
1260
		goto out;
1261

1262
	/* rename kernfs_node */
1263
	if (strcmp(kn->name, new_name) != 0) {
1264
		error = -ENOMEM;
1265
		new_name = kstrdup_const(new_name, GFP_KERNEL);
1266
		if (!new_name)
1267
			goto out;
1268 1269
	} else {
		new_name = NULL;
1270 1271 1272 1273 1274
	}

	/*
	 * Move to the appropriate place in the appropriate directories rbtree.
	 */
1275
	kernfs_unlink_sibling(kn);
1276
	kernfs_get(new_parent);
1277 1278 1279 1280 1281

	/* rename_lock protects ->parent and ->name accessors */
	spin_lock_irq(&kernfs_rename_lock);

	old_parent = kn->parent;
1282
	kn->parent = new_parent;
1283 1284 1285

	kn->ns = new_ns;
	if (new_name) {
T
Tejun Heo 已提交
1286
		old_name = kn->name;
1287 1288 1289 1290 1291
		kn->name = new_name;
	}

	spin_unlock_irq(&kernfs_rename_lock);

1292
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
1293
	kernfs_link_sibling(kn);
1294

1295
	kernfs_put(old_parent);
1296
	kfree_const(old_name);
1297

1298
	error = 0;
1299
 out:
1300
	mutex_unlock(&kernfs_mutex);
1301 1302 1303 1304
	return error;
}

/* Relationship between s_mode and the DT_xxx types */
1305
static inline unsigned char dt_type(struct kernfs_node *kn)
1306
{
1307
	return (kn->mode >> 12) & 15;
1308 1309
}

1310
static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
1311 1312 1313 1314 1315
{
	kernfs_put(filp->private_data);
	return 0;
}

1316
static struct kernfs_node *kernfs_dir_pos(const void *ns,
1317
	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
1318 1319
{
	if (pos) {
T
Tejun Heo 已提交
1320
		int valid = kernfs_active(pos) &&
1321
			pos->parent == parent && hash == pos->hash;
1322 1323 1324 1325 1326
		kernfs_put(pos);
		if (!valid)
			pos = NULL;
	}
	if (!pos && (hash > 1) && (hash < INT_MAX)) {
1327
		struct rb_node *node = parent->dir.children.rb_node;
1328
		while (node) {
1329
			pos = rb_to_kn(node);
1330

1331
			if (hash < pos->hash)
1332
				node = node->rb_left;
1333
			else if (hash > pos->hash)
1334 1335 1336 1337 1338
				node = node->rb_right;
			else
				break;
		}
	}
1339 1340
	/* Skip over entries which are dying/dead or in the wrong namespace */
	while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
1341
		struct rb_node *node = rb_next(&pos->rb);
1342 1343 1344
		if (!node)
			pos = NULL;
		else
1345
			pos = rb_to_kn(node);
1346 1347 1348 1349
	}
	return pos;
}

1350
static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1351
	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1352
{
1353
	pos = kernfs_dir_pos(ns, parent, ino, pos);
1354
	if (pos) {
1355
		do {
1356
			struct rb_node *node = rb_next(&pos->rb);
1357 1358 1359
			if (!node)
				pos = NULL;
			else
1360
				pos = rb_to_kn(node);
1361 1362
		} while (pos && (!kernfs_active(pos) || pos->ns != ns));
	}
1363 1364 1365
	return pos;
}

1366
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
1367 1368
{
	struct dentry *dentry = file->f_path.dentry;
1369 1370
	struct kernfs_node *parent = dentry->d_fsdata;
	struct kernfs_node *pos = file->private_data;
1371 1372 1373 1374
	const void *ns = NULL;

	if (!dir_emit_dots(file, ctx))
		return 0;
1375
	mutex_lock(&kernfs_mutex);
1376

1377
	if (kernfs_ns_enabled(parent))
1378
		ns = kernfs_info(dentry->d_sb)->ns;
1379

1380
	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
1381
	     pos;
1382
	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
1383
		const char *name = pos->name;
1384 1385
		unsigned int type = dt_type(pos);
		int len = strlen(name);
1386
		ino_t ino = pos->ino;
1387

1388
		ctx->pos = pos->hash;
1389 1390 1391
		file->private_data = pos;
		kernfs_get(pos);

1392
		mutex_unlock(&kernfs_mutex);
1393 1394
		if (!dir_emit(ctx, name, len, ino, type))
			return 0;
1395
		mutex_lock(&kernfs_mutex);
1396
	}
1397
	mutex_unlock(&kernfs_mutex);
1398 1399 1400 1401 1402
	file->private_data = NULL;
	ctx->pos = INT_MAX;
	return 0;
}

1403 1404
static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
				    int whence)
1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415
{
	struct inode *inode = file_inode(file);
	loff_t ret;

	mutex_lock(&inode->i_mutex);
	ret = generic_file_llseek(file, offset, whence);
	mutex_unlock(&inode->i_mutex);

	return ret;
}

1416
const struct file_operations kernfs_dir_fops = {
1417
	.read		= generic_read_dir,
1418 1419 1420
	.iterate	= kernfs_fop_readdir,
	.release	= kernfs_dir_fop_release,
	.llseek		= kernfs_dir_fop_llseek,
1421
};