dir.c 35.7 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2.
 */
10

11
#include <linux/sched.h>
12 13 14 15 16 17 18 19 20
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>

#include "kernfs-internal.h"

21
DEFINE_MUTEX(kernfs_mutex);
22 23
static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
24

25
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
26

T
Tejun Heo 已提交
27 28 29 30 31 32
static bool kernfs_active(struct kernfs_node *kn)
{
	lockdep_assert_held(&kernfs_mutex);
	return atomic_read(&kn->active) >= 0;
}

33 34 35 36 37 38 39 40 41
static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	return kn->flags & KERNFS_LOCKDEP;
#else
	return false;
#endif
}

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
	return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
					      size_t buflen)
{
	char *p = buf + buflen;
	int len;

	*--p = '\0';

	do {
		len = strlen(kn->name);
		if (p - buf < len + 1) {
			buf[0] = '\0';
			p = NULL;
			break;
		}
		p -= len;
		memcpy(p, kn->name, len);
		*--p = '/';
		kn = kn->parent;
	} while (kn && kn->parent);

	return p;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strlcpy().  It returns the length of @kn's name and if @buf
 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	ret = kernfs_name_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return ret;
}

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
 * path is built from the end of @buf so the returned pointer usually
 * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
 * and %NULL is returned.
 */
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	p = kernfs_path_locked(kn, buf, buflen);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
	return p;
}
T
Tejun Heo 已提交
115
EXPORT_SYMBOL_GPL(kernfs_path);
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
	pr_cont("%s", kernfs_pr_cont_buf);

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
	unsigned long flags;
	char *p;

	spin_lock_irqsave(&kernfs_rename_lock, flags);

	p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
			       sizeof(kernfs_pr_cont_buf));
	if (p)
		pr_cont("%s", p);
	else
		pr_cont("<name too long>");

	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
	struct kernfs_node *parent;
	unsigned long flags;

	spin_lock_irqsave(&kernfs_rename_lock, flags);
	parent = kn->parent;
	kernfs_get(parent);
	spin_unlock_irqrestore(&kernfs_rename_lock, flags);

	return parent;
}

178
/**
179
 *	kernfs_name_hash
180 181 182 183 184
 *	@name: Null terminated string to hash
 *	@ns:   Namespace tag to hash
 *
 *	Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
185
static unsigned int kernfs_name_hash(const char *name, const void *ns)
186 187 188 189 190 191 192 193
{
	unsigned long hash = init_name_hash();
	unsigned int len = strlen(name);
	while (len--)
		hash = partial_name_hash(*name++, hash);
	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
	hash &= 0x7fffffffU;
	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
R
Richard Cochran 已提交
194
	if (hash < 2)
195 196 197 198 199 200
		hash += 2;
	if (hash >= INT_MAX)
		hash = INT_MAX - 1;
	return hash;
}

201 202
static int kernfs_name_compare(unsigned int hash, const char *name,
			       const void *ns, const struct kernfs_node *kn)
203
{
204 205 206 207 208
	if (hash != kn->hash)
		return hash - kn->hash;
	if (ns != kn->ns)
		return ns - kn->ns;
	return strcmp(name, kn->name);
209 210
}

211 212
static int kernfs_sd_compare(const struct kernfs_node *left,
			     const struct kernfs_node *right)
213
{
214
	return kernfs_name_compare(left->hash, left->name, left->ns, right);
215 216 217
}

/**
218
 *	kernfs_link_sibling - link kernfs_node into sibling rbtree
219
 *	@kn: kernfs_node of interest
220
 *
221
 *	Link @kn into its sibling rbtree which starts from
222
 *	@kn->parent->dir.children.
223 224
 *
 *	Locking:
225
 *	mutex_lock(kernfs_mutex)
226 227 228 229
 *
 *	RETURNS:
 *	0 on susccess -EEXIST on failure.
 */
230
static int kernfs_link_sibling(struct kernfs_node *kn)
231
{
232
	struct rb_node **node = &kn->parent->dir.children.rb_node;
233 234 235
	struct rb_node *parent = NULL;

	while (*node) {
236
		struct kernfs_node *pos;
237 238
		int result;

239
		pos = rb_to_kn(*node);
240
		parent = *node;
241
		result = kernfs_sd_compare(kn, pos);
242
		if (result < 0)
243
			node = &pos->rb.rb_left;
244
		else if (result > 0)
245
			node = &pos->rb.rb_right;
246 247 248
		else
			return -EEXIST;
	}
J
Jianyu Zhan 已提交
249

250
	/* add new node and rebalance the tree */
251 252
	rb_link_node(&kn->rb, parent, node);
	rb_insert_color(&kn->rb, &kn->parent->dir.children);
J
Jianyu Zhan 已提交
253 254 255 256 257

	/* successfully added, account subdir number */
	if (kernfs_type(kn) == KERNFS_DIR)
		kn->parent->dir.subdirs++;

258 259 260 261
	return 0;
}

/**
262
 *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
263
 *	@kn: kernfs_node of interest
264
 *
265 266 267
 *	Try to unlink @kn from its sibling rbtree which starts from
 *	kn->parent->dir.children.  Returns %true if @kn was actually
 *	removed, %false if @kn wasn't on the rbtree.
268 269
 *
 *	Locking:
270
 *	mutex_lock(kernfs_mutex)
271
 */
272
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
273
{
274 275 276
	if (RB_EMPTY_NODE(&kn->rb))
		return false;

T
Tejun Heo 已提交
277
	if (kernfs_type(kn) == KERNFS_DIR)
278
		kn->parent->dir.subdirs--;
279

280
	rb_erase(&kn->rb, &kn->parent->dir.children);
281 282
	RB_CLEAR_NODE(&kn->rb);
	return true;
283 284 285
}

/**
286
 *	kernfs_get_active - get an active reference to kernfs_node
287
 *	@kn: kernfs_node to get an active reference to
288
 *
289
 *	Get an active reference of @kn.  This function is noop if @kn
290 291 292
 *	is NULL.
 *
 *	RETURNS:
293
 *	Pointer to @kn on success, NULL on failure.
294
 */
295
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
296
{
297
	if (unlikely(!kn))
298 299
		return NULL;

300 301
	if (!atomic_inc_unless_negative(&kn->active))
		return NULL;
302

303
	if (kernfs_lockdep(kn))
304 305
		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
	return kn;
306 307 308
}

/**
309
 *	kernfs_put_active - put an active reference to kernfs_node
310
 *	@kn: kernfs_node to put an active reference to
311
 *
312
 *	Put an active reference to @kn.  This function is noop if @kn
313 314
 *	is NULL.
 */
315
void kernfs_put_active(struct kernfs_node *kn)
316
{
317
	struct kernfs_root *root = kernfs_root(kn);
318 319
	int v;

320
	if (unlikely(!kn))
321 322
		return;

323
	if (kernfs_lockdep(kn))
324
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
325
	v = atomic_dec_return(&kn->active);
T
Tejun Heo 已提交
326
	if (likely(v != KN_DEACTIVATED_BIAS))
327 328
		return;

329
	wake_up_all(&root->deactivate_waitq);
330 331 332
}

/**
T
Tejun Heo 已提交
333 334
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
335
 *
T
Tejun Heo 已提交
336 337 338
 * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
339
 */
T
Tejun Heo 已提交
340
static void kernfs_drain(struct kernfs_node *kn)
341
	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
342
{
343
	struct kernfs_root *root = kernfs_root(kn);
344

345
	lockdep_assert_held(&kernfs_mutex);
T
Tejun Heo 已提交
346
	WARN_ON_ONCE(kernfs_active(kn));
347

348
	mutex_unlock(&kernfs_mutex);
349

350
	if (kernfs_lockdep(kn)) {
351 352 353 354
		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
			lock_contended(&kn->dep_map, _RET_IP_);
	}
355

356
	/* but everyone should wait for draining */
357 358
	wait_event(root->deactivate_waitq,
		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
359

360
	if (kernfs_lockdep(kn)) {
361 362 363
		lock_acquired(&kn->dep_map, _RET_IP_);
		rwsem_release(&kn->dep_map, 1, _RET_IP_);
	}
364

365 366
	kernfs_unmap_bin_file(kn);

367
	mutex_lock(&kernfs_mutex);
368 369 370
}

/**
371 372
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
373
 */
374
void kernfs_get(struct kernfs_node *kn)
375
{
376
	if (kn) {
377 378
		WARN_ON(!atomic_read(&kn->count));
		atomic_inc(&kn->count);
379 380 381 382 383
	}
}
EXPORT_SYMBOL_GPL(kernfs_get);

/**
384 385
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
386
 *
387
 * Put a reference count of @kn and destroy it if it reached zero.
388
 */
389
void kernfs_put(struct kernfs_node *kn)
390
{
391
	struct kernfs_node *parent;
392
	struct kernfs_root *root;
393

394
	if (!kn || !atomic_dec_and_test(&kn->count))
395
		return;
396
	root = kernfs_root(kn);
397
 repeat:
T
Tejun Heo 已提交
398 399
	/*
	 * Moving/renaming is always done while holding reference.
400
	 * kn->parent won't change beneath us.
401
	 */
402
	parent = kn->parent;
403

T
Tejun Heo 已提交
404 405 406
	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
407

T
Tejun Heo 已提交
408
	if (kernfs_type(kn) == KERNFS_LINK)
409
		kernfs_put(kn->symlink.target_kn);
410
	if (!(kn->flags & KERNFS_STATIC_NAME))
411 412 413 414 415 416
		kfree(kn->name);
	if (kn->iattr) {
		if (kn->iattr->ia_secdata)
			security_release_secctx(kn->iattr->ia_secdata,
						kn->iattr->ia_secdata_len);
		simple_xattrs_free(&kn->iattr->xattrs);
417
	}
418 419
	kfree(kn->iattr);
	ida_simple_remove(&root->ino_ida, kn->ino);
420
	kmem_cache_free(kernfs_node_cache, kn);
421

422 423
	kn = parent;
	if (kn) {
424
		if (atomic_dec_and_test(&kn->count))
425 426
			goto repeat;
	} else {
427
		/* just released the root kn, free @root too */
428
		ida_destroy(&root->ino_ida);
429 430
		kfree(root);
	}
431 432 433
}
EXPORT_SYMBOL_GPL(kernfs_put);

434
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
435
{
436
	struct kernfs_node *kn;
437 438 439 440

	if (flags & LOOKUP_RCU)
		return -ECHILD;

T
Tejun Heo 已提交
441 442 443 444
	/* Always perform fresh lookup for negatives */
	if (!dentry->d_inode)
		goto out_bad_unlocked;

445
	kn = dentry->d_fsdata;
446
	mutex_lock(&kernfs_mutex);
447

T
Tejun Heo 已提交
448 449
	/* The kernfs node has been deactivated */
	if (!kernfs_active(kn))
450 451
		goto out_bad;

452
	/* The kernfs node has been moved? */
453
	if (dentry->d_parent->d_fsdata != kn->parent)
454 455
		goto out_bad;

456
	/* The kernfs node has been renamed */
457
	if (strcmp(dentry->d_name.name, kn->name) != 0)
458 459
		goto out_bad;

460
	/* The kernfs node has been moved to a different namespace */
461
	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
462
	    kernfs_info(dentry->d_sb)->ns != kn->ns)
463 464
		goto out_bad;

465
	mutex_unlock(&kernfs_mutex);
466 467 468
out_valid:
	return 1;
out_bad:
469
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
470 471 472 473 474 475 476
out_bad_unlocked:
	/*
	 * @dentry doesn't match the underlying kernfs node, drop the
	 * dentry and force lookup.  If we have submounts we must allow the
	 * vfs caches to lie about the state of the filesystem to prevent
	 * leaks and other nasty things, so use check_submounts_and_drop()
	 * instead of d_drop().
477 478 479 480 481 482 483
	 */
	if (check_submounts_and_drop(dentry) != 0)
		goto out_valid;

	return 0;
}

484
static void kernfs_dop_release(struct dentry *dentry)
485 486 487 488
{
	kernfs_put(dentry->d_fsdata);
}

489
const struct dentry_operations kernfs_dops = {
490 491
	.d_revalidate	= kernfs_dop_revalidate,
	.d_release	= kernfs_dop_release,
492 493
};

494 495 496 497 498 499 500 501 502 503 504 505 506
/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
L
Li Zefan 已提交
507
	if (dentry->d_sb->s_op == &kernfs_sops)
508 509 510 511
		return dentry->d_fsdata;
	return NULL;
}

512 513 514
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
					     const char *name, umode_t mode,
					     unsigned flags)
515 516
{
	char *dup_name = NULL;
517
	struct kernfs_node *kn;
518
	int ret;
519

520
	if (!(flags & KERNFS_STATIC_NAME)) {
521 522 523 524 525
		name = dup_name = kstrdup(name, GFP_KERNEL);
		if (!name)
			return NULL;
	}

526
	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
527
	if (!kn)
528 529
		goto err_out1;

530 531
	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
	if (ret < 0)
532
		goto err_out2;
533
	kn->ino = ret;
534

535
	atomic_set(&kn->count, 1);
T
Tejun Heo 已提交
536
	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
537
	RB_CLEAR_NODE(&kn->rb);
538

539 540
	kn->name = name;
	kn->mode = mode;
T
Tejun Heo 已提交
541
	kn->flags = flags;
542

543
	return kn;
544 545

 err_out2:
546
	kmem_cache_free(kernfs_node_cache, kn);
547 548 549 550 551
 err_out1:
	kfree(dup_name);
	return NULL;
}

552 553 554 555 556 557 558 559 560 561 562 563 564 565
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
				    const char *name, umode_t mode,
				    unsigned flags)
{
	struct kernfs_node *kn;

	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
	if (kn) {
		kernfs_get(parent);
		kn->parent = parent;
	}
	return kn;
}

566
/**
567
 *	kernfs_add_one - add kernfs_node to parent without warning
568
 *	@kn: kernfs_node to be added
569
 *
570 571 572
 *	The caller must already have initialized @kn->parent.  This
 *	function increments nlink of the parent's inode if @kn is a
 *	directory and link into the children list of the parent.
573 574 575 576 577
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
 */
T
Tejun Heo 已提交
578
int kernfs_add_one(struct kernfs_node *kn)
579
{
580
	struct kernfs_node *parent = kn->parent;
581
	struct kernfs_iattrs *ps_iattr;
T
Tejun Heo 已提交
582
	bool has_ns;
583 584
	int ret;

T
Tejun Heo 已提交
585 586 587 588 589 590 591
	mutex_lock(&kernfs_mutex);

	ret = -EINVAL;
	has_ns = kernfs_ns_enabled(parent);
	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
		 has_ns ? "required" : "invalid", parent->name, kn->name))
		goto out_unlock;
592

T
Tejun Heo 已提交
593
	if (kernfs_type(parent) != KERNFS_DIR)
T
Tejun Heo 已提交
594
		goto out_unlock;
595

T
Tejun Heo 已提交
596
	ret = -ENOENT;
597
	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
T
Tejun Heo 已提交
598
		goto out_unlock;
599

600
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
601

602
	ret = kernfs_link_sibling(kn);
603
	if (ret)
T
Tejun Heo 已提交
604
		goto out_unlock;
605 606

	/* Update timestamps on the parent */
607
	ps_iattr = parent->iattr;
608 609 610 611 612
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

613 614 615 616 617 618 619 620 621 622 623 624 625
	mutex_unlock(&kernfs_mutex);

	/*
	 * Activate the new node unless CREATE_DEACTIVATED is requested.
	 * If not activated here, the kernfs user is responsible for
	 * activating the node with kernfs_activate().  A node which hasn't
	 * been activated is not visible to userland and its removal won't
	 * trigger deactivation.
	 */
	if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);
	return 0;

T
Tejun Heo 已提交
626
out_unlock:
627
	mutex_unlock(&kernfs_mutex);
T
Tejun Heo 已提交
628
	return ret;
629 630 631
}

/**
632 633
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
634 635 636
 * @name: name to look for
 * @ns: the namespace tag to use
 *
637 638
 * Look for kernfs_node with name @name under @parent.  Returns pointer to
 * the found kernfs_node on success, %NULL on failure.
639
 */
640 641 642
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
					  const unsigned char *name,
					  const void *ns)
643
{
644
	struct rb_node *node = parent->dir.children.rb_node;
645
	bool has_ns = kernfs_ns_enabled(parent);
646 647
	unsigned int hash;

648
	lockdep_assert_held(&kernfs_mutex);
649 650

	if (has_ns != (bool)ns) {
651
		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
652
		     has_ns ? "required" : "invalid", parent->name, name);
653 654 655
		return NULL;
	}

656
	hash = kernfs_name_hash(name, ns);
657
	while (node) {
658
		struct kernfs_node *kn;
659 660
		int result;

661
		kn = rb_to_kn(node);
662
		result = kernfs_name_compare(hash, name, ns, kn);
663 664 665 666 667
		if (result < 0)
			node = node->rb_left;
		else if (result > 0)
			node = node->rb_right;
		else
668
			return kn;
669 670 671 672 673
	}
	return NULL;
}

/**
674 675
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
676 677 678
 * @name: name to look for
 * @ns: the namespace tag to use
 *
679
 * Look for kernfs_node with name @name under @parent and get a reference
680
 * if found.  This function may sleep and returns pointer to the found
681
 * kernfs_node on success, %NULL on failure.
682
 */
683 684
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
					   const char *name, const void *ns)
685
{
686
	struct kernfs_node *kn;
687

688
	mutex_lock(&kernfs_mutex);
689 690
	kn = kernfs_find_ns(parent, name, ns);
	kernfs_get(kn);
691
	mutex_unlock(&kernfs_mutex);
692

693
	return kn;
694 695 696
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

697 698
/**
 * kernfs_create_root - create a new kernfs hierarchy
699
 * @scops: optional syscall operations for the hierarchy
700
 * @flags: KERNFS_ROOT_* flags
701 702 703 704 705
 * @priv: opaque data associated with the new directory
 *
 * Returns the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
706
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
707
				       unsigned int flags, void *priv)
708 709
{
	struct kernfs_root *root;
710
	struct kernfs_node *kn;
711 712 713 714 715

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root)
		return ERR_PTR(-ENOMEM);

716 717
	ida_init(&root->ino_ida);

718 719
	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
			       KERNFS_DIR);
720
	if (!kn) {
721
		ida_destroy(&root->ino_ida);
722 723 724 725
		kfree(root);
		return ERR_PTR(-ENOMEM);
	}

726
	kn->priv = priv;
727
	kn->dir.root = root;
728

729
	root->syscall_ops = scops;
730
	root->flags = flags;
731
	root->kn = kn;
732
	init_waitqueue_head(&root->deactivate_waitq);
733

734 735 736
	if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
		kernfs_activate(kn);

737 738 739 740 741 742 743 744 745 746 747 748
	return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
749
	kernfs_remove(root->kn);	/* will also free @root */
750 751
}

752 753 754 755
/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
756
 * @mode: mode of the new directory
757 758 759 760 761
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
762
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
763 764
					 const char *name, umode_t mode,
					 void *priv, const void *ns)
765
{
766
	struct kernfs_node *kn;
767 768 769
	int rc;

	/* allocate */
770
	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
771
	if (!kn)
772 773
		return ERR_PTR(-ENOMEM);

774 775
	kn->dir.root = parent->dir.root;
	kn->ns = ns;
776
	kn->priv = priv;
777 778

	/* link in */
T
Tejun Heo 已提交
779
	rc = kernfs_add_one(kn);
780
	if (!rc)
781
		return kn;
782

783
	kernfs_put(kn);
784 785 786
	return ERR_PTR(rc);
}

787 788 789
static struct dentry *kernfs_iop_lookup(struct inode *dir,
					struct dentry *dentry,
					unsigned int flags)
790
{
T
Tejun Heo 已提交
791
	struct dentry *ret;
792 793
	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
	struct kernfs_node *kn;
794 795 796
	struct inode *inode;
	const void *ns = NULL;

797
	mutex_lock(&kernfs_mutex);
798

799
	if (kernfs_ns_enabled(parent))
800
		ns = kernfs_info(dir->i_sb)->ns;
801

802
	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
803 804

	/* no such entry */
805
	if (!kn || !kernfs_active(kn)) {
T
Tejun Heo 已提交
806
		ret = NULL;
807 808
		goto out_unlock;
	}
809 810
	kernfs_get(kn);
	dentry->d_fsdata = kn;
811 812

	/* attach dentry and inode */
813
	inode = kernfs_get_inode(dir->i_sb, kn);
814 815 816 817 818 819 820 821
	if (!inode) {
		ret = ERR_PTR(-ENOMEM);
		goto out_unlock;
	}

	/* instantiate and hash dentry */
	ret = d_materialise_unique(dentry, inode);
 out_unlock:
822
	mutex_unlock(&kernfs_mutex);
823 824 825
	return ret;
}

T
Tejun Heo 已提交
826 827 828 829
static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
			    umode_t mode)
{
	struct kernfs_node *parent = dir->i_private;
830
	struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
831
	int ret;
T
Tejun Heo 已提交
832

833
	if (!scops || !scops->mkdir)
T
Tejun Heo 已提交
834 835
		return -EPERM;

836 837 838
	if (!kernfs_get_active(parent))
		return -ENODEV;

839
	ret = scops->mkdir(parent, dentry->d_name.name, mode);
840 841 842

	kernfs_put_active(parent);
	return ret;
T
Tejun Heo 已提交
843 844 845 846 847
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
	struct kernfs_node *kn  = dentry->d_fsdata;
848
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
849
	int ret;
T
Tejun Heo 已提交
850

851
	if (!scops || !scops->rmdir)
T
Tejun Heo 已提交
852 853
		return -EPERM;

854 855 856
	if (!kernfs_get_active(kn))
		return -ENODEV;

857
	ret = scops->rmdir(kn);
858 859 860

	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
861 862 863 864 865 866 867
}

static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
			     struct inode *new_dir, struct dentry *new_dentry)
{
	struct kernfs_node *kn  = old_dentry->d_fsdata;
	struct kernfs_node *new_parent = new_dir->i_private;
868
	struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
869
	int ret;
T
Tejun Heo 已提交
870

871
	if (!scops || !scops->rename)
T
Tejun Heo 已提交
872 873
		return -EPERM;

874 875 876 877 878 879 880 881
	if (!kernfs_get_active(kn))
		return -ENODEV;

	if (!kernfs_get_active(new_parent)) {
		kernfs_put_active(kn);
		return -ENODEV;
	}

882
	ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
883 884 885 886

	kernfs_put_active(new_parent);
	kernfs_put_active(kn);
	return ret;
T
Tejun Heo 已提交
887 888
}

889
const struct inode_operations kernfs_dir_iops = {
890 891 892 893 894 895 896 897
	.lookup		= kernfs_iop_lookup,
	.permission	= kernfs_iop_permission,
	.setattr	= kernfs_iop_setattr,
	.getattr	= kernfs_iop_getattr,
	.setxattr	= kernfs_iop_setxattr,
	.removexattr	= kernfs_iop_removexattr,
	.getxattr	= kernfs_iop_getxattr,
	.listxattr	= kernfs_iop_listxattr,
T
Tejun Heo 已提交
898 899 900 901

	.mkdir		= kernfs_iop_mkdir,
	.rmdir		= kernfs_iop_rmdir,
	.rename		= kernfs_iop_rename,
902 903
};

904
static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
905
{
906
	struct kernfs_node *last;
907 908 909 910 911 912

	while (true) {
		struct rb_node *rbn;

		last = pos;

T
Tejun Heo 已提交
913
		if (kernfs_type(pos) != KERNFS_DIR)
914 915
			break;

916
		rbn = rb_first(&pos->dir.children);
917 918 919
		if (!rbn)
			break;

920
		pos = rb_to_kn(rbn);
921 922 923 924 925 926
	}

	return last;
}

/**
927
 * kernfs_next_descendant_post - find the next descendant for post-order walk
928
 * @pos: the current position (%NULL to initiate traversal)
929
 * @root: kernfs_node whose descendants to walk
930 931 932 933 934
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
935 936
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
						       struct kernfs_node *root)
937 938 939
{
	struct rb_node *rbn;

940
	lockdep_assert_held(&kernfs_mutex);
941 942 943

	/* if first iteration, visit leftmost descendant which may be root */
	if (!pos)
944
		return kernfs_leftmost_descendant(root);
945 946 947 948 949 950

	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
951
	rbn = rb_next(&pos->rb);
952
	if (rbn)
953
		return kernfs_leftmost_descendant(rb_to_kn(rbn));
954 955

	/* no sibling left, visit parent */
956
	return pos->parent;
957 958
}

959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
	struct kernfs_node *pos;

	mutex_lock(&kernfs_mutex);

	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn))) {
		if (!pos || (pos->flags & KERNFS_ACTIVATED))
			continue;

		WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
		WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);

		atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
		pos->flags |= KERNFS_ACTIVATED;
	}

	mutex_unlock(&kernfs_mutex);
}

T
Tejun Heo 已提交
993
static void __kernfs_remove(struct kernfs_node *kn)
994
{
995 996 997
	struct kernfs_node *pos;

	lockdep_assert_held(&kernfs_mutex);
998

999 1000 1001 1002 1003 1004
	/*
	 * Short-circuit if non-root @kn has already finished removal.
	 * This is for kernfs_remove_self() which plays with active ref
	 * after removal.
	 */
	if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
1005 1006
		return;

1007
	pr_debug("kernfs %s: removing\n", kn->name);
1008

T
Tejun Heo 已提交
1009
	/* prevent any new usage under @kn by deactivating all nodes */
1010 1011
	pos = NULL;
	while ((pos = kernfs_next_descendant_post(pos, kn)))
T
Tejun Heo 已提交
1012 1013
		if (kernfs_active(pos))
			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1014 1015

	/* deactivate and unlink the subtree node-by-node */
1016
	do {
1017 1018 1019
		pos = kernfs_leftmost_descendant(kn);

		/*
T
Tejun Heo 已提交
1020 1021 1022 1023
		 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
		 * base ref could have been put by someone else by the time
		 * the function returns.  Make sure it doesn't go away
		 * underneath us.
1024 1025 1026
		 */
		kernfs_get(pos);

1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
		/*
		 * Drain iff @kn was activated.  This avoids draining and
		 * its lockdep annotations for nodes which have never been
		 * activated and allows embedding kernfs_remove() in create
		 * error paths without worrying about draining.
		 */
		if (kn->flags & KERNFS_ACTIVATED)
			kernfs_drain(pos);
		else
			WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051

		/*
		 * kernfs_unlink_sibling() succeeds once per node.  Use it
		 * to decide who's responsible for cleanups.
		 */
		if (!pos->parent || kernfs_unlink_sibling(pos)) {
			struct kernfs_iattrs *ps_iattr =
				pos->parent ? pos->parent->iattr : NULL;

			/* update timestamps on the parent */
			if (ps_iattr) {
				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
			}

T
Tejun Heo 已提交
1052
			kernfs_put(pos);
1053 1054 1055 1056
		}

		kernfs_put(pos);
	} while (pos != kn);
1057 1058 1059
}

/**
1060 1061
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
1062
 *
1063
 * Remove @kn along with all its subdirectories and files.
1064
 */
1065
void kernfs_remove(struct kernfs_node *kn)
1066
{
T
Tejun Heo 已提交
1067 1068 1069
	mutex_lock(&kernfs_mutex);
	__kernfs_remove(kn);
	mutex_unlock(&kernfs_mutex);
1070 1071
}

1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
	/*
	 * Take out ourself out of the active ref dependency chain.  If
	 * we're called without an active ref, lockdep will complain.
	 */
	kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being removed.  Once kernfs_break_active_protection() is invoked, that
 * protection is irreversibly gone for the kernfs operation instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
	/*
	 * @kn->active could be in any state; however, the increment we do
	 * here will be undone as soon as the enclosing kernfs operation
	 * finishes and this temporary bump can't break anything.  If @kn
	 * is alive, nothing changes.  If @kn is being deactivated, the
	 * soon-to-follow put will either finish deactivation or restore
	 * deactivated state.  If @kn is already removed, the temporary
	 * bump is guaranteed to be gone before @kn is released.
	 */
	atomic_inc(&kn->active);
	if (kernfs_lockdep(kn))
		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
	bool ret;

	mutex_lock(&kernfs_mutex);
	kernfs_break_active_protection(kn);

	/*
	 * SUICIDAL is used to arbitrate among competing invocations.  Only
	 * the first one will actually perform removal.  When the removal
	 * is complete, SUICIDED is set and the active ref is restored
	 * while holding kernfs_mutex.  The ones which lost arbitration
	 * waits for SUICDED && drained which can happen only after the
	 * enclosing kernfs operation which executed the winning instance
	 * of kernfs_remove_self() finished.
	 */
	if (!(kn->flags & KERNFS_SUICIDAL)) {
		kn->flags |= KERNFS_SUICIDAL;
		__kernfs_remove(kn);
		kn->flags |= KERNFS_SUICIDED;
		ret = true;
	} else {
		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
		DEFINE_WAIT(wait);

		while (true) {
			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

			if ((kn->flags & KERNFS_SUICIDED) &&
			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
				break;

			mutex_unlock(&kernfs_mutex);
			schedule();
			mutex_lock(&kernfs_mutex);
		}
		finish_wait(waitq, &wait);
		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
		ret = false;
	}

	/*
	 * This must be done while holding kernfs_mutex; otherwise, waiting
	 * for SUICIDED && deactivated could finish prematurely.
	 */
	kernfs_unbreak_active_protection(kn);

	mutex_unlock(&kernfs_mutex);
	return ret;
}

1203
/**
1204 1205 1206 1207
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
1208
 *
1209 1210
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 * Returns 0 on success, -ENOENT if such entry doesn't exist.
1211
 */
1212
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
1213 1214
			     const void *ns)
{
1215
	struct kernfs_node *kn;
1216

1217
	if (!parent) {
1218
		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
1219 1220 1221 1222
			name);
		return -ENOENT;
	}

T
Tejun Heo 已提交
1223
	mutex_lock(&kernfs_mutex);
1224

1225 1226
	kn = kernfs_find_ns(parent, name, ns);
	if (kn)
T
Tejun Heo 已提交
1227
		__kernfs_remove(kn);
1228

T
Tejun Heo 已提交
1229
	mutex_unlock(&kernfs_mutex);
1230

1231
	if (kn)
1232 1233 1234 1235 1236 1237 1238
		return 0;
	else
		return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
1239
 * @kn: target node
1240 1241 1242 1243
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
1244
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1245 1246
		     const char *new_name, const void *new_ns)
{
1247 1248
	struct kernfs_node *old_parent;
	const char *old_name = NULL;
1249 1250
	int error;

1251 1252 1253 1254
	/* can't move or rename root */
	if (!kn->parent)
		return -EINVAL;

1255 1256
	mutex_lock(&kernfs_mutex);

1257
	error = -ENOENT;
T
Tejun Heo 已提交
1258
	if (!kernfs_active(kn) || !kernfs_active(new_parent))
1259 1260
		goto out;

1261
	error = 0;
1262 1263
	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
	    (strcmp(kn->name, new_name) == 0))
1264
		goto out;	/* nothing to rename */
1265 1266 1267

	error = -EEXIST;
	if (kernfs_find_ns(new_parent, new_name, new_ns))
1268
		goto out;
1269

1270
	/* rename kernfs_node */
1271
	if (strcmp(kn->name, new_name) != 0) {
1272 1273 1274
		error = -ENOMEM;
		new_name = kstrdup(new_name, GFP_KERNEL);
		if (!new_name)
1275
			goto out;
1276 1277
	} else {
		new_name = NULL;
1278 1279 1280 1281 1282
	}

	/*
	 * Move to the appropriate place in the appropriate directories rbtree.
	 */
1283
	kernfs_unlink_sibling(kn);
1284
	kernfs_get(new_parent);
1285 1286 1287 1288 1289

	/* rename_lock protects ->parent and ->name accessors */
	spin_lock_irq(&kernfs_rename_lock);

	old_parent = kn->parent;
1290
	kn->parent = new_parent;
1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301

	kn->ns = new_ns;
	if (new_name) {
		if (!(kn->flags & KERNFS_STATIC_NAME))
			old_name = kn->name;
		kn->flags &= ~KERNFS_STATIC_NAME;
		kn->name = new_name;
	}

	spin_unlock_irq(&kernfs_rename_lock);

1302
	kn->hash = kernfs_name_hash(kn->name, kn->ns);
1303
	kernfs_link_sibling(kn);
1304

1305 1306 1307
	kernfs_put(old_parent);
	kfree(old_name);

1308
	error = 0;
1309
 out:
1310
	mutex_unlock(&kernfs_mutex);
1311 1312 1313 1314
	return error;
}

/* Relationship between s_mode and the DT_xxx types */
1315
static inline unsigned char dt_type(struct kernfs_node *kn)
1316
{
1317
	return (kn->mode >> 12) & 15;
1318 1319
}

1320
static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
1321 1322 1323 1324 1325
{
	kernfs_put(filp->private_data);
	return 0;
}

1326
static struct kernfs_node *kernfs_dir_pos(const void *ns,
1327
	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
1328 1329
{
	if (pos) {
T
Tejun Heo 已提交
1330
		int valid = kernfs_active(pos) &&
1331
			pos->parent == parent && hash == pos->hash;
1332 1333 1334 1335 1336
		kernfs_put(pos);
		if (!valid)
			pos = NULL;
	}
	if (!pos && (hash > 1) && (hash < INT_MAX)) {
1337
		struct rb_node *node = parent->dir.children.rb_node;
1338
		while (node) {
1339
			pos = rb_to_kn(node);
1340

1341
			if (hash < pos->hash)
1342
				node = node->rb_left;
1343
			else if (hash > pos->hash)
1344 1345 1346 1347 1348
				node = node->rb_right;
			else
				break;
		}
	}
1349 1350
	/* Skip over entries which are dying/dead or in the wrong namespace */
	while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
1351
		struct rb_node *node = rb_next(&pos->rb);
1352 1353 1354
		if (!node)
			pos = NULL;
		else
1355
			pos = rb_to_kn(node);
1356 1357 1358 1359
	}
	return pos;
}

1360
static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1361
	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1362
{
1363
	pos = kernfs_dir_pos(ns, parent, ino, pos);
1364
	if (pos) {
1365
		do {
1366
			struct rb_node *node = rb_next(&pos->rb);
1367 1368 1369
			if (!node)
				pos = NULL;
			else
1370
				pos = rb_to_kn(node);
1371 1372
		} while (pos && (!kernfs_active(pos) || pos->ns != ns));
	}
1373 1374 1375
	return pos;
}

1376
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
1377 1378
{
	struct dentry *dentry = file->f_path.dentry;
1379 1380
	struct kernfs_node *parent = dentry->d_fsdata;
	struct kernfs_node *pos = file->private_data;
1381 1382 1383 1384
	const void *ns = NULL;

	if (!dir_emit_dots(file, ctx))
		return 0;
1385
	mutex_lock(&kernfs_mutex);
1386

1387
	if (kernfs_ns_enabled(parent))
1388
		ns = kernfs_info(dentry->d_sb)->ns;
1389

1390
	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
1391
	     pos;
1392
	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
1393
		const char *name = pos->name;
1394 1395
		unsigned int type = dt_type(pos);
		int len = strlen(name);
1396
		ino_t ino = pos->ino;
1397

1398
		ctx->pos = pos->hash;
1399 1400 1401
		file->private_data = pos;
		kernfs_get(pos);

1402
		mutex_unlock(&kernfs_mutex);
1403 1404
		if (!dir_emit(ctx, name, len, ino, type))
			return 0;
1405
		mutex_lock(&kernfs_mutex);
1406
	}
1407
	mutex_unlock(&kernfs_mutex);
1408 1409 1410 1411 1412
	file->private_data = NULL;
	ctx->pos = INT_MAX;
	return 0;
}

1413 1414
static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
				    int whence)
1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
{
	struct inode *inode = file_inode(file);
	loff_t ret;

	mutex_lock(&inode->i_mutex);
	ret = generic_file_llseek(file, offset, whence);
	mutex_unlock(&inode->i_mutex);

	return ret;
}

1426
const struct file_operations kernfs_dir_fops = {
1427
	.read		= generic_read_dir,
1428 1429 1430
	.iterate	= kernfs_fop_readdir,
	.release	= kernfs_dir_fop_release,
	.llseek		= kernfs_dir_fop_llseek,
1431
};