dir.c 26.9 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
T
Tejun Heo 已提交
2 3 4 5 6 7 8 9 10
 * fs/sysfs/dir.c - sysfs core and dir operation implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * This file is released under the GPLv2.
 *
 * Please see Documentation/filesystems/sysfs.txt for more information.
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18
 */

#undef DEBUG

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kobject.h>
19
#include <linux/namei.h>
20
#include <linux/idr.h>
21
#include <linux/completion.h>
D
Dave Young 已提交
22
#include <linux/mutex.h>
23
#include <linux/slab.h>
24
#include <linux/security.h>
25
#include <linux/hash.h>
L
Linus Torvalds 已提交
26 27
#include "sysfs.h"

28
DEFINE_MUTEX(sysfs_mutex);
29
DEFINE_SPINLOCK(sysfs_symlink_target_lock);
L
Linus Torvalds 已提交
30

31
#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
32

R
Roel Kluin 已提交
33
static DEFINE_SPINLOCK(sysfs_ino_lock);
34 35
static DEFINE_IDA(sysfs_ino_ida);

36
/**
37 38
 *	sysfs_name_hash
 *	@name: Null terminated string to hash
T
Tejun Heo 已提交
39
 *	@ns:   Namespace tag to hash
40 41 42
 *
 *	Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
T
Tejun Heo 已提交
43
static unsigned int sysfs_name_hash(const char *name, const void *ns)
44 45 46 47 48
{
	unsigned long hash = init_name_hash();
	unsigned int len = strlen(name);
	while (len--)
		hash = partial_name_hash(*name++, hash);
49
	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
50 51 52 53 54 55 56 57 58
	hash &= 0x7fffffffU;
	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
	if (hash < 1)
		hash += 2;
	if (hash >= INT_MAX)
		hash = INT_MAX - 1;
	return hash;
}

T
Tejun Heo 已提交
59 60
static int sysfs_name_compare(unsigned int hash, const char *name,
			      const void *ns, const struct sysfs_dirent *sd)
61 62 63 64 65 66 67 68 69 70 71
{
	if (hash != sd->s_hash)
		return hash - sd->s_hash;
	if (ns != sd->s_ns)
		return ns - sd->s_ns;
	return strcmp(name, sd->s_name);
}

static int sysfs_sd_compare(const struct sysfs_dirent *left,
			    const struct sysfs_dirent *right)
{
T
Tejun Heo 已提交
72
	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
73 74 75 76
				  right);
}

/**
77
 *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 79
 *	@sd: sysfs_dirent of interest
 *
80
 *	Link @sd into its sibling rbtree which starts from
81
 *	sd->s_parent->s_dir.children.
82 83
 *
 *	Locking:
84
 *	mutex_lock(sysfs_mutex)
85 86 87
 *
 *	RETURNS:
 *	0 on susccess -EEXIST on failure.
88
 */
89
static int sysfs_link_sibling(struct sysfs_dirent *sd)
90
{
91 92
	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
	struct rb_node *parent = NULL;
93

94 95 96
	if (sysfs_type(sd) == SYSFS_DIR)
		sd->s_parent->s_dir.subdirs++;

97 98 99 100 101 102 103 104 105 106 107 108 109
	while (*node) {
		struct sysfs_dirent *pos;
		int result;

		pos = to_sysfs_dirent(*node);
		parent = *node;
		result = sysfs_sd_compare(sd, pos);
		if (result < 0)
			node = &pos->s_rb.rb_left;
		else if (result > 0)
			node = &pos->s_rb.rb_right;
		else
			return -EEXIST;
110
	}
111 112 113 114
	/* add new node and rebalance the tree */
	rb_link_node(&sd->s_rb, parent, node);
	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
	return 0;
115 116 117
}

/**
118
 *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
119 120
 *	@sd: sysfs_dirent of interest
 *
121
 *	Unlink @sd from its sibling rbtree which starts from
122
 *	sd->s_parent->s_dir.children.
123 124
 *
 *	Locking:
125
 *	mutex_lock(sysfs_mutex)
126
 */
127
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
128
{
129 130 131
	if (sysfs_type(sd) == SYSFS_DIR)
		sd->s_parent->s_dir.subdirs--;

132
	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
133 134
}

135 136 137 138 139 140 141 142 143 144
/**
 *	sysfs_get_active - get an active reference to sysfs_dirent
 *	@sd: sysfs_dirent to get an active reference to
 *
 *	Get an active reference of @sd.  This function is noop if @sd
 *	is NULL.
 *
 *	RETURNS:
 *	Pointer to @sd on success, NULL on failure.
 */
145
struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
146
{
147 148 149
	if (unlikely(!sd))
		return NULL;

150 151
	if (!atomic_inc_unless_negative(&sd->s_active))
		return NULL;
152

153
	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
154 155
		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
	return sd;
156 157 158 159 160 161 162 163 164
}

/**
 *	sysfs_put_active - put an active reference to sysfs_dirent
 *	@sd: sysfs_dirent to put an active reference to
 *
 *	Put an active reference to @sd.  This function is noop if @sd
 *	is NULL.
 */
165
void sysfs_put_active(struct sysfs_dirent *sd)
166
{
167 168 169 170 171
	int v;

	if (unlikely(!sd))
		return;

172
	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
173
		rwsem_release(&sd->dep_map, 1, _RET_IP_);
174 175 176 177 178
	v = atomic_dec_return(&sd->s_active);
	if (likely(v != SD_DEACTIVATED_BIAS))
		return;

	/* atomic_dec_return() is a mb(), we'll always see the updated
M
Mikulas Patocka 已提交
179
	 * sd->u.completion.
180
	 */
M
Mikulas Patocka 已提交
181
	complete(sd->u.completion);
182 183 184 185 186 187
}

/**
 *	sysfs_deactivate - deactivate sysfs_dirent
 *	@sd: sysfs_dirent to deactivate
 *
188
 *	Deny new active references and drain existing ones.
189
 */
190
static void sysfs_deactivate(struct sysfs_dirent *sd)
191
{
192 193
	DECLARE_COMPLETION_ONSTACK(wait);
	int v;
194

M
Mikulas Patocka 已提交
195
	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
196 197 198 199

	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
		return;

M
Mikulas Patocka 已提交
200
	sd->u.completion = (void *)&wait;
201

202
	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
203
	/* atomic_add_return() is a mb(), put_active() will always see
M
Mikulas Patocka 已提交
204
	 * the updated sd->u.completion.
205
	 */
206 207
	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);

208 209
	if (v != SD_DEACTIVATED_BIAS) {
		lock_contended(&sd->dep_map, _RET_IP_);
210
		wait_for_completion(&wait);
211
	}
212

213 214
	lock_acquired(&sd->dep_map, _RET_IP_);
	rwsem_release(&sd->dep_map, 1, _RET_IP_);
215 216
}

217
static int sysfs_alloc_ino(unsigned int *pino)
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
{
	int ino, rc;

 retry:
	spin_lock(&sysfs_ino_lock);
	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
	spin_unlock(&sysfs_ino_lock);

	if (rc == -EAGAIN) {
		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
			goto retry;
		rc = -ENOMEM;
	}

	*pino = ino;
	return rc;
}

236
static void sysfs_free_ino(unsigned int ino)
237 238 239 240 241 242
{
	spin_lock(&sysfs_ino_lock);
	ida_remove(&sysfs_ino_ida, ino);
	spin_unlock(&sysfs_ino_lock);
}

243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
/**
 * kernfs_get - get a reference count on a sysfs_dirent
 * @sd: the target sysfs_dirent
 */
void kernfs_get(struct sysfs_dirent *sd)
{
	if (sd) {
		WARN_ON(!atomic_read(&sd->s_count));
		atomic_inc(&sd->s_count);
	}
}
EXPORT_SYMBOL_GPL(kernfs_get);

/**
 * kernfs_put - put a reference count on a sysfs_dirent
 * @sd: the target sysfs_dirent
 *
 * Put a reference count of @sd and destroy it if it reached zero.
 */
void kernfs_put(struct sysfs_dirent *sd)
263
{
T
Tejun Heo 已提交
264 265
	struct sysfs_dirent *parent_sd;

266 267
	if (!sd || !atomic_dec_and_test(&sd->s_count))
		return;
T
Tejun Heo 已提交
268
 repeat:
269 270 271
	/* Moving/renaming is always done while holding reference.
	 * sd->s_parent won't change beneath us.
	 */
T
Tejun Heo 已提交
272 273
	parent_sd = sd->s_parent;

274 275 276 277
	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
		"sysfs: free using entry: %s/%s\n",
		parent_sd ? parent_sd->s_name : "", sd->s_name);

278
	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
279
		kernfs_put(sd->s_symlink.target_sd);
280
	if (sysfs_type(sd) & SYSFS_COPY_NAME)
T
Tejun Heo 已提交
281
		kfree(sd->s_name);
282 283 284
	if (sd->s_iattr && sd->s_iattr->ia_secdata)
		security_release_secctx(sd->s_iattr->ia_secdata,
					sd->s_iattr->ia_secdata_len);
285
	kfree(sd->s_iattr);
286
	sysfs_free_ino(sd->s_ino);
287
	kmem_cache_free(sysfs_dir_cachep, sd);
T
Tejun Heo 已提交
288 289 290 291

	sd = parent_sd;
	if (sd && atomic_dec_and_test(&sd->s_count))
		goto repeat;
292
}
293
EXPORT_SYMBOL_GPL(kernfs_put);
294

N
Nick Piggin 已提交
295
static int sysfs_dentry_delete(const struct dentry *dentry)
296 297
{
	struct sysfs_dirent *sd = dentry->d_fsdata;
298
	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
299 300
}

301
static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
302
{
303
	struct sysfs_dirent *sd;
304

305
	if (flags & LOOKUP_RCU)
306 307 308
		return -ECHILD;

	sd = dentry->d_fsdata;
309 310 311 312 313 314
	mutex_lock(&sysfs_mutex);

	/* The sysfs dirent has been deleted */
	if (sd->s_flags & SYSFS_FLAG_REMOVED)
		goto out_bad;

315 316 317 318 319 320 321 322
	/* The sysfs dirent has been moved? */
	if (dentry->d_parent->d_fsdata != sd->s_parent)
		goto out_bad;

	/* The sysfs dirent has been renamed */
	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
		goto out_bad;

323
	/* The sysfs dirent has been moved to a different namespace */
324 325 326
	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
		goto out_bad;
327

328 329 330 331 332 333 334
	mutex_unlock(&sysfs_mutex);
out_valid:
	return 1;
out_bad:
	/* Remove the dentry from the dcache hashes.
	 * If this is a deleted dentry we use d_drop instead of d_delete
	 * so sysfs doesn't need to cope with negative dentries.
335 336 337 338 339 340
	 *
	 * If this is a dentry that has simply been renamed we
	 * use d_drop to remove it from the dcache lookup on its
	 * old parent.  If this dentry persists later when a lookup
	 * is performed at its new name the dentry will be readded
	 * to the dcache hashes.
341 342
	 */
	mutex_unlock(&sysfs_mutex);
343 344 345 346 347 348 349 350

	/* If we have submounts we must allow the vfs caches
	 * to lie about the state of the filesystem to prevent
	 * leaks and other nasty things.
	 */
	if (check_submounts_and_drop(dentry) != 0)
		goto out_valid;

351 352 353
	return 0;
}

354
static void sysfs_dentry_release(struct dentry *dentry)
L
Linus Torvalds 已提交
355
{
356
	kernfs_put(dentry->d_fsdata);
L
Linus Torvalds 已提交
357 358
}

359
const struct dentry_operations sysfs_dentry_ops = {
360 361
	.d_revalidate	= sysfs_dentry_revalidate,
	.d_delete	= sysfs_dentry_delete,
362
	.d_release	= sysfs_dentry_release,
L
Linus Torvalds 已提交
363 364
};

365
struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
L
Linus Torvalds 已提交
366
{
T
Tejun Heo 已提交
367
	char *dup_name = NULL;
368
	struct sysfs_dirent *sd;
T
Tejun Heo 已提交
369 370 371 372

	if (type & SYSFS_COPY_NAME) {
		name = dup_name = kstrdup(name, GFP_KERNEL);
		if (!name)
373
			return NULL;
T
Tejun Heo 已提交
374
	}
L
Linus Torvalds 已提交
375

376
	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
L
Linus Torvalds 已提交
377
	if (!sd)
378
		goto err_out1;
L
Linus Torvalds 已提交
379

T
Tejun Heo 已提交
380
	if (sysfs_alloc_ino(&sd->s_ino))
381
		goto err_out2;
382

L
Linus Torvalds 已提交
383
	atomic_set(&sd->s_count, 1);
384
	atomic_set(&sd->s_active, 0);
385

T
Tejun Heo 已提交
386
	sd->s_name = name;
387
	sd->s_mode = mode;
388
	sd->s_flags = type | SYSFS_FLAG_REMOVED;
L
Linus Torvalds 已提交
389 390

	return sd;
T
Tejun Heo 已提交
391

392
 err_out2:
T
Tejun Heo 已提交
393
	kmem_cache_free(sysfs_dir_cachep, sd);
394 395
 err_out1:
	kfree(dup_name);
T
Tejun Heo 已提交
396
	return NULL;
L
Linus Torvalds 已提交
397 398
}

399
/**
400 401
 *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
 *	@acxt: pointer to sysfs_addrm_cxt to be used
402
 *
403 404 405
 *	This function is called when the caller is about to add or remove
 *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
 *	to keep and pass context to other addrm functions.
406 407
 *
 *	LOCKING:
408
 *	Kernel thread context (may sleep).  sysfs_mutex is locked on
409
 *	return.
410
 */
411 412
void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
	__acquires(sysfs_mutex)
413
{
414 415 416 417 418 419
	memset(acxt, 0, sizeof(*acxt));

	mutex_lock(&sysfs_mutex);
}

/**
T
Tejun Heo 已提交
420
 *	sysfs_add_one - add sysfs_dirent to parent without warning
421 422
 *	@acxt: addrm context to use
 *	@sd: sysfs_dirent to be added
423
 *	@parent_sd: the parent sysfs_dirent to add @sd to
424
 *
425 426 427
 *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
 *	the parent inode if @sd is a directory and link into the children
 *	list of the parent.
428 429 430 431 432 433 434
 *
 *	This function should be called between calls to
 *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
 *	passed the same @acxt as passed to sysfs_addrm_start().
 *
 *	LOCKING:
 *	Determined by sysfs_addrm_start().
435 436 437 438
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
439
 */
T
Tejun Heo 已提交
440 441
int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
		  struct sysfs_dirent *parent_sd)
442
{
443
	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
444
	struct sysfs_inode_attrs *ps_iattr;
445
	int ret;
446

447
	if (has_ns != (bool)sd->s_ns) {
448
		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
449 450
		     has_ns ? "required" : "invalid",
		     parent_sd->s_name, sd->s_name);
451 452 453
		return -EINVAL;
	}

454 455 456
	if (sysfs_type(parent_sd) != SYSFS_DIR)
		return -EINVAL;

T
Tejun Heo 已提交
457
	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
458 459
	sd->s_parent = parent_sd;
	kernfs_get(parent_sd);
460

461 462 463
	ret = sysfs_link_sibling(sd);
	if (ret)
		return ret;
464

465
	/* Update timestamps on the parent */
466
	ps_iattr = parent_sd->s_iattr;
467 468 469 470 471
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

472 473 474
	/* Mark the entry added into directory tree */
	sd->s_flags &= ~SYSFS_FLAG_REMOVED;

475
	return 0;
476 477
}

478 479 480
/**
 *	sysfs_pathname - return full path to sysfs dirent
 *	@sd: sysfs_dirent whose path we want
481
 *	@path: caller allocated buffer of size PATH_MAX
482 483 484 485 486 487 488 489
 *
 *	Gives the name "/" to the sysfs_root entry; any path returned
 *	is relative to wherever sysfs is mounted.
 */
static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
{
	if (sd->s_parent) {
		sysfs_pathname(sd->s_parent, path);
490
		strlcat(path, "/", PATH_MAX);
491
	}
492
	strlcat(path, sd->s_name, PATH_MAX);
493 494 495
	return path;
}

496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
{
	char *path;

	path = kzalloc(PATH_MAX, GFP_KERNEL);
	if (path) {
		sysfs_pathname(parent, path);
		strlcat(path, "/", PATH_MAX);
		strlcat(path, name, PATH_MAX);
	}

	WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n",
	     path ? path : name);

	kfree(path);
}

513 514 515
/**
 *	sysfs_remove_one - remove sysfs_dirent from parent
 *	@acxt: addrm context to use
516
 *	@sd: sysfs_dirent to be removed
517 518
 *
 *	Mark @sd removed and drop nlink of parent inode if @sd is a
519
 *	directory.  @sd is unlinked from the children list.
520 521 522 523 524 525 526 527
 *
 *	This function should be called between calls to
 *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
 *	passed the same @acxt as passed to sysfs_addrm_start().
 *
 *	LOCKING:
 *	Determined by sysfs_addrm_start().
 */
T
Tejun Heo 已提交
528 529
static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
			     struct sysfs_dirent *sd)
530
{
531 532
	struct sysfs_inode_attrs *ps_iattr;

533 534 535 536 537 538
	/*
	 * Removal can be called multiple times on the same node.  Only the
	 * first invocation is effective and puts the base ref.
	 */
	if (sd->s_flags & SYSFS_FLAG_REMOVED)
		return;
539 540

	sysfs_unlink_sibling(sd);
541

542
	/* Update timestamps on the parent */
543
	ps_iattr = sd->s_parent->s_iattr;
544 545 546 547 548
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

549
	sd->s_flags |= SYSFS_FLAG_REMOVED;
M
Mikulas Patocka 已提交
550
	sd->u.removed_list = acxt->removed;
551
	acxt->removed = sd;
552 553
}

554 555 556 557 558 559
/**
 *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
 *	@acxt: addrm context to finish up
 *
 *	Finish up sysfs_dirent add/remove.  Resources acquired by
 *	sysfs_addrm_start() are released and removed sysfs_dirents are
560
 *	cleaned up.
561 562
 *
 *	LOCKING:
563
 *	sysfs_mutex is released.
564
 */
565
void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
566
	__releases(sysfs_mutex)
567 568 569 570 571 572 573 574
{
	/* release resources acquired by sysfs_addrm_start() */
	mutex_unlock(&sysfs_mutex);

	/* kill removed sysfs_dirents */
	while (acxt->removed) {
		struct sysfs_dirent *sd = acxt->removed;

M
Mikulas Patocka 已提交
575
		acxt->removed = sd->u.removed_list;
576 577

		sysfs_deactivate(sd);
578
		sysfs_unmap_bin_file(sd);
579
		kernfs_put(sd);
T
Tejun Heo 已提交
580
	}
581 582
}

583
/**
584 585 586 587
 * kernfs_find_ns - find sysfs_dirent with the given name
 * @parent: sysfs_dirent to search under
 * @name: name to look for
 * @ns: the namespace tag to use
588
 *
589 590
 * Look for sysfs_dirent with name @name under @parent.  Returns pointer to
 * the found sysfs_dirent on success, %NULL on failure.
591
 */
592 593 594
static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
					   const unsigned char *name,
					   const void *ns)
595
{
596 597
	struct rb_node *node = parent->s_dir.children.rb_node;
	bool has_ns = parent->s_flags & SYSFS_FLAG_NS;
598
	unsigned int hash;
599

600 601
	lockdep_assert_held(&sysfs_mutex);

602
	if (has_ns != (bool)ns) {
603
		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
604
		     has_ns ? "required" : "invalid",
605
		     parent->s_name, name);
606 607 608
		return NULL;
	}

T
Tejun Heo 已提交
609
	hash = sysfs_name_hash(name, ns);
610 611 612 613 614
	while (node) {
		struct sysfs_dirent *sd;
		int result;

		sd = to_sysfs_dirent(node);
T
Tejun Heo 已提交
615
		result = sysfs_name_compare(hash, name, ns, sd);
616 617 618 619 620 621
		if (result < 0)
			node = node->rb_left;
		else if (result > 0)
			node = node->rb_right;
		else
			return sd;
622
	}
623
	return NULL;
624
}
625

626
/**
627 628 629 630
 * kernfs_find_and_get_ns - find and get sysfs_dirent with the given name
 * @parent: sysfs_dirent to search under
 * @name: name to look for
 * @ns: the namespace tag to use
631
 *
632 633 634
 * Look for sysfs_dirent with name @name under @parent and get a reference
 * if found.  This function may sleep and returns pointer to the found
 * sysfs_dirent on success, %NULL on failure.
635
 */
636 637
struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
					    const char *name, const void *ns)
638 639 640
{
	struct sysfs_dirent *sd;

641
	mutex_lock(&sysfs_mutex);
642 643
	sd = kernfs_find_ns(parent, name, ns);
	kernfs_get(sd);
644
	mutex_unlock(&sysfs_mutex);
645 646

	return sd;
647
}
648
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
649

650 651 652 653 654 655 656 657 658 659 660 661
/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
					  const char *name, void *priv,
					  const void *ns)
L
Linus Torvalds 已提交
662
{
663
	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
664
	struct sysfs_addrm_cxt acxt;
665
	struct sysfs_dirent *sd;
666
	int rc;
L
Linus Torvalds 已提交
667

668
	/* allocate */
669
	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
670
	if (!sd)
671
		return ERR_PTR(-ENOMEM);
672 673

	sd->s_ns = ns;
674
	sd->priv = priv;
675

676
	/* link in */
677
	sysfs_addrm_start(&acxt);
T
Tejun Heo 已提交
678
	rc = sysfs_add_one(&acxt, sd, parent);
679
	sysfs_addrm_finish(&acxt);
680

681 682
	if (!rc)
		return sd;
683

684
	kernfs_put(sd);
685
	return ERR_PTR(rc);
L
Linus Torvalds 已提交
686 687 688
}

/**
689 690 691
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
L
Linus Torvalds 已提交
692
 */
693
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
L
Linus Torvalds 已提交
694
{
695
	struct sysfs_dirent *parent_sd, *sd;
L
Linus Torvalds 已提交
696 697 698

	BUG_ON(!kobj);

699
	if (kobj->parent)
700
		parent_sd = kobj->parent->sd;
L
Linus Torvalds 已提交
701
	else
E
Eric W. Biederman 已提交
702
		parent_sd = &sysfs_root;
L
Linus Torvalds 已提交
703

704 705 706
	if (!parent_sd)
		return -ENOENT;

707 708 709 710 711 712 713 714 715
	sd = kernfs_create_dir_ns(parent_sd, kobject_name(kobj), kobj, ns);
	if (IS_ERR(sd)) {
		if (PTR_ERR(sd) == -EEXIST)
			sysfs_warn_dup(parent_sd, kobject_name(kobj));
		return PTR_ERR(sd);
	}

	kobj->sd = sd;
	return 0;
L
Linus Torvalds 已提交
716 717
}

718 719
static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
				   unsigned int flags)
L
Linus Torvalds 已提交
720
{
721
	struct dentry *ret = NULL;
722 723
	struct dentry *parent = dentry->d_parent;
	struct sysfs_dirent *parent_sd = parent->d_fsdata;
724
	struct sysfs_dirent *sd;
725
	struct inode *inode;
726
	const void *ns = NULL;
L
Linus Torvalds 已提交
727

728 729
	mutex_lock(&sysfs_mutex);

730 731
	if (parent_sd->s_flags & SYSFS_FLAG_NS)
		ns = sysfs_info(dir->i_sb)->ns;
732

733
	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
L
Linus Torvalds 已提交
734

735
	/* no such entry */
736 737
	if (!sd) {
		ret = ERR_PTR(-ENOENT);
738
		goto out_unlock;
739
	}
740 741
	kernfs_get(sd);
	dentry->d_fsdata = sd;
742 743

	/* attach dentry and inode */
744
	inode = sysfs_get_inode(dir->i_sb, sd);
745 746 747 748
	if (!inode) {
		ret = ERR_PTR(-ENOMEM);
		goto out_unlock;
	}
749

T
Tejun Heo 已提交
750
	/* instantiate and hash dentry */
A
Al Viro 已提交
751
	ret = d_materialise_unique(dentry, inode);
752
 out_unlock:
753
	mutex_unlock(&sysfs_mutex);
754
	return ret;
L
Linus Torvalds 已提交
755 756
}

757
const struct inode_operations sysfs_dir_inode_operations = {
L
Linus Torvalds 已提交
758
	.lookup		= sysfs_lookup,
759
	.permission	= sysfs_permission,
760
	.setattr	= sysfs_setattr,
761
	.getattr	= sysfs_getattr,
762
	.setxattr	= sysfs_setxattr,
L
Linus Torvalds 已提交
763 764
};

765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
{
	struct sysfs_dirent *last;

	while (true) {
		struct rb_node *rbn;

		last = pos;

		if (sysfs_type(pos) != SYSFS_DIR)
			break;

		rbn = rb_first(&pos->s_dir.children);
		if (!rbn)
			break;

		pos = to_sysfs_dirent(rbn);
	}

	return last;
}

/**
 * sysfs_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: sysfs_dirent whose descendants to walk
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
						       struct sysfs_dirent *root)
{
	struct rb_node *rbn;

	lockdep_assert_held(&sysfs_mutex);

	/* if first iteration, visit leftmost descendant which may be root */
	if (!pos)
		return sysfs_leftmost_descendant(root);

	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
	rbn = rb_next(&pos->s_rb);
	if (rbn)
		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));

	/* no sibling left, visit parent */
	return pos->s_parent;
}
L
Linus Torvalds 已提交
819

820 821
static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
			    struct sysfs_dirent *sd)
L
Linus Torvalds 已提交
822
{
823
	struct sysfs_dirent *pos, *next;
L
Linus Torvalds 已提交
824

T
Tejun Heo 已提交
825
	if (!sd)
L
Linus Torvalds 已提交
826 827
		return;

T
Tejun Heo 已提交
828
	pr_debug("sysfs %s: removing\n", sd->s_name);
829

830 831 832
	next = NULL;
	do {
		pos = next;
T
Tejun Heo 已提交
833
		next = sysfs_next_descendant_post(pos, sd);
834
		if (pos)
T
Tejun Heo 已提交
835
			sysfs_remove_one(acxt, pos);
836
	} while (next);
T
Tejun Heo 已提交
837
}
838

T
Tejun Heo 已提交
839
/**
840
 * kernfs_remove - remove a sysfs_dirent recursively
T
Tejun Heo 已提交
841 842 843 844
 * @sd: the sysfs_dirent to remove
 *
 * Remove @sd along with all its subdirectories and files.
 */
845
void kernfs_remove(struct sysfs_dirent *sd)
T
Tejun Heo 已提交
846 847 848 849
{
	struct sysfs_addrm_cxt acxt;

	sysfs_addrm_start(&acxt);
850
	__kernfs_remove(&acxt, sd);
851
	sysfs_addrm_finish(&acxt);
852 853
}

854
/**
855
 * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
856 857 858 859 860 861 862
 * @dir_sd: parent of the target
 * @name: name of the sysfs_dirent to remove
 * @ns: namespace tag of the sysfs_dirent to remove
 *
 * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
 * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
 */
863 864
int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
			     const void *ns)
865 866 867 868 869 870 871 872 873 874 875 876
{
	struct sysfs_addrm_cxt acxt;
	struct sysfs_dirent *sd;

	if (!dir_sd) {
		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
			name);
		return -ENOENT;
	}

	sysfs_addrm_start(&acxt);

877
	sd = kernfs_find_ns(dir_sd, name, ns);
878
	if (sd)
879
		__kernfs_remove(&acxt, sd);
880 881 882 883 884 885 886 887 888

	sysfs_addrm_finish(&acxt);

	if (sd)
		return 0;
	else
		return -ENOENT;
}

889 890 891 892 893 894 895 896
/**
 *	sysfs_remove_dir - remove an object's directory.
 *	@kobj:	object.
 *
 *	The only thing special about this is that we remove any files in
 *	the directory before we remove the directory, and we've inlined
 *	what used to be sysfs_rmdir() below, instead of calling separately.
 */
897
void sysfs_remove_dir(struct kobject *kobj)
898
{
899
	struct sysfs_dirent *sd = kobj->sd;
900

901 902 903 904 905 906 907 908 909 910 911 912 913
	/*
	 * In general, kboject owner is responsible for ensuring removal
	 * doesn't race with other operations and sysfs doesn't provide any
	 * protection; however, when @kobj is used as a symlink target, the
	 * symlinking entity usually doesn't own @kobj and thus has no
	 * control over removal.  @kobj->sd may be removed anytime and
	 * symlink code may end up dereferencing an already freed sd.
	 *
	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
	 * against symlink operations so that symlink code can safely
	 * dereference @kobj->sd.
	 */
	spin_lock(&sysfs_symlink_target_lock);
914
	kobj->sd = NULL;
915
	spin_unlock(&sysfs_symlink_target_lock);
916

T
Tejun Heo 已提交
917 918
	if (sd) {
		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
919
		kernfs_remove(sd);
T
Tejun Heo 已提交
920
	}
L
Linus Torvalds 已提交
921 922
}

923 924 925 926 927 928 929 930 931
/**
 * kernfs_rename_ns - move and rename a kernfs_node
 * @sd: target node
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
		     const char *new_name, const void *new_ns)
L
Linus Torvalds 已提交
932
{
933
	int error;
L
Linus Torvalds 已提交
934

935
	mutex_lock(&sysfs_mutex);
936

937
	error = 0;
938
	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
939
	    (strcmp(sd->s_name, new_name) == 0))
940 941 942
		goto out;	/* nothing to rename */

	error = -EEXIST;
943
	if (kernfs_find_ns(new_parent, new_name, new_ns))
944
		goto out;
945

946
	/* rename sysfs_dirent */
947 948
	if (strcmp(sd->s_name, new_name) != 0) {
		error = -ENOMEM;
949
		new_name = kstrdup(new_name, GFP_KERNEL);
950 951 952
		if (!new_name)
			goto out;

953
		kfree(sd->s_name);
954 955
		sd->s_name = new_name;
	}
T
Tejun Heo 已提交
956

957 958 959
	/*
	 * Move to the appropriate place in the appropriate directories rbtree.
	 */
960
	sysfs_unlink_sibling(sd);
961 962
	kernfs_get(new_parent);
	kernfs_put(sd->s_parent);
963
	sd->s_ns = new_ns;
T
Tejun Heo 已提交
964
	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
965
	sd->s_parent = new_parent;
966
	sysfs_link_sibling(sd);
T
Tejun Heo 已提交
967

968
	error = 0;
969
 out:
970
	mutex_unlock(&sysfs_mutex);
L
Linus Torvalds 已提交
971 972 973
	return error;
}

974 975
int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
			const void *new_ns)
976
{
977 978
	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;

979
	return kernfs_rename_ns(kobj->sd, parent_sd, new_name, new_ns);
980 981
}

982 983
int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
		      const void *new_ns)
984
{
985 986
	struct sysfs_dirent *sd = kobj->sd;
	struct sysfs_dirent *new_parent_sd;
987

988
	BUG_ON(!sd->s_parent);
989
	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
990
		new_parent_kobj->sd : &sysfs_root;
991

992
	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
993 994
}

995
/**
996
 * kernfs_enable_ns - enable namespace under a directory
997 998 999 1000 1001 1002
 * @sd: directory of interest, should be empty
 *
 * This is to be called right after @sd is created to enable namespace
 * under it.  All children of @sd must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
1003
void kernfs_enable_ns(struct sysfs_dirent *sd)
1004 1005 1006 1007 1008 1009
{
	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
	sd->s_flags |= SYSFS_FLAG_NS;
}

L
Linus Torvalds 已提交
1010 1011 1012 1013 1014 1015
/* Relationship between s_mode and the DT_xxx types */
static inline unsigned char dt_type(struct sysfs_dirent *sd)
{
	return (sd->s_mode >> 12) & 15;
}

1016 1017
static int sysfs_dir_release(struct inode *inode, struct file *filp)
{
1018
	kernfs_put(filp->private_data);
1019 1020 1021
	return 0;
}

1022
static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
1023
	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
1024 1025 1026 1027
{
	if (pos) {
		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
			pos->s_parent == parent_sd &&
1028
			hash == pos->s_hash;
1029
		kernfs_put(pos);
1030 1031
		if (!valid)
			pos = NULL;
1032
	}
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
	if (!pos && (hash > 1) && (hash < INT_MAX)) {
		struct rb_node *node = parent_sd->s_dir.children.rb_node;
		while (node) {
			pos = to_sysfs_dirent(node);

			if (hash < pos->s_hash)
				node = node->rb_left;
			else if (hash > pos->s_hash)
				node = node->rb_right;
			else
1043 1044 1045
				break;
		}
	}
1046
	/* Skip over entries in the wrong namespace */
1047
	while (pos && pos->s_ns != ns) {
1048 1049
		struct rb_node *node = rb_next(&pos->s_rb);
		if (!node)
1050 1051
			pos = NULL;
		else
1052
			pos = to_sysfs_dirent(node);
1053 1054 1055 1056
	}
	return pos;
}

1057 1058
static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
1059
{
1060
	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
1061 1062 1063 1064 1065 1066 1067 1068
	if (pos)
		do {
			struct rb_node *node = rb_next(&pos->s_rb);
			if (!node)
				pos = NULL;
			else
				pos = to_sysfs_dirent(node);
		} while (pos && pos->s_ns != ns);
1069 1070 1071
	return pos;
}

A
Al Viro 已提交
1072
static int sysfs_readdir(struct file *file, struct dir_context *ctx)
L
Linus Torvalds 已提交
1073
{
A
Al Viro 已提交
1074
	struct dentry *dentry = file->f_path.dentry;
1075
	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
A
Al Viro 已提交
1076
	struct sysfs_dirent *pos = file->private_data;
1077
	const void *ns = NULL;
1078

A
Al Viro 已提交
1079 1080
	if (!dir_emit_dots(file, ctx))
		return 0;
1081
	mutex_lock(&sysfs_mutex);
1082 1083 1084 1085

	if (parent_sd->s_flags & SYSFS_FLAG_NS)
		ns = sysfs_info(dentry->d_sb)->ns;

A
Al Viro 已提交
1086
	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1087
	     pos;
A
Al Viro 已提交
1088 1089 1090 1091 1092
	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
		const char *name = pos->s_name;
		unsigned int type = dt_type(pos);
		int len = strlen(name);
		ino_t ino = pos->s_ino;
1093

A
Al Viro 已提交
1094
		ctx->pos = pos->s_hash;
1095 1096
		file->private_data = pos;
		kernfs_get(pos);
L
Linus Torvalds 已提交
1097

1098
		mutex_unlock(&sysfs_mutex);
A
Al Viro 已提交
1099 1100
		if (!dir_emit(ctx, name, len, ino, type))
			return 0;
1101 1102 1103
		mutex_lock(&sysfs_mutex);
	}
	mutex_unlock(&sysfs_mutex);
A
Al Viro 已提交
1104 1105
	file->private_data = NULL;
	ctx->pos = INT_MAX;
E
Eric W. Biederman 已提交
1106
	return 0;
L
Linus Torvalds 已提交
1107 1108
}

1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file_inode(file);
	loff_t ret;

	mutex_lock(&inode->i_mutex);
	ret = generic_file_llseek(file, offset, whence);
	mutex_unlock(&inode->i_mutex);

	return ret;
}
E
Eric W. Biederman 已提交
1120

1121
const struct file_operations sysfs_dir_operations = {
L
Linus Torvalds 已提交
1122
	.read		= generic_read_dir,
A
Al Viro 已提交
1123
	.iterate	= sysfs_readdir,
1124
	.release	= sysfs_dir_release,
1125
	.llseek		= sysfs_dir_llseek,
L
Linus Torvalds 已提交
1126
};