dir.c 27.1 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
T
Tejun Heo 已提交
2 3 4 5 6 7 8 9 10
 * fs/sysfs/dir.c - sysfs core and dir operation implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * This file is released under the GPLv2.
 *
 * Please see Documentation/filesystems/sysfs.txt for more information.
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18
 */

#undef DEBUG

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kobject.h>
19
#include <linux/namei.h>
20
#include <linux/idr.h>
21
#include <linux/completion.h>
D
Dave Young 已提交
22
#include <linux/mutex.h>
23
#include <linux/slab.h>
24
#include <linux/security.h>
25
#include <linux/hash.h>
L
Linus Torvalds 已提交
26 27
#include "sysfs.h"

28
DEFINE_MUTEX(sysfs_mutex);
29
DEFINE_SPINLOCK(sysfs_symlink_target_lock);
L
Linus Torvalds 已提交
30

31
#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
32

R
Roel Kluin 已提交
33
static DEFINE_SPINLOCK(sysfs_ino_lock);
34 35
static DEFINE_IDA(sysfs_ino_ida);

36
/**
37 38
 *	sysfs_name_hash
 *	@name: Null terminated string to hash
T
Tejun Heo 已提交
39
 *	@ns:   Namespace tag to hash
40 41 42
 *
 *	Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
T
Tejun Heo 已提交
43
static unsigned int sysfs_name_hash(const char *name, const void *ns)
44 45 46 47 48
{
	unsigned long hash = init_name_hash();
	unsigned int len = strlen(name);
	while (len--)
		hash = partial_name_hash(*name++, hash);
49
	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
50 51 52 53 54 55 56 57 58
	hash &= 0x7fffffffU;
	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
	if (hash < 1)
		hash += 2;
	if (hash >= INT_MAX)
		hash = INT_MAX - 1;
	return hash;
}

T
Tejun Heo 已提交
59 60
static int sysfs_name_compare(unsigned int hash, const char *name,
			      const void *ns, const struct sysfs_dirent *sd)
61 62 63 64 65 66 67 68 69 70 71
{
	if (hash != sd->s_hash)
		return hash - sd->s_hash;
	if (ns != sd->s_ns)
		return ns - sd->s_ns;
	return strcmp(name, sd->s_name);
}

static int sysfs_sd_compare(const struct sysfs_dirent *left,
			    const struct sysfs_dirent *right)
{
T
Tejun Heo 已提交
72
	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
73 74 75 76
				  right);
}

/**
77
 *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 79
 *	@sd: sysfs_dirent of interest
 *
80
 *	Link @sd into its sibling rbtree which starts from
81
 *	sd->s_parent->s_dir.children.
82 83
 *
 *	Locking:
84
 *	mutex_lock(sysfs_mutex)
85 86 87
 *
 *	RETURNS:
 *	0 on susccess -EEXIST on failure.
88
 */
89
static int sysfs_link_sibling(struct sysfs_dirent *sd)
90
{
91 92
	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
	struct rb_node *parent = NULL;
93

94 95 96
	if (sysfs_type(sd) == SYSFS_DIR)
		sd->s_parent->s_dir.subdirs++;

97 98 99 100 101 102 103 104 105 106 107 108 109
	while (*node) {
		struct sysfs_dirent *pos;
		int result;

		pos = to_sysfs_dirent(*node);
		parent = *node;
		result = sysfs_sd_compare(sd, pos);
		if (result < 0)
			node = &pos->s_rb.rb_left;
		else if (result > 0)
			node = &pos->s_rb.rb_right;
		else
			return -EEXIST;
110
	}
111 112 113 114
	/* add new node and rebalance the tree */
	rb_link_node(&sd->s_rb, parent, node);
	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
	return 0;
115 116 117
}

/**
118
 *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
119 120
 *	@sd: sysfs_dirent of interest
 *
121
 *	Unlink @sd from its sibling rbtree which starts from
122
 *	sd->s_parent->s_dir.children.
123 124
 *
 *	Locking:
125
 *	mutex_lock(sysfs_mutex)
126
 */
127
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
128
{
129 130 131
	if (sysfs_type(sd) == SYSFS_DIR)
		sd->s_parent->s_dir.subdirs--;

132
	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
133 134
}

135 136 137 138 139 140 141 142 143 144
/**
 *	sysfs_get_active - get an active reference to sysfs_dirent
 *	@sd: sysfs_dirent to get an active reference to
 *
 *	Get an active reference of @sd.  This function is noop if @sd
 *	is NULL.
 *
 *	RETURNS:
 *	Pointer to @sd on success, NULL on failure.
 */
145
struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
146
{
147 148 149
	if (unlikely(!sd))
		return NULL;

150 151
	if (!atomic_inc_unless_negative(&sd->s_active))
		return NULL;
152

153
	if (likely(!sysfs_ignore_lockdep(sd)))
154 155
		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
	return sd;
156 157 158 159 160 161 162 163 164
}

/**
 *	sysfs_put_active - put an active reference to sysfs_dirent
 *	@sd: sysfs_dirent to put an active reference to
 *
 *	Put an active reference to @sd.  This function is noop if @sd
 *	is NULL.
 */
165
void sysfs_put_active(struct sysfs_dirent *sd)
166
{
167 168 169 170 171
	int v;

	if (unlikely(!sd))
		return;

172
	if (likely(!sysfs_ignore_lockdep(sd)))
173
		rwsem_release(&sd->dep_map, 1, _RET_IP_);
174 175 176 177 178
	v = atomic_dec_return(&sd->s_active);
	if (likely(v != SD_DEACTIVATED_BIAS))
		return;

	/* atomic_dec_return() is a mb(), we'll always see the updated
M
Mikulas Patocka 已提交
179
	 * sd->u.completion.
180
	 */
M
Mikulas Patocka 已提交
181
	complete(sd->u.completion);
182 183 184 185 186 187
}

/**
 *	sysfs_deactivate - deactivate sysfs_dirent
 *	@sd: sysfs_dirent to deactivate
 *
188
 *	Deny new active references and drain existing ones.
189
 */
190
static void sysfs_deactivate(struct sysfs_dirent *sd)
191
{
192 193
	DECLARE_COMPLETION_ONSTACK(wait);
	int v;
194

M
Mikulas Patocka 已提交
195
	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
196 197 198 199

	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
		return;

M
Mikulas Patocka 已提交
200
	sd->u.completion = (void *)&wait;
201

202
	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
203
	/* atomic_add_return() is a mb(), put_active() will always see
M
Mikulas Patocka 已提交
204
	 * the updated sd->u.completion.
205
	 */
206 207
	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);

208 209
	if (v != SD_DEACTIVATED_BIAS) {
		lock_contended(&sd->dep_map, _RET_IP_);
210
		wait_for_completion(&wait);
211
	}
212

213 214
	lock_acquired(&sd->dep_map, _RET_IP_);
	rwsem_release(&sd->dep_map, 1, _RET_IP_);
215 216
}

217
static int sysfs_alloc_ino(unsigned int *pino)
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
{
	int ino, rc;

 retry:
	spin_lock(&sysfs_ino_lock);
	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
	spin_unlock(&sysfs_ino_lock);

	if (rc == -EAGAIN) {
		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
			goto retry;
		rc = -ENOMEM;
	}

	*pino = ino;
	return rc;
}

236
static void sysfs_free_ino(unsigned int ino)
237 238 239 240 241 242
{
	spin_lock(&sysfs_ino_lock);
	ida_remove(&sysfs_ino_ida, ino);
	spin_unlock(&sysfs_ino_lock);
}

243
void release_sysfs_dirent(struct sysfs_dirent *sd)
244
{
T
Tejun Heo 已提交
245 246 247
	struct sysfs_dirent *parent_sd;

 repeat:
248 249 250
	/* Moving/renaming is always done while holding reference.
	 * sd->s_parent won't change beneath us.
	 */
T
Tejun Heo 已提交
251 252
	parent_sd = sd->s_parent;

253 254 255 256
	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
		"sysfs: free using entry: %s/%s\n",
		parent_sd ? parent_sd->s_name : "", sd->s_name);

257
	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
T
Tejun Heo 已提交
258
		sysfs_put(sd->s_symlink.target_sd);
259
	if (sysfs_type(sd) & SYSFS_COPY_NAME)
T
Tejun Heo 已提交
260
		kfree(sd->s_name);
261 262 263
	if (sd->s_iattr && sd->s_iattr->ia_secdata)
		security_release_secctx(sd->s_iattr->ia_secdata,
					sd->s_iattr->ia_secdata_len);
264
	kfree(sd->s_iattr);
265
	sysfs_free_ino(sd->s_ino);
266
	kmem_cache_free(sysfs_dir_cachep, sd);
T
Tejun Heo 已提交
267 268 269 270

	sd = parent_sd;
	if (sd && atomic_dec_and_test(&sd->s_count))
		goto repeat;
271 272
}

N
Nick Piggin 已提交
273
static int sysfs_dentry_delete(const struct dentry *dentry)
274 275
{
	struct sysfs_dirent *sd = dentry->d_fsdata;
276
	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
277 278
}

279
static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
280
{
281
	struct sysfs_dirent *sd;
282

283
	if (flags & LOOKUP_RCU)
284 285 286
		return -ECHILD;

	sd = dentry->d_fsdata;
287 288 289 290 291 292
	mutex_lock(&sysfs_mutex);

	/* The sysfs dirent has been deleted */
	if (sd->s_flags & SYSFS_FLAG_REMOVED)
		goto out_bad;

293 294 295 296 297 298 299 300
	/* The sysfs dirent has been moved? */
	if (dentry->d_parent->d_fsdata != sd->s_parent)
		goto out_bad;

	/* The sysfs dirent has been renamed */
	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
		goto out_bad;

301
	/* The sysfs dirent has been moved to a different namespace */
302 303 304
	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
		goto out_bad;
305

306 307 308 309 310 311 312
	mutex_unlock(&sysfs_mutex);
out_valid:
	return 1;
out_bad:
	/* Remove the dentry from the dcache hashes.
	 * If this is a deleted dentry we use d_drop instead of d_delete
	 * so sysfs doesn't need to cope with negative dentries.
313 314 315 316 317 318
	 *
	 * If this is a dentry that has simply been renamed we
	 * use d_drop to remove it from the dcache lookup on its
	 * old parent.  If this dentry persists later when a lookup
	 * is performed at its new name the dentry will be readded
	 * to the dcache hashes.
319 320
	 */
	mutex_unlock(&sysfs_mutex);
321 322 323 324 325 326 327 328

	/* If we have submounts we must allow the vfs caches
	 * to lie about the state of the filesystem to prevent
	 * leaks and other nasty things.
	 */
	if (check_submounts_and_drop(dentry) != 0)
		goto out_valid;

329 330 331
	return 0;
}

332
static void sysfs_dentry_release(struct dentry *dentry)
L
Linus Torvalds 已提交
333
{
334
	sysfs_put(dentry->d_fsdata);
L
Linus Torvalds 已提交
335 336
}

337
const struct dentry_operations sysfs_dentry_ops = {
338 339
	.d_revalidate	= sysfs_dentry_revalidate,
	.d_delete	= sysfs_dentry_delete,
340
	.d_release	= sysfs_dentry_release,
L
Linus Torvalds 已提交
341 342
};

343
struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
L
Linus Torvalds 已提交
344
{
T
Tejun Heo 已提交
345
	char *dup_name = NULL;
346
	struct sysfs_dirent *sd;
T
Tejun Heo 已提交
347 348 349 350

	if (type & SYSFS_COPY_NAME) {
		name = dup_name = kstrdup(name, GFP_KERNEL);
		if (!name)
351
			return NULL;
T
Tejun Heo 已提交
352
	}
L
Linus Torvalds 已提交
353

354
	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
L
Linus Torvalds 已提交
355
	if (!sd)
356
		goto err_out1;
L
Linus Torvalds 已提交
357

T
Tejun Heo 已提交
358
	if (sysfs_alloc_ino(&sd->s_ino))
359
		goto err_out2;
360

L
Linus Torvalds 已提交
361
	atomic_set(&sd->s_count, 1);
362
	atomic_set(&sd->s_active, 0);
363

T
Tejun Heo 已提交
364
	sd->s_name = name;
365
	sd->s_mode = mode;
366
	sd->s_flags = type | SYSFS_FLAG_REMOVED;
L
Linus Torvalds 已提交
367 368

	return sd;
T
Tejun Heo 已提交
369

370
 err_out2:
T
Tejun Heo 已提交
371
	kmem_cache_free(sysfs_dir_cachep, sd);
372 373
 err_out1:
	kfree(dup_name);
T
Tejun Heo 已提交
374
	return NULL;
L
Linus Torvalds 已提交
375 376
}

377
/**
378 379
 *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
 *	@acxt: pointer to sysfs_addrm_cxt to be used
380
 *
381 382 383
 *	This function is called when the caller is about to add or remove
 *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
 *	to keep and pass context to other addrm functions.
384 385
 *
 *	LOCKING:
386
 *	Kernel thread context (may sleep).  sysfs_mutex is locked on
387
 *	return.
388
 */
389 390
void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
	__acquires(sysfs_mutex)
391
{
392 393 394 395 396 397
	memset(acxt, 0, sizeof(*acxt));

	mutex_lock(&sysfs_mutex);
}

/**
398
 *	__sysfs_add_one - add sysfs_dirent to parent without warning
399 400
 *	@acxt: addrm context to use
 *	@sd: sysfs_dirent to be added
401
 *	@parent_sd: the parent sysfs_dirent to add @sd to
402
 *
403 404 405
 *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
 *	the parent inode if @sd is a directory and link into the children
 *	list of the parent.
406 407 408 409 410 411 412
 *
 *	This function should be called between calls to
 *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
 *	passed the same @acxt as passed to sysfs_addrm_start().
 *
 *	LOCKING:
 *	Determined by sysfs_addrm_start().
413 414 415 416
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
417
 */
418 419
int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
		    struct sysfs_dirent *parent_sd)
420
{
421
	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
422
	struct sysfs_inode_attrs *ps_iattr;
423
	int ret;
424

425
	if (has_ns != (bool)sd->s_ns) {
426
		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
427 428
		     has_ns ? "required" : "invalid",
		     parent_sd->s_name, sd->s_name);
429 430 431
		return -EINVAL;
	}

432 433 434
	if (sysfs_type(parent_sd) != SYSFS_DIR)
		return -EINVAL;

T
Tejun Heo 已提交
435
	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
436
	sd->s_parent = sysfs_get(parent_sd);
437

438 439 440
	ret = sysfs_link_sibling(sd);
	if (ret)
		return ret;
441

442
	/* Update timestamps on the parent */
443
	ps_iattr = parent_sd->s_iattr;
444 445 446 447 448
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

449 450 451
	/* Mark the entry added into directory tree */
	sd->s_flags &= ~SYSFS_FLAG_REMOVED;

452
	return 0;
453 454
}

455 456 457
/**
 *	sysfs_pathname - return full path to sysfs dirent
 *	@sd: sysfs_dirent whose path we want
458
 *	@path: caller allocated buffer of size PATH_MAX
459 460 461 462 463 464 465 466
 *
 *	Gives the name "/" to the sysfs_root entry; any path returned
 *	is relative to wherever sysfs is mounted.
 */
static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
{
	if (sd->s_parent) {
		sysfs_pathname(sd->s_parent, path);
467
		strlcat(path, "/", PATH_MAX);
468
	}
469
	strlcat(path, sd->s_name, PATH_MAX);
470 471 472
	return path;
}

473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
{
	char *path;

	path = kzalloc(PATH_MAX, GFP_KERNEL);
	if (path) {
		sysfs_pathname(parent, path);
		strlcat(path, "/", PATH_MAX);
		strlcat(path, name, PATH_MAX);
	}

	WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n",
	     path ? path : name);

	kfree(path);
}

490 491 492 493
/**
 *	sysfs_add_one - add sysfs_dirent to parent
 *	@acxt: addrm context to use
 *	@sd: sysfs_dirent to be added
494
 *	@parent_sd: the parent sysfs_dirent to add @sd to
495
 *
496 497 498
 *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
 *	the parent inode if @sd is a directory and link into the children
 *	list of the parent.
499 500 501 502 503 504 505 506 507 508 509 510
 *
 *	This function should be called between calls to
 *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
 *	passed the same @acxt as passed to sysfs_addrm_start().
 *
 *	LOCKING:
 *	Determined by sysfs_addrm_start().
 *
 *	RETURNS:
 *	0 on success, -EEXIST if entry with the given name already
 *	exists.
 */
511 512
int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
		  struct sysfs_dirent *parent_sd)
513 514 515
{
	int ret;

516
	ret = __sysfs_add_one(acxt, sd, parent_sd);
517

518 519
	if (ret == -EEXIST)
		sysfs_warn_dup(parent_sd, sd->s_name);
520 521 522
	return ret;
}

523 524 525
/**
 *	sysfs_remove_one - remove sysfs_dirent from parent
 *	@acxt: addrm context to use
526
 *	@sd: sysfs_dirent to be removed
527 528
 *
 *	Mark @sd removed and drop nlink of parent inode if @sd is a
529
 *	directory.  @sd is unlinked from the children list.
530 531 532 533 534 535 536 537
 *
 *	This function should be called between calls to
 *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
 *	passed the same @acxt as passed to sysfs_addrm_start().
 *
 *	LOCKING:
 *	Determined by sysfs_addrm_start().
 */
T
Tejun Heo 已提交
538 539
static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
			     struct sysfs_dirent *sd)
540
{
541 542
	struct sysfs_inode_attrs *ps_iattr;

543 544 545 546 547 548
	/*
	 * Removal can be called multiple times on the same node.  Only the
	 * first invocation is effective and puts the base ref.
	 */
	if (sd->s_flags & SYSFS_FLAG_REMOVED)
		return;
549 550

	sysfs_unlink_sibling(sd);
551

552
	/* Update timestamps on the parent */
553
	ps_iattr = sd->s_parent->s_iattr;
554 555 556 557 558
	if (ps_iattr) {
		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
	}

559
	sd->s_flags |= SYSFS_FLAG_REMOVED;
M
Mikulas Patocka 已提交
560
	sd->u.removed_list = acxt->removed;
561
	acxt->removed = sd;
562 563
}

564 565 566 567 568 569
/**
 *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
 *	@acxt: addrm context to finish up
 *
 *	Finish up sysfs_dirent add/remove.  Resources acquired by
 *	sysfs_addrm_start() are released and removed sysfs_dirents are
570
 *	cleaned up.
571 572
 *
 *	LOCKING:
573
 *	sysfs_mutex is released.
574
 */
575
void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
576
	__releases(sysfs_mutex)
577 578 579 580 581 582 583 584
{
	/* release resources acquired by sysfs_addrm_start() */
	mutex_unlock(&sysfs_mutex);

	/* kill removed sysfs_dirents */
	while (acxt->removed) {
		struct sysfs_dirent *sd = acxt->removed;

M
Mikulas Patocka 已提交
585
		acxt->removed = sd->u.removed_list;
586 587

		sysfs_deactivate(sd);
588
		sysfs_unmap_bin_file(sd);
589
		sysfs_put(sd);
T
Tejun Heo 已提交
590
	}
591 592
}

593 594 595 596
/**
 *	sysfs_find_dirent - find sysfs_dirent with the given name
 *	@parent_sd: sysfs_dirent to search under
 *	@name: name to look for
T
Tejun Heo 已提交
597
 *	@ns: the namespace tag to use
598 599
 *
 *	Look for sysfs_dirent with name @name under @parent_sd.
600
 *
601
 *	LOCKING:
602
 *	mutex_lock(sysfs_mutex)
603
 *
604 605
 *	RETURNS:
 *	Pointer to sysfs_dirent if found, NULL if not.
606
 */
607
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
T
Tejun Heo 已提交
608 609
				       const unsigned char *name,
				       const void *ns)
610
{
611
	struct rb_node *node = parent_sd->s_dir.children.rb_node;
612
	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
613
	unsigned int hash;
614

615
	if (has_ns != (bool)ns) {
616
		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
617 618
		     has_ns ? "required" : "invalid",
		     parent_sd->s_name, name);
619 620 621
		return NULL;
	}

T
Tejun Heo 已提交
622
	hash = sysfs_name_hash(name, ns);
623 624 625 626 627
	while (node) {
		struct sysfs_dirent *sd;
		int result;

		sd = to_sysfs_dirent(node);
T
Tejun Heo 已提交
628
		result = sysfs_name_compare(hash, name, ns, sd);
629 630 631 632 633 634
		if (result < 0)
			node = node->rb_left;
		else if (result > 0)
			node = node->rb_right;
		else
			return sd;
635
	}
636
	return NULL;
637
}
638

639
/**
T
Tejun Heo 已提交
640
 *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
641 642
 *	@parent_sd: sysfs_dirent to search under
 *	@name: name to look for
T
Tejun Heo 已提交
643
 *	@ns: the namespace tag to use
644 645 646 647 648
 *
 *	Look for sysfs_dirent with name @name under @parent_sd and get
 *	it if found.
 *
 *	LOCKING:
649
 *	Kernel thread context (may sleep).  Grabs sysfs_mutex.
650 651 652 653
 *
 *	RETURNS:
 *	Pointer to sysfs_dirent if found, NULL if not.
 */
T
Tejun Heo 已提交
654 655 656
struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
					 const unsigned char *name,
					 const void *ns)
657 658 659
{
	struct sysfs_dirent *sd;

660
	mutex_lock(&sysfs_mutex);
T
Tejun Heo 已提交
661
	sd = sysfs_find_dirent(parent_sd, name, ns);
662
	sysfs_get(sd);
663
	mutex_unlock(&sysfs_mutex);
664 665

	return sd;
666
}
T
Tejun Heo 已提交
667
EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
668

669
static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
T
Tejun Heo 已提交
670 671
		      const char *name, const void *ns,
		      struct sysfs_dirent **p_sd)
L
Linus Torvalds 已提交
672
{
673
	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
674
	struct sysfs_addrm_cxt acxt;
675
	struct sysfs_dirent *sd;
676
	int rc;
L
Linus Torvalds 已提交
677

678
	/* allocate */
679
	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
680
	if (!sd)
681
		return -ENOMEM;
682 683

	sd->s_ns = ns;
684
	sd->priv = kobj;
685

686
	/* link in */
687 688
	sysfs_addrm_start(&acxt);
	rc = sysfs_add_one(&acxt, sd, parent_sd);
689
	sysfs_addrm_finish(&acxt);
690

691 692 693
	if (rc == 0)
		*p_sd = sd;
	else
694
		sysfs_put(sd);
695

696
	return rc;
L
Linus Torvalds 已提交
697 698
}

699 700
int sysfs_create_subdir(struct kobject *kobj, const char *name,
			struct sysfs_dirent **p_sd)
L
Linus Torvalds 已提交
701
{
702
	return create_dir(kobj, kobj->sd, name, NULL, p_sd);
L
Linus Torvalds 已提交
703 704 705
}

/**
706 707 708
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
L
Linus Torvalds 已提交
709
 */
710
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
L
Linus Torvalds 已提交
711
{
712
	struct sysfs_dirent *parent_sd, *sd;
L
Linus Torvalds 已提交
713 714 715 716
	int error = 0;

	BUG_ON(!kobj);

717
	if (kobj->parent)
718
		parent_sd = kobj->parent->sd;
L
Linus Torvalds 已提交
719
	else
E
Eric W. Biederman 已提交
720
		parent_sd = &sysfs_root;
L
Linus Torvalds 已提交
721

722 723 724
	if (!parent_sd)
		return -ENOENT;

725
	error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd);
L
Linus Torvalds 已提交
726
	if (!error)
727
		kobj->sd = sd;
L
Linus Torvalds 已提交
728 729 730
	return error;
}

731 732
static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
				   unsigned int flags)
L
Linus Torvalds 已提交
733
{
734
	struct dentry *ret = NULL;
735 736
	struct dentry *parent = dentry->d_parent;
	struct sysfs_dirent *parent_sd = parent->d_fsdata;
737
	struct sysfs_dirent *sd;
738
	struct inode *inode;
739
	const void *ns = NULL;
L
Linus Torvalds 已提交
740

741 742
	mutex_lock(&sysfs_mutex);

743 744
	if (parent_sd->s_flags & SYSFS_FLAG_NS)
		ns = sysfs_info(dir->i_sb)->ns;
745

T
Tejun Heo 已提交
746
	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
L
Linus Torvalds 已提交
747

748
	/* no such entry */
749 750
	if (!sd) {
		ret = ERR_PTR(-ENOENT);
751
		goto out_unlock;
752
	}
753
	dentry->d_fsdata = sysfs_get(sd);
754 755

	/* attach dentry and inode */
756
	inode = sysfs_get_inode(dir->i_sb, sd);
757 758 759 760
	if (!inode) {
		ret = ERR_PTR(-ENOMEM);
		goto out_unlock;
	}
761

T
Tejun Heo 已提交
762
	/* instantiate and hash dentry */
A
Al Viro 已提交
763
	ret = d_materialise_unique(dentry, inode);
764
 out_unlock:
765
	mutex_unlock(&sysfs_mutex);
766
	return ret;
L
Linus Torvalds 已提交
767 768
}

769
const struct inode_operations sysfs_dir_inode_operations = {
L
Linus Torvalds 已提交
770
	.lookup		= sysfs_lookup,
771
	.permission	= sysfs_permission,
772
	.setattr	= sysfs_setattr,
773
	.getattr	= sysfs_getattr,
774
	.setxattr	= sysfs_setxattr,
L
Linus Torvalds 已提交
775 776
};

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
{
	struct sysfs_dirent *last;

	while (true) {
		struct rb_node *rbn;

		last = pos;

		if (sysfs_type(pos) != SYSFS_DIR)
			break;

		rbn = rb_first(&pos->s_dir.children);
		if (!rbn)
			break;

		pos = to_sysfs_dirent(rbn);
	}

	return last;
}

/**
 * sysfs_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: sysfs_dirent whose descendants to walk
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
						       struct sysfs_dirent *root)
{
	struct rb_node *rbn;

	lockdep_assert_held(&sysfs_mutex);

	/* if first iteration, visit leftmost descendant which may be root */
	if (!pos)
		return sysfs_leftmost_descendant(root);

	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
	rbn = rb_next(&pos->s_rb);
	if (rbn)
		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));

	/* no sibling left, visit parent */
	return pos->s_parent;
}
L
Linus Torvalds 已提交
831

832 833
static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
			    struct sysfs_dirent *sd)
L
Linus Torvalds 已提交
834
{
835
	struct sysfs_dirent *pos, *next;
L
Linus Torvalds 已提交
836

T
Tejun Heo 已提交
837
	if (!sd)
L
Linus Torvalds 已提交
838 839
		return;

T
Tejun Heo 已提交
840
	pr_debug("sysfs %s: removing\n", sd->s_name);
841

842 843 844
	next = NULL;
	do {
		pos = next;
T
Tejun Heo 已提交
845
		next = sysfs_next_descendant_post(pos, sd);
846
		if (pos)
T
Tejun Heo 已提交
847
			sysfs_remove_one(acxt, pos);
848
	} while (next);
T
Tejun Heo 已提交
849
}
850

T
Tejun Heo 已提交
851
/**
852
 * kernfs_remove - remove a sysfs_dirent recursively
T
Tejun Heo 已提交
853 854 855 856
 * @sd: the sysfs_dirent to remove
 *
 * Remove @sd along with all its subdirectories and files.
 */
857
void kernfs_remove(struct sysfs_dirent *sd)
T
Tejun Heo 已提交
858 859 860 861
{
	struct sysfs_addrm_cxt acxt;

	sysfs_addrm_start(&acxt);
862
	__kernfs_remove(&acxt, sd);
863
	sysfs_addrm_finish(&acxt);
864 865
}

866
/**
867
 * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
868 869 870 871 872 873 874
 * @dir_sd: parent of the target
 * @name: name of the sysfs_dirent to remove
 * @ns: namespace tag of the sysfs_dirent to remove
 *
 * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
 * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
 */
875 876
int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
			     const void *ns)
877 878 879 880 881 882 883 884 885 886 887 888 889 890
{
	struct sysfs_addrm_cxt acxt;
	struct sysfs_dirent *sd;

	if (!dir_sd) {
		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
			name);
		return -ENOENT;
	}

	sysfs_addrm_start(&acxt);

	sd = sysfs_find_dirent(dir_sd, name, ns);
	if (sd)
891
		__kernfs_remove(&acxt, sd);
892 893 894 895 896 897 898 899 900

	sysfs_addrm_finish(&acxt);

	if (sd)
		return 0;
	else
		return -ENOENT;
}

901 902 903 904 905 906 907 908
/**
 *	sysfs_remove_dir - remove an object's directory.
 *	@kobj:	object.
 *
 *	The only thing special about this is that we remove any files in
 *	the directory before we remove the directory, and we've inlined
 *	what used to be sysfs_rmdir() below, instead of calling separately.
 */
909
void sysfs_remove_dir(struct kobject *kobj)
910
{
911
	struct sysfs_dirent *sd = kobj->sd;
912

913 914 915 916 917 918 919 920 921 922 923 924 925
	/*
	 * In general, kboject owner is responsible for ensuring removal
	 * doesn't race with other operations and sysfs doesn't provide any
	 * protection; however, when @kobj is used as a symlink target, the
	 * symlinking entity usually doesn't own @kobj and thus has no
	 * control over removal.  @kobj->sd may be removed anytime and
	 * symlink code may end up dereferencing an already freed sd.
	 *
	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
	 * against symlink operations so that symlink code can safely
	 * dereference @kobj->sd.
	 */
	spin_lock(&sysfs_symlink_target_lock);
926
	kobj->sd = NULL;
927
	spin_unlock(&sysfs_symlink_target_lock);
928

T
Tejun Heo 已提交
929 930
	if (sd) {
		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
931
		kernfs_remove(sd);
T
Tejun Heo 已提交
932
	}
L
Linus Torvalds 已提交
933 934
}

935 936 937 938 939 940 941 942 943
/**
 * kernfs_rename_ns - move and rename a kernfs_node
 * @sd: target node
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
		     const char *new_name, const void *new_ns)
L
Linus Torvalds 已提交
944
{
945
	int error;
L
Linus Torvalds 已提交
946

947
	mutex_lock(&sysfs_mutex);
948

949
	error = 0;
950
	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
951
	    (strcmp(sd->s_name, new_name) == 0))
952 953 954
		goto out;	/* nothing to rename */

	error = -EEXIST;
955
	if (sysfs_find_dirent(new_parent, new_name, new_ns))
956
		goto out;
957

958
	/* rename sysfs_dirent */
959 960
	if (strcmp(sd->s_name, new_name) != 0) {
		error = -ENOMEM;
961
		new_name = kstrdup(new_name, GFP_KERNEL);
962 963 964
		if (!new_name)
			goto out;

965
		kfree(sd->s_name);
966 967
		sd->s_name = new_name;
	}
T
Tejun Heo 已提交
968

969 970 971
	/*
	 * Move to the appropriate place in the appropriate directories rbtree.
	 */
972
	sysfs_unlink_sibling(sd);
973
	sysfs_get(new_parent);
974
	sysfs_put(sd->s_parent);
975
	sd->s_ns = new_ns;
T
Tejun Heo 已提交
976
	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
977
	sd->s_parent = new_parent;
978
	sysfs_link_sibling(sd);
T
Tejun Heo 已提交
979

980
	error = 0;
981
 out:
982
	mutex_unlock(&sysfs_mutex);
L
Linus Torvalds 已提交
983 984 985
	return error;
}

986 987
int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
			const void *new_ns)
988
{
989 990
	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;

991
	return kernfs_rename_ns(kobj->sd, parent_sd, new_name, new_ns);
992 993
}

994 995
int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
		      const void *new_ns)
996
{
997 998
	struct sysfs_dirent *sd = kobj->sd;
	struct sysfs_dirent *new_parent_sd;
999

1000
	BUG_ON(!sd->s_parent);
1001
	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
1002
		new_parent_kobj->sd : &sysfs_root;
1003

1004
	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
1005 1006
}

1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
/**
 * sysfs_enable_ns - enable namespace under a directory
 * @sd: directory of interest, should be empty
 *
 * This is to be called right after @sd is created to enable namespace
 * under it.  All children of @sd must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
void sysfs_enable_ns(struct sysfs_dirent *sd)
{
	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
	sd->s_flags |= SYSFS_FLAG_NS;
}

L
Linus Torvalds 已提交
1022 1023 1024 1025 1026 1027
/* Relationship between s_mode and the DT_xxx types */
static inline unsigned char dt_type(struct sysfs_dirent *sd)
{
	return (sd->s_mode >> 12) & 15;
}

1028 1029 1030 1031 1032 1033
static int sysfs_dir_release(struct inode *inode, struct file *filp)
{
	sysfs_put(filp->private_data);
	return 0;
}

1034
static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
1035
	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
1036 1037 1038 1039
{
	if (pos) {
		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
			pos->s_parent == parent_sd &&
1040
			hash == pos->s_hash;
1041
		sysfs_put(pos);
1042 1043
		if (!valid)
			pos = NULL;
1044
	}
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
	if (!pos && (hash > 1) && (hash < INT_MAX)) {
		struct rb_node *node = parent_sd->s_dir.children.rb_node;
		while (node) {
			pos = to_sysfs_dirent(node);

			if (hash < pos->s_hash)
				node = node->rb_left;
			else if (hash > pos->s_hash)
				node = node->rb_right;
			else
1055 1056 1057
				break;
		}
	}
1058
	/* Skip over entries in the wrong namespace */
1059
	while (pos && pos->s_ns != ns) {
1060 1061
		struct rb_node *node = rb_next(&pos->s_rb);
		if (!node)
1062 1063
			pos = NULL;
		else
1064
			pos = to_sysfs_dirent(node);
1065 1066 1067 1068
	}
	return pos;
}

1069 1070
static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
1071
{
1072
	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
1073 1074 1075 1076 1077 1078 1079 1080
	if (pos)
		do {
			struct rb_node *node = rb_next(&pos->s_rb);
			if (!node)
				pos = NULL;
			else
				pos = to_sysfs_dirent(node);
		} while (pos && pos->s_ns != ns);
1081 1082 1083
	return pos;
}

A
Al Viro 已提交
1084
static int sysfs_readdir(struct file *file, struct dir_context *ctx)
L
Linus Torvalds 已提交
1085
{
A
Al Viro 已提交
1086
	struct dentry *dentry = file->f_path.dentry;
1087
	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
A
Al Viro 已提交
1088
	struct sysfs_dirent *pos = file->private_data;
1089
	const void *ns = NULL;
1090

A
Al Viro 已提交
1091 1092
	if (!dir_emit_dots(file, ctx))
		return 0;
1093
	mutex_lock(&sysfs_mutex);
1094 1095 1096 1097

	if (parent_sd->s_flags & SYSFS_FLAG_NS)
		ns = sysfs_info(dentry->d_sb)->ns;

A
Al Viro 已提交
1098
	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1099
	     pos;
A
Al Viro 已提交
1100 1101 1102 1103 1104 1105 1106
	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
		const char *name = pos->s_name;
		unsigned int type = dt_type(pos);
		int len = strlen(name);
		ino_t ino = pos->s_ino;
		ctx->pos = pos->s_hash;
		file->private_data = sysfs_get(pos);
L
Linus Torvalds 已提交
1107

1108
		mutex_unlock(&sysfs_mutex);
A
Al Viro 已提交
1109 1110
		if (!dir_emit(ctx, name, len, ino, type))
			return 0;
1111 1112 1113
		mutex_lock(&sysfs_mutex);
	}
	mutex_unlock(&sysfs_mutex);
A
Al Viro 已提交
1114 1115
	file->private_data = NULL;
	ctx->pos = INT_MAX;
E
Eric W. Biederman 已提交
1116
	return 0;
L
Linus Torvalds 已提交
1117 1118
}

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file_inode(file);
	loff_t ret;

	mutex_lock(&inode->i_mutex);
	ret = generic_file_llseek(file, offset, whence);
	mutex_unlock(&inode->i_mutex);

	return ret;
}
E
Eric W. Biederman 已提交
1130

1131
const struct file_operations sysfs_dir_operations = {
L
Linus Torvalds 已提交
1132
	.read		= generic_read_dir,
A
Al Viro 已提交
1133
	.iterate	= sysfs_readdir,
1134
	.release	= sysfs_dir_release,
1135
	.llseek		= sysfs_dir_llseek,
L
Linus Torvalds 已提交
1136
};