cgroup.c 130.3 KB
Newer Older
1 2 3 4 5 6
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
7 8 9 10
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cgroup.h>
30
#include <linux/cred.h>
31
#include <linux/ctype.h>
32
#include <linux/errno.h>
33
#include <linux/init_task.h>
34 35 36 37 38 39
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
40
#include <linux/proc_fs.h>
41 42 43 44 45
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
46
#include <linux/sort.h>
47
#include <linux/kmod.h>
B
Balbir Singh 已提交
48 49
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
50
#include <linux/hashtable.h>
L
Li Zefan 已提交
51
#include <linux/pid_namespace.h>
52
#include <linux/idr.h>
53
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
54
#include <linux/flex_array.h> /* used in cgroup_attach_task */
55
#include <linux/kthread.h>
B
Balbir Singh 已提交
56

A
Arun Sharma 已提交
57
#include <linux/atomic.h>
58

59 60 61 62 63 64 65 66
/*
 * pidlists linger the following amount before being destroyed.  The goal
 * is avoiding frequent destruction in the middle of consecutive read calls
 * Expiring in the middle is a performance problem not a correctness one.
 * 1 sec should be enough.
 */
#define CGROUP_PIDLIST_DESTROY_DELAY	HZ

T
Tejun Heo 已提交
67 68 69
#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
					 MAX_CFTYPE_NAME + 2)

T
Tejun Heo 已提交
70 71 72 73 74 75 76 77 78
/*
 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
 * creation/removal and hierarchy changing operations including cgroup
 * creation, removal, css association and controller rebinding.  This outer
 * lock is needed mainly to resolve the circular dependency between kernfs
 * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
 */
static DEFINE_MUTEX(cgroup_tree_mutex);

T
Tejun Heo 已提交
79 80 81 82
/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 */
T
Tejun Heo 已提交
83 84
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
85
EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for lockdep */
T
Tejun Heo 已提交
86
#else
87
static DEFINE_MUTEX(cgroup_mutex);
T
Tejun Heo 已提交
88 89
#endif

90 91 92 93 94 95
/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 */
static DEFINE_SPINLOCK(release_agent_path_lock);

T
Tejun Heo 已提交
96
#define cgroup_assert_mutexes_or_rcu_locked()				\
97
	rcu_lockdep_assert(rcu_read_lock_held() ||			\
T
Tejun Heo 已提交
98
			   lockdep_is_held(&cgroup_tree_mutex) ||	\
99
			   lockdep_is_held(&cgroup_mutex),		\
T
Tejun Heo 已提交
100
			   "cgroup_[tree_]mutex or RCU read lock required");
101

102 103 104 105 106 107 108 109
/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

110 111 112 113 114 115
/*
 * pidlist destructions need to be flushed on cgroup destruction.  Use a
 * separate workqueue as flush domain.
 */
static struct workqueue_struct *cgroup_pidlist_destroy_wq;

T
Tejun Heo 已提交
116
/* generate an array of cgroup subsystem pointers */
117
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
T
Tejun Heo 已提交
118
static struct cgroup_subsys *cgroup_subsys[] = {
119 120
#include <linux/cgroup_subsys.h>
};
121 122 123 124 125 126 127 128
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
129 130

/*
131 132 133
 * The dummy hierarchy, reserved for the subsystems that are otherwise
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
134
 */
135 136 137 138
static struct cgroupfs_root cgroup_dummy_root;

/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
139 140 141

/* The list of hierarchy roots */

142 143
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
144

T
Tejun Heo 已提交
145
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
146
static DEFINE_IDR(cgroup_hierarchy_idr);
147

148 149 150 151 152
/*
 * Assign a monotonically increasing serial number to cgroups.  It
 * guarantees cgroups with bigger numbers are newer than those with smaller
 * numbers.  Also, as cgroups are always appended to the parent's
 * ->children list, it guarantees that sibling cgroups are always sorted in
153 154
 * the ascending serial number order on the list.  Protected by
 * cgroup_mutex.
155
 */
156
static u64 cgroup_serial_nr_next = 1;
157

158
/* This flag indicates whether tasks in the fork and exit paths should
L
Li Zefan 已提交
159 160 161
 * check for fork/exit handlers to call. This avoids us having to do
 * extra work in the fork/exit path if none of the subsystems need to
 * be called.
162
 */
163
static int need_forkexit_callback __read_mostly;
164

165 166
static struct cftype cgroup_base_files[];

167
static void cgroup_put(struct cgroup *cgrp);
168 169
static int rebind_subsystems(struct cgroupfs_root *root,
			     unsigned long added_mask, unsigned removed_mask);
170
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
171
static int cgroup_destroy_locked(struct cgroup *cgrp);
172 173
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
			      bool is_add);
174
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
175

T
Tejun Heo 已提交
176 177 178
/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
179
 * @ss: the subsystem of interest (%NULL returns the dummy_css)
T
Tejun Heo 已提交
180
 *
181 182 183 184 185
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
T
Tejun Heo 已提交
186 187
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
188
					      struct cgroup_subsys *ss)
T
Tejun Heo 已提交
189
{
190
	if (ss)
191
		return rcu_dereference_check(cgrp->subsys[ss->id],
T
Tejun Heo 已提交
192 193
					lockdep_is_held(&cgroup_tree_mutex) ||
					lockdep_is_held(&cgroup_mutex));
194 195
	else
		return &cgrp->dummy_css;
T
Tejun Heo 已提交
196
}
197

198
/* convenient tests for these bits */
199
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
200
{
201
	return test_bit(CGRP_DEAD, &cgrp->flags);
202 203
}

204 205
struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
T
Tejun Heo 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
	struct kernfs_open_file *of = seq->private;
	struct cgroup *cgrp = of->kn->parent->priv;
	struct cftype *cft = seq_cft(seq);

	/*
	 * This is open and unprotected implementation of cgroup_css().
	 * seq_css() is only called from a kernfs file operation which has
	 * an active reference on the file.  Because all the subsystem
	 * files are drained before a css is disassociated with a cgroup,
	 * the matching css from the cgroup's subsys table is guaranteed to
	 * be and stay valid until the enclosing operation is complete.
	 */
	if (cft->ss)
		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
	else
		return &cgrp->dummy_css;
222 223 224
}
EXPORT_SYMBOL_GPL(seq_css);

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
{
	while (cgrp) {
		if (cgrp == ancestor)
			return true;
		cgrp = cgrp->parent;
	}
	return false;
}
EXPORT_SYMBOL_GPL(cgroup_is_descendant);
244

245
static int cgroup_is_releasable(const struct cgroup *cgrp)
246 247
{
	const int bits =
248 249 250
		(1 << CGRP_RELEASABLE) |
		(1 << CGRP_NOTIFY_ON_RELEASE);
	return (cgrp->flags & bits) == bits;
251 252
}

253
static int notify_on_release(const struct cgroup *cgrp)
254
{
255
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
256 257
}

T
Tejun Heo 已提交
258 259 260 261 262 263 264 265 266 267 268 269
/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_mutex.
 */
#define for_each_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = rcu_dereference_check(			\
				(cgrp)->subsys[(ssid)],			\
T
Tejun Heo 已提交
270
				lockdep_is_held(&cgroup_tree_mutex) ||	\
T
Tejun Heo 已提交
271 272 273
				lockdep_is_held(&cgroup_mutex)))) { }	\
		else

274
/**
T
Tejun Heo 已提交
275
 * for_each_subsys - iterate all enabled cgroup subsystems
276
 * @ss: the iteration cursor
277
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
278
 */
279
#define for_each_subsys(ss, ssid)					\
T
Tejun Heo 已提交
280 281
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
282

283 284 285
/* iterate across the active hierarchies */
#define for_each_active_root(root)					\
	list_for_each_entry((root), &cgroup_roots, root_list)
286

287 288 289 290
/**
 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
 * @cgrp: the cgroup to be checked for liveness
 *
T
Tejun Heo 已提交
291 292
 * On success, returns true; the mutex should be later unlocked.  On
 * failure returns false with no lock held.
293
 */
294
static bool cgroup_lock_live_group(struct cgroup *cgrp)
295 296
{
	mutex_lock(&cgroup_mutex);
297
	if (cgroup_is_dead(cgrp)) {
298 299 300 301 302 303
		mutex_unlock(&cgroup_mutex);
		return false;
	}
	return true;
}

304 305 306
/* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
static LIST_HEAD(release_list);
307
static DEFINE_RAW_SPINLOCK(release_list_lock);
308 309
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
310
static void check_for_release(struct cgroup *cgrp);
311

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
	/* the cgroup and css_set this link associates */
	struct cgroup		*cgrp;
	struct css_set		*cset;

	/* list of cgrp_cset_links anchored at cgrp->cset_links */
	struct list_head	cset_link;

	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
	struct list_head	cgrp_link;
330 331 332 333 334 335 336 337 338 339
};

/* The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */

static struct css_set init_css_set;
340
static struct cgrp_cset_link init_cgrp_cset_link;
341

342 343 344
/*
 * css_set_lock protects the list of css_set objects, and the chain of
 * tasks off each css_set.  Nests outside task->alloc_lock due to
345
 * css_task_iter_start().
346
 */
347 348 349
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;

350 351 352 353 354
/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
355
#define CSS_SET_HASH_BITS	7
356
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357

358
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359
{
360
	unsigned long key = 0UL;
361 362
	struct cgroup_subsys *ss;
	int i;
363

364
	for_each_subsys(ss, i)
365 366
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;
367

368
	return key;
369 370
}

371 372
/*
 * We don't maintain the lists running through each css_set to its task
373 374 375
 * until after the first call to css_task_iter_start().  This reduces the
 * fork()/exit() overhead for people who have cgroups compiled into their
 * kernel but not actually in use.
376
 */
377
static int use_task_css_set_links __read_mostly;
378

379
static void __put_css_set(struct css_set *cset, int taskexit)
380
{
381
	struct cgrp_cset_link *link, *tmp_link;
382

383 384 385 386 387
	/*
	 * Ensure that the refcount doesn't hit zero while any readers
	 * can see it. Similar to atomic_dec_and_lock(), but for an
	 * rwlock
	 */
388
	if (atomic_add_unless(&cset->refcount, -1, 1))
389 390
		return;
	write_lock(&css_set_lock);
391
	if (!atomic_dec_and_test(&cset->refcount)) {
392 393 394
		write_unlock(&css_set_lock);
		return;
	}
395

396
	/* This css_set is dead. unlink it and release cgroup refcounts */
397
	hash_del(&cset->hlist);
398 399
	css_set_count--;

400
	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
401
		struct cgroup *cgrp = link->cgrp;
402

403 404
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
L
Li Zefan 已提交
405

406
		/* @cgrp can't go away while we're holding css_set_lock */
T
Tejun Heo 已提交
407
		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408
			if (taskexit)
409 410
				set_bit(CGRP_RELEASABLE, &cgrp->flags);
			check_for_release(cgrp);
411
		}
412 413

		kfree(link);
414
	}
415 416

	write_unlock(&css_set_lock);
417
	kfree_rcu(cset, rcu_head);
418 419
}

420 421 422
/*
 * refcounted get/put for css_set objects
 */
423
static inline void get_css_set(struct css_set *cset)
424
{
425
	atomic_inc(&cset->refcount);
426 427
}

428
static inline void put_css_set(struct css_set *cset)
429
{
430
	__put_css_set(cset, 0);
431 432
}

433
static inline void put_css_set_taskexit(struct css_set *cset)
434
{
435
	__put_css_set(cset, 1);
436 437
}

438
/**
439
 * compare_css_sets - helper function for find_existing_css_set().
440 441
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
442 443 444
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
L
Li Zefan 已提交
445
 * Returns true if "cset" matches "old_cset" except for the hierarchy
446 447
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
448 449
static bool compare_css_sets(struct css_set *cset,
			     struct css_set *old_cset,
450 451 452 453 454
			     struct cgroup *new_cgrp,
			     struct cgroup_subsys_state *template[])
{
	struct list_head *l1, *l2;

455
	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
456 457 458 459 460 461 462 463 464 465 466 467 468
		/* Not all subsystems matched */
		return false;
	}

	/*
	 * Compare cgroup pointers in order to distinguish between
	 * different cgroups in heirarchies with no subsystems. We
	 * could get by with just this check alone (and skip the
	 * memcmp above) but on most setups the memcmp check will
	 * avoid the need for this more expensive check on almost all
	 * candidates.
	 */

469 470
	l1 = &cset->cgrp_links;
	l2 = &old_cset->cgrp_links;
471
	while (1) {
472
		struct cgrp_cset_link *link1, *link2;
473
		struct cgroup *cgrp1, *cgrp2;
474 475 476 477

		l1 = l1->next;
		l2 = l2->next;
		/* See if we reached the end - both lists are equal length. */
478 479
		if (l1 == &cset->cgrp_links) {
			BUG_ON(l2 != &old_cset->cgrp_links);
480 481
			break;
		} else {
482
			BUG_ON(l2 == &old_cset->cgrp_links);
483 484
		}
		/* Locate the cgroups associated with these links. */
485 486 487 488
		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
		cgrp1 = link1->cgrp;
		cgrp2 = link2->cgrp;
489
		/* Hierarchies should be linked in the same order. */
490
		BUG_ON(cgrp1->root != cgrp2->root);
491 492 493 494 495 496 497 498

		/*
		 * If this hierarchy is the hierarchy of the cgroup
		 * that's changing, then we need to check that this
		 * css_set points to the new cgroup; if it's any other
		 * hierarchy, then this css_set should point to the
		 * same cgroup as the old css_set.
		 */
499 500
		if (cgrp1->root == new_cgrp->root) {
			if (cgrp1 != new_cgrp)
501 502
				return false;
		} else {
503
			if (cgrp1 != cgrp2)
504 505 506 507 508 509
				return false;
		}
	}
	return true;
}

510 511 512 513 514
/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
515
 */
516 517 518
static struct css_set *find_existing_css_set(struct css_set *old_cset,
					struct cgroup *cgrp,
					struct cgroup_subsys_state *template[])
519
{
520
	struct cgroupfs_root *root = cgrp->root;
521
	struct cgroup_subsys *ss;
522
	struct css_set *cset;
523
	unsigned long key;
524
	int i;
525

B
Ben Blum 已提交
526 527 528 529 530
	/*
	 * Build the set of subsystem state objects that we want to see in the
	 * new css_set. while subsystems can change globally, the entries here
	 * won't change, so no need for locking.
	 */
531
	for_each_subsys(ss, i) {
532
		if (root->subsys_mask & (1UL << i)) {
533 534 535
			/* Subsystem is in this hierarchy. So we want
			 * the subsystem state from the new
			 * cgroup */
536
			template[i] = cgroup_css(cgrp, ss);
537 538 539
		} else {
			/* Subsystem is not in this hierarchy, so we
			 * don't want to change the subsystem state */
540
			template[i] = old_cset->subsys[i];
541 542 543
		}
	}

544
	key = css_set_hash(template);
545 546
	hash_for_each_possible(css_set_table, cset, hlist, key) {
		if (!compare_css_sets(cset, old_cset, cgrp, template))
547 548 549
			continue;

		/* This css_set matches what we need */
550
		return cset;
551
	}
552 553 554 555 556

	/* No existing cgroup group matched */
	return NULL;
}

557
static void free_cgrp_cset_links(struct list_head *links_to_free)
558
{
559
	struct cgrp_cset_link *link, *tmp_link;
560

561 562
	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
		list_del(&link->cset_link);
563 564 565 566
		kfree(link);
	}
}

567 568 569 570 571 572 573
/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
574
 */
575
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
576
{
577
	struct cgrp_cset_link *link;
578
	int i;
579 580 581

	INIT_LIST_HEAD(tmp_links);

582
	for (i = 0; i < count; i++) {
583
		link = kzalloc(sizeof(*link), GFP_KERNEL);
584
		if (!link) {
585
			free_cgrp_cset_links(tmp_links);
586 587
			return -ENOMEM;
		}
588
		list_add(&link->cset_link, tmp_links);
589 590 591 592
	}
	return 0;
}

593 594
/**
 * link_css_set - a helper function to link a css_set to a cgroup
595
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
596
 * @cset: the css_set to be linked
597 598
 * @cgrp: the destination cgroup
 */
599 600
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
601
{
602
	struct cgrp_cset_link *link;
603

604 605 606
	BUG_ON(list_empty(tmp_links));
	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
607
	link->cgrp = cgrp;
608
	list_move(&link->cset_link, &cgrp->cset_links);
609 610 611 612
	/*
	 * Always add links to the tail of the list so that the list
	 * is sorted by order of hierarchy creation
	 */
613
	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
614 615
}

616 617 618 619 620 621 622
/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
623
 */
624 625
static struct css_set *find_css_set(struct css_set *old_cset,
				    struct cgroup *cgrp)
626
{
627
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
628
	struct css_set *cset;
629 630
	struct list_head tmp_links;
	struct cgrp_cset_link *link;
631
	unsigned long key;
632

633 634
	lockdep_assert_held(&cgroup_mutex);

635 636
	/* First see if we already have a cgroup group that matches
	 * the desired set */
637
	read_lock(&css_set_lock);
638 639 640
	cset = find_existing_css_set(old_cset, cgrp, template);
	if (cset)
		get_css_set(cset);
641
	read_unlock(&css_set_lock);
642

643 644
	if (cset)
		return cset;
645

646
	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
647
	if (!cset)
648 649
		return NULL;

650
	/* Allocate all the cgrp_cset_link objects that we'll need */
651
	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
652
		kfree(cset);
653 654 655
		return NULL;
	}

656
	atomic_set(&cset->refcount, 1);
657
	INIT_LIST_HEAD(&cset->cgrp_links);
658 659
	INIT_LIST_HEAD(&cset->tasks);
	INIT_HLIST_NODE(&cset->hlist);
660 661 662

	/* Copy the set of subsystem state objects generated in
	 * find_existing_css_set() */
663
	memcpy(cset->subsys, template, sizeof(cset->subsys));
664 665 666

	write_lock(&css_set_lock);
	/* Add reference counts and links from the new css_set. */
667
	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
668
		struct cgroup *c = link->cgrp;
669

670 671
		if (c->root == cgrp->root)
			c = cgrp;
672
		link_css_set(&tmp_links, cset, c);
673
	}
674

675
	BUG_ON(!list_empty(&tmp_links));
676 677

	css_set_count++;
678 679

	/* Add this cgroup group to the hash table */
680 681
	key = css_set_hash(cset->subsys);
	hash_add(css_set_table, &cset->hlist, key);
682

683 684
	write_unlock(&css_set_lock);

685
	return cset;
686 687
}

T
Tejun Heo 已提交
688 689 690 691 692 693 694
static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
	struct cgroup *top_cgrp = kf_root->kn->priv;

	return top_cgrp->root;
}

695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
{
	int id;

	lockdep_assert_held(&cgroup_mutex);

	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
			      GFP_KERNEL);
	if (id < 0)
		return id;

	root->hierarchy_id = id;
	return 0;
}

static void cgroup_exit_root_id(struct cgroupfs_root *root)
{
	lockdep_assert_held(&cgroup_mutex);

	if (root->hierarchy_id) {
		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
		root->hierarchy_id = 0;
	}
}

static void cgroup_free_root(struct cgroupfs_root *root)
{
	if (root) {
		/* hierarhcy ID shoulid already have been released */
		WARN_ON_ONCE(root->hierarchy_id);

		idr_destroy(&root->cgroup_idr);
		kfree(root);
	}
}

731 732
static void cgroup_get_root(struct cgroupfs_root *root)
{
T
Tejun Heo 已提交
733 734 735 736 737 738 739 740
	/*
	 * The caller must ensure that @root is alive, which can be
	 * achieved by holding a ref on one of the member cgroups or
	 * following a registered reference to @root while holding
	 * cgroup_tree_mutex.
	 */
	WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
	atomic_inc(&root->refcnt);
741 742 743 744
}

static void cgroup_put_root(struct cgroupfs_root *root)
{
745 746 747 748
	struct cgroup *cgrp = &root->top_cgroup;
	struct cgrp_cset_link *link, *tmp_link;
	int ret;

T
Tejun Heo 已提交
749 750 751 752 753 754 755 756 757 758 759
	/*
	 * @root's refcnt reaching zero and its deregistration should be
	 * atomic w.r.t. cgroup_tree_mutex.  This ensures that
	 * cgroup_get_root() is safe to invoke if @root is registered.
	 */
	mutex_lock(&cgroup_tree_mutex);
	if (!atomic_dec_and_test(&root->refcnt)) {
		mutex_unlock(&cgroup_tree_mutex);
		return;
	}
	mutex_lock(&cgroup_mutex);
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793

	BUG_ON(root->number_of_cgroups != 1);
	BUG_ON(!list_empty(&cgrp->children));

	/* Rebind all subsystems back to the default hierarchy */
	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
		ret = rebind_subsystems(root, 0, root->subsys_mask);
		/* Shouldn't be able to fail ... */
		BUG_ON(ret);
	}

	/*
	 * Release all the links from cset_links to this hierarchy's
	 * root cgroup
	 */
	write_lock(&css_set_lock);

	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
		kfree(link);
	}
	write_unlock(&css_set_lock);

	if (!list_empty(&root->root_list)) {
		list_del(&root->root_list);
		cgroup_root_count--;
	}

	cgroup_exit_root_id(root);

	mutex_unlock(&cgroup_mutex);
	mutex_unlock(&cgroup_tree_mutex);

T
Tejun Heo 已提交
794
	kernfs_destroy_root(root->kf_root);
795 796 797
	cgroup_free_root(root);
}

798 799 800 801 802 803 804
/*
 * Return the cgroup for "task" from the given hierarchy. Must be
 * called with cgroup_mutex held.
 */
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
					    struct cgroupfs_root *root)
{
805
	struct css_set *cset;
806 807 808 809 810 811 812 813 814
	struct cgroup *res = NULL;

	BUG_ON(!mutex_is_locked(&cgroup_mutex));
	read_lock(&css_set_lock);
	/*
	 * No need to lock the task - since we hold cgroup_mutex the
	 * task can't change groups, so the only thing that can happen
	 * is that it exits and its css is set back to init_css_set.
	 */
815
	cset = task_css_set(task);
816
	if (cset == &init_css_set) {
817 818
		res = &root->top_cgroup;
	} else {
819 820 821
		struct cgrp_cset_link *link;

		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
822
			struct cgroup *c = link->cgrp;
823

824 825 826 827 828 829 830 831 832 833 834
			if (c->root == root) {
				res = c;
				break;
			}
		}
	}
	read_unlock(&css_set_lock);
	BUG_ON(!res);
	return res;
}

835 836 837 838 839 840 841 842 843 844
/*
 * There is one global cgroup mutex. We also require taking
 * task_lock() when dereferencing a task's cgroup subsys pointers.
 * See "The task_lock() exception", at the end of this comment.
 *
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
845
 * cgroup_attach_task() can increment it again.  Because a count of zero
846 847 848 849 850 851 852 853 854 855 856 857 858
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 * (usually) take cgroup_mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
 * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 * is taken, and if the cgroup count is zero, a usermode call made
L
Li Zefan 已提交
859 860
 * to the release agent with the name of the cgroup (path relative to
 * the root of cgroup file system) as the argument.
861 862 863 864 865 866 867 868 869 870 871
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, top_cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that top_cgroup cannot be deleted.
 *
 *	The task_lock() exception
 *
 * The need for this exception arises from the action of
872
 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
L
Li Zefan 已提交
873
 * another.  It does so using cgroup_mutex, however there are
874 875 876
 * several performance critical places that need to reference
 * task->cgroup without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
877
 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
878 879 880 881
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
882
 * update of a tasks cgroup pointer by cgroup_attach_task()
883 884
 */

885
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
T
Tejun Heo 已提交
886
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
887
static const struct file_operations proc_cgroupstats_operations;
888

T
Tejun Heo 已提交
889 890 891 892 893 894 895 896 897 898 899 900
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
			      char *buf)
{
	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
			 cft->ss->name, cft->name);
	else
		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	return buf;
}

901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
 * returns cft->mode if ->mode is not 0
 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 * returns S_IRUGO if it has only a read handler
 * returns S_IWUSR if it has only a write hander
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
{
	umode_t mode = 0;

	if (cft->mode)
		return cft->mode;

	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
		mode |= S_IRUGO;

	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
	    cft->trigger)
		mode |= S_IWUSR;

	return mode;
}

927 928
static void cgroup_free_fn(struct work_struct *work)
{
929
	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
930 931 932 933 934

	mutex_lock(&cgroup_mutex);
	cgrp->root->number_of_cgroups--;
	mutex_unlock(&cgroup_mutex);

935
	/*
936 937 938
	 * We get a ref to the parent, and put the ref when this cgroup is
	 * being freed, so it's guaranteed that the parent won't be
	 * destroyed before its children.
939
	 */
940
	cgroup_put(cgrp->parent);
941

942 943
	/* put the root reference that we took when we created the cgroup */
	cgroup_put_root(cgrp->root);
944

945
	cgroup_pidlist_destroy_all(cgrp);
946

947
	kernfs_put(cgrp->kn);
948 949 950 951 952 953 954
	kfree(cgrp);
}

static void cgroup_free_rcu(struct rcu_head *head)
{
	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

955
	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
956
	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
957 958
}

959 960
static void cgroup_get(struct cgroup *cgrp)
{
T
Tejun Heo 已提交
961 962 963
	WARN_ON_ONCE(cgroup_is_dead(cgrp));
	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
	atomic_inc(&cgrp->refcnt);
964 965
}

966 967
static void cgroup_put(struct cgroup *cgrp)
{
T
Tejun Heo 已提交
968 969 970 971
	if (!atomic_dec_and_test(&cgrp->refcnt))
		return;
	if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
		return;
972

T
Tejun Heo 已提交
973 974 975 976 977 978 979 980 981 982
	/*
	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
	 * css's lifetimes will be decoupled, it should be made
	 * per-subsystem and moved to css->id so that lookups are
	 * successful until the target css is released.
	 */
	mutex_lock(&cgroup_mutex);
	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
	mutex_unlock(&cgroup_mutex);
	cgrp->id = -1;
983

T
Tejun Heo 已提交
984
	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
985 986
}

987
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
T
Tejun Heo 已提交
988
{
T
Tejun Heo 已提交
989
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
990

T
Tejun Heo 已提交
991
	lockdep_assert_held(&cgroup_tree_mutex);
T
Tejun Heo 已提交
992
	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
T
Tejun Heo 已提交
993 994
}

995
/**
996
 * cgroup_clear_dir - remove subsys files in a cgroup directory
997
 * @cgrp: target cgroup
998 999
 * @subsys_mask: mask of the subsystem ids whose files should be removed
 */
1000
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
T
Tejun Heo 已提交
1001
{
1002
	struct cgroup_subsys *ss;
1003
	int i;
T
Tejun Heo 已提交
1004

1005
	for_each_subsys(ss, i) {
T
Tejun Heo 已提交
1006
		struct cftype *cfts;
1007 1008

		if (!test_bit(i, &subsys_mask))
1009
			continue;
T
Tejun Heo 已提交
1010 1011
		list_for_each_entry(cfts, &ss->cfts, node)
			cgroup_addrm_files(cgrp, cfts, false);
1012
	}
1013 1014 1015
}

static int rebind_subsystems(struct cgroupfs_root *root,
1016
			     unsigned long added_mask, unsigned removed_mask)
1017
{
1018
	struct cgroup *cgrp = &root->top_cgroup;
1019
	struct cgroup_subsys *ss;
1020
	int i, ret;
1021

T
Tejun Heo 已提交
1022 1023
	lockdep_assert_held(&cgroup_tree_mutex);
	lockdep_assert_held(&cgroup_mutex);
B
Ben Blum 已提交
1024

1025
	/* Check that any added subsystems are currently free */
T
Tejun Heo 已提交
1026 1027 1028
	for_each_subsys(ss, i)
		if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
			return -EBUSY;
1029

1030 1031
	ret = cgroup_populate_dir(cgrp, added_mask);
	if (ret)
T
Tejun Heo 已提交
1032
		return ret;
1033 1034 1035 1036 1037

	/*
	 * Nothing can fail from this point on.  Remove files for the
	 * removed subsystems and rebind each subsystem.
	 */
1038
	mutex_unlock(&cgroup_mutex);
1039
	cgroup_clear_dir(cgrp, removed_mask);
1040
	mutex_lock(&cgroup_mutex);
1041

1042
	for_each_subsys(ss, i) {
1043
		unsigned long bit = 1UL << i;
1044

1045
		if (bit & added_mask) {
1046
			/* We're binding this subsystem to this hierarchy */
1047 1048 1049
			BUG_ON(cgroup_css(cgrp, ss));
			BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
			BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1050

1051
			rcu_assign_pointer(cgrp->subsys[i],
1052 1053
					   cgroup_css(cgroup_dummy_top, ss));
			cgroup_css(cgrp, ss)->cgroup = cgrp;
1054

1055
			ss->root = root;
1056
			if (ss->bind)
1057
				ss->bind(cgroup_css(cgrp, ss));
1058

B
Ben Blum 已提交
1059
			/* refcount was already taken, and we're keeping it */
1060
			root->subsys_mask |= bit;
1061
		} else if (bit & removed_mask) {
1062
			/* We're removing this subsystem */
1063 1064
			BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
			BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1065

1066
			if (ss->bind)
1067
				ss->bind(cgroup_css(cgroup_dummy_top, ss));
1068

1069
			cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1070 1071
			RCU_INIT_POINTER(cgrp->subsys[i], NULL);

1072
			cgroup_subsys[i]->root = &cgroup_dummy_root;
1073
			root->subsys_mask &= ~bit;
1074 1075 1076
		}
	}

1077 1078 1079 1080 1081
	/*
	 * Mark @root has finished binding subsystems.  @root->subsys_mask
	 * now matches the bound subsystems.
	 */
	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
T
Tejun Heo 已提交
1082
	kernfs_activate(cgrp->kn);
1083

1084 1085 1086
	return 0;
}

T
Tejun Heo 已提交
1087 1088
static int cgroup_show_options(struct seq_file *seq,
			       struct kernfs_root *kf_root)
1089
{
T
Tejun Heo 已提交
1090
	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
1091
	struct cgroup_subsys *ss;
T
Tejun Heo 已提交
1092
	int ssid;
1093

T
Tejun Heo 已提交
1094 1095 1096
	for_each_subsys(ss, ssid)
		if (root->subsys_mask & (1 << ssid))
			seq_printf(seq, ",%s", ss->name);
1097 1098
	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
		seq_puts(seq, ",sane_behavior");
1099
	if (root->flags & CGRP_ROOT_NOPREFIX)
1100
		seq_puts(seq, ",noprefix");
1101
	if (root->flags & CGRP_ROOT_XATTR)
A
Aristeu Rozanski 已提交
1102
		seq_puts(seq, ",xattr");
1103 1104

	spin_lock(&release_agent_path_lock);
1105 1106
	if (strlen(root->release_agent_path))
		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1107 1108
	spin_unlock(&release_agent_path_lock);

1109
	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1110
		seq_puts(seq, ",clone_children");
1111 1112
	if (strlen(root->name))
		seq_printf(seq, ",name=%s", root->name);
1113 1114 1115 1116
	return 0;
}

struct cgroup_sb_opts {
1117
	unsigned long subsys_mask;
1118
	unsigned long flags;
1119
	char *release_agent;
1120
	bool cpuset_clone_children;
1121
	char *name;
1122 1123
	/* User explicitly requested empty subsystem */
	bool none;
1124 1125
};

B
Ben Blum 已提交
1126
/*
1127 1128 1129 1130
 * Convert a hierarchy specifier into a bitmask of subsystems and
 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
 * array. This function takes refcounts on subsystems to be used, unless it
 * returns error, in which case no refcounts are taken.
B
Ben Blum 已提交
1131
 */
B
Ben Blum 已提交
1132
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1133
{
1134 1135
	char *token, *o = data;
	bool all_ss = false, one_ss = false;
1136
	unsigned long mask = (unsigned long)-1;
1137 1138
	struct cgroup_subsys *ss;
	int i;
1139

B
Ben Blum 已提交
1140 1141
	BUG_ON(!mutex_is_locked(&cgroup_mutex));

1142
#ifdef CONFIG_CPUSETS
1143
	mask = ~(1UL << cpuset_cgrp_id);
1144
#endif
1145

1146
	memset(opts, 0, sizeof(*opts));
1147 1148 1149 1150

	while ((token = strsep(&o, ",")) != NULL) {
		if (!*token)
			return -EINVAL;
1151
		if (!strcmp(token, "none")) {
1152 1153
			/* Explicitly have no subsystems */
			opts->none = true;
1154 1155 1156 1157 1158 1159 1160 1161 1162
			continue;
		}
		if (!strcmp(token, "all")) {
			/* Mutually exclusive option 'all' + subsystem name */
			if (one_ss)
				return -EINVAL;
			all_ss = true;
			continue;
		}
1163 1164 1165 1166
		if (!strcmp(token, "__DEVEL__sane_behavior")) {
			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
			continue;
		}
1167
		if (!strcmp(token, "noprefix")) {
1168
			opts->flags |= CGRP_ROOT_NOPREFIX;
1169 1170 1171
			continue;
		}
		if (!strcmp(token, "clone_children")) {
1172
			opts->cpuset_clone_children = true;
1173 1174
			continue;
		}
A
Aristeu Rozanski 已提交
1175
		if (!strcmp(token, "xattr")) {
1176
			opts->flags |= CGRP_ROOT_XATTR;
A
Aristeu Rozanski 已提交
1177 1178
			continue;
		}
1179
		if (!strncmp(token, "release_agent=", 14)) {
1180 1181 1182
			/* Specifying two release agents is forbidden */
			if (opts->release_agent)
				return -EINVAL;
1183
			opts->release_agent =
1184
				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1185 1186
			if (!opts->release_agent)
				return -ENOMEM;
1187 1188 1189
			continue;
		}
		if (!strncmp(token, "name=", 5)) {
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
			const char *name = token + 5;
			/* Can't specify an empty name */
			if (!strlen(name))
				return -EINVAL;
			/* Must match [\w.-]+ */
			for (i = 0; i < strlen(name); i++) {
				char c = name[i];
				if (isalnum(c))
					continue;
				if ((c == '.') || (c == '-') || (c == '_'))
					continue;
				return -EINVAL;
			}
			/* Specifying two names is forbidden */
			if (opts->name)
				return -EINVAL;
			opts->name = kstrndup(name,
1207
					      MAX_CGROUP_ROOT_NAMELEN - 1,
1208 1209 1210
					      GFP_KERNEL);
			if (!opts->name)
				return -ENOMEM;
1211 1212 1213 1214

			continue;
		}

1215
		for_each_subsys(ss, i) {
1216 1217 1218 1219 1220 1221 1222 1223
			if (strcmp(token, ss->name))
				continue;
			if (ss->disabled)
				continue;

			/* Mutually exclusive option 'all' + subsystem name */
			if (all_ss)
				return -EINVAL;
1224
			set_bit(i, &opts->subsys_mask);
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
			one_ss = true;

			break;
		}
		if (i == CGROUP_SUBSYS_COUNT)
			return -ENOENT;
	}

	/*
	 * If the 'all' option was specified select all the subsystems,
1235 1236
	 * otherwise if 'none', 'name=' and a subsystem name options
	 * were not specified, let's default to 'all'
1237
	 */
1238 1239 1240 1241
	if (all_ss || (!one_ss && !opts->none && !opts->name))
		for_each_subsys(ss, i)
			if (!ss->disabled)
				set_bit(i, &opts->subsys_mask);
1242

1243 1244
	/* Consistency checks */

1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

		if (opts->flags & CGRP_ROOT_NOPREFIX) {
			pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
			return -EINVAL;
		}

		if (opts->cpuset_clone_children) {
			pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
			return -EINVAL;
		}
1257 1258 1259

		if (opts->flags & CGRP_ROOT_XATTR)
			pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n");
1260 1261
	}

1262 1263 1264 1265 1266
	/*
	 * Option noprefix was introduced just for backward compatibility
	 * with the old cpuset, so we allow noprefix only if mounting just
	 * the cpuset subsystem.
	 */
1267
	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 1269
		return -EINVAL;

1270 1271

	/* Can't specify "none" and some subsystems */
1272
	if (opts->subsys_mask && opts->none)
1273 1274 1275 1276 1277 1278
		return -EINVAL;

	/*
	 * We either have to specify by name or by subsystems. (So all
	 * empty hierarchies must have a name).
	 */
1279
	if (!opts->subsys_mask && !opts->name)
1280 1281 1282 1283 1284
		return -EINVAL;

	return 0;
}

T
Tejun Heo 已提交
1285
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1286 1287
{
	int ret = 0;
T
Tejun Heo 已提交
1288
	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
1289
	struct cgroup_sb_opts opts;
1290
	unsigned long added_mask, removed_mask;
1291

1292 1293 1294 1295 1296
	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
		pr_err("cgroup: sane_behavior: remount is not allowed\n");
		return -EINVAL;
	}

T
Tejun Heo 已提交
1297
	mutex_lock(&cgroup_tree_mutex);
1298 1299 1300 1301 1302 1303 1304
	mutex_lock(&cgroup_mutex);

	/* See what subsystems are wanted */
	ret = parse_cgroupfs_options(data, &opts);
	if (ret)
		goto out_unlock;

1305
	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1306 1307 1308
		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
			   task_tgid_nr(current), current->comm);

1309 1310
	added_mask = opts.subsys_mask & ~root->subsys_mask;
	removed_mask = root->subsys_mask & ~opts.subsys_mask;
1311

B
Ben Blum 已提交
1312
	/* Don't allow flags or name to change at remount */
1313
	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
B
Ben Blum 已提交
1314
	    (opts.name && strcmp(opts.name, root->name))) {
1315 1316 1317
		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1318 1319 1320 1321
		ret = -EINVAL;
		goto out_unlock;
	}

1322 1323 1324
	/* remounting is not allowed for populated hierarchies */
	if (root->number_of_cgroups > 1) {
		ret = -EBUSY;
1325
		goto out_unlock;
B
Ben Blum 已提交
1326
	}
1327

1328
	ret = rebind_subsystems(root, added_mask, removed_mask);
1329
	if (ret)
1330
		goto out_unlock;
1331

1332 1333
	if (opts.release_agent) {
		spin_lock(&release_agent_path_lock);
1334
		strcpy(root->release_agent_path, opts.release_agent);
1335 1336
		spin_unlock(&release_agent_path_lock);
	}
1337
 out_unlock:
1338
	kfree(opts.release_agent);
1339
	kfree(opts.name);
1340
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
1341
	mutex_unlock(&cgroup_tree_mutex);
1342 1343 1344
	return ret;
}

1345 1346
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
T
Tejun Heo 已提交
1347
	atomic_set(&cgrp->refcnt, 1);
1348 1349
	INIT_LIST_HEAD(&cgrp->sibling);
	INIT_LIST_HEAD(&cgrp->children);
1350
	INIT_LIST_HEAD(&cgrp->cset_links);
1351
	INIT_LIST_HEAD(&cgrp->release_list);
1352 1353
	INIT_LIST_HEAD(&cgrp->pidlists);
	mutex_init(&cgrp->pidlist_mutex);
T
Tejun Heo 已提交
1354
	cgrp->dummy_css.cgroup = cgrp;
1355
}
1356

1357 1358
static void init_cgroup_root(struct cgroupfs_root *root)
{
1359
	struct cgroup *cgrp = &root->top_cgroup;
1360

T
Tejun Heo 已提交
1361
	atomic_set(&root->refcnt, 1);
1362 1363
	INIT_LIST_HEAD(&root->root_list);
	root->number_of_cgroups = 1;
1364
	cgrp->root = root;
1365
	init_cgroup_housekeeping(cgrp);
1366
	idr_init(&root->cgroup_idr);
1367 1368
}

1369 1370 1371 1372
static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
	struct cgroupfs_root *root;

1373
	if (!opts->subsys_mask && !opts->none)
T
Tejun Heo 已提交
1374
		return ERR_PTR(-EINVAL);
1375 1376 1377 1378 1379 1380

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root)
		return ERR_PTR(-ENOMEM);

	init_cgroup_root(root);
1381

1382 1383 1384 1385 1386 1387 1388 1389
	/*
	 * We need to set @root->subsys_mask now so that @root can be
	 * matched by cgroup_test_super() before it finishes
	 * initialization; otherwise, competing mounts with the same
	 * options may try to bind the same subsystems instead of waiting
	 * for the first one leading to unexpected mount errors.
	 * SUBSYS_BOUND will be set once actual binding is complete.
	 */
1390
	root->subsys_mask = opts->subsys_mask;
1391 1392 1393 1394 1395
	root->flags = opts->flags;
	if (opts->release_agent)
		strcpy(root->release_agent_path, opts->release_agent);
	if (opts->name)
		strcpy(root->name, opts->name);
1396 1397
	if (opts->cpuset_clone_children)
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1398 1399 1400
	return root;
}

1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412
static int cgroup_setup_root(struct cgroupfs_root *root)
{
	LIST_HEAD(tmp_links);
	struct cgroup *root_cgrp = &root->top_cgroup;
	struct css_set *cset;
	int i, ret;

	lockdep_assert_held(&cgroup_tree_mutex);
	lockdep_assert_held(&cgroup_mutex);

	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
	if (ret < 0)
T
Tejun Heo 已提交
1413
		goto out;
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
	root_cgrp->id = ret;

	/*
	 * We're accessing css_set_count without locking css_set_lock here,
	 * but that's OK - it can only be increased by someone holding
	 * cgroup_lock, and that's us. The worst that can happen is that we
	 * have some link structures left over
	 */
	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
	if (ret)
T
Tejun Heo 已提交
1424
		goto out;
1425 1426 1427 1428

	/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
	ret = cgroup_init_root_id(root, 2, 0);
	if (ret)
T
Tejun Heo 已提交
1429
		goto out;
1430

T
Tejun Heo 已提交
1431 1432 1433 1434 1435 1436 1437 1438
	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
					   KERNFS_ROOT_CREATE_DEACTIVATED,
					   root_cgrp);
	if (IS_ERR(root->kf_root)) {
		ret = PTR_ERR(root->kf_root);
		goto exit_root_id;
	}
	root_cgrp->kn = root->kf_root->kn;
1439 1440 1441

	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
	if (ret)
T
Tejun Heo 已提交
1442
		goto destroy_root;
1443 1444 1445

	ret = rebind_subsystems(root, root->subsys_mask, 0);
	if (ret)
T
Tejun Heo 已提交
1446
		goto destroy_root;
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467

	/*
	 * There must be no failure case after here, since rebinding takes
	 * care of subsystems' refcounts, which are explicitly dropped in
	 * the failure exit path.
	 */
	list_add(&root->root_list, &cgroup_roots);
	cgroup_root_count++;

	/*
	 * Link the top cgroup in this hierarchy into all the css_set
	 * objects.
	 */
	write_lock(&css_set_lock);
	hash_for_each(css_set_table, i, cset, hlist)
		link_css_set(&tmp_links, cset, root_cgrp);
	write_unlock(&css_set_lock);

	BUG_ON(!list_empty(&root_cgrp->children));
	BUG_ON(root->number_of_cgroups != 1);

T
Tejun Heo 已提交
1468
	kernfs_activate(root_cgrp->kn);
1469
	ret = 0;
T
Tejun Heo 已提交
1470
	goto out;
1471

T
Tejun Heo 已提交
1472 1473 1474 1475
destroy_root:
	kernfs_destroy_root(root->kf_root);
	root->kf_root = NULL;
exit_root_id:
1476
	cgroup_exit_root_id(root);
T
Tejun Heo 已提交
1477
out:
1478 1479 1480 1481
	free_cgrp_cset_links(&tmp_links);
	return ret;
}

A
Al Viro 已提交
1482
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1483
			 int flags, const char *unused_dev_name,
A
Al Viro 已提交
1484
			 void *data)
1485
{
T
Tejun Heo 已提交
1486
	struct cgroupfs_root *root;
1487
	struct cgroup_sb_opts opts;
T
Tejun Heo 已提交
1488
	struct dentry *dentry;
1489
	int ret;
1490

1491
	mutex_lock(&cgroup_tree_mutex);
B
Ben Blum 已提交
1492
	mutex_lock(&cgroup_mutex);
1493 1494

	/* First find the desired set of subsystems */
1495
	ret = parse_cgroupfs_options(data, &opts);
1496
	if (ret)
1497
		goto out_unlock;
1498

T
Tejun Heo 已提交
1499 1500 1501
	/* look for a matching existing root */
	for_each_active_root(root) {
		bool name_match = false;
1502

T
Tejun Heo 已提交
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
		/*
		 * If we asked for a name then it must match.  Also, if
		 * name matches but sybsys_mask doesn't, we should fail.
		 * Remember whether name matched.
		 */
		if (opts.name) {
			if (strcmp(opts.name, root->name))
				continue;
			name_match = true;
		}
1513

1514
		/*
T
Tejun Heo 已提交
1515 1516
		 * If we asked for subsystems (or explicitly for no
		 * subsystems) then they must match.
1517
		 */
T
Tejun Heo 已提交
1518 1519 1520 1521 1522 1523 1524
		if ((opts.subsys_mask || opts.none) &&
		    (opts.subsys_mask != root->subsys_mask)) {
			if (!name_match)
				continue;
			ret = -EBUSY;
			goto out_unlock;
		}
1525

1526
		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1527 1528 1529
			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
				ret = -EINVAL;
1530
				goto out_unlock;
1531 1532 1533
			} else {
				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
			}
1534
		}
T
Tejun Heo 已提交
1535 1536 1537

		cgroup_get_root(root);
		goto out_unlock;
1538 1539
	}

T
Tejun Heo 已提交
1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	/* no such thing, create a new one */
	root = cgroup_root_from_opts(&opts);
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out_unlock;
	}

	ret = cgroup_setup_root(root);
	if (ret)
		cgroup_free_root(root);

1551
out_unlock:
T
Tejun Heo 已提交
1552
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
1553
	mutex_unlock(&cgroup_tree_mutex);
1554

1555 1556
	kfree(opts.release_agent);
	kfree(opts.name);
1557

T
Tejun Heo 已提交
1558
	if (ret)
1559
		return ERR_PTR(ret);
T
Tejun Heo 已提交
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573

	dentry = kernfs_mount(fs_type, flags, root->kf_root);
	if (IS_ERR(dentry))
		cgroup_put_root(root);
	return dentry;
}

static void cgroup_kill_sb(struct super_block *sb)
{
	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);

	cgroup_put_root(root);
	kernfs_kill_sb(sb);
1574 1575 1576 1577
}

static struct file_system_type cgroup_fs_type = {
	.name = "cgroup",
A
Al Viro 已提交
1578
	.mount = cgroup_mount,
1579 1580 1581
	.kill_sb = cgroup_kill_sb,
};

1582 1583
static struct kobject *cgroup_kobj;

1584
/**
1585
 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1586 1587 1588 1589
 * @task: target task
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
1590 1591 1592 1593 1594
 * Determine @task's cgroup on the first (the one with the lowest non-zero
 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
 * function grabs cgroup_mutex and shouldn't be used inside locks used by
 * cgroup controller callbacks.
 *
T
Tejun Heo 已提交
1595
 * Return value is the same as kernfs_path().
1596
 */
T
Tejun Heo 已提交
1597
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1598 1599
{
	struct cgroupfs_root *root;
1600
	struct cgroup *cgrp;
T
Tejun Heo 已提交
1601 1602
	int hierarchy_id = 1;
	char *path = NULL;
1603 1604 1605

	mutex_lock(&cgroup_mutex);

1606 1607
	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1608 1609
	if (root) {
		cgrp = task_cgroup_from_root(task, root);
T
Tejun Heo 已提交
1610
		path = cgroup_path(cgrp, buf, buflen);
1611 1612
	} else {
		/* if no hierarchy exists, everyone is in "/" */
T
Tejun Heo 已提交
1613 1614
		if (strlcpy(buf, "/", buflen) < buflen)
			path = buf;
1615 1616 1617
	}

	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
1618
	return path;
1619
}
1620
EXPORT_SYMBOL_GPL(task_cgroup_path);
1621

1622 1623 1624
/*
 * Control Group taskset
 */
1625 1626 1627
struct task_and_cgroup {
	struct task_struct	*task;
	struct cgroup		*cgrp;
L
Li Zefan 已提交
1628
	struct css_set		*cset;
1629 1630
};

1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
struct cgroup_taskset {
	struct task_and_cgroup	single;
	struct flex_array	*tc_array;
	int			tc_array_len;
	int			idx;
	struct cgroup		*cur_cgrp;
};

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
	if (tset->tc_array) {
		tset->idx = 0;
		return cgroup_taskset_next(tset);
	} else {
		tset->cur_cgrp = tset->single.cgrp;
		return tset->single.task;
	}
}
EXPORT_SYMBOL_GPL(cgroup_taskset_first);

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
	struct task_and_cgroup *tc;

	if (!tset->tc_array || tset->idx >= tset->tc_array_len)
		return NULL;

	tc = flex_array_get(tset->tc_array, tset->idx++);
	tset->cur_cgrp = tc->cgrp;
	return tc->task;
}
EXPORT_SYMBOL_GPL(cgroup_taskset_next);

/**
1678
 * cgroup_taskset_cur_css - return the matching css for the current task
1679
 * @tset: taskset of interest
1680
 * @subsys_id: the ID of the target subsystem
1681
 *
1682 1683 1684
 * Return the css for the current (last returned) task of @tset for
 * subsystem specified by @subsys_id.  This function must be preceded by
 * either cgroup_taskset_first() or cgroup_taskset_next().
1685
 */
1686 1687
struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
						   int subsys_id)
1688
{
1689
	return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1690
}
1691
EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703

/**
 * cgroup_taskset_size - return the number of tasks in taskset
 * @tset: taskset of interest
 */
int cgroup_taskset_size(struct cgroup_taskset *tset)
{
	return tset->tc_array ? tset->tc_array_len : 1;
}
EXPORT_SYMBOL_GPL(cgroup_taskset_size);


B
Ben Blum 已提交
1704 1705 1706
/*
 * cgroup_task_migrate - move a task from one cgroup to another.
 *
1707
 * Must be called with cgroup_mutex and threadgroup locked.
B
Ben Blum 已提交
1708
 */
1709 1710 1711
static void cgroup_task_migrate(struct cgroup *old_cgrp,
				struct task_struct *tsk,
				struct css_set *new_cset)
B
Ben Blum 已提交
1712
{
1713
	struct css_set *old_cset;
B
Ben Blum 已提交
1714 1715

	/*
1716 1717 1718
	 * We are synchronized through threadgroup_lock() against PF_EXITING
	 * setting such that we can't race against cgroup_exit() changing the
	 * css_set to init_css_set and dropping the old one.
B
Ben Blum 已提交
1719
	 */
1720
	WARN_ON_ONCE(tsk->flags & PF_EXITING);
1721
	old_cset = task_css_set(tsk);
B
Ben Blum 已提交
1722 1723

	task_lock(tsk);
1724
	rcu_assign_pointer(tsk->cgroups, new_cset);
B
Ben Blum 已提交
1725 1726 1727 1728 1729
	task_unlock(tsk);

	/* Update the css_set linked lists if we're using them */
	write_lock(&css_set_lock);
	if (!list_empty(&tsk->cg_list))
1730
		list_move(&tsk->cg_list, &new_cset->tasks);
B
Ben Blum 已提交
1731 1732 1733
	write_unlock(&css_set_lock);

	/*
1734 1735 1736
	 * We just gained a reference on old_cset by taking it from the
	 * task. As trading it for new_cset is protected by cgroup_mutex,
	 * we're safe to drop it here; it will be freed under RCU.
B
Ben Blum 已提交
1737
	 */
1738 1739
	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
	put_css_set(old_cset);
B
Ben Blum 已提交
1740 1741
}

L
Li Zefan 已提交
1742
/**
1743
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
B
Ben Blum 已提交
1744
 * @cgrp: the cgroup to attach to
1745 1746
 * @tsk: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
B
Ben Blum 已提交
1747
 *
1748
 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1749
 * task_lock of @tsk or each thread in the threadgroup individually in turn.
B
Ben Blum 已提交
1750
 */
T
Tejun Heo 已提交
1751 1752
static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
			      bool threadgroup)
B
Ben Blum 已提交
1753 1754 1755
{
	int retval, i, group_size;
	struct cgroupfs_root *root = cgrp->root;
T
Tejun Heo 已提交
1756
	struct cgroup_subsys_state *css, *failed_css = NULL;
B
Ben Blum 已提交
1757
	/* threadgroup list cursor and array */
1758
	struct task_struct *leader = tsk;
1759
	struct task_and_cgroup *tc;
1760
	struct flex_array *group;
1761
	struct cgroup_taskset tset = { };
B
Ben Blum 已提交
1762 1763 1764 1765 1766

	/*
	 * step 0: in order to do expensive, possibly blocking operations for
	 * every thread, we cannot iterate the thread group list, since it needs
	 * rcu or tasklist locked. instead, build an array of all threads in the
1767 1768
	 * group - group_rwsem prevents new threads from appearing, and if
	 * threads exit, this will just be an over-estimate.
B
Ben Blum 已提交
1769
	 */
1770 1771 1772 1773
	if (threadgroup)
		group_size = get_nr_threads(tsk);
	else
		group_size = 1;
1774
	/* flex_array supports very large thread-groups better than kmalloc. */
1775
	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
B
Ben Blum 已提交
1776 1777
	if (!group)
		return -ENOMEM;
1778
	/* pre-allocate to guarantee space while iterating in rcu read-side. */
1779
	retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
1780 1781
	if (retval)
		goto out_free_group_list;
B
Ben Blum 已提交
1782 1783

	i = 0;
1784 1785 1786 1787 1788 1789
	/*
	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
	 * already PF_EXITING could be freed from underneath us unless we
	 * take an rcu_read_lock.
	 */
	rcu_read_lock();
B
Ben Blum 已提交
1790
	do {
1791 1792
		struct task_and_cgroup ent;

1793 1794
		/* @tsk either already exited or can't exit until the end */
		if (tsk->flags & PF_EXITING)
1795
			goto next;
1796

B
Ben Blum 已提交
1797 1798
		/* as per above, nr_threads may decrease, but not increase. */
		BUG_ON(i >= group_size);
1799 1800
		ent.task = tsk;
		ent.cgrp = task_cgroup_from_root(tsk, root);
1801 1802
		/* nothing to do if this task is already in the cgroup */
		if (ent.cgrp == cgrp)
1803
			goto next;
1804 1805 1806 1807
		/*
		 * saying GFP_ATOMIC has no effect here because we did prealloc
		 * earlier, but it's good form to communicate our expectations.
		 */
1808
		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
1809
		BUG_ON(retval != 0);
B
Ben Blum 已提交
1810
		i++;
1811
	next:
1812 1813
		if (!threadgroup)
			break;
B
Ben Blum 已提交
1814
	} while_each_thread(leader, tsk);
1815
	rcu_read_unlock();
B
Ben Blum 已提交
1816 1817
	/* remember the number of threads in the array for later. */
	group_size = i;
1818 1819
	tset.tc_array = group;
	tset.tc_array_len = group_size;
B
Ben Blum 已提交
1820

1821 1822
	/* methods shouldn't be called if no task is actually migrating */
	retval = 0;
1823
	if (!group_size)
1824
		goto out_free_group_list;
1825

B
Ben Blum 已提交
1826 1827 1828
	/*
	 * step 1: check that we can legitimately attach to the cgroup.
	 */
T
Tejun Heo 已提交
1829 1830 1831
	for_each_css(css, i, cgrp) {
		if (css->ss->can_attach) {
			retval = css->ss->can_attach(css, &tset);
B
Ben Blum 已提交
1832
			if (retval) {
T
Tejun Heo 已提交
1833
				failed_css = css;
B
Ben Blum 已提交
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843
				goto out_cancel_attach;
			}
		}
	}

	/*
	 * step 2: make sure css_sets exist for all threads to be migrated.
	 * we use find_css_set, which allocates a new one if necessary.
	 */
	for (i = 0; i < group_size; i++) {
1844 1845
		struct css_set *old_cset;

1846
		tc = flex_array_get(group, i);
1847
		old_cset = task_css_set(tc->task);
L
Li Zefan 已提交
1848 1849
		tc->cset = find_css_set(old_cset, cgrp);
		if (!tc->cset) {
1850 1851
			retval = -ENOMEM;
			goto out_put_css_set_refs;
B
Ben Blum 已提交
1852 1853 1854 1855
		}
	}

	/*
1856 1857 1858
	 * step 3: now that we're guaranteed success wrt the css_sets,
	 * proceed to move all tasks to the new cgroup.  There are no
	 * failure cases after here, so this is the commit point.
B
Ben Blum 已提交
1859 1860
	 */
	for (i = 0; i < group_size; i++) {
1861
		tc = flex_array_get(group, i);
L
Li Zefan 已提交
1862
		cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
B
Ben Blum 已提交
1863 1864 1865 1866
	}
	/* nothing is sensitive to fork() after this point. */

	/*
1867
	 * step 4: do subsystem attach callbacks.
B
Ben Blum 已提交
1868
	 */
T
Tejun Heo 已提交
1869 1870 1871
	for_each_css(css, i, cgrp)
		if (css->ss->attach)
			css->ss->attach(css, &tset);
B
Ben Blum 已提交
1872 1873 1874 1875 1876

	/*
	 * step 5: success! and cleanup
	 */
	retval = 0;
1877 1878 1879 1880
out_put_css_set_refs:
	if (retval) {
		for (i = 0; i < group_size; i++) {
			tc = flex_array_get(group, i);
L
Li Zefan 已提交
1881
			if (!tc->cset)
1882
				break;
L
Li Zefan 已提交
1883
			put_css_set(tc->cset);
1884
		}
B
Ben Blum 已提交
1885 1886 1887
	}
out_cancel_attach:
	if (retval) {
T
Tejun Heo 已提交
1888 1889
		for_each_css(css, i, cgrp) {
			if (css == failed_css)
B
Ben Blum 已提交
1890
				break;
T
Tejun Heo 已提交
1891 1892
			if (css->ss->cancel_attach)
				css->ss->cancel_attach(css, &tset);
B
Ben Blum 已提交
1893 1894 1895
		}
	}
out_free_group_list:
1896
	flex_array_free(group);
B
Ben Blum 已提交
1897 1898 1899 1900 1901
	return retval;
}

/*
 * Find the task_struct of the task to attach by vpid and pass it along to the
1902 1903
 * function to attach either it or all tasks in its threadgroup. Will lock
 * cgroup_mutex and threadgroup; may take task_lock of task.
1904
 */
B
Ben Blum 已提交
1905
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1906 1907
{
	struct task_struct *tsk;
1908
	const struct cred *cred = current_cred(), *tcred;
1909 1910
	int ret;

B
Ben Blum 已提交
1911 1912 1913
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;

1914 1915
retry_find_task:
	rcu_read_lock();
1916
	if (pid) {
1917
		tsk = find_task_by_vpid(pid);
B
Ben Blum 已提交
1918 1919
		if (!tsk) {
			rcu_read_unlock();
S
SeongJae Park 已提交
1920
			ret = -ESRCH;
1921
			goto out_unlock_cgroup;
1922
		}
B
Ben Blum 已提交
1923 1924 1925 1926
		/*
		 * even if we're attaching all tasks in the thread group, we
		 * only need to check permissions on one of them.
		 */
1927
		tcred = __task_cred(tsk);
1928 1929 1930
		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
		    !uid_eq(cred->euid, tcred->uid) &&
		    !uid_eq(cred->euid, tcred->suid)) {
1931
			rcu_read_unlock();
1932 1933
			ret = -EACCES;
			goto out_unlock_cgroup;
1934
		}
1935 1936
	} else
		tsk = current;
1937 1938

	if (threadgroup)
1939
		tsk = tsk->group_leader;
1940 1941

	/*
1942
	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
1943 1944 1945
	 * trapped in a cpuset, or RT worker may be born in a cgroup
	 * with no rt_runtime allocated.  Just say no.
	 */
1946
	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
1947 1948 1949 1950 1951
		ret = -EINVAL;
		rcu_read_unlock();
		goto out_unlock_cgroup;
	}

1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
	get_task_struct(tsk);
	rcu_read_unlock();

	threadgroup_lock(tsk);
	if (threadgroup) {
		if (!thread_group_leader(tsk)) {
			/*
			 * a race with de_thread from another thread's exec()
			 * may strip us of our leadership, if this happens,
			 * there is no choice but to throw this task away and
			 * try again; this is
			 * "double-double-toil-and-trouble-check locking".
			 */
			threadgroup_unlock(tsk);
			put_task_struct(tsk);
			goto retry_find_task;
		}
1969 1970 1971 1972
	}

	ret = cgroup_attach_task(cgrp, tsk, threadgroup);

1973 1974
	threadgroup_unlock(tsk);

1975
	put_task_struct(tsk);
1976
out_unlock_cgroup:
T
Tejun Heo 已提交
1977
	mutex_unlock(&cgroup_mutex);
1978 1979 1980
	return ret;
}

1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
/**
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
 * @from: attach to all cgroups of a given task
 * @tsk: the task to be attached
 */
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
	struct cgroupfs_root *root;
	int retval = 0;

T
Tejun Heo 已提交
1991
	mutex_lock(&cgroup_mutex);
1992
	for_each_active_root(root) {
L
Li Zefan 已提交
1993
		struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
1994

L
Li Zefan 已提交
1995
		retval = cgroup_attach_task(from_cgrp, tsk, false);
1996 1997 1998
		if (retval)
			break;
	}
T
Tejun Heo 已提交
1999
	mutex_unlock(&cgroup_mutex);
2000 2001 2002 2003 2004

	return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2005 2006
static int cgroup_tasks_write(struct cgroup_subsys_state *css,
			      struct cftype *cft, u64 pid)
B
Ben Blum 已提交
2007
{
2008
	return attach_task_by_pid(css->cgroup, pid, false);
B
Ben Blum 已提交
2009 2010
}

2011 2012
static int cgroup_procs_write(struct cgroup_subsys_state *css,
			      struct cftype *cft, u64 tgid)
2013
{
2014
	return attach_task_by_pid(css->cgroup, tgid, true);
2015 2016
}

2017 2018
static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
				      struct cftype *cft, const char *buffer)
2019
{
2020 2021 2022
	struct cgroupfs_root *root = css->cgroup->root;

	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2023
	if (!cgroup_lock_live_group(css->cgroup))
2024
		return -ENODEV;
2025
	spin_lock(&release_agent_path_lock);
2026 2027
	strlcpy(root->release_agent_path, buffer,
		sizeof(root->release_agent_path));
2028
	spin_unlock(&release_agent_path_lock);
T
Tejun Heo 已提交
2029
	mutex_unlock(&cgroup_mutex);
2030 2031 2032
	return 0;
}

2033
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2034
{
2035
	struct cgroup *cgrp = seq_css(seq)->cgroup;
2036

2037 2038 2039 2040
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;
	seq_puts(seq, cgrp->root->release_agent_path);
	seq_putc(seq, '\n');
T
Tejun Heo 已提交
2041
	mutex_unlock(&cgroup_mutex);
2042 2043 2044
	return 0;
}

2045
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2046
{
2047 2048 2049
	struct cgroup *cgrp = seq_css(seq)->cgroup;

	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2050 2051 2052
	return 0;
}

T
Tejun Heo 已提交
2053 2054
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
				 size_t nbytes, loff_t off)
2055
{
T
Tejun Heo 已提交
2056 2057 2058
	struct cgroup *cgrp = of->kn->parent->priv;
	struct cftype *cft = of->kn->priv;
	struct cgroup_subsys_state *css;
2059
	int ret;
2060

T
Tejun Heo 已提交
2061 2062 2063 2064 2065 2066 2067 2068 2069
	/*
	 * kernfs guarantees that a file isn't deleted with operations in
	 * flight, which means that the matching css is and stays alive and
	 * doesn't need to be pinned.  The RCU locking is not necessary
	 * either.  It's just for the convenience of using cgroup_css().
	 */
	rcu_read_lock();
	css = cgroup_css(cgrp, cft->ss);
	rcu_read_unlock();
2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084

	if (cft->write_string) {
		ret = cft->write_string(css, cft, strstrip(buf));
	} else if (cft->write_u64) {
		unsigned long long v;
		ret = kstrtoull(buf, 0, &v);
		if (!ret)
			ret = cft->write_u64(css, cft, v);
	} else if (cft->write_s64) {
		long long v;
		ret = kstrtoll(buf, 0, &v);
		if (!ret)
			ret = cft->write_s64(css, cft, v);
	} else if (cft->trigger) {
		ret = cft->trigger(css, (unsigned int)cft->private);
2085
	} else {
2086
		ret = -EINVAL;
2087
	}
T
Tejun Heo 已提交
2088

2089
	return ret ?: nbytes;
2090 2091
}

2092
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2093
{
T
Tejun Heo 已提交
2094
	return seq_cft(seq)->seq_start(seq, ppos);
2095 2096
}

2097
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2098
{
T
Tejun Heo 已提交
2099
	return seq_cft(seq)->seq_next(seq, v, ppos);
2100 2101
}

2102
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2103
{
T
Tejun Heo 已提交
2104
	seq_cft(seq)->seq_stop(seq, v);
2105 2106
}

2107
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2108
{
2109 2110
	struct cftype *cft = seq_cft(m);
	struct cgroup_subsys_state *css = seq_css(m);
2111

2112 2113
	if (cft->seq_show)
		return cft->seq_show(m, arg);
2114

2115
	if (cft->read_u64)
2116 2117 2118 2119 2120 2121
		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
	else if (cft->read_s64)
		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
	else
		return -EINVAL;
	return 0;
2122 2123
}

T
Tejun Heo 已提交
2124 2125 2126 2127
static struct kernfs_ops cgroup_kf_single_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_show		= cgroup_seqfile_show,
2128 2129
};

T
Tejun Heo 已提交
2130 2131 2132 2133 2134 2135 2136 2137
static struct kernfs_ops cgroup_kf_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_start		= cgroup_seqfile_start,
	.seq_next		= cgroup_seqfile_next,
	.seq_stop		= cgroup_seqfile_stop,
	.seq_show		= cgroup_seqfile_show,
};
2138 2139 2140 2141

/*
 * cgroup_rename - Only allow simple rename of directories in place.
 */
T
Tejun Heo 已提交
2142 2143
static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			 const char *new_name_str)
2144
{
T
Tejun Heo 已提交
2145 2146
	struct cgroup *cgrp = kn->priv;
	int ret;
2147

T
Tejun Heo 已提交
2148
	if (kernfs_type(kn) != KERNFS_DIR)
2149
		return -ENOTDIR;
T
Tejun Heo 已提交
2150
	if (kn->parent != new_parent)
2151
		return -EIO;
2152

2153 2154 2155 2156 2157 2158 2159
	/*
	 * This isn't a proper migration and its usefulness is very
	 * limited.  Disallow if sane_behavior.
	 */
	if (cgroup_sane_behavior(cgrp))
		return -EPERM;

T
Tejun Heo 已提交
2160 2161 2162 2163
	mutex_lock(&cgroup_tree_mutex);
	mutex_lock(&cgroup_mutex);

	ret = kernfs_rename(kn, new_parent, new_name_str);
2164

T
Tejun Heo 已提交
2165 2166 2167
	mutex_unlock(&cgroup_mutex);
	mutex_unlock(&cgroup_tree_mutex);
	return ret;
2168 2169
}

2170
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2171
{
T
Tejun Heo 已提交
2172
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
2173 2174
	struct kernfs_node *kn;
	struct lock_class_key *key = NULL;
T
Tejun Heo 已提交
2175

T
Tejun Heo 已提交
2176 2177 2178 2179 2180 2181 2182 2183 2184
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	key = &cft->lockdep_key;
#endif
	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
				  NULL, false, key);
	if (IS_ERR(kn))
		return PTR_ERR(kn);
	return 0;
2185 2186
}

2187 2188 2189 2190 2191 2192 2193
/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @cgrp: the target cgroup
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2194 2195 2196
 * For removals, this function never fails.  If addition fails, this
 * function doesn't remove files already added.  The caller is responsible
 * for cleaning up.
2197
 */
2198 2199
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
			      bool is_add)
2200
{
A
Aristeu Rozanski 已提交
2201
	struct cftype *cft;
2202 2203
	int ret;

T
Tejun Heo 已提交
2204
	lockdep_assert_held(&cgroup_tree_mutex);
T
Tejun Heo 已提交
2205 2206

	for (cft = cfts; cft->name[0] != '\0'; cft++) {
2207
		/* does cft->flags tell us to skip this file on @cgrp? */
2208 2209
		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
			continue;
2210 2211 2212 2213 2214
		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
			continue;
		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
			continue;

2215
		if (is_add) {
2216
			ret = cgroup_add_file(cgrp, cft);
2217
			if (ret) {
2218
				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2219 2220 2221
					cft->name, ret);
				return ret;
			}
2222 2223
		} else {
			cgroup_rm_file(cgrp, cft);
T
Tejun Heo 已提交
2224
		}
2225
	}
2226
	return 0;
2227 2228
}

2229
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2230 2231
{
	LIST_HEAD(pending);
2232
	struct cgroup_subsys *ss = cfts[0].ss;
2233 2234
	struct cgroup *root = &ss->root->top_cgroup;
	struct cgroup_subsys_state *css;
2235
	int ret = 0;
2236

2237
	lockdep_assert_held(&cgroup_tree_mutex);
2238

2239 2240
	/* don't bother if @ss isn't attached */
	if (ss->root == &cgroup_dummy_root)
2241
		return 0;
2242 2243

	/* add/rm files for all cgroups created before */
2244
	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2245 2246
		struct cgroup *cgrp = css->cgroup;

2247 2248 2249
		if (cgroup_is_dead(cgrp))
			continue;

2250
		ret = cgroup_addrm_files(cgrp, cfts, is_add);
2251 2252
		if (ret)
			break;
2253
	}
2254 2255 2256

	if (is_add && !ret)
		kernfs_activate(root->kn);
2257
	return ret;
2258 2259
}

2260 2261 2262 2263
static void cgroup_exit_cftypes(struct cftype *cfts)
{
	struct cftype *cft;

T
Tejun Heo 已提交
2264 2265 2266 2267 2268
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		/* free copy for custom atomic_write_len, see init_cftypes() */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
			kfree(cft->kf_ops);
		cft->kf_ops = NULL;
2269
		cft->ss = NULL;
T
Tejun Heo 已提交
2270
	}
2271 2272
}

T
Tejun Heo 已提交
2273
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2274 2275 2276
{
	struct cftype *cft;

T
Tejun Heo 已提交
2277 2278 2279
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		struct kernfs_ops *kf_ops;

T
Tejun Heo 已提交
2280 2281
		WARN_ON(cft->ss || cft->kf_ops);

T
Tejun Heo 已提交
2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300
		if (cft->seq_start)
			kf_ops = &cgroup_kf_ops;
		else
			kf_ops = &cgroup_kf_single_ops;

		/*
		 * Ugh... if @cft wants a custom max_write_len, we need to
		 * make a copy of kf_ops to set its atomic_write_len.
		 */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
			if (!kf_ops) {
				cgroup_exit_cftypes(cfts);
				return -ENOMEM;
			}
			kf_ops->atomic_write_len = cft->max_write_len;
		}

		cft->kf_ops = kf_ops;
2301
		cft->ss = ss;
T
Tejun Heo 已提交
2302 2303 2304
	}

	return 0;
2305 2306
}

2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
	lockdep_assert_held(&cgroup_tree_mutex);

	if (!cfts || !cfts[0].ss)
		return -ENOENT;

	list_del(&cfts->node);
	cgroup_apply_cftypes(cfts, false);
	cgroup_exit_cftypes(cfts);
	return 0;
}

T
Tejun Heo 已提交
2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332
/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
2333
	int ret;
T
Tejun Heo 已提交
2334

2335 2336 2337 2338
	mutex_lock(&cgroup_tree_mutex);
	ret = cgroup_rm_cftypes_locked(cfts);
	mutex_unlock(&cgroup_tree_mutex);
	return ret;
T
Tejun Heo 已提交
2339 2340
}

2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354
/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
A
Aristeu Rozanski 已提交
2355
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2356
{
2357
	int ret;
2358

T
Tejun Heo 已提交
2359 2360 2361
	ret = cgroup_init_cftypes(ss, cfts);
	if (ret)
		return ret;
2362

2363 2364
	mutex_lock(&cgroup_tree_mutex);

T
Tejun Heo 已提交
2365
	list_add_tail(&cfts->node, &ss->cfts);
2366
	ret = cgroup_apply_cftypes(cfts, true);
2367
	if (ret)
2368 2369 2370
		cgroup_rm_cftypes_locked(cfts);

	mutex_unlock(&cgroup_tree_mutex);
2371
	return ret;
2372 2373 2374
}
EXPORT_SYMBOL_GPL(cgroup_add_cftypes);

L
Li Zefan 已提交
2375 2376 2377 2378 2379 2380
/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 *
 * Return the number of tasks in the cgroup.
 */
2381
int cgroup_task_count(const struct cgroup *cgrp)
2382 2383
{
	int count = 0;
2384
	struct cgrp_cset_link *link;
2385 2386

	read_lock(&css_set_lock);
2387 2388
	list_for_each_entry(link, &cgrp->cset_links, cset_link)
		count += atomic_read(&link->cset->refcount);
2389
	read_unlock(&css_set_lock);
2390 2391 2392
	return count;
}

2393
/*
2394 2395 2396
 * To reduce the fork() overhead for systems that are not actually using
 * their cgroups capability, we don't maintain the lists running through
 * each css_set to its tasks until we see the list actually used - in other
2397
 * words after the first call to css_task_iter_start().
2398
 */
2399
static void cgroup_enable_task_cg_lists(void)
2400 2401 2402 2403
{
	struct task_struct *p, *g;
	write_lock(&css_set_lock);
	use_task_css_set_links = 1;
2404 2405 2406 2407 2408 2409 2410 2411
	/*
	 * We need tasklist_lock because RCU is not safe against
	 * while_each_thread(). Besides, a forking task that has passed
	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
	 * is not guaranteed to have its child immediately visible in the
	 * tasklist if we walk through it with RCU.
	 */
	read_lock(&tasklist_lock);
2412 2413
	do_each_thread(g, p) {
		task_lock(p);
2414 2415 2416 2417 2418 2419
		/*
		 * We should check if the process is exiting, otherwise
		 * it will race with cgroup_exit() in that the list
		 * entry won't be deleted though the process has exited.
		 */
		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2420
			list_add(&p->cg_list, &task_css_set(p)->tasks);
2421 2422
		task_unlock(p);
	} while_each_thread(g, p);
2423
	read_unlock(&tasklist_lock);
2424 2425 2426
	write_unlock(&css_set_lock);
}

2427
/**
2428 2429 2430
 * css_next_child - find the next child of a given css
 * @pos_css: the current position (%NULL to initiate traversal)
 * @parent_css: css whose children to walk
2431
 *
2432
 * This function returns the next child of @parent_css and should be called
2433 2434 2435
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent_css and @pos_css are accessible.  The next sibling is
 * guaranteed to be returned regardless of their states.
2436
 */
2437 2438 2439
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
	       struct cgroup_subsys_state *parent_css)
2440
{
2441 2442
	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
	struct cgroup *cgrp = parent_css->cgroup;
2443 2444
	struct cgroup *next;

T
Tejun Heo 已提交
2445
	cgroup_assert_mutexes_or_rcu_locked();
2446 2447 2448 2449

	/*
	 * @pos could already have been removed.  Once a cgroup is removed,
	 * its ->sibling.next is no longer updated when its next sibling
2450 2451 2452 2453 2454 2455 2456
	 * changes.  As CGRP_DEAD assertion is serialized and happens
	 * before the cgroup is taken off the ->sibling list, if we see it
	 * unasserted, it's guaranteed that the next sibling hasn't
	 * finished its grace period even if it's already removed, and thus
	 * safe to dereference from this RCU critical section.  If
	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
	 * to be visible as %true here.
2457 2458 2459 2460 2461 2462 2463 2464
	 *
	 * If @pos is dead, its next pointer can't be dereferenced;
	 * however, as each cgroup is given a monotonically increasing
	 * unique serial number and always appended to the sibling list,
	 * the next one can be found by walking the parent's children until
	 * we see a cgroup with higher serial number than @pos's.  While
	 * this path can be slower, it's taken only when either the current
	 * cgroup is removed or iteration and removal race.
2465
	 */
2466 2467 2468
	if (!pos) {
		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
	} else if (likely(!cgroup_is_dead(pos))) {
2469
		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
2470 2471 2472 2473
	} else {
		list_for_each_entry_rcu(next, &cgrp->children, sibling)
			if (next->serial_nr > pos->serial_nr)
				break;
2474 2475
	}

2476 2477 2478
	if (&next->sibling == &cgrp->children)
		return NULL;

2479
	return cgroup_css(next, parent_css->ss);
2480
}
2481
EXPORT_SYMBOL_GPL(css_next_child);
2482

2483
/**
2484
 * css_next_descendant_pre - find the next descendant for pre-order walk
2485
 * @pos: the current position (%NULL to initiate traversal)
2486
 * @root: css whose descendants to walk
2487
 *
2488
 * To be used by css_for_each_descendant_pre().  Find the next descendant
2489 2490
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
2491
 *
2492 2493 2494 2495
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
2496
 */
2497 2498 2499
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
			struct cgroup_subsys_state *root)
2500
{
2501
	struct cgroup_subsys_state *next;
2502

T
Tejun Heo 已提交
2503
	cgroup_assert_mutexes_or_rcu_locked();
2504

2505
	/* if first iteration, visit @root */
2506
	if (!pos)
2507
		return root;
2508 2509

	/* visit the first child if exists */
2510
	next = css_next_child(NULL, pos);
2511 2512 2513 2514
	if (next)
		return next;

	/* no child, visit my or the closest ancestor's next sibling */
2515 2516
	while (pos != root) {
		next = css_next_child(pos, css_parent(pos));
2517
		if (next)
2518
			return next;
2519
		pos = css_parent(pos);
2520
	}
2521 2522 2523

	return NULL;
}
2524
EXPORT_SYMBOL_GPL(css_next_descendant_pre);
2525

2526
/**
2527 2528
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
2529
 *
2530 2531
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
2532
 * subtree of @pos.
2533
 *
2534 2535 2536 2537
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
2538
 */
2539 2540
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
2541
{
2542
	struct cgroup_subsys_state *last, *tmp;
2543

T
Tejun Heo 已提交
2544
	cgroup_assert_mutexes_or_rcu_locked();
2545 2546 2547 2548 2549

	do {
		last = pos;
		/* ->prev isn't RCU safe, walk ->next till the end */
		pos = NULL;
2550
		css_for_each_child(tmp, last)
2551 2552 2553 2554 2555
			pos = tmp;
	} while (pos);

	return last;
}
2556
EXPORT_SYMBOL_GPL(css_rightmost_descendant);
2557

2558 2559
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
2560
{
2561
	struct cgroup_subsys_state *last;
2562 2563 2564

	do {
		last = pos;
2565
		pos = css_next_child(NULL, pos);
2566 2567 2568 2569 2570 2571
	} while (pos);

	return last;
}

/**
2572
 * css_next_descendant_post - find the next descendant for post-order walk
2573
 * @pos: the current position (%NULL to initiate traversal)
2574
 * @root: css whose descendants to walk
2575
 *
2576
 * To be used by css_for_each_descendant_post().  Find the next descendant
2577 2578
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
2579
 *
2580 2581 2582 2583 2584
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
2585
 */
2586 2587 2588
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
			 struct cgroup_subsys_state *root)
2589
{
2590
	struct cgroup_subsys_state *next;
2591

T
Tejun Heo 已提交
2592
	cgroup_assert_mutexes_or_rcu_locked();
2593

2594 2595 2596
	/* if first iteration, visit leftmost descendant which may be @root */
	if (!pos)
		return css_leftmost_descendant(root);
2597

2598 2599 2600 2601
	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

2602
	/* if there's an unvisited sibling, visit its leftmost descendant */
2603
	next = css_next_child(pos, css_parent(pos));
2604
	if (next)
2605
		return css_leftmost_descendant(next);
2606 2607

	/* no sibling left, visit parent */
2608
	return css_parent(pos);
2609
}
2610
EXPORT_SYMBOL_GPL(css_next_descendant_post);
2611

2612
/**
2613
 * css_advance_task_iter - advance a task itererator to the next css_set
2614 2615 2616
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
2617
 */
2618
static void css_advance_task_iter(struct css_task_iter *it)
2619 2620 2621 2622 2623 2624 2625 2626
{
	struct list_head *l = it->cset_link;
	struct cgrp_cset_link *link;
	struct css_set *cset;

	/* Advance to the next non-empty css_set */
	do {
		l = l->next;
2627
		if (l == &it->origin_css->cgroup->cset_links) {
2628 2629 2630 2631 2632 2633 2634 2635 2636 2637
			it->cset_link = NULL;
			return;
		}
		link = list_entry(l, struct cgrp_cset_link, cset_link);
		cset = link->cset;
	} while (list_empty(&cset->tasks));
	it->cset_link = l;
	it->task = cset->tasks.next;
}

2638
/**
2639 2640
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
2641 2642
 * @it: the task iterator to use
 *
2643 2644 2645 2646
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
2647 2648 2649 2650 2651
 *
 * Note that this function acquires a lock which is released when the
 * iteration finishes.  The caller can't sleep while iteration is in
 * progress.
 */
2652 2653
void css_task_iter_start(struct cgroup_subsys_state *css,
			 struct css_task_iter *it)
2654
	__acquires(css_set_lock)
2655 2656
{
	/*
2657 2658 2659
	 * The first time anyone tries to iterate across a css, we need to
	 * enable the list linking each css_set to its tasks, and fix up
	 * all existing tasks.
2660
	 */
2661 2662 2663
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();

2664
	read_lock(&css_set_lock);
2665

2666 2667
	it->origin_css = css;
	it->cset_link = &css->cgroup->cset_links;
2668

2669
	css_advance_task_iter(it);
2670 2671
}

2672
/**
2673
 * css_task_iter_next - return the next task for the iterator
2674 2675 2676
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
2677 2678
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
2679
 */
2680
struct task_struct *css_task_iter_next(struct css_task_iter *it)
2681 2682 2683
{
	struct task_struct *res;
	struct list_head *l = it->task;
2684
	struct cgrp_cset_link *link;
2685 2686

	/* If the iterator cg is NULL, we have no tasks */
2687
	if (!it->cset_link)
2688 2689 2690 2691
		return NULL;
	res = list_entry(l, struct task_struct, cg_list);
	/* Advance iterator to find next entry */
	l = l->next;
2692 2693
	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
	if (l == &link->cset->tasks) {
2694 2695 2696 2697
		/*
		 * We reached the end of this task list - move on to the
		 * next cgrp_cset_link.
		 */
2698
		css_advance_task_iter(it);
2699 2700 2701 2702 2703 2704
	} else {
		it->task = l;
	}
	return res;
}

2705
/**
2706
 * css_task_iter_end - finish task iteration
2707 2708
 * @it: the task iterator to finish
 *
2709
 * Finish task iteration started by css_task_iter_start().
2710
 */
2711
void css_task_iter_end(struct css_task_iter *it)
2712
	__releases(css_set_lock)
2713 2714 2715 2716
{
	read_unlock(&css_set_lock);
}

2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751
static inline int started_after_time(struct task_struct *t1,
				     struct timespec *time,
				     struct task_struct *t2)
{
	int start_diff = timespec_compare(&t1->start_time, time);
	if (start_diff > 0) {
		return 1;
	} else if (start_diff < 0) {
		return 0;
	} else {
		/*
		 * Arbitrarily, if two processes started at the same
		 * time, we'll say that the lower pointer value
		 * started first. Note that t2 may have exited by now
		 * so this may not be a valid pointer any longer, but
		 * that's fine - it still serves to distinguish
		 * between two tasks started (effectively) simultaneously.
		 */
		return t1 > t2;
	}
}

/*
 * This function is a callback from heap_insert() and is used to order
 * the heap.
 * In this case we order the heap in descending task start time.
 */
static inline int started_after(void *p1, void *p2)
{
	struct task_struct *t1 = p1;
	struct task_struct *t2 = p2;
	return started_after_time(t1, &t2->start_time, t2);
}

/**
2752 2753
 * css_scan_tasks - iterate though all the tasks in a css
 * @css: the css to iterate tasks of
T
Tejun Heo 已提交
2754 2755 2756 2757
 * @test: optional test callback
 * @process: process callback
 * @data: data passed to @test and @process
 * @heap: optional pre-allocated heap used for task iteration
2758
 *
2759 2760
 * Iterate through all the tasks in @css, calling @test for each, and if it
 * returns %true, call @process for it also.
2761
 *
T
Tejun Heo 已提交
2762
 * @test may be NULL, meaning always true (select all tasks), which
2763
 * effectively duplicates css_task_iter_{start,next,end}() but does not
T
Tejun Heo 已提交
2764 2765 2766
 * lock css_set_lock for the call to @process.
 *
 * It is guaranteed that @process will act on every task that is a member
2767 2768 2769
 * of @css for the duration of this call.  This function may or may not
 * call @process for tasks that exit or move to a different css during the
 * call, or are forked or move into the css during the call.
2770
 *
T
Tejun Heo 已提交
2771 2772 2773
 * Note that @test may be called with locks held, and may in some
 * situations be called multiple times for the same task, so it should be
 * cheap.
2774
 *
T
Tejun Heo 已提交
2775 2776 2777 2778
 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
 * heap operations (and its "gt" member will be overwritten), else a
 * temporary heap will be used (allocation of which may cause this function
 * to fail).
2779
 */
2780 2781 2782 2783
int css_scan_tasks(struct cgroup_subsys_state *css,
		   bool (*test)(struct task_struct *, void *),
		   void (*process)(struct task_struct *, void *),
		   void *data, struct ptr_heap *heap)
2784 2785
{
	int retval, i;
2786
	struct css_task_iter it;
2787 2788 2789 2790 2791 2792
	struct task_struct *p, *dropped;
	/* Never dereference latest_task, since it's not refcounted */
	struct task_struct *latest_task = NULL;
	struct ptr_heap tmp_heap;
	struct timespec latest_time = { 0, 0 };

T
Tejun Heo 已提交
2793
	if (heap) {
2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806
		/* The caller supplied our heap and pre-allocated its memory */
		heap->gt = &started_after;
	} else {
		/* We need to allocate our own heap memory */
		heap = &tmp_heap;
		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
		if (retval)
			/* cannot allocate the heap */
			return retval;
	}

 again:
	/*
2807
	 * Scan tasks in the css, using the @test callback to determine
T
Tejun Heo 已提交
2808 2809 2810 2811 2812 2813 2814
	 * which are of interest, and invoking @process callback on the
	 * ones which need an update.  Since we don't want to hold any
	 * locks during the task updates, gather tasks to be processed in a
	 * heap structure.  The heap is sorted by descending task start
	 * time.  If the statically-sized heap fills up, we overflow tasks
	 * that started later, and in future iterations only consider tasks
	 * that started after the latest task in the previous pass. This
2815 2816 2817
	 * guarantees forward progress and that we don't miss any tasks.
	 */
	heap->size = 0;
2818 2819
	css_task_iter_start(css, &it);
	while ((p = css_task_iter_next(&it))) {
2820 2821 2822 2823
		/*
		 * Only affect tasks that qualify per the caller's callback,
		 * if he provided one
		 */
T
Tejun Heo 已提交
2824
		if (test && !test(p, data))
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851
			continue;
		/*
		 * Only process tasks that started after the last task
		 * we processed
		 */
		if (!started_after_time(p, &latest_time, latest_task))
			continue;
		dropped = heap_insert(heap, p);
		if (dropped == NULL) {
			/*
			 * The new task was inserted; the heap wasn't
			 * previously full
			 */
			get_task_struct(p);
		} else if (dropped != p) {
			/*
			 * The new task was inserted, and pushed out a
			 * different task
			 */
			get_task_struct(p);
			put_task_struct(dropped);
		}
		/*
		 * Else the new task was newer than anything already in
		 * the heap and wasn't inserted
		 */
	}
2852
	css_task_iter_end(&it);
2853 2854 2855

	if (heap->size) {
		for (i = 0; i < heap->size; i++) {
2856
			struct task_struct *q = heap->ptrs[i];
2857
			if (i == 0) {
2858 2859
				latest_time = q->start_time;
				latest_task = q;
2860 2861
			}
			/* Process the task per the caller's callback */
T
Tejun Heo 已提交
2862
			process(q, data);
2863
			put_task_struct(q);
2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878
		}
		/*
		 * If we had to process any tasks at all, scan again
		 * in case some of them were in the middle of forking
		 * children that didn't get processed.
		 * Not the most efficient way to do it, but it avoids
		 * having to take callback_mutex in the fork path
		 */
		goto again;
	}
	if (heap == &tmp_heap)
		heap_free(&tmp_heap);
	return 0;
}

T
Tejun Heo 已提交
2879
static void cgroup_transfer_one_task(struct task_struct *task, void *data)
2880
{
T
Tejun Heo 已提交
2881
	struct cgroup *new_cgroup = data;
2882

T
Tejun Heo 已提交
2883
	mutex_lock(&cgroup_mutex);
2884
	cgroup_attach_task(new_cgroup, task, false);
T
Tejun Heo 已提交
2885
	mutex_unlock(&cgroup_mutex);
2886 2887 2888 2889 2890 2891 2892 2893 2894
}

/**
 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
 * @to: cgroup to which the tasks will be moved
 * @from: cgroup in which the tasks currently reside
 */
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
2895 2896
	return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
			      to, NULL);
2897 2898
}

2899
/*
2900
 * Stuff for reading the 'tasks'/'procs' files.
2901 2902 2903 2904 2905 2906 2907 2908
 *
 * Reading this file can return large amounts of data if a cgroup has
 * *lots* of attached tasks. So it may need several calls to read(),
 * but we cannot guarantee that the information we produce is correct
 * unless we produce it entirely atomically.
 *
 */

2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
/* which pidlist file are we talking about? */
enum cgroup_filetype {
	CGROUP_FILE_PROCS,
	CGROUP_FILE_TASKS,
};

/*
 * A pidlist is a list of pids that virtually represents the contents of one
 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
 * a pair (one each for procs, tasks) for each pid namespace that's relevant
 * to the cgroup.
 */
struct cgroup_pidlist {
	/*
	 * used to find which pidlist is wanted. doesn't change as long as
	 * this particular list stays in the list.
	*/
	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
	/* array of xids */
	pid_t *list;
	/* how many elements the above list has */
	int length;
	/* each of these stored in a list by its cgroup */
	struct list_head links;
	/* pointer to the cgroup we belong to, for list removal purposes */
	struct cgroup *owner;
2935 2936
	/* for delayed destruction */
	struct delayed_work destroy_dwork;
2937 2938
};

2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951
/*
 * The following two functions "fix" the issue where there are more pids
 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
 * TODO: replace with a kernel-wide solution to this problem
 */
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
	if (PIDLIST_TOO_LARGE(count))
		return vmalloc(count * sizeof(pid_t));
	else
		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
2952

2953 2954 2955 2956 2957 2958 2959 2960
static void pidlist_free(void *p)
{
	if (is_vmalloc_addr(p))
		vfree(p);
	else
		kfree(p);
}

2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987
/*
 * Used to destroy all pidlists lingering waiting for destroy timer.  None
 * should be left afterwards.
 */
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
	struct cgroup_pidlist *l, *tmp_l;

	mutex_lock(&cgrp->pidlist_mutex);
	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
	mutex_unlock(&cgrp->pidlist_mutex);

	flush_workqueue(cgroup_pidlist_destroy_wq);
	BUG_ON(!list_empty(&cgrp->pidlists));
}

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
	struct delayed_work *dwork = to_delayed_work(work);
	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
						destroy_dwork);
	struct cgroup_pidlist *tofree = NULL;

	mutex_lock(&l->owner->pidlist_mutex);

	/*
2988 2989
	 * Destroy iff we didn't get queued again.  The state won't change
	 * as destroy_dwork can only be queued while locked.
2990
	 */
2991
	if (!delayed_work_pending(dwork)) {
2992 2993 2994 2995 2996 2997 2998 2999 3000 3001
		list_del(&l->links);
		pidlist_free(l->list);
		put_pid_ns(l->key.ns);
		tofree = l;
	}

	mutex_unlock(&l->owner->pidlist_mutex);
	kfree(tofree);
}

3002
/*
3003
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3004
 * Returns the number of unique elements.
3005
 */
3006
static int pidlist_uniq(pid_t *list, int length)
3007
{
3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031
	int src, dest = 1;

	/*
	 * we presume the 0th element is unique, so i starts at 1. trivial
	 * edge cases first; no work needs to be done for either
	 */
	if (length == 0 || length == 1)
		return length;
	/* src and dest walk down the list; dest counts unique elements */
	for (src = 1; src < length; src++) {
		/* find next unique element */
		while (list[src] == list[src-1]) {
			src++;
			if (src == length)
				goto after;
		}
		/* dest always points to where the next unique element goes */
		list[dest] = list[src];
		dest++;
	}
after:
	return dest;
}

3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064
/*
 * The two pid files - task and cgroup.procs - guaranteed that the result
 * is sorted, which forced this whole pidlist fiasco.  As pid order is
 * different per namespace, each namespace needs differently sorted list,
 * making it impossible to use, for example, single rbtree of member tasks
 * sorted by task pointer.  As pidlists can be fairly large, allocating one
 * per open file is dangerous, so cgroup had to implement shared pool of
 * pidlists keyed by cgroup and namespace.
 *
 * All this extra complexity was caused by the original implementation
 * committing to an entirely unnecessary property.  In the long term, we
 * want to do away with it.  Explicitly scramble sort order if
 * sane_behavior so that no such expectation exists in the new interface.
 *
 * Scrambling is done by swapping every two consecutive bits, which is
 * non-identity one-to-one mapping which disturbs sort order sufficiently.
 */
static pid_t pid_fry(pid_t pid)
{
	unsigned a = pid & 0x55555555;
	unsigned b = pid & 0xAAAAAAAA;

	return (a << 1) | (b >> 1);
}

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
	if (cgroup_sane_behavior(cgrp))
		return pid_fry(pid);
	else
		return pid;
}

3065 3066 3067 3068 3069
static int cmppid(const void *a, const void *b)
{
	return *(pid_t *)a - *(pid_t *)b;
}

3070 3071 3072 3073 3074
static int fried_cmppid(const void *a, const void *b)
{
	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}

T
Tejun Heo 已提交
3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
						  enum cgroup_filetype type)
{
	struct cgroup_pidlist *l;
	/* don't need task_nsproxy() if we're looking at ourself */
	struct pid_namespace *ns = task_active_pid_ns(current);

	lockdep_assert_held(&cgrp->pidlist_mutex);

	list_for_each_entry(l, &cgrp->pidlists, links)
		if (l->key.type == type && l->key.ns == ns)
			return l;
	return NULL;
}

3090 3091 3092 3093 3094 3095
/*
 * find the appropriate pidlist for our purpose (given procs vs tasks)
 * returns with the lock on that pidlist already held, and takes care
 * of the use count, or returns NULL with no locks held if we're out of
 * memory.
 */
T
Tejun Heo 已提交
3096 3097
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
						enum cgroup_filetype type)
3098 3099
{
	struct cgroup_pidlist *l;
3100

T
Tejun Heo 已提交
3101 3102 3103 3104 3105 3106
	lockdep_assert_held(&cgrp->pidlist_mutex);

	l = cgroup_pidlist_find(cgrp, type);
	if (l)
		return l;

3107
	/* entry not found; create a new one */
3108
	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
T
Tejun Heo 已提交
3109
	if (!l)
3110
		return l;
T
Tejun Heo 已提交
3111

3112
	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3113
	l->key.type = type;
T
Tejun Heo 已提交
3114 3115
	/* don't need task_nsproxy() if we're looking at ourself */
	l->key.ns = get_pid_ns(task_active_pid_ns(current));
3116 3117 3118 3119 3120
	l->owner = cgrp;
	list_add(&l->links, &cgrp->pidlists);
	return l;
}

3121 3122 3123
/*
 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
 */
3124 3125
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
			      struct cgroup_pidlist **lp)
3126 3127 3128 3129
{
	pid_t *array;
	int length;
	int pid, n = 0; /* used for populating the array */
3130
	struct css_task_iter it;
3131
	struct task_struct *tsk;
3132 3133
	struct cgroup_pidlist *l;

3134 3135
	lockdep_assert_held(&cgrp->pidlist_mutex);

3136 3137 3138 3139 3140 3141 3142
	/*
	 * If cgroup gets more users after we read count, we won't have
	 * enough space - tough.  This race is indistinguishable to the
	 * caller from the case that the additional cgroup users didn't
	 * show up until sometime later on.
	 */
	length = cgroup_task_count(cgrp);
3143
	array = pidlist_allocate(length);
3144 3145 3146
	if (!array)
		return -ENOMEM;
	/* now, populate the array */
3147 3148
	css_task_iter_start(&cgrp->dummy_css, &it);
	while ((tsk = css_task_iter_next(&it))) {
3149
		if (unlikely(n == length))
3150
			break;
3151
		/* get tgid or pid for procs or tasks file respectively */
3152 3153 3154 3155
		if (type == CGROUP_FILE_PROCS)
			pid = task_tgid_vnr(tsk);
		else
			pid = task_pid_vnr(tsk);
3156 3157
		if (pid > 0) /* make sure to only use valid results */
			array[n++] = pid;
3158
	}
3159
	css_task_iter_end(&it);
3160 3161
	length = n;
	/* now sort & (if procs) strip out duplicates */
3162 3163 3164 3165
	if (cgroup_sane_behavior(cgrp))
		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
	else
		sort(array, length, sizeof(pid_t), cmppid, NULL);
3166
	if (type == CGROUP_FILE_PROCS)
3167
		length = pidlist_uniq(array, length);
T
Tejun Heo 已提交
3168 3169

	l = cgroup_pidlist_find_create(cgrp, type);
3170
	if (!l) {
T
Tejun Heo 已提交
3171
		mutex_unlock(&cgrp->pidlist_mutex);
3172
		pidlist_free(array);
3173
		return -ENOMEM;
3174
	}
T
Tejun Heo 已提交
3175 3176

	/* store array, freeing old if necessary */
3177
	pidlist_free(l->list);
3178 3179
	l->list = array;
	l->length = length;
3180
	*lp = l;
3181
	return 0;
3182 3183
}

B
Balbir Singh 已提交
3184
/**
L
Li Zefan 已提交
3185
 * cgroupstats_build - build and fill cgroupstats
B
Balbir Singh 已提交
3186 3187 3188
 * @stats: cgroupstats to fill information into
 * @dentry: A dentry entry belonging to the cgroup for which stats have
 * been requested.
L
Li Zefan 已提交
3189 3190 3191
 *
 * Build and fill cgroupstats so that taskstats can export it to user
 * space.
B
Balbir Singh 已提交
3192 3193 3194
 */
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
T
Tejun Heo 已提交
3195
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3196
	struct cgroup *cgrp;
3197
	struct css_task_iter it;
B
Balbir Singh 已提交
3198
	struct task_struct *tsk;
3199

T
Tejun Heo 已提交
3200 3201 3202 3203 3204
	/* it should be kernfs_node belonging to cgroupfs and is a directory */
	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
	    kernfs_type(kn) != KERNFS_DIR)
		return -EINVAL;

B
Balbir Singh 已提交
3205
	/*
T
Tejun Heo 已提交
3206 3207 3208
	 * We aren't being called from kernfs and there's no guarantee on
	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
B
Balbir Singh 已提交
3209
	 */
T
Tejun Heo 已提交
3210 3211 3212 3213 3214 3215
	rcu_read_lock();
	cgrp = rcu_dereference(kn->priv);
	if (!cgrp) {
		rcu_read_unlock();
		return -ENOENT;
	}
B
Balbir Singh 已提交
3216

3217 3218
	css_task_iter_start(&cgrp->dummy_css, &it);
	while ((tsk = css_task_iter_next(&it))) {
B
Balbir Singh 已提交
3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
		switch (tsk->state) {
		case TASK_RUNNING:
			stats->nr_running++;
			break;
		case TASK_INTERRUPTIBLE:
			stats->nr_sleeping++;
			break;
		case TASK_UNINTERRUPTIBLE:
			stats->nr_uninterruptible++;
			break;
		case TASK_STOPPED:
			stats->nr_stopped++;
			break;
		default:
			if (delayacct_is_task_waiting_on_io(tsk))
				stats->nr_io_wait++;
			break;
		}
	}
3238
	css_task_iter_end(&it);
B
Balbir Singh 已提交
3239

T
Tejun Heo 已提交
3240 3241
	rcu_read_unlock();
	return 0;
B
Balbir Singh 已提交
3242 3243
}

3244

3245
/*
3246
 * seq_file methods for the tasks/procs files. The seq_file position is the
3247
 * next pid to display; the seq_file iterator is a pointer to the pid
3248
 * in the cgroup->l->list array.
3249
 */
3250

3251
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3252
{
3253 3254 3255 3256 3257 3258
	/*
	 * Initially we receive a position value that corresponds to
	 * one more than the last pid shown (or 0 on the first call or
	 * after a seek to the start). Use a binary-search to find the
	 * next pid to display, if any
	 */
T
Tejun Heo 已提交
3259
	struct kernfs_open_file *of = s->private;
3260
	struct cgroup *cgrp = seq_css(s)->cgroup;
3261
	struct cgroup_pidlist *l;
3262
	enum cgroup_filetype type = seq_cft(s)->private;
3263
	int index = 0, pid = *pos;
3264 3265 3266 3267 3268
	int *iter, ret;

	mutex_lock(&cgrp->pidlist_mutex);

	/*
3269
	 * !NULL @of->priv indicates that this isn't the first start()
3270
	 * after open.  If the matching pidlist is around, we can use that.
3271
	 * Look for it.  Note that @of->priv can't be used directly.  It
3272 3273
	 * could already have been destroyed.
	 */
3274 3275
	if (of->priv)
		of->priv = cgroup_pidlist_find(cgrp, type);
3276 3277 3278 3279 3280

	/*
	 * Either this is the first start() after open or the matching
	 * pidlist has been destroyed inbetween.  Create a new one.
	 */
3281 3282 3283
	if (!of->priv) {
		ret = pidlist_array_load(cgrp, type,
					 (struct cgroup_pidlist **)&of->priv);
3284 3285 3286
		if (ret)
			return ERR_PTR(ret);
	}
3287
	l = of->priv;
3288 3289

	if (pid) {
3290
		int end = l->length;
S
Stephen Rothwell 已提交
3291

3292 3293
		while (index < end) {
			int mid = (index + end) / 2;
3294
			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3295 3296
				index = mid;
				break;
3297
			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3298 3299 3300 3301 3302 3303
				index = mid + 1;
			else
				end = mid;
		}
	}
	/* If we're off the end of the array, we're done */
3304
	if (index >= l->length)
3305 3306
		return NULL;
	/* Update the abstract position to be the actual pid that we found */
3307
	iter = l->list + index;
3308
	*pos = cgroup_pid_fry(cgrp, *iter);
3309 3310 3311
	return iter;
}

3312
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3313
{
T
Tejun Heo 已提交
3314
	struct kernfs_open_file *of = s->private;
3315
	struct cgroup_pidlist *l = of->priv;
3316

3317 3318
	if (l)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3319
				 CGROUP_PIDLIST_DESTROY_DELAY);
3320
	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3321 3322
}

3323
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3324
{
T
Tejun Heo 已提交
3325
	struct kernfs_open_file *of = s->private;
3326
	struct cgroup_pidlist *l = of->priv;
3327 3328
	pid_t *p = v;
	pid_t *end = l->list + l->length;
3329 3330 3331 3332 3333 3334 3335 3336
	/*
	 * Advance to the next pid in the array. If this goes off the
	 * end, we're done
	 */
	p++;
	if (p >= end) {
		return NULL;
	} else {
3337
		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3338 3339 3340 3341
		return p;
	}
}

3342
static int cgroup_pidlist_show(struct seq_file *s, void *v)
3343 3344 3345
{
	return seq_printf(s, "%d\n", *(int *)v);
}
3346

3347 3348 3349 3350 3351 3352 3353 3354 3355
/*
 * seq_operations functions for iterating on pidlists through seq_file -
 * independent of whether it's tasks or procs
 */
static const struct seq_operations cgroup_pidlist_seq_operations = {
	.start = cgroup_pidlist_start,
	.stop = cgroup_pidlist_stop,
	.next = cgroup_pidlist_next,
	.show = cgroup_pidlist_show,
3356 3357
};

3358 3359
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
					 struct cftype *cft)
3360
{
3361
	return notify_on_release(css->cgroup);
3362 3363
}

3364 3365
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
					  struct cftype *cft, u64 val)
3366
{
3367
	clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3368
	if (val)
3369
		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3370
	else
3371
		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3372 3373 3374
	return 0;
}

3375 3376
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
3377
{
3378
	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3379 3380
}

3381 3382
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, u64 val)
3383 3384
{
	if (val)
3385
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3386
	else
3387
		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3388 3389 3390
	return 0;
}

3391
static struct cftype cgroup_base_files[] = {
3392
	{
3393
		.name = "cgroup.procs",
3394 3395 3396 3397
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
3398
		.private = CGROUP_FILE_PROCS,
B
Ben Blum 已提交
3399 3400
		.write_u64 = cgroup_procs_write,
		.mode = S_IRUGO | S_IWUSR,
3401
	},
3402 3403
	{
		.name = "cgroup.clone_children",
3404
		.flags = CFTYPE_INSANE,
3405 3406 3407
		.read_u64 = cgroup_clone_children_read,
		.write_u64 = cgroup_clone_children_write,
	},
3408 3409 3410
	{
		.name = "cgroup.sane_behavior",
		.flags = CFTYPE_ONLY_ON_ROOT,
3411
		.seq_show = cgroup_sane_behavior_show,
3412
	},
3413 3414 3415 3416 3417 3418 3419 3420 3421

	/*
	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
	 * don't exist if sane_behavior.  If you're depending on these, be
	 * prepared to be burned.
	 */
	{
		.name = "tasks",
		.flags = CFTYPE_INSANE,		/* use "procs" instead */
3422 3423 3424 3425
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
3426
		.private = CGROUP_FILE_TASKS,
3427 3428 3429 3430 3431 3432 3433 3434 3435
		.write_u64 = cgroup_tasks_write,
		.mode = S_IRUGO | S_IWUSR,
	},
	{
		.name = "notify_on_release",
		.flags = CFTYPE_INSANE,
		.read_u64 = cgroup_read_notify_on_release,
		.write_u64 = cgroup_write_notify_on_release,
	},
3436 3437
	{
		.name = "release_agent",
3438
		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3439
		.seq_show = cgroup_release_agent_show,
3440
		.write_string = cgroup_release_agent_write,
3441
		.max_write_len = PATH_MAX - 1,
3442
	},
T
Tejun Heo 已提交
3443
	{ }	/* terminate */
3444 3445
};

3446
/**
3447
 * cgroup_populate_dir - create subsys files in a cgroup directory
3448 3449
 * @cgrp: target cgroup
 * @subsys_mask: mask of the subsystem ids whose files should be added
3450 3451
 *
 * On failure, no file is added.
3452
 */
3453
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3454 3455
{
	struct cgroup_subsys *ss;
3456
	int i, ret = 0;
3457

3458
	/* process cftsets of each subsystem */
3459
	for_each_subsys(ss, i) {
T
Tejun Heo 已提交
3460
		struct cftype *cfts;
3461 3462

		if (!test_bit(i, &subsys_mask))
3463
			continue;
3464

T
Tejun Heo 已提交
3465 3466
		list_for_each_entry(cfts, &ss->cfts, node) {
			ret = cgroup_addrm_files(cgrp, cfts, true);
3467 3468 3469
			if (ret < 0)
				goto err;
		}
3470 3471
	}
	return 0;
3472 3473 3474
err:
	cgroup_clear_dir(cgrp, subsys_mask);
	return ret;
3475 3476
}

3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498
/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
 *    and thus css_tryget() is guaranteed to fail, the css can be offlined
 *    by invoking offline_css().  After offlining, the base ref is put.
 *    Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_work_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
3499
static void css_free_work_fn(struct work_struct *work)
3500 3501
{
	struct cgroup_subsys_state *css =
3502
		container_of(work, struct cgroup_subsys_state, destroy_work);
3503
	struct cgroup *cgrp = css->cgroup;
3504

3505 3506 3507
	if (css->parent)
		css_put(css->parent);

3508
	css->ss->css_free(css);
T
Tejun Heo 已提交
3509
	cgroup_put(cgrp);
3510 3511
}

3512
static void css_free_rcu_fn(struct rcu_head *rcu_head)
3513 3514
{
	struct cgroup_subsys_state *css =
3515
		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3516

3517
	INIT_WORK(&css->destroy_work, css_free_work_fn);
3518
	queue_work(cgroup_destroy_wq, &css->destroy_work);
3519 3520
}

3521 3522 3523 3524 3525
static void css_release(struct percpu_ref *ref)
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

3526
	rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL);
3527
	call_rcu(&css->rcu_head, css_free_rcu_fn);
3528 3529
}

3530 3531
static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
		     struct cgroup *cgrp)
3532
{
3533
	css->cgroup = cgrp;
3534
	css->ss = ss;
3535
	css->flags = 0;
3536 3537

	if (cgrp->parent)
3538
		css->parent = cgroup_css(cgrp->parent, ss);
3539
	else
3540
		css->flags |= CSS_ROOT;
3541

3542
	BUG_ON(cgroup_css(cgrp, ss));
3543 3544
}

3545
/* invoke ->css_online() on a new CSS and mark it online if successful */
3546
static int online_css(struct cgroup_subsys_state *css)
3547
{
3548
	struct cgroup_subsys *ss = css->ss;
T
Tejun Heo 已提交
3549 3550
	int ret = 0;

T
Tejun Heo 已提交
3551
	lockdep_assert_held(&cgroup_tree_mutex);
3552 3553
	lockdep_assert_held(&cgroup_mutex);

3554
	if (ss->css_online)
3555
		ret = ss->css_online(css);
3556
	if (!ret) {
3557
		css->flags |= CSS_ONLINE;
3558
		css->cgroup->nr_css++;
3559
		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3560
	}
T
Tejun Heo 已提交
3561
	return ret;
3562 3563
}

3564
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
3565
static void offline_css(struct cgroup_subsys_state *css)
3566
{
3567
	struct cgroup_subsys *ss = css->ss;
3568

T
Tejun Heo 已提交
3569
	lockdep_assert_held(&cgroup_tree_mutex);
3570 3571 3572 3573 3574
	lockdep_assert_held(&cgroup_mutex);

	if (!(css->flags & CSS_ONLINE))
		return;

3575
	if (ss->css_offline)
3576
		ss->css_offline(css);
3577

3578
	css->flags &= ~CSS_ONLINE;
3579
	css->cgroup->nr_css--;
3580
	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
3581 3582
}

3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609
/**
 * create_css - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
 * css is online and installed in @cgrp with all interface files created.
 * Returns 0 on success, -errno on failure.
 */
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
	struct cgroup *parent = cgrp->parent;
	struct cgroup_subsys_state *css;
	int err;

	lockdep_assert_held(&cgroup_mutex);

	css = ss->css_alloc(cgroup_css(parent, ss));
	if (IS_ERR(css))
		return PTR_ERR(css);

	err = percpu_ref_init(&css->refcnt, css_release);
	if (err)
		goto err_free;

	init_css(css, ss, cgrp);

3610
	err = cgroup_populate_dir(cgrp, 1 << ss->id);
3611 3612 3613 3614 3615 3616 3617
	if (err)
		goto err_free;

	err = online_css(css);
	if (err)
		goto err_free;

3618
	cgroup_get(cgrp);
3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637
	css_get(css->parent);

	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
	    parent->parent) {
		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
			   current->comm, current->pid, ss->name);
		if (!strcmp(ss->name, "memory"))
			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
		ss->warned_broken_hierarchy = true;
	}

	return 0;

err_free:
	percpu_ref_cancel_init(&css->refcnt);
	ss->css_free(css);
	return err;
}

T
Tejun Heo 已提交
3638
/**
L
Li Zefan 已提交
3639 3640
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
T
Tejun Heo 已提交
3641
 * @name: name of the new cgroup
T
Tejun Heo 已提交
3642
 * @mode: mode to set on new cgroup
3643
 */
T
Tejun Heo 已提交
3644
static long cgroup_create(struct cgroup *parent, const char *name,
T
Tejun Heo 已提交
3645
			  umode_t mode)
3646
{
3647
	struct cgroup *cgrp;
3648
	struct cgroupfs_root *root = parent->root;
3649
	int ssid, err;
3650
	struct cgroup_subsys *ss;
T
Tejun Heo 已提交
3651
	struct kernfs_node *kn;
3652

T
Tejun Heo 已提交
3653
	/* allocate the cgroup and its ID, 0 is reserved for the root */
3654 3655
	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
	if (!cgrp)
3656 3657
		return -ENOMEM;

T
Tejun Heo 已提交
3658 3659
	mutex_lock(&cgroup_tree_mutex);

3660 3661 3662 3663 3664 3665 3666 3667 3668
	/*
	 * Only live parents can have children.  Note that the liveliness
	 * check isn't strictly necessary because cgroup_mkdir() and
	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
	 * anyway so that locking is contained inside cgroup proper and we
	 * don't get nasty surprises if we ever grow another caller.
	 */
	if (!cgroup_lock_live_group(parent)) {
		err = -ENODEV;
T
Tejun Heo 已提交
3669
		goto err_unlock_tree;
3670 3671 3672 3673 3674 3675 3676 3677 3678 3679
	}

	/*
	 * Temporarily set the pointer to NULL, so idr_find() won't return
	 * a half-baked cgroup.
	 */
	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
	if (cgrp->id < 0) {
		err = -ENOMEM;
		goto err_unlock;
3680 3681
	}

3682
	init_cgroup_housekeeping(cgrp);
3683

3684
	cgrp->parent = parent;
3685
	cgrp->dummy_css.parent = &parent->dummy_css;
3686
	cgrp->root = parent->root;
3687

3688 3689 3690
	if (notify_on_release(parent))
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3691 3692
	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3693

T
Tejun Heo 已提交
3694
	/* create the directory */
T
Tejun Heo 已提交
3695
	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
T
Tejun Heo 已提交
3696 3697
	if (IS_ERR(kn)) {
		err = PTR_ERR(kn);
3698
		goto err_free_id;
T
Tejun Heo 已提交
3699 3700
	}
	cgrp->kn = kn;
3701

3702 3703 3704 3705 3706 3707
	/*
	 * This extra ref will be put in cgroup_free_fn() and guarantees
	 * that @cgrp->kn is always accessible.
	 */
	kernfs_get(kn);

3708
	cgrp->serial_nr = cgroup_serial_nr_next++;
3709

3710 3711 3712
	/* allocation complete, commit to creation */
	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
	root->number_of_cgroups++;
T
Tejun Heo 已提交
3713

T
Tejun Heo 已提交
3714 3715 3716 3717 3718
	/*
	 * Grab a reference on the root and parent so that they don't get
	 * deleted while there are child cgroups.
	 */
	cgroup_get_root(root);
3719
	cgroup_get(parent);
3720

3721 3722 3723 3724
	/*
	 * @cgrp is now fully operational.  If something fails after this
	 * point, it'll be released via the normal destruction path.
	 */
3725 3726
	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3727
	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3728 3729 3730
	if (err)
		goto err_destroy;

3731
	/* let's create and online css's */
T
Tejun Heo 已提交
3732 3733 3734 3735 3736 3737
	for_each_subsys(ss, ssid) {
		if (root->subsys_mask & (1 << ssid)) {
			err = create_css(cgrp, ss);
			if (err)
				goto err_destroy;
		}
3738
	}
3739

T
Tejun Heo 已提交
3740 3741
	kernfs_activate(kn);

3742
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
3743
	mutex_unlock(&cgroup_tree_mutex);
3744 3745 3746

	return 0;

T
Tejun Heo 已提交
3747
err_free_id:
3748
	idr_remove(&root->cgroup_idr, cgrp->id);
3749 3750
err_unlock:
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
3751 3752
err_unlock_tree:
	mutex_unlock(&cgroup_tree_mutex);
3753
	kfree(cgrp);
3754
	return err;
3755 3756 3757 3758

err_destroy:
	cgroup_destroy_locked(cgrp);
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
3759
	mutex_unlock(&cgroup_tree_mutex);
3760
	return err;
3761 3762
}

T
Tejun Heo 已提交
3763 3764
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
			umode_t mode)
3765
{
T
Tejun Heo 已提交
3766
	struct cgroup *parent = parent_kn->priv;
3767

T
Tejun Heo 已提交
3768
	return cgroup_create(parent, name, mode);
3769 3770
}

3771 3772 3773 3774 3775
/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget() is now guaranteed to fail.
 */
static void css_killed_work_fn(struct work_struct *work)
3776
{
3777 3778 3779
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);
	struct cgroup *cgrp = css->cgroup;
3780

T
Tejun Heo 已提交
3781
	mutex_lock(&cgroup_tree_mutex);
3782 3783
	mutex_lock(&cgroup_mutex);

3784 3785 3786 3787 3788 3789
	/*
	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
	 * initate destruction.
	 */
	offline_css(css);

3790 3791 3792 3793 3794
	/*
	 * If @cgrp is marked dead, it's waiting for refs of all css's to
	 * be disabled before proceeding to the second phase of cgroup
	 * destruction.  If we are the last one, kick it off.
	 */
3795
	if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3796 3797 3798
		cgroup_destroy_css_killed(cgrp);

	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
3799
	mutex_unlock(&cgroup_tree_mutex);
3800 3801 3802 3803 3804 3805 3806 3807 3808

	/*
	 * Put the css refs from kill_css().  Each css holds an extra
	 * reference to the cgroup's dentry and cgroup removal proceeds
	 * regardless of css refs.  On the last put of each css, whenever
	 * that may be, the extra dentry ref is put so that dentry
	 * destruction happens only after all css's are released.
	 */
	css_put(css);
3809 3810
}

3811 3812
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
3813 3814 3815 3816
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

3817
	INIT_WORK(&css->destroy_work, css_killed_work_fn);
3818
	queue_work(cgroup_destroy_wq, &css->destroy_work);
3819 3820
}

T
Tejun Heo 已提交
3821 3822 3823 3824
/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
3825 3826 3827 3828
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
 * asynchronously once css_tryget() is guaranteed to fail and when the
 * reference count reaches zero, @css will be released.
T
Tejun Heo 已提交
3829 3830 3831
 */
static void kill_css(struct cgroup_subsys_state *css)
{
T
Tejun Heo 已提交
3832 3833 3834 3835
	/*
	 * This must happen before css is disassociated with its cgroup.
	 * See seq_css() for details.
	 */
3836
	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3837

T
Tejun Heo 已提交
3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854
	/*
	 * Killing would put the base ref, but we need to keep it alive
	 * until after ->css_offline().
	 */
	css_get(css);

	/*
	 * cgroup core guarantees that, by the time ->css_offline() is
	 * invoked, no new css reference will be given out via
	 * css_tryget().  We can't simply call percpu_ref_kill() and
	 * proceed to offlining css's because percpu_ref_kill() doesn't
	 * guarantee that the ref is seen as killed on all CPUs on return.
	 *
	 * Use percpu_ref_kill_and_confirm() to get notifications as each
	 * css is confirmed to be seen as killed on all CPUs.
	 */
	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
 * invoked.  To satisfy all the requirements, destruction is implemented in
 * the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
3881 3882
static int cgroup_destroy_locked(struct cgroup *cgrp)
	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3883
{
3884
	struct cgroup *child;
T
Tejun Heo 已提交
3885
	struct cgroup_subsys_state *css;
3886
	bool empty;
T
Tejun Heo 已提交
3887
	int ssid;
3888

T
Tejun Heo 已提交
3889
	lockdep_assert_held(&cgroup_tree_mutex);
3890 3891
	lockdep_assert_held(&cgroup_mutex);

3892
	/*
T
Tejun Heo 已提交
3893 3894
	 * css_set_lock synchronizes access to ->cset_links and prevents
	 * @cgrp from being removed while __put_css_set() is in progress.
3895 3896
	 */
	read_lock(&css_set_lock);
3897
	empty = list_empty(&cgrp->cset_links);
3898 3899
	read_unlock(&css_set_lock);
	if (!empty)
3900
		return -EBUSY;
L
Li Zefan 已提交
3901

3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917
	/*
	 * Make sure there's no live children.  We can't test ->children
	 * emptiness as dead children linger on it while being destroyed;
	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
	 */
	empty = true;
	rcu_read_lock();
	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
		empty = cgroup_is_dead(child);
		if (!empty)
			break;
	}
	rcu_read_unlock();
	if (!empty)
		return -EBUSY;

3918
	/*
T
Tejun Heo 已提交
3919 3920
	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
	 * will be invoked to perform the rest of destruction once the
3921 3922
	 * percpu refs of all css's are confirmed to be killed.  This
	 * involves removing the subsystem's files, drop cgroup_mutex.
3923
	 */
3924
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
3925 3926
	for_each_css(css, ssid, cgrp)
		kill_css(css);
3927
	mutex_lock(&cgroup_mutex);
3928 3929 3930 3931

	/*
	 * Mark @cgrp dead.  This prevents further task migration and child
	 * creation by disabling cgroup_lock_live_group().  Note that
3932
	 * CGRP_DEAD assertion is depended upon by css_next_child() to
3933
	 * resume iteration after dropping RCU read lock.  See
3934
	 * css_next_child() for details.
3935
	 */
3936
	set_bit(CGRP_DEAD, &cgrp->flags);
3937

3938 3939 3940 3941 3942 3943 3944
	/* CGRP_DEAD is set, remove from ->release_list for the last time */
	raw_spin_lock(&release_list_lock);
	if (!list_empty(&cgrp->release_list))
		list_del_init(&cgrp->release_list);
	raw_spin_unlock(&release_list_lock);

	/*
3945 3946 3947 3948 3949 3950 3951 3952
	 * If @cgrp has css's attached, the second stage of cgroup
	 * destruction is kicked off from css_killed_work_fn() after the
	 * refs of all attached css's are killed.  If @cgrp doesn't have
	 * any css, we kick it off here.
	 */
	if (!cgrp->nr_css)
		cgroup_destroy_css_killed(cgrp);

T
Tejun Heo 已提交
3953 3954 3955
	/* remove @cgrp directory along with the base files */
	mutex_unlock(&cgroup_mutex);

3956
	/*
T
Tejun Heo 已提交
3957 3958 3959 3960 3961
	 * There are two control paths which try to determine cgroup from
	 * dentry without going through kernfs - cgroupstats_build() and
	 * css_tryget_from_dir().  Those are supported by RCU protecting
	 * clearing of cgrp->kn->priv backpointer, which should happen
	 * after all files under it have been removed.
3962
	 */
3963
	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
T
Tejun Heo 已提交
3964 3965
	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

3966
	mutex_lock(&cgroup_mutex);
3967

3968 3969 3970
	return 0;
};

3971
/**
3972
 * cgroup_destroy_css_killed - the second step of cgroup destruction
3973 3974 3975
 * @work: cgroup->destroy_free_work
 *
 * This function is invoked from a work item for a cgroup which is being
3976 3977 3978
 * destroyed after all css's are offlined and performs the rest of
 * destruction.  This is the second step of destruction described in the
 * comment above cgroup_destroy_locked().
3979
 */
3980
static void cgroup_destroy_css_killed(struct cgroup *cgrp)
3981 3982 3983
{
	struct cgroup *parent = cgrp->parent;

T
Tejun Heo 已提交
3984
	lockdep_assert_held(&cgroup_tree_mutex);
3985
	lockdep_assert_held(&cgroup_mutex);
3986

3987
	/* delete this cgroup from parent->children */
3988
	list_del_rcu(&cgrp->sibling);
3989

3990
	cgroup_put(cgrp);
3991

3992
	set_bit(CGRP_RELEASABLE, &parent->flags);
3993
	check_for_release(parent);
3994 3995
}

T
Tejun Heo 已提交
3996
static int cgroup_rmdir(struct kernfs_node *kn)
3997
{
T
Tejun Heo 已提交
3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008
	struct cgroup *cgrp = kn->priv;
	int ret = 0;

	/*
	 * This is self-destruction but @kn can't be removed while this
	 * callback is in progress.  Let's break active protection.  Once
	 * the protection is broken, @cgrp can be destroyed at any point.
	 * Pin it so that it stays accessible.
	 */
	cgroup_get(cgrp);
	kernfs_break_active_protection(kn);
4009

T
Tejun Heo 已提交
4010
	mutex_lock(&cgroup_tree_mutex);
4011
	mutex_lock(&cgroup_mutex);
T
Tejun Heo 已提交
4012 4013 4014 4015 4016 4017 4018 4019

	/*
	 * @cgrp might already have been destroyed while we're trying to
	 * grab the mutexes.
	 */
	if (!cgroup_is_dead(cgrp))
		ret = cgroup_destroy_locked(cgrp);

4020
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
4021
	mutex_unlock(&cgroup_tree_mutex);
4022

T
Tejun Heo 已提交
4023 4024
	kernfs_unbreak_active_protection(kn);
	cgroup_put(cgrp);
4025 4026 4027
	return ret;
}

T
Tejun Heo 已提交
4028 4029 4030 4031 4032 4033 4034 4035
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
	.remount_fs		= cgroup_remount,
	.show_options		= cgroup_show_options,
	.mkdir			= cgroup_mkdir,
	.rmdir			= cgroup_rmdir,
	.rename			= cgroup_rename,
};

4036
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4037 4038
{
	struct cgroup_subsys_state *css;
D
Diego Calleja 已提交
4039 4040

	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4041

T
Tejun Heo 已提交
4042
	mutex_lock(&cgroup_tree_mutex);
4043 4044
	mutex_lock(&cgroup_mutex);

T
Tejun Heo 已提交
4045
	INIT_LIST_HEAD(&ss->cfts);
4046

4047
	/* Create the top cgroup state for this subsystem */
4048
	ss->root = &cgroup_dummy_root;
4049
	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4050 4051
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
4052
	init_css(css, ss, cgroup_dummy_top);
4053

L
Li Zefan 已提交
4054
	/* Update the init_css_set to contain a subsys
4055
	 * pointer to this state - since the subsystem is
L
Li Zefan 已提交
4056 4057
	 * newly registered, all tasks and hence the
	 * init_css_set is in the subsystem's top cgroup. */
4058
	init_css_set.subsys[ss->id] = css;
4059 4060 4061

	need_forkexit_callback |= ss->fork || ss->exit;

L
Li Zefan 已提交
4062 4063 4064 4065 4066
	/* At system boot, before all subsystems have been
	 * registered, no tasks have been forked, so we don't
	 * need to invoke fork callbacks here. */
	BUG_ON(!list_empty(&init_task.tasks));

4067
	BUG_ON(online_css(css));
4068

4069
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
4070
	mutex_unlock(&cgroup_tree_mutex);
4071 4072
}

4073
/**
L
Li Zefan 已提交
4074 4075 4076 4077
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
4078 4079 4080
 */
int __init cgroup_init_early(void)
{
4081
	struct cgroup_subsys *ss;
4082
	int i;
4083

4084
	atomic_set(&init_css_set.refcount, 1);
4085
	INIT_LIST_HEAD(&init_css_set.cgrp_links);
4086
	INIT_LIST_HEAD(&init_css_set.tasks);
4087
	INIT_HLIST_NODE(&init_css_set.hlist);
4088
	css_set_count = 1;
4089 4090
	init_cgroup_root(&cgroup_dummy_root);
	cgroup_root_count = 1;
4091
	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4092

4093
	init_cgrp_cset_link.cset = &init_css_set;
4094 4095
	init_cgrp_cset_link.cgrp = cgroup_dummy_top;
	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4096
	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4097

T
Tejun Heo 已提交
4098
	for_each_subsys(ss, i) {
4099
		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4100 4101
		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4102
		     ss->id, ss->name);
4103 4104 4105
		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4106
		ss->id = i;
4107
		ss->name = cgroup_subsys_name[i];
4108 4109 4110 4111 4112 4113 4114 4115

		if (ss->early_init)
			cgroup_init_subsys(ss);
	}
	return 0;
}

/**
L
Li Zefan 已提交
4116 4117 4118 4119
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
4120 4121 4122
 */
int __init cgroup_init(void)
{
4123
	struct cgroup_subsys *ss;
4124
	unsigned long key;
4125
	int i, err;
4126

T
Tejun Heo 已提交
4127
	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4128

T
Tejun Heo 已提交
4129
	for_each_subsys(ss, i) {
4130 4131
		if (!ss->early_init)
			cgroup_init_subsys(ss);
4132 4133 4134 4135 4136 4137 4138

		/*
		 * cftype registration needs kmalloc and can't be done
		 * during early_init.  Register base cftypes separately.
		 */
		if (ss->base_cftypes)
			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4139 4140
	}

4141
	/* allocate id for the dummy hierarchy */
T
Tejun Heo 已提交
4142 4143
	mutex_lock(&cgroup_mutex);

4144 4145 4146 4147
	/* Add init_css_set to the hash table */
	key = css_set_hash(init_css_set.subsys);
	hash_add(css_set_table, &init_css_set.hlist, key);

4148
	BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4149

4150 4151 4152 4153
	err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
			0, 1, GFP_KERNEL);
	BUG_ON(err < 0);

T
Tejun Heo 已提交
4154 4155
	mutex_unlock(&cgroup_mutex);

4156
	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
T
Tejun Heo 已提交
4157 4158
	if (!cgroup_kobj)
		return -ENOMEM;
4159

4160
	err = register_filesystem(&cgroup_fs_type);
4161 4162
	if (err < 0) {
		kobject_put(cgroup_kobj);
T
Tejun Heo 已提交
4163
		return err;
4164
	}
4165

L
Li Zefan 已提交
4166
	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
T
Tejun Heo 已提交
4167
	return 0;
4168
}
4169

4170 4171 4172 4173 4174
static int __init cgroup_wq_init(void)
{
	/*
	 * There isn't much point in executing destruction path in
	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4175 4176 4177 4178 4179
	 *
	 * XXX: Must be ordered to make sure parent is offlined after
	 * children.  The ordering requirement is for memcg where a
	 * parent's offline may wait for a child's leading to deadlock.  In
	 * the long term, this should be fixed from memcg side.
4180 4181 4182 4183
	 *
	 * We would prefer to do this in cgroup_init() above, but that
	 * is called before init_workqueues(): so leave this until after.
	 */
4184
	cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
4185
	BUG_ON(!cgroup_destroy_wq);
4186 4187 4188 4189 4190 4191 4192 4193 4194

	/*
	 * Used to destroy pidlists and separate to serve as flush domain.
	 * Cap @max_active to 1 too.
	 */
	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
						    0, 1);
	BUG_ON(!cgroup_pidlist_destroy_wq);

4195 4196 4197 4198
	return 0;
}
core_initcall(cgroup_wq_init);

4199 4200 4201 4202 4203 4204
/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
 *    doesn't really matter if tsk->cgroup changes after we read it,
4205
 *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4206 4207 4208 4209 4210 4211
 *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
 *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
 *    cgroup to top_cgroup.
 */

/* TODO: Use a proper seq_file iterator */
4212
int proc_cgroup_show(struct seq_file *m, void *v)
4213 4214 4215
{
	struct pid *pid;
	struct task_struct *tsk;
T
Tejun Heo 已提交
4216
	char *buf, *path;
4217 4218 4219 4220
	int retval;
	struct cgroupfs_root *root;

	retval = -ENOMEM;
T
Tejun Heo 已提交
4221
	buf = kmalloc(PATH_MAX, GFP_KERNEL);
4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234
	if (!buf)
		goto out;

	retval = -ESRCH;
	pid = m->private;
	tsk = get_pid_task(pid, PIDTYPE_PID);
	if (!tsk)
		goto out_free;

	retval = 0;

	mutex_lock(&cgroup_mutex);

4235
	for_each_active_root(root) {
4236
		struct cgroup_subsys *ss;
4237
		struct cgroup *cgrp;
T
Tejun Heo 已提交
4238
		int ssid, count = 0;
4239

4240
		seq_printf(m, "%d:", root->hierarchy_id);
T
Tejun Heo 已提交
4241 4242 4243
		for_each_subsys(ss, ssid)
			if (root->subsys_mask & (1 << ssid))
				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4244 4245 4246
		if (strlen(root->name))
			seq_printf(m, "%sname=%s", count ? "," : "",
				   root->name);
4247
		seq_putc(m, ':');
4248
		cgrp = task_cgroup_from_root(tsk, root);
T
Tejun Heo 已提交
4249 4250 4251
		path = cgroup_path(cgrp, buf, PATH_MAX);
		if (!path) {
			retval = -ENAMETOOLONG;
4252
			goto out_unlock;
T
Tejun Heo 已提交
4253 4254
		}
		seq_puts(m, path);
4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269
		seq_putc(m, '\n');
	}

out_unlock:
	mutex_unlock(&cgroup_mutex);
	put_task_struct(tsk);
out_free:
	kfree(buf);
out:
	return retval;
}

/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
4270
	struct cgroup_subsys *ss;
4271 4272
	int i;

4273
	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
B
Ben Blum 已提交
4274 4275 4276 4277 4278
	/*
	 * ideally we don't want subsystems moving around while we do this.
	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
	 * subsys/hierarchy state.
	 */
4279
	mutex_lock(&cgroup_mutex);
4280 4281

	for_each_subsys(ss, i)
4282 4283
		seq_printf(m, "%s\t%d\t%d\t%d\n",
			   ss->name, ss->root->hierarchy_id,
4284
			   ss->root->number_of_cgroups, !ss->disabled);
4285

4286 4287 4288 4289 4290 4291
	mutex_unlock(&cgroup_mutex);
	return 0;
}

static int cgroupstats_open(struct inode *inode, struct file *file)
{
A
Al Viro 已提交
4292
	return single_open(file, proc_cgroupstats_show, NULL);
4293 4294
}

4295
static const struct file_operations proc_cgroupstats_operations = {
4296 4297 4298 4299 4300 4301
	.open = cgroupstats_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

4302 4303
/**
 * cgroup_fork - attach newly forked task to its parents cgroup.
L
Li Zefan 已提交
4304
 * @child: pointer to task_struct of forking parent process.
4305 4306 4307 4308 4309
 *
 * Description: A task inherits its parent's cgroup at fork().
 *
 * A pointer to the shared css_set was automatically copied in
 * fork.c by dup_task_struct().  However, we ignore that copy, since
4310 4311 4312 4313
 * it was not made under the protection of RCU or cgroup_mutex, so
 * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
 * have already changed current->cgroups, allowing the previously
 * referenced cgroup group to be removed and freed.
4314 4315 4316 4317 4318 4319
 *
 * At the point that cgroup_fork() is called, 'current' is the parent
 * task, and the passed argument 'child' points to the child task.
 */
void cgroup_fork(struct task_struct *child)
{
4320
	task_lock(current);
4321
	get_css_set(task_css_set(current));
4322
	child->cgroups = current->cgroups;
4323
	task_unlock(current);
4324
	INIT_LIST_HEAD(&child->cg_list);
4325 4326
}

4327
/**
L
Li Zefan 已提交
4328 4329 4330
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
4331 4332 4333
 * Adds the task to the list running through its css_set if necessary and
 * call the subsystem fork() callbacks.  Has to be after the task is
 * visible on the task list in case we race with the first call to
4334
 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4335
 * list.
L
Li Zefan 已提交
4336
 */
4337 4338
void cgroup_post_fork(struct task_struct *child)
{
4339
	struct cgroup_subsys *ss;
4340 4341
	int i;

4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352
	/*
	 * use_task_css_set_links is set to 1 before we walk the tasklist
	 * under the tasklist_lock and we read it here after we added the child
	 * to the tasklist under the tasklist_lock as well. If the child wasn't
	 * yet in the tasklist when we walked through it from
	 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
	 * should be visible now due to the paired locking and barriers implied
	 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
	 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
	 * lock on fork.
	 */
4353 4354
	if (use_task_css_set_links) {
		write_lock(&css_set_lock);
4355 4356
		task_lock(child);
		if (list_empty(&child->cg_list))
4357
			list_add(&child->cg_list, &task_css_set(child)->tasks);
4358
		task_unlock(child);
4359 4360
		write_unlock(&css_set_lock);
	}
4361 4362 4363 4364 4365 4366 4367

	/*
	 * Call ss->fork().  This must happen after @child is linked on
	 * css_set; otherwise, @child might change state between ->fork()
	 * and addition to css_set.
	 */
	if (need_forkexit_callback) {
T
Tejun Heo 已提交
4368
		for_each_subsys(ss, i)
4369 4370 4371
			if (ss->fork)
				ss->fork(child);
	}
4372
}
4373

4374 4375 4376
/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
L
Li Zefan 已提交
4377
 * @run_callback: run exit callbacks?
4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405
 *
 * Description: Detach cgroup from @tsk and release it.
 *
 * Note that cgroups marked notify_on_release force every task in
 * them to take the global cgroup_mutex mutex when exiting.
 * This could impact scaling on very large systems.  Be reluctant to
 * use notify_on_release cgroups where very high task exit scaling
 * is required on large systems.
 *
 * the_top_cgroup_hack:
 *
 *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
 *
 *    We call cgroup_exit() while the task is still competent to
 *    handle notify_on_release(), then leave the task attached to the
 *    root cgroup in each hierarchy for the remainder of its exit.
 *
 *    To do this properly, we would increment the reference count on
 *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
 *    code we would add a second cgroup function call, to drop that
 *    reference.  This would just create an unnecessary hot spot on
 *    the top_cgroup reference count, to no avail.
 *
 *    Normally, holding a reference to a cgroup without bumping its
 *    count is unsafe.   The cgroup could go away, or someone could
 *    attach us to a different cgroup, decrementing the count on
 *    the first cgroup that we never incremented.  But in this case,
 *    top_cgroup isn't going away, and either task has PF_EXITING set,
4406 4407
 *    which wards off any cgroup_attach_task() attempts, or task is a failed
 *    fork, never visible to cgroup_attach_task.
4408 4409 4410
 */
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
4411
	struct cgroup_subsys *ss;
4412
	struct css_set *cset;
4413
	int i;
4414 4415 4416 4417 4418 4419 4420 4421 4422

	/*
	 * Unlink from the css_set task list if necessary.
	 * Optimistically check cg_list before taking
	 * css_set_lock
	 */
	if (!list_empty(&tsk->cg_list)) {
		write_lock(&css_set_lock);
		if (!list_empty(&tsk->cg_list))
4423
			list_del_init(&tsk->cg_list);
4424 4425 4426
		write_unlock(&css_set_lock);
	}

4427 4428
	/* Reassign the task to the init_css_set. */
	task_lock(tsk);
4429 4430
	cset = task_css_set(tsk);
	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4431 4432

	if (run_callbacks && need_forkexit_callback) {
T
Tejun Heo 已提交
4433 4434
		/* see cgroup_post_fork() for details */
		for_each_subsys(ss, i) {
4435
			if (ss->exit) {
4436 4437
				struct cgroup_subsys_state *old_css = cset->subsys[i];
				struct cgroup_subsys_state *css = task_css(tsk, i);
4438

4439
				ss->exit(css, old_css, tsk);
4440 4441 4442
			}
		}
	}
4443
	task_unlock(tsk);
4444

4445
	put_css_set_taskexit(cset);
4446
}
4447

4448
static void check_for_release(struct cgroup *cgrp)
4449
{
4450
	if (cgroup_is_releasable(cgrp) &&
T
Tejun Heo 已提交
4451
	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4452 4453
		/*
		 * Control Group is currently removeable. If it's not
4454
		 * already queued for a userspace notification, queue
4455 4456
		 * it now
		 */
4457
		int need_schedule_work = 0;
4458

4459
		raw_spin_lock(&release_list_lock);
4460
		if (!cgroup_is_dead(cgrp) &&
4461 4462
		    list_empty(&cgrp->release_list)) {
			list_add(&cgrp->release_list, &release_list);
4463 4464
			need_schedule_work = 1;
		}
4465
		raw_spin_unlock(&release_list_lock);
4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497
		if (need_schedule_work)
			schedule_work(&release_agent_work);
	}
}

/*
 * Notify userspace when a cgroup is released, by running the
 * configured release agent with the name of the cgroup (path
 * relative to the root of cgroup file system) as the argument.
 *
 * Most likely, this user command will try to rmdir this cgroup.
 *
 * This races with the possibility that some other task will be
 * attached to this cgroup before it is removed, or that some other
 * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
 * unused, and this cgroup will be reprieved from its death sentence,
 * to continue to serve a useful existence.  Next time it's released,
 * we will get notified again, if it still has 'notify_on_release' set.
 *
 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
 * means only wait until the task is successfully execve()'d.  The
 * separate release agent task is forked by call_usermodehelper(),
 * then control in this thread returns here, without waiting for the
 * release agent task.  We don't bother to wait because the caller of
 * this routine has no use for the exit status of the release agent
 * task, so no sense holding our caller up for that.
 */
static void cgroup_release_agent(struct work_struct *work)
{
	BUG_ON(work != &release_agent_work);
	mutex_lock(&cgroup_mutex);
4498
	raw_spin_lock(&release_list_lock);
4499 4500 4501
	while (!list_empty(&release_list)) {
		char *argv[3], *envp[3];
		int i;
T
Tejun Heo 已提交
4502
		char *pathbuf = NULL, *agentbuf = NULL, *path;
4503
		struct cgroup *cgrp = list_entry(release_list.next,
4504 4505
						    struct cgroup,
						    release_list);
4506
		list_del_init(&cgrp->release_list);
4507
		raw_spin_unlock(&release_list_lock);
T
Tejun Heo 已提交
4508
		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
4509 4510
		if (!pathbuf)
			goto continue_free;
T
Tejun Heo 已提交
4511 4512
		path = cgroup_path(cgrp, pathbuf, PATH_MAX);
		if (!path)
4513 4514 4515 4516
			goto continue_free;
		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
		if (!agentbuf)
			goto continue_free;
4517 4518

		i = 0;
4519
		argv[i++] = agentbuf;
T
Tejun Heo 已提交
4520
		argv[i++] = path;
4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534
		argv[i] = NULL;

		i = 0;
		/* minimal command environment */
		envp[i++] = "HOME=/";
		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
		envp[i] = NULL;

		/* Drop the lock while we invoke the usermode helper,
		 * since the exec could involve hitting disk and hence
		 * be a slow process */
		mutex_unlock(&cgroup_mutex);
		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
		mutex_lock(&cgroup_mutex);
4535 4536 4537
 continue_free:
		kfree(pathbuf);
		kfree(agentbuf);
4538
		raw_spin_lock(&release_list_lock);
4539
	}
4540
	raw_spin_unlock(&release_list_lock);
4541 4542
	mutex_unlock(&cgroup_mutex);
}
4543 4544 4545

static int __init cgroup_disable(char *str)
{
4546
	struct cgroup_subsys *ss;
4547
	char *token;
4548
	int i;
4549 4550 4551 4552

	while ((token = strsep(&str, ",")) != NULL) {
		if (!*token)
			continue;
4553

T
Tejun Heo 已提交
4554
		for_each_subsys(ss, i) {
4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565
			if (!strcmp(token, ss->name)) {
				ss->disabled = 1;
				printk(KERN_INFO "Disabling %s control group"
					" subsystem\n", ss->name);
				break;
			}
		}
	}
	return 1;
}
__setup("cgroup_disable=", cgroup_disable);
K
KAMEZAWA Hiroyuki 已提交
4566

4567
/**
4568
 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
4569 4570
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
4571
 *
4572 4573 4574
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
S
Stephane Eranian 已提交
4575
 */
4576 4577
struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
						struct cgroup_subsys *ss)
S
Stephane Eranian 已提交
4578
{
T
Tejun Heo 已提交
4579 4580
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
	struct cgroup_subsys_state *css = NULL;
S
Stephane Eranian 已提交
4581
	struct cgroup *cgrp;
4582

4583
	/* is @dentry a cgroup dir? */
T
Tejun Heo 已提交
4584 4585
	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
	    kernfs_type(kn) != KERNFS_DIR)
S
Stephane Eranian 已提交
4586 4587
		return ERR_PTR(-EBADF);

4588 4589
	rcu_read_lock();

T
Tejun Heo 已提交
4590 4591 4592 4593 4594 4595 4596 4597
	/*
	 * This path doesn't originate from kernfs and @kn could already
	 * have been or be removed at any point.  @kn->priv is RCU
	 * protected for this access.  See destroy_locked() for details.
	 */
	cgrp = rcu_dereference(kn->priv);
	if (cgrp)
		css = cgroup_css(cgrp, ss);
4598 4599 4600 4601 4602 4603

	if (!css || !css_tryget(css))
		css = ERR_PTR(-ENOENT);

	rcu_read_unlock();
	return css;
S
Stephane Eranian 已提交
4604 4605
}

4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617
/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
	struct cgroup *cgrp;

T
Tejun Heo 已提交
4618
	cgroup_assert_mutexes_or_rcu_locked();
4619 4620 4621

	cgrp = idr_find(&ss->root->cgroup_idr, id);
	if (cgrp)
4622
		return cgroup_css(cgrp, ss);
4623
	return NULL;
S
Stephane Eranian 已提交
4624 4625
}

4626
#ifdef CONFIG_CGROUP_DEBUG
4627 4628
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
4629 4630 4631 4632 4633 4634 4635 4636 4637
{
	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

	if (!css)
		return ERR_PTR(-ENOMEM);

	return css;
}

4638
static void debug_css_free(struct cgroup_subsys_state *css)
4639
{
4640
	kfree(css);
4641 4642
}

4643 4644
static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
				struct cftype *cft)
4645
{
4646
	return cgroup_task_count(css->cgroup);
4647 4648
}

4649 4650
static u64 current_css_set_read(struct cgroup_subsys_state *css,
				struct cftype *cft)
4651 4652 4653 4654
{
	return (u64)(unsigned long)current->cgroups;
}

4655
static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
L
Li Zefan 已提交
4656
					 struct cftype *cft)
4657 4658 4659 4660
{
	u64 count;

	rcu_read_lock();
4661
	count = atomic_read(&task_css_set(current)->refcount);
4662 4663 4664 4665
	rcu_read_unlock();
	return count;
}

4666
static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
4667
{
4668
	struct cgrp_cset_link *link;
4669
	struct css_set *cset;
T
Tejun Heo 已提交
4670 4671 4672 4673 4674
	char *name_buf;

	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
	if (!name_buf)
		return -ENOMEM;
4675 4676 4677

	read_lock(&css_set_lock);
	rcu_read_lock();
4678
	cset = rcu_dereference(current->cgroups);
4679
	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
4680
		struct cgroup *c = link->cgrp;
4681 4682
		const char *name = "?";

T
Tejun Heo 已提交
4683 4684 4685 4686
		if (c != cgroup_dummy_top) {
			cgroup_name(c, name_buf, NAME_MAX + 1);
			name = name_buf;
		}
4687

4688 4689
		seq_printf(seq, "Root %d group %s\n",
			   c->root->hierarchy_id, name);
4690 4691 4692
	}
	rcu_read_unlock();
	read_unlock(&css_set_lock);
T
Tejun Heo 已提交
4693
	kfree(name_buf);
4694 4695 4696 4697
	return 0;
}

#define MAX_TASKS_SHOWN_PER_CSS 25
4698
static int cgroup_css_links_read(struct seq_file *seq, void *v)
4699
{
4700
	struct cgroup_subsys_state *css = seq_css(seq);
4701
	struct cgrp_cset_link *link;
4702 4703

	read_lock(&css_set_lock);
4704
	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
4705
		struct css_set *cset = link->cset;
4706 4707
		struct task_struct *task;
		int count = 0;
4708 4709
		seq_printf(seq, "css_set %p\n", cset);
		list_for_each_entry(task, &cset->tasks, cg_list) {
4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722
			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
				seq_puts(seq, "  ...\n");
				break;
			} else {
				seq_printf(seq, "  task %d\n",
					   task_pid_vnr(task));
			}
		}
	}
	read_unlock(&css_set_lock);
	return 0;
}

4723
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
4724
{
4725
	return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743
}

static struct cftype debug_files[] =  {
	{
		.name = "taskcount",
		.read_u64 = debug_taskcount_read,
	},

	{
		.name = "current_css_set",
		.read_u64 = current_css_set_read,
	},

	{
		.name = "current_css_set_refcount",
		.read_u64 = current_css_set_refcount_read,
	},

4744 4745
	{
		.name = "current_css_set_cg_links",
4746
		.seq_show = current_css_set_cg_links_read,
4747 4748 4749 4750
	},

	{
		.name = "cgroup_css_links",
4751
		.seq_show = cgroup_css_links_read,
4752 4753
	},

4754 4755 4756 4757 4758
	{
		.name = "releasable",
		.read_u64 = releasable_read,
	},

4759 4760
	{ }	/* terminate */
};
4761

4762
struct cgroup_subsys debug_cgrp_subsys = {
4763 4764
	.css_alloc = debug_css_alloc,
	.css_free = debug_css_free,
4765
	.base_cftypes = debug_files,
4766 4767
};
#endif /* CONFIG_CGROUP_DEBUG */