cgroup.c 166.0 KB
Newer Older
1 2 3 4 5 6
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
7 8 9 10
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

29 30
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

31
#include <linux/cgroup.h>
32
#include <linux/cred.h>
33
#include <linux/ctype.h>
34
#include <linux/errno.h>
35
#include <linux/init_task.h>
36 37
#include <linux/kernel.h>
#include <linux/list.h>
38
#include <linux/magic.h>
39 40 41 42
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
43
#include <linux/proc_fs.h>
44 45 46 47
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
48
#include <linux/percpu-rwsem.h>
49
#include <linux/string.h>
50
#include <linux/sort.h>
51
#include <linux/kmod.h>
B
Balbir Singh 已提交
52 53
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
54
#include <linux/hashtable.h>
L
Li Zefan 已提交
55
#include <linux/pid_namespace.h>
56
#include <linux/idr.h>
57
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58
#include <linux/kthread.h>
T
Tejun Heo 已提交
59
#include <linux/delay.h>
A
Arun Sharma 已提交
60
#include <linux/atomic.h>
61
#include <linux/cpuset.h>
T
Tejun Heo 已提交
62
#include <net/sock.h>
63

64 65 66 67 68 69 70 71
/*
 * pidlists linger the following amount before being destroyed.  The goal
 * is avoiding frequent destruction in the middle of consecutive read calls
 * Expiring in the middle is a performance problem not a correctness one.
 * 1 sec should be enough.
 */
#define CGROUP_PIDLIST_DESTROY_DELAY	HZ

T
Tejun Heo 已提交
72 73 74
#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
					 MAX_CFTYPE_NAME + 2)

T
Tejun Heo 已提交
75 76 77 78
/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
79
 * css_set_lock protects task->cgroups pointer, the list of css_set
80
 * objects, and the chain of tasks off each css_set.
T
Tejun Heo 已提交
81
 *
82 83
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
T
Tejun Heo 已提交
84
 */
T
Tejun Heo 已提交
85 86
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
87
DEFINE_SPINLOCK(css_set_lock);
88
EXPORT_SYMBOL_GPL(cgroup_mutex);
89
EXPORT_SYMBOL_GPL(css_set_lock);
T
Tejun Heo 已提交
90
#else
91
static DEFINE_MUTEX(cgroup_mutex);
92
static DEFINE_SPINLOCK(css_set_lock);
T
Tejun Heo 已提交
93 94
#endif

95
/*
96 97
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
98 99 100
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

101 102 103 104 105 106
/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

107 108 109 110 111
/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 */
static DEFINE_SPINLOCK(release_agent_path_lock);
112

113 114
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;

T
Tejun Heo 已提交
115
#define cgroup_assert_mutex_or_rcu_locked()				\
116 117
	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
			   !lockdep_is_held(&cgroup_mutex),		\
T
Tejun Heo 已提交
118
			   "cgroup_mutex or RCU read lock required");
119

120 121 122 123 124 125 126 127
/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

128 129 130 131 132 133
/*
 * pidlist destructions need to be flushed on cgroup destruction.  Use a
 * separate workqueue as flush domain.
 */
static struct workqueue_struct *cgroup_pidlist_destroy_wq;

T
Tejun Heo 已提交
134
/* generate an array of cgroup subsystem pointers */
135
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
T
Tejun Heo 已提交
136
static struct cgroup_subsys *cgroup_subsys[] = {
137 138
#include <linux/cgroup_subsys.h>
};
139 140 141 142 143
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
144 145
#include <linux/cgroup_subsys.h>
};
146
#undef SUBSYS
147

148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)								\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

169
/*
170
 * The default hierarchy, reserved for the subsystems that are otherwise
171 172
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
173
 */
T
Tejun Heo 已提交
174
struct cgroup_root cgrp_dfl_root;
T
Tejun Heo 已提交
175
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
176

T
Tejun Heo 已提交
177 178 179 180
/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
T
Tejun Heo 已提交
181
static bool cgrp_dfl_visible;
182

183
/* Controllers blocked by the commandline in v1 */
184
static u16 cgroup_no_v1_mask;
185

186
/* some controllers are not supported in the default hierarchy */
T
Tejun Heo 已提交
187
static u16 cgrp_dfl_inhibit_ss_mask;
188

189 190
/* The list of hierarchy roots */

191 192
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
193

T
Tejun Heo 已提交
194
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
195
static DEFINE_IDR(cgroup_hierarchy_idr);
196

197
/*
198 199 200 201 202
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
203
 */
204
static u64 css_serial_nr_next = 1;
205

206 207 208 209
/*
 * These bitmask flags indicate whether tasks in the fork and exit paths have
 * fork/exit handlers to call. This avoids us having to do extra work in the
 * fork/exit path to check which subsystems have fork/exit callbacks.
210
 */
211 212 213
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
214

215
/* Ditto for the can_fork callback. */
216
static u16 have_canfork_callback __read_mostly;
217

218
static struct file_system_type cgroup2_fs_type;
219 220
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
221

222
static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
223
static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
224 225
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
226
static void css_task_iter_advance(struct css_task_iter *it);
227
static int cgroup_destroy_locked(struct cgroup *cgrp);
228 229
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss);
230
static void css_release(struct percpu_ref *ref);
231
static void kill_css(struct cgroup_subsys_state *css);
232 233
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
234
			      bool is_add);
235

236 237 238 239 240 241 242 243 244 245 246 247 248
/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
static bool cgroup_ssid_enabled(int ssid)
{
	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

249 250 251 252 253
static bool cgroup_ssid_no_v1(int ssid)
{
	return cgroup_no_v1_mask & (1 << ssid);
}

254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differnetly depending on the
 * interface version.
 *
 * The set of behaviors which change on the default hierarchy are still
 * being determined and the mount option is prefixed with __DEVEL__.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - Remount is disallowed.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled inbetween reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 *   is not created.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 *
 * - debug: disallowed on the default hierarchy.
 */
static bool cgroup_on_dfl(const struct cgroup *cgrp)
{
	return cgrp->root == &cgrp_dfl_root;
}

312 313 314 315 316 317 318
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
			    gfp_t gfp_mask)
{
	int ret;

	idr_preload(gfp_mask);
T
Tejun Heo 已提交
319
	spin_lock_bh(&cgroup_idr_lock);
320
	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
T
Tejun Heo 已提交
321
	spin_unlock_bh(&cgroup_idr_lock);
322 323 324 325 326 327 328 329
	idr_preload_end();
	return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
	void *ret;

T
Tejun Heo 已提交
330
	spin_lock_bh(&cgroup_idr_lock);
331
	ret = idr_replace(idr, ptr, id);
T
Tejun Heo 已提交
332
	spin_unlock_bh(&cgroup_idr_lock);
333 334 335 336 337
	return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
T
Tejun Heo 已提交
338
	spin_lock_bh(&cgroup_idr_lock);
339
	idr_remove(idr, id);
T
Tejun Heo 已提交
340
	spin_unlock_bh(&cgroup_idr_lock);
341 342
}

T
Tejun Heo 已提交
343 344 345 346 347 348 349 350 351
static struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
	struct cgroup_subsys_state *parent_css = cgrp->self.parent;

	if (parent_css)
		return container_of(parent_css, struct cgroup, self);
	return NULL;
}

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);
	u16 root_ss_mask = cgrp->root->subsys_mask;

	if (parent)
		return parent->subtree_control;

	if (cgroup_on_dfl(cgrp))
		root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask;

	return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);

	if (parent)
		return parent->subtree_ss_mask;

	return cgrp->root->subsys_mask;
}

T
Tejun Heo 已提交
378 379 380
/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
381
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
T
Tejun Heo 已提交
382
 *
383 384 385 386 387
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
T
Tejun Heo 已提交
388 389
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
390
					      struct cgroup_subsys *ss)
T
Tejun Heo 已提交
391
{
392
	if (ss)
393
		return rcu_dereference_check(cgrp->subsys[ss->id],
T
Tejun Heo 已提交
394
					lockdep_is_held(&cgroup_mutex));
395
	else
396
		return &cgrp->self;
T
Tejun Heo 已提交
397
}
398

399 400 401
/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
402
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
403
 *
C
Chen Hanxiao 已提交
404
 * Similar to cgroup_css() but returns the effective css, which is defined
405 406 407 408 409 410 411 412 413 414
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
						struct cgroup_subsys *ss)
{
	lockdep_assert_held(&cgroup_mutex);

	if (!ss)
415
		return &cgrp->self;
416

T
Tejun Heo 已提交
417 418
	/*
	 * This function is used while updating css associations and thus
419
	 * can't test the csses directly.  Test ss_mask.
T
Tejun Heo 已提交
420
	 */
421
	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
T
Tejun Heo 已提交
422
		cgrp = cgroup_parent(cgrp);
423 424 425
		if (!cgrp)
			return NULL;
	}
426 427

	return cgroup_css(cgrp, ss);
T
Tejun Heo 已提交
428
}
429

T
Tejun Heo 已提交
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
					     struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	rcu_read_lock();

	do {
		css = cgroup_css(cgrp, ss);

		if (css && css_tryget_online(css))
			goto out_unlock;
		cgrp = cgroup_parent(cgrp);
	} while (cgrp);

	css = init_css_set.subsys[ss->id];
	css_get(css);
out_unlock:
	rcu_read_unlock();
	return css;
}

463
/* convenient tests for these bits */
464
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
465
{
466
	return !(cgrp->self.flags & CSS_ONLINE);
467 468
}

T
Tejun Heo 已提交
469 470 471 472 473 474 475 476 477 478 479
static void cgroup_get(struct cgroup *cgrp)
{
	WARN_ON_ONCE(cgroup_is_dead(cgrp));
	css_get(&cgrp->self);
}

static bool cgroup_tryget(struct cgroup *cgrp)
{
	return css_tryget(&cgrp->self);
}

T
Tejun Heo 已提交
480
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
481
{
T
Tejun Heo 已提交
482
	struct cgroup *cgrp = of->kn->parent->priv;
T
Tejun Heo 已提交
483
	struct cftype *cft = of_cft(of);
T
Tejun Heo 已提交
484 485 486 487 488 489 490 491 492 493 494 495

	/*
	 * This is open and unprotected implementation of cgroup_css().
	 * seq_css() is only called from a kernfs file operation which has
	 * an active reference on the file.  Because all the subsystem
	 * files are drained before a css is disassociated with a cgroup,
	 * the matching css from the cgroup's subsys table is guaranteed to
	 * be and stay valid until the enclosing operation is complete.
	 */
	if (cft->ss)
		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
	else
496
		return &cgrp->self;
497
}
T
Tejun Heo 已提交
498
EXPORT_SYMBOL_GPL(of_css);
499

500
static int notify_on_release(const struct cgroup *cgrp)
501
{
502
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
503 504
}

T
Tejun Heo 已提交
505 506 507 508 509 510
/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
511
 * Should be called under cgroup_[tree_]mutex.
T
Tejun Heo 已提交
512 513 514 515 516 517 518 519
 */
#define for_each_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = rcu_dereference_check(			\
				(cgrp)->subsys[(ssid)],			\
				lockdep_is_held(&cgroup_mutex)))) { }	\
		else

520 521 522 523 524 525 526 527 528 529 530 531 532 533
/**
 * for_each_e_css - iterate all effective css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
#define for_each_e_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
			;						\
		else

534
/**
T
Tejun Heo 已提交
535
 * for_each_subsys - iterate all enabled cgroup subsystems
536
 * @ss: the iteration cursor
537
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
538
 */
539
#define for_each_subsys(ss, ssid)					\
T
Tejun Heo 已提交
540 541
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
542

543
/**
544
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
545 546
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
547
 * @ss_mask: the bitmask
548 549
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
550
 * @ss_mask is set.
551
 */
552 553 554
#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
	unsigned long __ss_mask = (ss_mask);				\
	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
555
		(ssid) = 0;						\
556 557 558 559 560 561 562 563 564 565
		break;							\
	}								\
	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
		(ss) = cgroup_subsys[ssid];				\
		{

#define while_each_subsys_mask()					\
		}							\
	}								\
} while (false)
566

567 568
/* iterate across the hierarchies */
#define for_each_root(root)						\
569
	list_for_each_entry((root), &cgroup_roots, root_list)
570

571 572
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
573
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
T
Tejun Heo 已提交
574
		if (({ lockdep_assert_held(&cgroup_mutex);		\
575 576 577
		       cgroup_is_dead(child); }))			\
			;						\
		else
578

579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

597
static void cgroup_release_agent(struct work_struct *work);
598
static void check_for_release(struct cgroup *cgrp);
599

600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
	/* the cgroup and css_set this link associates */
	struct cgroup		*cgrp;
	struct css_set		*cset;

	/* list of cgrp_cset_links anchored at cgrp->cset_links */
	struct list_head	cset_link;

	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
	struct list_head	cgrp_link;
618 619
};

620 621
/*
 * The default css_set - used by init and its children prior to any
622 623 624 625 626
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
T
Tejun Heo 已提交
627
struct css_set init_css_set = {
628 629 630 631 632 633
	.refcount		= ATOMIC_INIT(1),
	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
634
	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
635
};
636

637
static int css_set_count	= 1;	/* 1 for init_css_set */
638

639 640 641 642 643 644
/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 */
static bool css_set_populated(struct css_set *cset)
{
645
	lockdep_assert_held(&css_set_lock);
646 647 648 649

	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

650 651 652 653 654
/**
 * cgroup_update_populated - updated populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
655 656 657 658
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 * count is propagated towards root so that a given cgroup's populated_cnt
 * is zero iff the cgroup and all its descendants don't contain any tasks.
659 660 661 662 663 664 665 666 667
 *
 * @cgrp's interface file "cgroup.populated" is zero if
 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 * changes from or to zero, userland is notified that the content of the
 * interface file has changed.  This can be used to detect when @cgrp and
 * its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
668
	lockdep_assert_held(&css_set_lock);
669 670 671 672 673 674 675 676 677 678 679 680

	do {
		bool trigger;

		if (populated)
			trigger = !cgrp->populated_cnt++;
		else
			trigger = !--cgrp->populated_cnt;

		if (!trigger)
			break;

681
		check_for_release(cgrp);
682 683
		cgroup_file_notify(&cgrp->events_file);

T
Tejun Heo 已提交
684
		cgrp = cgroup_parent(cgrp);
685 686 687
	} while (cgrp);
}

688 689 690 691 692 693 694 695 696 697 698 699
/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * ->populated_cnt of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
	struct cgrp_cset_link *link;

700
	lockdep_assert_held(&css_set_lock);
701 702 703 704 705

	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		cgroup_update_populated(link->cgrp, populated);
}

T
Tejun Heo 已提交
706 707 708 709 710 711 712 713 714 715 716
/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
717 718 719
 * This function automatically handles populated_cnt updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
T
Tejun Heo 已提交
720 721 722 723 724
 */
static void css_set_move_task(struct task_struct *task,
			      struct css_set *from_cset, struct css_set *to_cset,
			      bool use_mg_tasks)
{
725
	lockdep_assert_held(&css_set_lock);
T
Tejun Heo 已提交
726

727 728 729
	if (to_cset && !css_set_populated(to_cset))
		css_set_update_populated(to_cset, true);

T
Tejun Heo 已提交
730
	if (from_cset) {
731 732
		struct css_task_iter *it, *pos;

T
Tejun Heo 已提交
733
		WARN_ON_ONCE(list_empty(&task->cg_list));
734 735 736 737 738 739 740 741 742 743 744 745 746

		/*
		 * @task is leaving, advance task iterators which are
		 * pointing to it so that they can resume at the next
		 * position.  Advancing an iterator might remove it from
		 * the list, use safe walk.  See css_task_iter_advance*()
		 * for details.
		 */
		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
					 iters_node)
			if (it->task_pos == &task->cg_list)
				css_task_iter_advance(it);

T
Tejun Heo 已提交
747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
		list_del_init(&task->cg_list);
		if (!css_set_populated(from_cset))
			css_set_update_populated(from_cset, false);
	} else {
		WARN_ON_ONCE(!list_empty(&task->cg_list));
	}

	if (to_cset) {
		/*
		 * We are synchronized through cgroup_threadgroup_rwsem
		 * against PF_EXITING setting such that we can't race
		 * against cgroup_exit() changing the css_set to
		 * init_css_set and dropping the old one.
		 */
		WARN_ON_ONCE(task->flags & PF_EXITING);

		rcu_assign_pointer(task->cgroups, to_cset);
		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
							     &to_cset->tasks);
	}
}

769 770 771 772 773
/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
774
#define CSS_SET_HASH_BITS	7
775
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
776

777
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
778
{
779
	unsigned long key = 0UL;
780 781
	struct cgroup_subsys *ss;
	int i;
782

783
	for_each_subsys(ss, i)
784 785
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;
786

787
	return key;
788 789
}

Z
Zefan Li 已提交
790
static void put_css_set_locked(struct css_set *cset)
791
{
792
	struct cgrp_cset_link *link, *tmp_link;
T
Tejun Heo 已提交
793 794
	struct cgroup_subsys *ss;
	int ssid;
795

796
	lockdep_assert_held(&css_set_lock);
797 798

	if (!atomic_dec_and_test(&cset->refcount))
799
		return;
800

801 802
	/* This css_set is dead. unlink it and release cgroup and css refs */
	for_each_subsys(ss, ssid) {
T
Tejun Heo 已提交
803
		list_del(&cset->e_cset_node[ssid]);
804 805
		css_put(cset->subsys[ssid]);
	}
806
	hash_del(&cset->hlist);
807 808
	css_set_count--;

809 810 811
	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
812 813
		if (cgroup_parent(link->cgrp))
			cgroup_put(link->cgrp);
814
		kfree(link);
815
	}
816

817
	kfree_rcu(cset, rcu_head);
818 819
}

Z
Zefan Li 已提交
820
static void put_css_set(struct css_set *cset)
821 822 823 824 825 826 827 828 829
{
	/*
	 * Ensure that the refcount doesn't hit zero while any readers
	 * can see it. Similar to atomic_dec_and_lock(), but for an
	 * rwlock
	 */
	if (atomic_add_unless(&cset->refcount, -1, 1))
		return;

830
	spin_lock_bh(&css_set_lock);
Z
Zefan Li 已提交
831
	put_css_set_locked(cset);
832
	spin_unlock_bh(&css_set_lock);
833 834
}

835 836 837
/*
 * refcounted get/put for css_set objects
 */
838
static inline void get_css_set(struct css_set *cset)
839
{
840
	atomic_inc(&cset->refcount);
841 842
}

843
/**
844
 * compare_css_sets - helper function for find_existing_css_set().
845 846
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
847 848 849
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
L
Li Zefan 已提交
850
 * Returns true if "cset" matches "old_cset" except for the hierarchy
851 852
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
853 854
static bool compare_css_sets(struct css_set *cset,
			     struct css_set *old_cset,
855 856 857 858 859
			     struct cgroup *new_cgrp,
			     struct cgroup_subsys_state *template[])
{
	struct list_head *l1, *l2;

860 861 862 863 864 865
	/*
	 * On the default hierarchy, there can be csets which are
	 * associated with the same set of cgroups but different csses.
	 * Let's first ensure that csses match.
	 */
	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
866 867 868 869
		return false;

	/*
	 * Compare cgroup pointers in order to distinguish between
870 871 872
	 * different cgroups in hierarchies.  As different cgroups may
	 * share the same effective css, this comparison is always
	 * necessary.
873
	 */
874 875
	l1 = &cset->cgrp_links;
	l2 = &old_cset->cgrp_links;
876
	while (1) {
877
		struct cgrp_cset_link *link1, *link2;
878
		struct cgroup *cgrp1, *cgrp2;
879 880 881 882

		l1 = l1->next;
		l2 = l2->next;
		/* See if we reached the end - both lists are equal length. */
883 884
		if (l1 == &cset->cgrp_links) {
			BUG_ON(l2 != &old_cset->cgrp_links);
885 886
			break;
		} else {
887
			BUG_ON(l2 == &old_cset->cgrp_links);
888 889
		}
		/* Locate the cgroups associated with these links. */
890 891 892 893
		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
		cgrp1 = link1->cgrp;
		cgrp2 = link2->cgrp;
894
		/* Hierarchies should be linked in the same order. */
895
		BUG_ON(cgrp1->root != cgrp2->root);
896 897 898 899 900 901 902 903

		/*
		 * If this hierarchy is the hierarchy of the cgroup
		 * that's changing, then we need to check that this
		 * css_set points to the new cgroup; if it's any other
		 * hierarchy, then this css_set should point to the
		 * same cgroup as the old css_set.
		 */
904 905
		if (cgrp1->root == new_cgrp->root) {
			if (cgrp1 != new_cgrp)
906 907
				return false;
		} else {
908
			if (cgrp1 != cgrp2)
909 910 911 912 913 914
				return false;
		}
	}
	return true;
}

915 916 917 918 919
/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
920
 */
921 922 923
static struct css_set *find_existing_css_set(struct css_set *old_cset,
					struct cgroup *cgrp,
					struct cgroup_subsys_state *template[])
924
{
925
	struct cgroup_root *root = cgrp->root;
926
	struct cgroup_subsys *ss;
927
	struct css_set *cset;
928
	unsigned long key;
929
	int i;
930

B
Ben Blum 已提交
931 932 933 934 935
	/*
	 * Build the set of subsystem state objects that we want to see in the
	 * new css_set. while subsystems can change globally, the entries here
	 * won't change, so no need for locking.
	 */
936
	for_each_subsys(ss, i) {
937
		if (root->subsys_mask & (1UL << i)) {
938 939 940 941 942
			/*
			 * @ss is in this hierarchy, so we want the
			 * effective css from @cgrp.
			 */
			template[i] = cgroup_e_css(cgrp, ss);
943
		} else {
944 945 946 947
			/*
			 * @ss is not in this hierarchy, so we don't want
			 * to change the css.
			 */
948
			template[i] = old_cset->subsys[i];
949 950 951
		}
	}

952
	key = css_set_hash(template);
953 954
	hash_for_each_possible(css_set_table, cset, hlist, key) {
		if (!compare_css_sets(cset, old_cset, cgrp, template))
955 956 957
			continue;

		/* This css_set matches what we need */
958
		return cset;
959
	}
960 961 962 963 964

	/* No existing cgroup group matched */
	return NULL;
}

965
static void free_cgrp_cset_links(struct list_head *links_to_free)
966
{
967
	struct cgrp_cset_link *link, *tmp_link;
968

969 970
	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
		list_del(&link->cset_link);
971 972 973 974
		kfree(link);
	}
}

975 976 977 978 979 980 981
/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
982
 */
983
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
984
{
985
	struct cgrp_cset_link *link;
986
	int i;
987 988 989

	INIT_LIST_HEAD(tmp_links);

990
	for (i = 0; i < count; i++) {
991
		link = kzalloc(sizeof(*link), GFP_KERNEL);
992
		if (!link) {
993
			free_cgrp_cset_links(tmp_links);
994 995
			return -ENOMEM;
		}
996
		list_add(&link->cset_link, tmp_links);
997 998 999 1000
	}
	return 0;
}

1001 1002
/**
 * link_css_set - a helper function to link a css_set to a cgroup
1003
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
1004
 * @cset: the css_set to be linked
1005 1006
 * @cgrp: the destination cgroup
 */
1007 1008
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
1009
{
1010
	struct cgrp_cset_link *link;
1011

1012
	BUG_ON(list_empty(tmp_links));
T
Tejun Heo 已提交
1013 1014 1015 1016

	if (cgroup_on_dfl(cgrp))
		cset->dfl_cgrp = cgrp;

1017 1018
	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
1019
	link->cgrp = cgrp;
1020

1021
	/*
1022 1023
	 * Always add links to the tail of the lists so that the lists are
	 * in choronological order.
1024
	 */
1025
	list_move_tail(&link->cset_link, &cgrp->cset_links);
1026
	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1027 1028 1029

	if (cgroup_parent(cgrp))
		cgroup_get(cgrp);
1030 1031
}

1032 1033 1034 1035 1036 1037 1038
/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
1039
 */
1040 1041
static struct css_set *find_css_set(struct css_set *old_cset,
				    struct cgroup *cgrp)
1042
{
1043
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1044
	struct css_set *cset;
1045 1046
	struct list_head tmp_links;
	struct cgrp_cset_link *link;
T
Tejun Heo 已提交
1047
	struct cgroup_subsys *ss;
1048
	unsigned long key;
T
Tejun Heo 已提交
1049
	int ssid;
1050

1051 1052
	lockdep_assert_held(&cgroup_mutex);

1053 1054
	/* First see if we already have a cgroup group that matches
	 * the desired set */
1055
	spin_lock_bh(&css_set_lock);
1056 1057 1058
	cset = find_existing_css_set(old_cset, cgrp, template);
	if (cset)
		get_css_set(cset);
1059
	spin_unlock_bh(&css_set_lock);
1060

1061 1062
	if (cset)
		return cset;
1063

1064
	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1065
	if (!cset)
1066 1067
		return NULL;

1068
	/* Allocate all the cgrp_cset_link objects that we'll need */
1069
	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1070
		kfree(cset);
1071 1072 1073
		return NULL;
	}

1074
	atomic_set(&cset->refcount, 1);
1075
	INIT_LIST_HEAD(&cset->cgrp_links);
1076
	INIT_LIST_HEAD(&cset->tasks);
T
Tejun Heo 已提交
1077
	INIT_LIST_HEAD(&cset->mg_tasks);
1078
	INIT_LIST_HEAD(&cset->mg_preload_node);
1079
	INIT_LIST_HEAD(&cset->mg_node);
1080
	INIT_LIST_HEAD(&cset->task_iters);
1081
	INIT_HLIST_NODE(&cset->hlist);
1082 1083 1084

	/* Copy the set of subsystem state objects generated in
	 * find_existing_css_set() */
1085
	memcpy(cset->subsys, template, sizeof(cset->subsys));
1086

1087
	spin_lock_bh(&css_set_lock);
1088
	/* Add reference counts and links from the new css_set. */
1089
	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1090
		struct cgroup *c = link->cgrp;
1091

1092 1093
		if (c->root == cgrp->root)
			c = cgrp;
1094
		link_css_set(&tmp_links, cset, c);
1095
	}
1096

1097
	BUG_ON(!list_empty(&tmp_links));
1098 1099

	css_set_count++;
1100

T
Tejun Heo 已提交
1101
	/* Add @cset to the hash table */
1102 1103
	key = css_set_hash(cset->subsys);
	hash_add(css_set_table, &cset->hlist, key);
1104

1105 1106 1107
	for_each_subsys(ss, ssid) {
		struct cgroup_subsys_state *css = cset->subsys[ssid];

T
Tejun Heo 已提交
1108
		list_add_tail(&cset->e_cset_node[ssid],
1109 1110 1111
			      &css->cgroup->e_csets[ssid]);
		css_get(css);
	}
T
Tejun Heo 已提交
1112

1113
	spin_unlock_bh(&css_set_lock);
1114

1115
	return cset;
1116 1117
}

1118
static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1119
{
1120
	struct cgroup *root_cgrp = kf_root->kn->priv;
T
Tejun Heo 已提交
1121

1122
	return root_cgrp->root;
T
Tejun Heo 已提交
1123 1124
}

1125
static int cgroup_init_root_id(struct cgroup_root *root)
1126 1127 1128 1129 1130
{
	int id;

	lockdep_assert_held(&cgroup_mutex);

1131
	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1132 1133 1134 1135 1136 1137 1138
	if (id < 0)
		return id;

	root->hierarchy_id = id;
	return 0;
}

1139
static void cgroup_exit_root_id(struct cgroup_root *root)
1140 1141 1142 1143 1144 1145 1146 1147 1148
{
	lockdep_assert_held(&cgroup_mutex);

	if (root->hierarchy_id) {
		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
		root->hierarchy_id = 0;
	}
}

1149
static void cgroup_free_root(struct cgroup_root *root)
1150 1151
{
	if (root) {
C
Chen Hanxiao 已提交
1152
		/* hierarchy ID should already have been released */
1153 1154 1155 1156 1157 1158 1159
		WARN_ON_ONCE(root->hierarchy_id);

		idr_destroy(&root->cgroup_idr);
		kfree(root);
	}
}

1160
static void cgroup_destroy_root(struct cgroup_root *root)
1161
{
1162
	struct cgroup *cgrp = &root->cgrp;
1163 1164
	struct cgrp_cset_link *link, *tmp_link;

1165
	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1166

T
Tejun Heo 已提交
1167
	BUG_ON(atomic_read(&root->nr_cgrps));
1168
	BUG_ON(!list_empty(&cgrp->self.children));
1169 1170

	/* Rebind all subsystems back to the default hierarchy */
1171
	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1172 1173

	/*
1174 1175
	 * Release all the links from cset_links to this hierarchy's
	 * root cgroup
1176
	 */
1177
	spin_lock_bh(&css_set_lock);
1178 1179 1180 1181 1182 1183

	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
		kfree(link);
	}
1184 1185

	spin_unlock_bh(&css_set_lock);
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195

	if (!list_empty(&root->root_list)) {
		list_del(&root->root_list);
		cgroup_root_count--;
	}

	cgroup_exit_root_id(root);

	mutex_unlock(&cgroup_mutex);

T
Tejun Heo 已提交
1196
	kernfs_destroy_root(root->kf_root);
1197 1198 1199
	cgroup_free_root(root);
}

1200 1201
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1202
					    struct cgroup_root *root)
1203 1204 1205
{
	struct cgroup *res = NULL;

1206
	lockdep_assert_held(&cgroup_mutex);
1207
	lockdep_assert_held(&css_set_lock);
1208

1209
	if (cset == &init_css_set) {
1210
		res = &root->cgrp;
1211
	} else {
1212 1213 1214
		struct cgrp_cset_link *link;

		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1215
			struct cgroup *c = link->cgrp;
1216

1217 1218 1219 1220 1221 1222
			if (c->root == root) {
				res = c;
				break;
			}
		}
	}
1223

1224 1225 1226 1227
	BUG_ON(!res);
	return res;
}

1228
/*
1229
 * Return the cgroup for "task" from the given hierarchy. Must be
1230
 * called with cgroup_mutex and css_set_lock held.
1231 1232
 */
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1233
					    struct cgroup_root *root)
1234 1235 1236 1237 1238 1239 1240 1241 1242
{
	/*
	 * No need to lock the task - since we hold cgroup_mutex the
	 * task can't change groups, so the only thing that can happen
	 * is that it exits and its css is set back to init_css_set.
	 */
	return cset_cgroup_from_root(task_css_set(task), root);
}

1243 1244 1245 1246 1247 1248
/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
1249
 * cgroup_attach_task() can increment it again.  Because a count of zero
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
1261
 * least one task in the system (init, pid == 1), therefore, root cgroup
1262
 * always has either children cgroups and/or using tasks.  So we don't
1263
 * need a special hack to ensure that root cgroup cannot be deleted.
1264 1265
 *
 * P.S.  One more locking exception.  RCU is used to guard the
1266
 * update of a tasks cgroup pointer by cgroup_attach_task()
1267 1268
 */

T
Tejun Heo 已提交
1269
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1270
static const struct file_operations proc_cgroupstats_operations;
1271

T
Tejun Heo 已提交
1272 1273
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
			      char *buf)
1274
{
1275 1276
	struct cgroup_subsys *ss = cft->ss;

T
Tejun Heo 已提交
1277 1278 1279
	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1280 1281
			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
			 cft->name);
T
Tejun Heo 已提交
1282 1283 1284
	else
		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	return buf;
1285 1286
}

1287 1288 1289 1290
/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
1291
 * S_IRUGO for read, S_IWUSR for write.
1292 1293
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
1294
{
1295
	umode_t mode = 0;
1296

1297 1298 1299
	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
		mode |= S_IRUGO;

1300 1301 1302 1303 1304 1305
	if (cft->write_u64 || cft->write_s64 || cft->write) {
		if (cft->flags & CFTYPE_WORLD_WRITABLE)
			mode |= S_IWUGO;
		else
			mode |= S_IWUSR;
	}
1306 1307

	return mode;
1308 1309
}

1310
/**
1311
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1312
 * @subtree_control: the new subtree_control mask to consider
1313
 * @this_ss_mask: available subsystems
1314 1315 1316 1317 1318
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
1319
 * This function calculates which subsystems need to be enabled if
1320
 * @subtree_control is to be applied while restricted to @this_ss_mask.
1321
 */
1322
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1323
{
1324
	u16 cur_ss_mask = subtree_control;
1325 1326 1327 1328 1329 1330
	struct cgroup_subsys *ss;
	int ssid;

	lockdep_assert_held(&cgroup_mutex);

	while (true) {
1331
		u16 new_ss_mask = cur_ss_mask;
1332

1333
		do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1334
			new_ss_mask |= ss->depends_on;
1335
		} while_each_subsys_mask();
1336 1337 1338 1339 1340 1341

		/*
		 * Mask out subsystems which aren't available.  This can
		 * happen only if some depended-upon subsystems were bound
		 * to non-default hierarchies.
		 */
1342
		new_ss_mask &= this_ss_mask;
1343 1344 1345 1346 1347 1348

		if (new_ss_mask == cur_ss_mask)
			break;
		cur_ss_mask = new_ss_mask;
	}

1349 1350 1351
	return cur_ss_mask;
}

1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
static void cgroup_kn_unlock(struct kernfs_node *kn)
1363
{
1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374
	struct cgroup *cgrp;

	if (kernfs_type(kn) == KERNFS_DIR)
		cgrp = kn->priv;
	else
		cgrp = kn->parent->priv;

	mutex_unlock(&cgroup_mutex);

	kernfs_unbreak_active_protection(kn);
	cgroup_put(cgrp);
1375 1376
}

1377 1378 1379
/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
1380
 * @drain_offline: perform offline draining on the cgroup
1381 1382 1383 1384 1385
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
1386 1387
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
1388 1389 1390 1391 1392 1393
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
1394 1395
static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
					  bool drain_offline)
T
Tejun Heo 已提交
1396
{
1397 1398 1399 1400 1401 1402
	struct cgroup *cgrp;

	if (kernfs_type(kn) == KERNFS_DIR)
		cgrp = kn->priv;
	else
		cgrp = kn->parent->priv;
T
Tejun Heo 已提交
1403

1404
	/*
1405
	 * We're gonna grab cgroup_mutex which nests outside kernfs
1406 1407 1408
	 * active_ref.  cgroup liveliness check alone provides enough
	 * protection against removal.  Ensure @cgrp stays accessible and
	 * break the active_ref protection.
1409
	 */
1410 1411
	if (!cgroup_tryget(cgrp))
		return NULL;
1412 1413
	kernfs_break_active_protection(kn);

1414 1415 1416 1417
	if (drain_offline)
		cgroup_lock_and_drain_offline(cgrp);
	else
		mutex_lock(&cgroup_mutex);
T
Tejun Heo 已提交
1418

1419 1420 1421 1422 1423
	if (!cgroup_is_dead(cgrp))
		return cgrp;

	cgroup_kn_unlock(kn);
	return NULL;
1424
}
T
Tejun Heo 已提交
1425

1426
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
T
Tejun Heo 已提交
1427
{
T
Tejun Heo 已提交
1428
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
1429

1430
	lockdep_assert_held(&cgroup_mutex);
1431 1432 1433 1434 1435 1436 1437 1438 1439 1440

	if (cft->file_offset) {
		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		spin_lock_irq(&cgroup_file_kn_lock);
		cfile->kn = NULL;
		spin_unlock_irq(&cgroup_file_kn_lock);
	}

T
Tejun Heo 已提交
1441
	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
T
Tejun Heo 已提交
1442 1443
}

1444
/**
1445 1446
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: taget css
1447
 */
1448
static void css_clear_dir(struct cgroup_subsys_state *css)
T
Tejun Heo 已提交
1449
{
1450
	struct cgroup *cgrp = css->cgroup;
1451
	struct cftype *cfts;
T
Tejun Heo 已提交
1452

1453 1454 1455 1456 1457
	if (!(css->flags & CSS_VISIBLE))
		return;

	css->flags &= ~CSS_VISIBLE;

1458 1459
	list_for_each_entry(cfts, &css->ss->cfts, node)
		cgroup_addrm_files(css, cgrp, cfts, false);
1460 1461
}

1462
/**
1463 1464
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
1465 1466 1467
 *
 * On failure, no file is added.
 */
1468
static int css_populate_dir(struct cgroup_subsys_state *css)
1469
{
1470
	struct cgroup *cgrp = css->cgroup;
1471 1472
	struct cftype *cfts, *failed_cfts;
	int ret;
1473

1474
	if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1475 1476
		return 0;

1477 1478 1479 1480 1481
	if (!css->ss) {
		if (cgroup_on_dfl(cgrp))
			cfts = cgroup_dfl_base_files;
		else
			cfts = cgroup_legacy_base_files;
1482

1483 1484
		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
	}
1485

1486 1487 1488 1489 1490
	list_for_each_entry(cfts, &css->ss->cfts, node) {
		ret = cgroup_addrm_files(css, cgrp, cfts, true);
		if (ret < 0) {
			failed_cfts = cfts;
			goto err;
1491 1492
		}
	}
1493 1494 1495

	css->flags |= CSS_VISIBLE;

1496 1497
	return 0;
err:
1498 1499 1500 1501 1502
	list_for_each_entry(cfts, &css->ss->cfts, node) {
		if (cfts == failed_cfts)
			break;
		cgroup_addrm_files(css, cgrp, cfts, false);
	}
1503 1504 1505
	return ret;
}

1506
static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1507
{
1508
	struct cgroup *dcgrp = &dst_root->cgrp;
1509
	struct cgroup_subsys *ss;
T
Tejun Heo 已提交
1510
	int ssid, i, ret;
1511

T
Tejun Heo 已提交
1512
	lockdep_assert_held(&cgroup_mutex);
1513

1514
	do_each_subsys_mask(ss, ssid, ss_mask) {
1515 1516
		/* if @ss has non-root csses attached to it, can't move */
		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
T
Tejun Heo 已提交
1517
			return -EBUSY;
1518

1519
		/* can't move between two non-dummy roots either */
1520
		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1521
			return -EBUSY;
1522
	} while_each_subsys_mask();
1523

1524
	do_each_subsys_mask(ss, ssid, ss_mask) {
1525 1526 1527
		struct cgroup_root *src_root = ss->root;
		struct cgroup *scgrp = &src_root->cgrp;
		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
T
Tejun Heo 已提交
1528
		struct css_set *cset;
1529

1530
		WARN_ON(!css || cgroup_css(dcgrp, ss));
1531

1532 1533 1534 1535
		/* disable from the source */
		src_root->subsys_mask &= ~(1 << ssid);
		WARN_ON(cgroup_apply_control(scgrp));
		cgroup_finalize_control(scgrp, 0);
1536

1537
		/* rebind */
1538 1539
		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
		rcu_assign_pointer(dcgrp->subsys[ssid], css);
1540
		ss->root = dst_root;
1541
		css->cgroup = dcgrp;
1542

1543
		spin_lock_bh(&css_set_lock);
T
Tejun Heo 已提交
1544 1545
		hash_for_each(css_set_table, i, cset, hlist)
			list_move_tail(&cset->e_cset_node[ss->id],
1546
				       &dcgrp->e_csets[ss->id]);
1547
		spin_unlock_bh(&css_set_lock);
T
Tejun Heo 已提交
1548

1549
		/* default hierarchy doesn't enable controllers by default */
1550
		dst_root->subsys_mask |= 1 << ssid;
1551 1552 1553
		if (dst_root == &cgrp_dfl_root) {
			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
		} else {
1554
			dcgrp->subtree_control |= 1 << ssid;
1555
			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1556
		}
1557

1558 1559 1560 1561 1562
		ret = cgroup_apply_control(dcgrp);
		if (ret)
			pr_warn("partial failure to rebind %s controller (err=%d)\n",
				ss->name, ret);

1563 1564
		if (ss->bind)
			ss->bind(css);
1565
	} while_each_subsys_mask();
1566

1567
	kernfs_activate(dcgrp->kn);
1568 1569 1570
	return 0;
}

T
Tejun Heo 已提交
1571 1572
static int cgroup_show_options(struct seq_file *seq,
			       struct kernfs_root *kf_root)
1573
{
1574
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1575
	struct cgroup_subsys *ss;
T
Tejun Heo 已提交
1576
	int ssid;
1577

1578 1579 1580
	if (root != &cgrp_dfl_root)
		for_each_subsys(ss, ssid)
			if (root->subsys_mask & (1 << ssid))
1581
				seq_show_option(seq, ss->legacy_name, NULL);
1582
	if (root->flags & CGRP_ROOT_NOPREFIX)
1583
		seq_puts(seq, ",noprefix");
1584
	if (root->flags & CGRP_ROOT_XATTR)
A
Aristeu Rozanski 已提交
1585
		seq_puts(seq, ",xattr");
1586 1587

	spin_lock(&release_agent_path_lock);
1588
	if (strlen(root->release_agent_path))
1589 1590
		seq_show_option(seq, "release_agent",
				root->release_agent_path);
1591 1592
	spin_unlock(&release_agent_path_lock);

1593
	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1594
		seq_puts(seq, ",clone_children");
1595
	if (strlen(root->name))
1596
		seq_show_option(seq, "name", root->name);
1597 1598 1599 1600
	return 0;
}

struct cgroup_sb_opts {
1601
	u16 subsys_mask;
1602
	unsigned int flags;
1603
	char *release_agent;
1604
	bool cpuset_clone_children;
1605
	char *name;
1606 1607
	/* User explicitly requested empty subsystem */
	bool none;
1608 1609
};

B
Ben Blum 已提交
1610
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1611
{
1612 1613
	char *token, *o = data;
	bool all_ss = false, one_ss = false;
1614
	u16 mask = U16_MAX;
1615
	struct cgroup_subsys *ss;
1616
	int nr_opts = 0;
1617
	int i;
1618 1619

#ifdef CONFIG_CPUSETS
1620
	mask = ~((u16)1 << cpuset_cgrp_id);
1621
#endif
1622

1623
	memset(opts, 0, sizeof(*opts));
1624 1625

	while ((token = strsep(&o, ",")) != NULL) {
1626 1627
		nr_opts++;

1628 1629
		if (!*token)
			return -EINVAL;
1630
		if (!strcmp(token, "none")) {
1631 1632
			/* Explicitly have no subsystems */
			opts->none = true;
1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
			continue;
		}
		if (!strcmp(token, "all")) {
			/* Mutually exclusive option 'all' + subsystem name */
			if (one_ss)
				return -EINVAL;
			all_ss = true;
			continue;
		}
		if (!strcmp(token, "noprefix")) {
1643
			opts->flags |= CGRP_ROOT_NOPREFIX;
1644 1645 1646
			continue;
		}
		if (!strcmp(token, "clone_children")) {
1647
			opts->cpuset_clone_children = true;
1648 1649
			continue;
		}
A
Aristeu Rozanski 已提交
1650
		if (!strcmp(token, "xattr")) {
1651
			opts->flags |= CGRP_ROOT_XATTR;
A
Aristeu Rozanski 已提交
1652 1653
			continue;
		}
1654
		if (!strncmp(token, "release_agent=", 14)) {
1655 1656 1657
			/* Specifying two release agents is forbidden */
			if (opts->release_agent)
				return -EINVAL;
1658
			opts->release_agent =
1659
				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1660 1661
			if (!opts->release_agent)
				return -ENOMEM;
1662 1663 1664
			continue;
		}
		if (!strncmp(token, "name=", 5)) {
1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
			const char *name = token + 5;
			/* Can't specify an empty name */
			if (!strlen(name))
				return -EINVAL;
			/* Must match [\w.-]+ */
			for (i = 0; i < strlen(name); i++) {
				char c = name[i];
				if (isalnum(c))
					continue;
				if ((c == '.') || (c == '-') || (c == '_'))
					continue;
				return -EINVAL;
			}
			/* Specifying two names is forbidden */
			if (opts->name)
				return -EINVAL;
			opts->name = kstrndup(name,
1682
					      MAX_CGROUP_ROOT_NAMELEN - 1,
1683 1684 1685
					      GFP_KERNEL);
			if (!opts->name)
				return -ENOMEM;
1686 1687 1688 1689

			continue;
		}

1690
		for_each_subsys(ss, i) {
1691
			if (strcmp(token, ss->legacy_name))
1692
				continue;
1693
			if (!cgroup_ssid_enabled(i))
1694
				continue;
1695 1696
			if (cgroup_ssid_no_v1(i))
				continue;
1697 1698 1699 1700

			/* Mutually exclusive option 'all' + subsystem name */
			if (all_ss)
				return -EINVAL;
1701
			opts->subsys_mask |= (1 << i);
1702 1703 1704 1705 1706 1707 1708 1709
			one_ss = true;

			break;
		}
		if (i == CGROUP_SUBSYS_COUNT)
			return -ENOENT;
	}

1710 1711 1712 1713 1714 1715 1716
	/*
	 * If the 'all' option was specified select all the subsystems,
	 * otherwise if 'none', 'name=' and a subsystem name options were
	 * not specified, let's default to 'all'
	 */
	if (all_ss || (!one_ss && !opts->none && !opts->name))
		for_each_subsys(ss, i)
1717
			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
1718 1719 1720 1721 1722 1723 1724 1725 1726
				opts->subsys_mask |= (1 << i);

	/*
	 * We either have to specify by name or by subsystems. (So all
	 * empty hierarchies must have a name).
	 */
	if (!opts->subsys_mask && !opts->name)
		return -EINVAL;

1727 1728 1729 1730 1731
	/*
	 * Option noprefix was introduced just for backward compatibility
	 * with the old cpuset, so we allow noprefix only if mounting just
	 * the cpuset subsystem.
	 */
1732
	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1733 1734
		return -EINVAL;

1735
	/* Can't specify "none" and some subsystems */
1736
	if (opts->subsys_mask && opts->none)
1737 1738
		return -EINVAL;

1739 1740 1741
	return 0;
}

T
Tejun Heo 已提交
1742
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1743 1744
{
	int ret = 0;
1745
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1746
	struct cgroup_sb_opts opts;
1747
	u16 added_mask, removed_mask;
1748

1749 1750
	if (root == &cgrp_dfl_root) {
		pr_err("remount is not allowed\n");
1751 1752 1753
		return -EINVAL;
	}

1754
	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1755 1756 1757 1758 1759 1760

	/* See what subsystems are wanted */
	ret = parse_cgroupfs_options(data, &opts);
	if (ret)
		goto out_unlock;

1761
	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1762
		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1763
			task_tgid_nr(current), current->comm);
1764

1765 1766
	added_mask = opts.subsys_mask & ~root->subsys_mask;
	removed_mask = root->subsys_mask & ~opts.subsys_mask;
1767

B
Ben Blum 已提交
1768
	/* Don't allow flags or name to change at remount */
T
Tejun Heo 已提交
1769
	if ((opts.flags ^ root->flags) ||
B
Ben Blum 已提交
1770
	    (opts.name && strcmp(opts.name, root->name))) {
1771
		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
T
Tejun Heo 已提交
1772
		       opts.flags, opts.name ?: "", root->flags, root->name);
1773 1774 1775 1776
		ret = -EINVAL;
		goto out_unlock;
	}

1777
	/* remounting is not allowed for populated hierarchies */
1778
	if (!list_empty(&root->cgrp.self.children)) {
1779
		ret = -EBUSY;
1780
		goto out_unlock;
B
Ben Blum 已提交
1781
	}
1782

1783
	ret = rebind_subsystems(root, added_mask);
1784
	if (ret)
1785
		goto out_unlock;
1786

1787
	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1788

1789 1790
	if (opts.release_agent) {
		spin_lock(&release_agent_path_lock);
1791
		strcpy(root->release_agent_path, opts.release_agent);
1792 1793
		spin_unlock(&release_agent_path_lock);
	}
1794
 out_unlock:
1795
	kfree(opts.release_agent);
1796
	kfree(opts.name);
1797 1798 1799 1800
	mutex_unlock(&cgroup_mutex);
	return ret;
}

1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
/*
 * To reduce the fork() overhead for systems that are not actually using
 * their cgroups capability, we don't maintain the lists running through
 * each css_set to its tasks until we see the list actually used - in other
 * words after the first mount.
 */
static bool use_task_css_set_links __read_mostly;

static void cgroup_enable_task_cg_lists(void)
{
	struct task_struct *p, *g;

1813
	spin_lock_bh(&css_set_lock);
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835

	if (use_task_css_set_links)
		goto out_unlock;

	use_task_css_set_links = true;

	/*
	 * We need tasklist_lock because RCU is not safe against
	 * while_each_thread(). Besides, a forking task that has passed
	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
	 * is not guaranteed to have its child immediately visible in the
	 * tasklist if we walk through it with RCU.
	 */
	read_lock(&tasklist_lock);
	do_each_thread(g, p) {
		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
			     task_css_set(p) != &init_css_set);

		/*
		 * We should check if the process is exiting, otherwise
		 * it will race with cgroup_exit() in that the list
		 * entry won't be deleted though the process has exited.
1836 1837
		 * Do it while holding siglock so that we don't end up
		 * racing against cgroup_exit().
1838
		 */
1839
		spin_lock_irq(&p->sighand->siglock);
1840 1841 1842
		if (!(p->flags & PF_EXITING)) {
			struct css_set *cset = task_css_set(p);

1843 1844
			if (!css_set_populated(cset))
				css_set_update_populated(cset, true);
1845
			list_add_tail(&p->cg_list, &cset->tasks);
1846 1847
			get_css_set(cset);
		}
1848
		spin_unlock_irq(&p->sighand->siglock);
1849 1850 1851
	} while_each_thread(g, p);
	read_unlock(&tasklist_lock);
out_unlock:
1852
	spin_unlock_bh(&css_set_lock);
1853
}
1854

1855 1856
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
T
Tejun Heo 已提交
1857 1858 1859
	struct cgroup_subsys *ss;
	int ssid;

1860 1861
	INIT_LIST_HEAD(&cgrp->self.sibling);
	INIT_LIST_HEAD(&cgrp->self.children);
1862
	INIT_LIST_HEAD(&cgrp->cset_links);
1863 1864
	INIT_LIST_HEAD(&cgrp->pidlists);
	mutex_init(&cgrp->pidlist_mutex);
1865
	cgrp->self.cgroup = cgrp;
1866
	cgrp->self.flags |= CSS_ONLINE;
T
Tejun Heo 已提交
1867 1868 1869

	for_each_subsys(ss, ssid)
		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1870 1871

	init_waitqueue_head(&cgrp->offline_waitq);
1872
	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1873
}
1874

1875
static void init_cgroup_root(struct cgroup_root *root,
1876
			     struct cgroup_sb_opts *opts)
1877
{
1878
	struct cgroup *cgrp = &root->cgrp;
1879

1880
	INIT_LIST_HEAD(&root->root_list);
1881
	atomic_set(&root->nr_cgrps, 1);
1882
	cgrp->root = root;
1883
	init_cgroup_housekeeping(cgrp);
1884
	idr_init(&root->cgroup_idr);
1885 1886 1887 1888 1889 1890

	root->flags = opts->flags;
	if (opts->release_agent)
		strcpy(root->release_agent_path, opts->release_agent);
	if (opts->name)
		strcpy(root->name, opts->name);
1891
	if (opts->cpuset_clone_children)
1892
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1893 1894
}

1895
static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1896
{
1897
	LIST_HEAD(tmp_links);
1898
	struct cgroup *root_cgrp = &root->cgrp;
1899 1900
	struct css_set *cset;
	int i, ret;
1901

1902
	lockdep_assert_held(&cgroup_mutex);
1903

V
Vladimir Davydov 已提交
1904
	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1905
	if (ret < 0)
T
Tejun Heo 已提交
1906
		goto out;
1907
	root_cgrp->id = ret;
1908
	root_cgrp->ancestor_ids[0] = ret;
1909

1910 1911
	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
			      GFP_KERNEL);
1912 1913 1914
	if (ret)
		goto out;

1915
	/*
1916
	 * We're accessing css_set_count without locking css_set_lock here,
1917
	 * but that's OK - it can only be increased by someone holding
1918 1919 1920
	 * cgroup_lock, and that's us.  Later rebinding may disable
	 * controllers on the default hierarchy and thus create new csets,
	 * which can't be more than the existing ones.  Allocate 2x.
1921
	 */
1922
	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1923
	if (ret)
1924
		goto cancel_ref;
1925

1926
	ret = cgroup_init_root_id(root);
1927
	if (ret)
1928
		goto cancel_ref;
1929

T
Tejun Heo 已提交
1930 1931 1932 1933 1934 1935 1936 1937
	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
					   KERNFS_ROOT_CREATE_DEACTIVATED,
					   root_cgrp);
	if (IS_ERR(root->kf_root)) {
		ret = PTR_ERR(root->kf_root);
		goto exit_root_id;
	}
	root_cgrp->kn = root->kf_root->kn;
1938

1939
	ret = css_populate_dir(&root_cgrp->self);
1940
	if (ret)
T
Tejun Heo 已提交
1941
		goto destroy_root;
1942

1943
	ret = rebind_subsystems(root, ss_mask);
1944
	if (ret)
T
Tejun Heo 已提交
1945
		goto destroy_root;
1946

1947 1948 1949 1950 1951 1952 1953
	/*
	 * There must be no failure case after here, since rebinding takes
	 * care of subsystems' refcounts, which are explicitly dropped in
	 * the failure exit path.
	 */
	list_add(&root->root_list, &cgroup_roots);
	cgroup_root_count++;
A
Al Viro 已提交
1954

1955
	/*
1956
	 * Link the root cgroup in this hierarchy into all the css_set
1957 1958
	 * objects.
	 */
1959
	spin_lock_bh(&css_set_lock);
1960
	hash_for_each(css_set_table, i, cset, hlist) {
1961
		link_css_set(&tmp_links, cset, root_cgrp);
1962 1963 1964
		if (css_set_populated(cset))
			cgroup_update_populated(root_cgrp, true);
	}
1965
	spin_unlock_bh(&css_set_lock);
1966

1967
	BUG_ON(!list_empty(&root_cgrp->self.children));
1968
	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1969

T
Tejun Heo 已提交
1970
	kernfs_activate(root_cgrp->kn);
1971
	ret = 0;
T
Tejun Heo 已提交
1972
	goto out;
1973

T
Tejun Heo 已提交
1974 1975 1976 1977
destroy_root:
	kernfs_destroy_root(root->kf_root);
	root->kf_root = NULL;
exit_root_id:
1978
	cgroup_exit_root_id(root);
1979
cancel_ref:
1980
	percpu_ref_exit(&root_cgrp->self.refcnt);
T
Tejun Heo 已提交
1981
out:
1982 1983
	free_cgrp_cset_links(&tmp_links);
	return ret;
1984 1985
}

A
Al Viro 已提交
1986
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1987
			 int flags, const char *unused_dev_name,
A
Al Viro 已提交
1988
			 void *data)
1989
{
1990
	bool is_v2 = fs_type == &cgroup2_fs_type;
1991
	struct super_block *pinned_sb = NULL;
1992
	struct cgroup_subsys *ss;
1993
	struct cgroup_root *root;
1994
	struct cgroup_sb_opts opts;
T
Tejun Heo 已提交
1995
	struct dentry *dentry;
1996
	int ret;
1997
	int i;
L
Li Zefan 已提交
1998
	bool new_sb;
1999

2000 2001 2002 2003 2004 2005
	/*
	 * The first time anyone tries to mount a cgroup, enable the list
	 * linking each css_set to its tasks and fix up all existing tasks.
	 */
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();
2006

2007 2008 2009 2010 2011
	if (is_v2) {
		if (data) {
			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
			return ERR_PTR(-EINVAL);
		}
T
Tejun Heo 已提交
2012
		cgrp_dfl_visible = true;
2013 2014 2015 2016 2017
		root = &cgrp_dfl_root;
		cgroup_get(&root->cgrp);
		goto out_mount;
	}

2018
	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
2019 2020

	/* First find the desired set of subsystems */
2021
	ret = parse_cgroupfs_options(data, &opts);
2022
	if (ret)
2023
		goto out_unlock;
2024

2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045
	/*
	 * Destruction of cgroup root is asynchronous, so subsystems may
	 * still be dying after the previous unmount.  Let's drain the
	 * dying subsystems.  We just need to ensure that the ones
	 * unmounted previously finish dying and don't care about new ones
	 * starting.  Testing ref liveliness is good enough.
	 */
	for_each_subsys(ss, i) {
		if (!(opts.subsys_mask & (1 << i)) ||
		    ss->root == &cgrp_dfl_root)
			continue;

		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
			mutex_unlock(&cgroup_mutex);
			msleep(10);
			ret = restart_syscall();
			goto out_free;
		}
		cgroup_put(&ss->root->cgrp);
	}

2046
	for_each_root(root) {
T
Tejun Heo 已提交
2047
		bool name_match = false;
2048

2049
		if (root == &cgrp_dfl_root)
2050
			continue;
2051

B
Ben Blum 已提交
2052
		/*
T
Tejun Heo 已提交
2053 2054 2055
		 * If we asked for a name then it must match.  Also, if
		 * name matches but sybsys_mask doesn't, we should fail.
		 * Remember whether name matched.
B
Ben Blum 已提交
2056
		 */
T
Tejun Heo 已提交
2057 2058 2059 2060 2061
		if (opts.name) {
			if (strcmp(opts.name, root->name))
				continue;
			name_match = true;
		}
2062

2063
		/*
T
Tejun Heo 已提交
2064 2065
		 * If we asked for subsystems (or explicitly for no
		 * subsystems) then they must match.
2066
		 */
T
Tejun Heo 已提交
2067
		if ((opts.subsys_mask || opts.none) &&
2068
		    (opts.subsys_mask != root->subsys_mask)) {
T
Tejun Heo 已提交
2069 2070 2071 2072 2073
			if (!name_match)
				continue;
			ret = -EBUSY;
			goto out_unlock;
		}
2074

2075 2076
		if (root->flags ^ opts.flags)
			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
2077

T
Tejun Heo 已提交
2078
		/*
2079 2080 2081 2082 2083 2084 2085 2086 2087 2088
		 * We want to reuse @root whose lifetime is governed by its
		 * ->cgrp.  Let's check whether @root is alive and keep it
		 * that way.  As cgroup_kill_sb() can happen anytime, we
		 * want to block it by pinning the sb so that @root doesn't
		 * get killed before mount is complete.
		 *
		 * With the sb pinned, tryget_live can reliably indicate
		 * whether @root can be reused.  If it's being killed,
		 * drain it.  We can use wait_queue for the wait but this
		 * path is super cold.  Let's just sleep a bit and retry.
T
Tejun Heo 已提交
2089
		 */
2090 2091 2092
		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
		if (IS_ERR(pinned_sb) ||
		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
T
Tejun Heo 已提交
2093
			mutex_unlock(&cgroup_mutex);
2094 2095
			if (!IS_ERR_OR_NULL(pinned_sb))
				deactivate_super(pinned_sb);
T
Tejun Heo 已提交
2096
			msleep(10);
2097 2098
			ret = restart_syscall();
			goto out_free;
T
Tejun Heo 已提交
2099
		}
2100

T
Tejun Heo 已提交
2101
		ret = 0;
T
Tejun Heo 已提交
2102
		goto out_unlock;
2103 2104
	}

2105
	/*
2106 2107 2108
	 * No such thing, create a new one.  name= matching without subsys
	 * specification is allowed for already existing hierarchies but we
	 * can't create new one without subsys specification.
2109
	 */
2110 2111 2112
	if (!opts.subsys_mask && !opts.none) {
		ret = -EINVAL;
		goto out_unlock;
2113 2114
	}

2115 2116 2117
	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root) {
		ret = -ENOMEM;
T
Tejun Heo 已提交
2118
		goto out_unlock;
2119
	}
2120

2121 2122
	init_cgroup_root(root, &opts);

T
Tejun Heo 已提交
2123
	ret = cgroup_setup_root(root, opts.subsys_mask);
T
Tejun Heo 已提交
2124 2125
	if (ret)
		cgroup_free_root(root);
2126

2127
out_unlock:
2128
	mutex_unlock(&cgroup_mutex);
2129
out_free:
2130 2131
	kfree(opts.release_agent);
	kfree(opts.name);
A
Aristeu Rozanski 已提交
2132

T
Tejun Heo 已提交
2133
	if (ret)
2134
		return ERR_PTR(ret);
2135
out_mount:
2136
	dentry = kernfs_mount(fs_type, flags, root->kf_root,
2137 2138
			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
			      &new_sb);
L
Li Zefan 已提交
2139
	if (IS_ERR(dentry) || !new_sb)
2140
		cgroup_put(&root->cgrp);
2141 2142 2143 2144 2145 2146 2147 2148 2149 2150

	/*
	 * If @pinned_sb, we're reusing an existing root and holding an
	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
	 */
	if (pinned_sb) {
		WARN_ON(new_sb);
		deactivate_super(pinned_sb);
	}

T
Tejun Heo 已提交
2151 2152 2153 2154 2155 2156
	return dentry;
}

static void cgroup_kill_sb(struct super_block *sb)
{
	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2157
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
T
Tejun Heo 已提交
2158

2159 2160 2161 2162
	/*
	 * If @root doesn't have any mounts or children, start killing it.
	 * This prevents new mounts by disabling percpu_ref_tryget_live().
	 * cgroup_mount() may wait for @root's release.
2163 2164
	 *
	 * And don't kill the default root.
2165
	 */
2166
	if (!list_empty(&root->cgrp.self.children) ||
2167
	    root == &cgrp_dfl_root)
2168 2169 2170 2171
		cgroup_put(&root->cgrp);
	else
		percpu_ref_kill(&root->cgrp.self.refcnt);

T
Tejun Heo 已提交
2172
	kernfs_kill_sb(sb);
2173 2174 2175 2176
}

static struct file_system_type cgroup_fs_type = {
	.name = "cgroup",
A
Al Viro 已提交
2177
	.mount = cgroup_mount,
2178 2179 2180
	.kill_sb = cgroup_kill_sb,
};

2181 2182 2183 2184 2185 2186
static struct file_system_type cgroup2_fs_type = {
	.name = "cgroup2",
	.mount = cgroup_mount,
	.kill_sb = cgroup_kill_sb,
};

2187
/**
2188
 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
2189 2190 2191 2192
 * @task: target task
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
2193 2194 2195 2196 2197
 * Determine @task's cgroup on the first (the one with the lowest non-zero
 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
 * function grabs cgroup_mutex and shouldn't be used inside locks used by
 * cgroup controller callbacks.
 *
T
Tejun Heo 已提交
2198
 * Return value is the same as kernfs_path().
2199
 */
T
Tejun Heo 已提交
2200
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2201
{
2202
	struct cgroup_root *root;
2203
	struct cgroup *cgrp;
T
Tejun Heo 已提交
2204 2205
	int hierarchy_id = 1;
	char *path = NULL;
2206 2207

	mutex_lock(&cgroup_mutex);
2208
	spin_lock_bh(&css_set_lock);
2209

2210 2211
	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

2212 2213
	if (root) {
		cgrp = task_cgroup_from_root(task, root);
T
Tejun Heo 已提交
2214
		path = cgroup_path(cgrp, buf, buflen);
2215 2216
	} else {
		/* if no hierarchy exists, everyone is in "/" */
T
Tejun Heo 已提交
2217 2218
		if (strlcpy(buf, "/", buflen) < buflen)
			path = buf;
2219 2220
	}

2221
	spin_unlock_bh(&css_set_lock);
2222
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
2223
	return path;
2224
}
2225
EXPORT_SYMBOL_GPL(task_cgroup_path);
2226

2227
/* used to track tasks and other necessary states during migration */
2228
struct cgroup_taskset {
2229 2230 2231 2232
	/* the src and dst cset list running through cset->mg_node */
	struct list_head	src_csets;
	struct list_head	dst_csets;

2233 2234 2235
	/* the subsys currently being processed */
	int			ssid;

2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249
	/*
	 * Fields for cgroup_taskset_*() iteration.
	 *
	 * Before migration is committed, the target migration tasks are on
	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
	 * or ->dst_csets depending on whether migration is committed.
	 *
	 * ->cur_csets and ->cur_task point to the current task position
	 * during iteration.
	 */
	struct list_head	*csets;
	struct css_set		*cur_cset;
	struct task_struct	*cur_task;
2250 2251
};

2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272
#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
	.csets			= &tset.src_csets,			\
}

/**
 * cgroup_taskset_add - try to add a migration target task to a taskset
 * @task: target task
 * @tset: target taskset
 *
 * Add @task, which is a migration target, to @tset.  This function becomes
 * noop if @task doesn't need to be migrated.  @task's css_set should have
 * been added as a migration source and @task->cg_list will be moved from
 * the css_set's tasks list to mg_tasks one.
 */
static void cgroup_taskset_add(struct task_struct *task,
			       struct cgroup_taskset *tset)
{
	struct css_set *cset;

2273
	lockdep_assert_held(&css_set_lock);
2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294

	/* @task either already exited or can't exit until the end */
	if (task->flags & PF_EXITING)
		return;

	/* leave @task alone if post_fork() hasn't linked it yet */
	if (list_empty(&task->cg_list))
		return;

	cset = task_css_set(task);
	if (!cset->mg_src_cgrp)
		return;

	list_move_tail(&task->cg_list, &cset->mg_tasks);
	if (list_empty(&cset->mg_node))
		list_add_tail(&cset->mg_node, &tset->src_csets);
	if (list_empty(&cset->mg_dst_cset->mg_node))
		list_move_tail(&cset->mg_dst_cset->mg_node,
			       &tset->dst_csets);
}

2295 2296 2297
/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
2298
 * @dst_cssp: output variable for the destination css
2299 2300 2301
 *
 * @tset iteration is initialized and the first task is returned.
 */
2302 2303
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
					 struct cgroup_subsys_state **dst_cssp)
2304
{
2305 2306 2307
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_task = NULL;

2308
	return cgroup_taskset_next(tset, dst_cssp);
2309 2310 2311 2312 2313
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
2314
 * @dst_cssp: output variable for the destination css
2315 2316 2317 2318
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
2319 2320
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp)
2321
{
2322 2323
	struct css_set *cset = tset->cur_cset;
	struct task_struct *task = tset->cur_task;
2324

2325 2326 2327 2328 2329 2330
	while (&cset->mg_node != tset->csets) {
		if (!task)
			task = list_first_entry(&cset->mg_tasks,
						struct task_struct, cg_list);
		else
			task = list_next_entry(task, cg_list);
2331

2332 2333 2334
		if (&task->cg_list != &cset->mg_tasks) {
			tset->cur_cset = cset;
			tset->cur_task = task;
2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346

			/*
			 * This function may be called both before and
			 * after cgroup_taskset_migrate().  The two cases
			 * can be distinguished by looking at whether @cset
			 * has its ->mg_dst_cset set.
			 */
			if (cset->mg_dst_cset)
				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
			else
				*dst_cssp = cset->subsys[tset->ssid];

2347 2348
			return task;
		}
2349

2350 2351 2352
		cset = list_next_entry(cset, mg_node);
		task = NULL;
	}
2353

2354
	return NULL;
2355 2356
}

2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
/**
 * cgroup_taskset_migrate - migrate a taskset to a cgroup
 * @tset: taget taskset
 * @dst_cgrp: destination cgroup
 *
 * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
 * ->can_attach callbacks fails and guarantees that either all or none of
 * the tasks in @tset are migrated.  @tset is consumed regardless of
 * success.
 */
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
				  struct cgroup *dst_cgrp)
{
	struct cgroup_subsys_state *css, *failed_css = NULL;
	struct task_struct *task, *tmp_task;
	struct css_set *cset, *tmp_cset;
	int i, ret;

	/* methods shouldn't be called if no task is actually migrating */
	if (list_empty(&tset->src_csets))
		return 0;

	/* check that we can legitimately attach to the cgroup */
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->can_attach) {
2382 2383
			tset->ssid = i;
			ret = css->ss->can_attach(tset);
2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
			if (ret) {
				failed_css = css;
				goto out_cancel_attach;
			}
		}
	}

	/*
	 * Now that we're guaranteed success, proceed to move all tasks to
	 * the new cgroup.  There are no failure cases after here, so this
	 * is the commit point.
	 */
2396
	spin_lock_bh(&css_set_lock);
2397
	list_for_each_entry(cset, &tset->src_csets, mg_node) {
T
Tejun Heo 已提交
2398 2399 2400 2401 2402 2403 2404 2405
		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
			struct css_set *from_cset = task_css_set(task);
			struct css_set *to_cset = cset->mg_dst_cset;

			get_css_set(to_cset);
			css_set_move_task(task, from_cset, to_cset, true);
			put_css_set_locked(from_cset);
		}
2406
	}
2407
	spin_unlock_bh(&css_set_lock);
2408 2409 2410 2411 2412 2413 2414 2415

	/*
	 * Migration is committed, all target tasks are now on dst_csets.
	 * Nothing is sensitive to fork() after this point.  Notify
	 * controllers that migration is complete.
	 */
	tset->csets = &tset->dst_csets;

2416 2417 2418 2419 2420 2421
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->attach) {
			tset->ssid = i;
			css->ss->attach(tset);
		}
	}
2422 2423 2424 2425 2426 2427 2428 2429

	ret = 0;
	goto out_release_tset;

out_cancel_attach:
	for_each_e_css(css, i, dst_cgrp) {
		if (css == failed_css)
			break;
2430 2431 2432 2433
		if (css->ss->cancel_attach) {
			tset->ssid = i;
			css->ss->cancel_attach(tset);
		}
2434 2435
	}
out_release_tset:
2436
	spin_lock_bh(&css_set_lock);
2437 2438 2439 2440 2441
	list_splice_init(&tset->dst_csets, &tset->src_csets);
	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
		list_del_init(&cset->mg_node);
	}
2442
	spin_unlock_bh(&css_set_lock);
2443 2444 2445
	return ret;
}

L
Li Zefan 已提交
2446
/**
2447 2448
 * cgroup_migrate_finish - cleanup after attach
 * @preloaded_csets: list of preloaded css_sets
B
Ben Blum 已提交
2449
 *
2450 2451
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
B
Ben Blum 已提交
2452
 */
2453
static void cgroup_migrate_finish(struct list_head *preloaded_csets)
B
Ben Blum 已提交
2454
{
2455
	struct css_set *cset, *tmp_cset;
B
Ben Blum 已提交
2456

2457 2458
	lockdep_assert_held(&cgroup_mutex);

2459
	spin_lock_bh(&css_set_lock);
2460 2461 2462 2463
	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
		cset->mg_src_cgrp = NULL;
		cset->mg_dst_cset = NULL;
		list_del_init(&cset->mg_preload_node);
Z
Zefan Li 已提交
2464
		put_css_set_locked(cset);
2465
	}
2466
	spin_unlock_bh(&css_set_lock);
2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @preloaded_csets: list of preloaded css_sets
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @preloaded_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
2479 2480 2481 2482 2483
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
2484 2485 2486 2487 2488 2489 2490 2491
 */
static void cgroup_migrate_add_src(struct css_set *src_cset,
				   struct cgroup *dst_cgrp,
				   struct list_head *preloaded_csets)
{
	struct cgroup *src_cgrp;

	lockdep_assert_held(&cgroup_mutex);
2492
	lockdep_assert_held(&css_set_lock);
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509

	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

	if (!list_empty(&src_cset->mg_preload_node))
		return;

	WARN_ON(src_cset->mg_src_cgrp);
	WARN_ON(!list_empty(&src_cset->mg_tasks));
	WARN_ON(!list_empty(&src_cset->mg_node));

	src_cset->mg_src_cgrp = src_cgrp;
	get_css_set(src_cset);
	list_add(&src_cset->mg_preload_node, preloaded_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2510
 * @dst_cgrp: the destination cgroup (may be %NULL)
2511 2512 2513 2514
 * @preloaded_csets: list of preloaded source css_sets
 *
 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
 * have been preloaded to @preloaded_csets.  This function looks up and
2515 2516 2517
 * pins all destination css_sets, links each to its source, and append them
 * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
 * source css_set is assumed to be its cgroup on the default hierarchy.
2518 2519 2520 2521 2522 2523 2524 2525 2526 2527
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @preloaded_csets.
 */
static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
				      struct list_head *preloaded_csets)
{
	LIST_HEAD(csets);
2528
	struct css_set *src_cset, *tmp_cset;
2529 2530 2531

	lockdep_assert_held(&cgroup_mutex);

2532
	/*
2533
	 * Except for the root, subtree_control must be zero for a cgroup
2534 2535
	 * with tasks so that child cgroups don't compete against tasks.
	 */
T
Tejun Heo 已提交
2536
	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2537
	    dst_cgrp->subtree_control)
2538 2539
		return -EBUSY;

2540
	/* look up the dst cset for each src cset and link it to src */
2541
	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2542 2543
		struct css_set *dst_cset;

2544 2545
		dst_cset = find_css_set(src_cset,
					dst_cgrp ?: src_cset->dfl_cgrp);
2546 2547 2548 2549
		if (!dst_cset)
			goto err;

		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2550 2551 2552 2553 2554 2555 2556 2557 2558

		/*
		 * If src cset equals dst, it's noop.  Drop the src.
		 * cgroup_migrate() will skip the cset too.  Note that we
		 * can't handle src == dst as some nodes are used by both.
		 */
		if (src_cset == dst_cset) {
			src_cset->mg_src_cgrp = NULL;
			list_del_init(&src_cset->mg_preload_node);
Z
Zefan Li 已提交
2559 2560
			put_css_set(src_cset);
			put_css_set(dst_cset);
2561 2562 2563
			continue;
		}

2564 2565 2566 2567 2568
		src_cset->mg_dst_cset = dst_cset;

		if (list_empty(&dst_cset->mg_preload_node))
			list_add(&dst_cset->mg_preload_node, &csets);
		else
Z
Zefan Li 已提交
2569
			put_css_set(dst_cset);
2570 2571
	}

2572
	list_splice_tail(&csets, preloaded_csets);
2573 2574 2575 2576 2577 2578 2579 2580 2581 2582
	return 0;
err:
	cgroup_migrate_finish(&csets);
	return -ENOMEM;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
2583
 * @cgrp: the destination cgroup
2584 2585
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2586
 * process, the caller must be holding cgroup_threadgroup_rwsem.  The
2587 2588 2589 2590 2591 2592 2593 2594 2595 2596
 * caller is also responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
2597 2598
static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			  struct cgroup *cgrp)
B
Ben Blum 已提交
2599
{
2600 2601
	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
	struct task_struct *task;
B
Ben Blum 已提交
2602

2603 2604 2605 2606 2607
	/*
	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
	 * already PF_EXITING could be freed from underneath us unless we
	 * take an rcu_read_lock.
	 */
2608
	spin_lock_bh(&css_set_lock);
2609
	rcu_read_lock();
2610
	task = leader;
B
Ben Blum 已提交
2611
	do {
2612
		cgroup_taskset_add(task, &tset);
2613 2614
		if (!threadgroup)
			break;
2615
	} while_each_thread(leader, task);
2616
	rcu_read_unlock();
2617
	spin_unlock_bh(&css_set_lock);
B
Ben Blum 已提交
2618

2619
	return cgroup_taskset_migrate(&tset, cgrp);
B
Ben Blum 已提交
2620 2621
}

2622 2623 2624 2625 2626 2627
/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
2628
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2629 2630 2631 2632 2633 2634 2635 2636 2637
 */
static int cgroup_attach_task(struct cgroup *dst_cgrp,
			      struct task_struct *leader, bool threadgroup)
{
	LIST_HEAD(preloaded_csets);
	struct task_struct *task;
	int ret;

	/* look up all src csets */
2638
	spin_lock_bh(&css_set_lock);
2639 2640 2641 2642 2643 2644 2645 2646 2647
	rcu_read_lock();
	task = leader;
	do {
		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
				       &preloaded_csets);
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
2648
	spin_unlock_bh(&css_set_lock);
2649 2650 2651 2652

	/* prepare dst csets and commit */
	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
	if (!ret)
2653
		ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
2654 2655 2656

	cgroup_migrate_finish(&preloaded_csets);
	return ret;
B
Ben Blum 已提交
2657 2658
}

2659 2660 2661
static int cgroup_procs_write_permission(struct task_struct *task,
					 struct cgroup *dst_cgrp,
					 struct kernfs_open_file *of)
2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675
{
	const struct cred *cred = current_cred();
	const struct cred *tcred = get_task_cred(task);
	int ret = 0;

	/*
	 * even if we're attaching all tasks in the thread group, we only
	 * need to check permissions on one of them.
	 */
	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
	    !uid_eq(cred->euid, tcred->uid) &&
	    !uid_eq(cred->euid, tcred->suid))
		ret = -EACCES;

2676 2677 2678 2679 2680
	if (!ret && cgroup_on_dfl(dst_cgrp)) {
		struct super_block *sb = of->file->f_path.dentry->d_sb;
		struct cgroup *cgrp;
		struct inode *inode;

2681
		spin_lock_bh(&css_set_lock);
2682
		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2683
		spin_unlock_bh(&css_set_lock);
2684 2685 2686 2687 2688

		while (!cgroup_is_descendant(dst_cgrp, cgrp))
			cgrp = cgroup_parent(cgrp);

		ret = -ENOMEM;
2689
		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2690 2691 2692 2693 2694 2695
		if (inode) {
			ret = inode_permission(inode, MAY_WRITE);
			iput(inode);
		}
	}

2696 2697 2698 2699
	put_cred(tcred);
	return ret;
}

B
Ben Blum 已提交
2700 2701
/*
 * Find the task_struct of the task to attach by vpid and pass it along to the
2702
 * function to attach either it or all tasks in its threadgroup. Will lock
2703
 * cgroup_mutex and threadgroup.
2704
 */
2705 2706
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
				    size_t nbytes, loff_t off, bool threadgroup)
2707 2708
{
	struct task_struct *tsk;
2709
	struct cgroup *cgrp;
2710
	pid_t pid;
2711 2712
	int ret;

2713 2714 2715
	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
		return -EINVAL;

2716
	cgrp = cgroup_kn_lock_live(of->kn, false);
2717
	if (!cgrp)
B
Ben Blum 已提交
2718 2719
		return -ENODEV;

T
Tejun Heo 已提交
2720
	percpu_down_write(&cgroup_threadgroup_rwsem);
2721
	rcu_read_lock();
2722
	if (pid) {
2723
		tsk = find_task_by_vpid(pid);
B
Ben Blum 已提交
2724
		if (!tsk) {
S
SeongJae Park 已提交
2725
			ret = -ESRCH;
T
Tejun Heo 已提交
2726
			goto out_unlock_rcu;
2727
		}
2728
	} else {
2729
		tsk = current;
2730
	}
2731 2732

	if (threadgroup)
2733
		tsk = tsk->group_leader;
2734 2735

	/*
2736
	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2737 2738 2739
	 * trapped in a cpuset, or RT worker may be born in a cgroup
	 * with no rt_runtime allocated.  Just say no.
	 */
2740
	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2741
		ret = -EINVAL;
T
Tejun Heo 已提交
2742
		goto out_unlock_rcu;
2743 2744
	}

2745 2746 2747
	get_task_struct(tsk);
	rcu_read_unlock();

2748
	ret = cgroup_procs_write_permission(tsk, cgrp, of);
2749 2750
	if (!ret)
		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2751

2752
	put_task_struct(tsk);
T
Tejun Heo 已提交
2753 2754 2755 2756 2757 2758
	goto out_unlock_threadgroup;

out_unlock_rcu:
	rcu_read_unlock();
out_unlock_threadgroup:
	percpu_up_write(&cgroup_threadgroup_rwsem);
2759
	cgroup_kn_unlock(of->kn);
2760
	cpuset_post_attach_flush();
2761
	return ret ?: nbytes;
2762 2763
}

2764 2765 2766 2767 2768 2769 2770
/**
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
 * @from: attach to all cgroups of a given task
 * @tsk: the task to be attached
 */
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
2771
	struct cgroup_root *root;
2772 2773
	int retval = 0;

T
Tejun Heo 已提交
2774
	mutex_lock(&cgroup_mutex);
2775
	for_each_root(root) {
2776 2777
		struct cgroup *from_cgrp;

2778
		if (root == &cgrp_dfl_root)
2779 2780
			continue;

2781
		spin_lock_bh(&css_set_lock);
2782
		from_cgrp = task_cgroup_from_root(from, root);
2783
		spin_unlock_bh(&css_set_lock);
2784

L
Li Zefan 已提交
2785
		retval = cgroup_attach_task(from_cgrp, tsk, false);
2786 2787 2788
		if (retval)
			break;
	}
T
Tejun Heo 已提交
2789
	mutex_unlock(&cgroup_mutex);
2790 2791 2792 2793 2794

	return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2795 2796
static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
				  char *buf, size_t nbytes, loff_t off)
B
Ben Blum 已提交
2797
{
2798
	return __cgroup_procs_write(of, buf, nbytes, off, false);
B
Ben Blum 已提交
2799 2800
}

2801 2802
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
				  char *buf, size_t nbytes, loff_t off)
2803
{
2804
	return __cgroup_procs_write(of, buf, nbytes, off, true);
2805 2806
}

2807 2808
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes, loff_t off)
2809
{
2810
	struct cgroup *cgrp;
2811

2812
	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2813

2814
	cgrp = cgroup_kn_lock_live(of->kn, false);
2815
	if (!cgrp)
2816
		return -ENODEV;
2817
	spin_lock(&release_agent_path_lock);
2818 2819
	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
		sizeof(cgrp->root->release_agent_path));
2820
	spin_unlock(&release_agent_path_lock);
2821
	cgroup_kn_unlock(of->kn);
2822
	return nbytes;
2823 2824
}

2825
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2826
{
2827
	struct cgroup *cgrp = seq_css(seq)->cgroup;
2828

2829
	spin_lock(&release_agent_path_lock);
2830
	seq_puts(seq, cgrp->root->release_agent_path);
2831
	spin_unlock(&release_agent_path_lock);
2832 2833 2834 2835
	seq_putc(seq, '\n');
	return 0;
}

2836
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2837
{
2838
	seq_puts(seq, "0\n");
2839 2840 2841
	return 0;
}

2842
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2843
{
2844 2845 2846
	struct cgroup_subsys *ss;
	bool printed = false;
	int ssid;
2847

2848
	do_each_subsys_mask(ss, ssid, ss_mask) {
2849 2850 2851 2852
		if (printed)
			seq_putc(seq, ' ');
		seq_printf(seq, "%s", ss->name);
		printed = true;
2853
	} while_each_subsys_mask();
2854 2855
	if (printed)
		seq_putc(seq, '\n');
2856 2857
}

2858 2859
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
2860
{
2861 2862
	struct cgroup *cgrp = seq_css(seq)->cgroup;

2863
	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2864
	return 0;
2865 2866
}

2867 2868
/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2869
{
2870 2871
	struct cgroup *cgrp = seq_css(seq)->cgroup;

2872
	cgroup_print_ss_mask(seq, cgrp->subtree_control);
2873 2874 2875 2876 2877 2878 2879
	return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
2880
 * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded)
2881 2882 2883 2884 2885 2886 2887
 * css associations need to be updated accordingly.  This function looks up
 * all css_sets which are attached to the subtree, creates the matching
 * updated css_sets and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
	LIST_HEAD(preloaded_csets);
2888
	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2889 2890 2891 2892 2893 2894
	struct cgroup_subsys_state *css;
	struct css_set *src_cset;
	int ret;

	lockdep_assert_held(&cgroup_mutex);

T
Tejun Heo 已提交
2895 2896
	percpu_down_write(&cgroup_threadgroup_rwsem);

2897
	/* look up all csses currently attached to @cgrp's subtree */
2898
	spin_lock_bh(&css_set_lock);
2899 2900 2901
	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
		struct cgrp_cset_link *link;

2902
		/* self is not affected by subtree_ss_mask change */
2903 2904 2905 2906 2907 2908 2909
		if (css->cgroup == cgrp)
			continue;

		list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
			cgroup_migrate_add_src(link->cset, cgrp,
					       &preloaded_csets);
	}
2910
	spin_unlock_bh(&css_set_lock);
2911 2912 2913 2914 2915 2916

	/* NULL dst indicates self on default hierarchy */
	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
	if (ret)
		goto out_finish;

2917
	spin_lock_bh(&css_set_lock);
2918
	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2919
		struct task_struct *task, *ntask;
2920 2921 2922 2923 2924

		/* src_csets precede dst_csets, break on the first dst_cset */
		if (!src_cset->mg_src_cgrp)
			break;

2925 2926 2927
		/* all tasks in src_csets need to be migrated */
		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
			cgroup_taskset_add(task, &tset);
2928
	}
2929
	spin_unlock_bh(&css_set_lock);
2930

2931
	ret = cgroup_taskset_migrate(&tset, cgrp);
2932 2933
out_finish:
	cgroup_migrate_finish(&preloaded_csets);
T
Tejun Heo 已提交
2934
	percpu_up_write(&cgroup_threadgroup_rwsem);
2935 2936 2937
	return ret;
}

2938
/**
2939
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
2940
 * @cgrp: root of the target subtree
2941 2942
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
2943 2944
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
2945
 */
2946 2947
static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
	__acquires(&cgroup_mutex)
2948 2949
{
	struct cgroup *dsct;
2950
	struct cgroup_subsys_state *d_css;
2951 2952 2953
	struct cgroup_subsys *ss;
	int ssid;

2954 2955
restart:
	mutex_lock(&cgroup_mutex);
2956

2957
	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2958 2959 2960 2961
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
			DEFINE_WAIT(wait);

2962
			if (!css || !percpu_ref_is_dying(&css->refcnt))
2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973
				continue;

			cgroup_get(dsct);
			prepare_to_wait(&dsct->offline_waitq, &wait,
					TASK_UNINTERRUPTIBLE);

			mutex_unlock(&cgroup_mutex);
			schedule();
			finish_wait(&dsct->offline_waitq, &wait);

			cgroup_put(dsct);
2974
			goto restart;
2975 2976 2977 2978
		}
	}
}

2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011
/**
 * cgroup_save_control - save control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control and ->subtree_ss_mask to the respective old_
 * prefixed fields for @cgrp's subtree including @cgrp itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
		dsct->old_subtree_control = dsct->subtree_control;
		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
	}
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
		dsct->subtree_control &= cgroup_control(dsct);
3012 3013 3014
		dsct->subtree_ss_mask =
			cgroup_calc_subtree_ss_mask(dsct->subtree_control,
						    cgroup_ss_mask(dsct));
3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
	}
}

/**
 * cgroup_restore_control - restore control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
 * prefixed fields for @cgrp's subtree including @cgrp itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
		dsct->subtree_control = dsct->old_subtree_control;
		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
	}
}

3036 3037
/**
 * cgroup_apply_control_enable - enable or show csses according to control
3038
 * @cgrp: root of the target subtree
3039
 *
3040
 * Walk @cgrp's subtree and create new csses or make the existing ones
3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
 * cleaning up with cgroup_apply_control_disble().
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
	struct cgroup *dsct;
3052
	struct cgroup_subsys_state *d_css;
3053 3054 3055
	struct cgroup_subsys *ss;
	int ssid, ret;

3056
	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3057 3058 3059
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

3060 3061
			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));

3062 3063 3064 3065 3066 3067 3068 3069 3070 3071
			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
				continue;

			if (!css) {
				css = css_create(dsct, ss);
				if (IS_ERR(css))
					return PTR_ERR(css);
			}

			if (cgroup_control(dsct) & (1 << ss->id)) {
3072
				ret = css_populate_dir(css);
3073 3074 3075 3076 3077 3078 3079 3080 3081
				if (ret)
					return ret;
			}
		}
	}

	return 0;
}

3082 3083
/**
 * cgroup_apply_control_disable - kill or hide csses according to control
3084
 * @cgrp: root of the target subtree
3085
 *
3086
 * Walk @cgrp's subtree and kill and hide csses so that they match
3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
	struct cgroup *dsct;
3098
	struct cgroup_subsys_state *d_css;
3099 3100 3101
	struct cgroup_subsys *ss;
	int ssid;

3102
	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3103 3104 3105
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

3106 3107
			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));

3108 3109 3110
			if (!css)
				continue;

3111 3112
			if (css->parent &&
			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3113 3114
				kill_css(css);
			} else if (!(cgroup_control(dsct) & (1 << ss->id))) {
3115
				css_clear_dir(css);
3116 3117 3118 3119 3120 3121 3122
				if (ss->css_reset)
					ss->css_reset(css);
			}
		}
	}
}

3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
	int ret;

	cgroup_propagate_control(cgrp);

	ret = cgroup_apply_control_enable(cgrp);
	if (ret)
		return ret;

	/*
	 * At this point, cgroup_e_css() results reflect the new csses
	 * making the following cgroup_update_dfl_csses() properly update
	 * css associations of all tasks in the subtree.
	 */
	ret = cgroup_update_dfl_csses(cgrp);
	if (ret)
		return ret;

	return 0;
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
	if (ret) {
		cgroup_restore_control(cgrp);
		cgroup_propagate_control(cgrp);
	}

	cgroup_apply_control_disable(cgrp);
}

3179
/* change the enabled child controllers for a cgroup in the default hierarchy */
3180 3181 3182
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
					    char *buf, size_t nbytes,
					    loff_t off)
3183
{
3184
	u16 enable = 0, disable = 0;
3185
	struct cgroup *cgrp, *child;
3186
	struct cgroup_subsys *ss;
3187
	char *tok;
3188 3189 3190
	int ssid, ret;

	/*
3191 3192
	 * Parse input - space separated list of subsystem names prefixed
	 * with either + or -.
3193
	 */
3194 3195
	buf = strstrip(buf);
	while ((tok = strsep(&buf, " "))) {
3196 3197
		if (tok[0] == '\0')
			continue;
T
Tejun Heo 已提交
3198
		do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3199 3200
			if (!cgroup_ssid_enabled(ssid) ||
			    strcmp(tok + 1, ss->name))
3201 3202 3203
				continue;

			if (*tok == '+') {
3204 3205
				enable |= 1 << ssid;
				disable &= ~(1 << ssid);
3206
			} else if (*tok == '-') {
3207 3208
				disable |= 1 << ssid;
				enable &= ~(1 << ssid);
3209 3210 3211 3212
			} else {
				return -EINVAL;
			}
			break;
3213
		} while_each_subsys_mask();
3214 3215 3216 3217
		if (ssid == CGROUP_SUBSYS_COUNT)
			return -EINVAL;
	}

3218
	cgrp = cgroup_kn_lock_live(of->kn, true);
3219 3220
	if (!cgrp)
		return -ENODEV;
3221 3222 3223

	for_each_subsys(ss, ssid) {
		if (enable & (1 << ssid)) {
3224
			if (cgrp->subtree_control & (1 << ssid)) {
3225 3226 3227 3228
				enable &= ~(1 << ssid);
				continue;
			}

3229
			if (!(cgroup_control(cgrp) & (1 << ssid))) {
3230 3231 3232
				ret = -ENOENT;
				goto out_unlock;
			}
3233
		} else if (disable & (1 << ssid)) {
3234
			if (!(cgrp->subtree_control & (1 << ssid))) {
3235 3236 3237 3238 3239 3240
				disable &= ~(1 << ssid);
				continue;
			}

			/* a child has it enabled? */
			cgroup_for_each_live_child(child, cgrp) {
3241
				if (child->subtree_control & (1 << ssid)) {
3242
					ret = -EBUSY;
3243
					goto out_unlock;
3244 3245 3246 3247 3248 3249 3250
				}
			}
		}
	}

	if (!enable && !disable) {
		ret = 0;
3251
		goto out_unlock;
3252 3253 3254
	}

	/*
3255
	 * Except for the root, subtree_control must be zero for a cgroup
3256 3257
	 * with tasks so that child cgroups don't compete against tasks.
	 */
T
Tejun Heo 已提交
3258
	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
3259 3260 3261 3262
		ret = -EBUSY;
		goto out_unlock;
	}

3263 3264
	/* save and update control masks and prepare csses */
	cgroup_save_control(cgrp);
3265

3266 3267
	cgrp->subtree_control |= enable;
	cgrp->subtree_control &= ~disable;
3268

3269
	ret = cgroup_apply_control(cgrp);
3270

3271
	cgroup_finalize_control(cgrp, ret);
3272 3273 3274 3275

	kernfs_activate(cgrp->kn);
	ret = 0;
out_unlock:
3276
	cgroup_kn_unlock(of->kn);
3277
	return ret ?: nbytes;
3278 3279
}

3280
static int cgroup_events_show(struct seq_file *seq, void *v)
3281
{
3282
	seq_printf(seq, "populated %d\n",
3283
		   cgroup_is_populated(seq_css(seq)->cgroup));
3284 3285 3286
	return 0;
}

T
Tejun Heo 已提交
3287 3288
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
				 size_t nbytes, loff_t off)
3289
{
T
Tejun Heo 已提交
3290 3291 3292
	struct cgroup *cgrp = of->kn->parent->priv;
	struct cftype *cft = of->kn->priv;
	struct cgroup_subsys_state *css;
3293
	int ret;
3294

T
Tejun Heo 已提交
3295 3296 3297
	if (cft->write)
		return cft->write(of, buf, nbytes, off);

T
Tejun Heo 已提交
3298 3299 3300 3301 3302 3303 3304 3305 3306
	/*
	 * kernfs guarantees that a file isn't deleted with operations in
	 * flight, which means that the matching css is and stays alive and
	 * doesn't need to be pinned.  The RCU locking is not necessary
	 * either.  It's just for the convenience of using cgroup_css().
	 */
	rcu_read_lock();
	css = cgroup_css(cgrp, cft->ss);
	rcu_read_unlock();
3307

3308
	if (cft->write_u64) {
3309 3310 3311 3312 3313 3314 3315 3316 3317
		unsigned long long v;
		ret = kstrtoull(buf, 0, &v);
		if (!ret)
			ret = cft->write_u64(css, cft, v);
	} else if (cft->write_s64) {
		long long v;
		ret = kstrtoll(buf, 0, &v);
		if (!ret)
			ret = cft->write_s64(css, cft, v);
3318
	} else {
3319
		ret = -EINVAL;
3320
	}
T
Tejun Heo 已提交
3321

3322
	return ret ?: nbytes;
3323 3324
}

3325
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3326
{
T
Tejun Heo 已提交
3327
	return seq_cft(seq)->seq_start(seq, ppos);
3328 3329
}

3330
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3331
{
T
Tejun Heo 已提交
3332
	return seq_cft(seq)->seq_next(seq, v, ppos);
3333 3334
}

3335
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3336
{
T
Tejun Heo 已提交
3337
	seq_cft(seq)->seq_stop(seq, v);
3338 3339
}

3340
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3341
{
3342 3343
	struct cftype *cft = seq_cft(m);
	struct cgroup_subsys_state *css = seq_css(m);
3344

3345 3346
	if (cft->seq_show)
		return cft->seq_show(m, arg);
3347

3348
	if (cft->read_u64)
3349 3350 3351 3352 3353 3354
		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
	else if (cft->read_s64)
		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
	else
		return -EINVAL;
	return 0;
3355 3356
}

T
Tejun Heo 已提交
3357 3358 3359 3360
static struct kernfs_ops cgroup_kf_single_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_show		= cgroup_seqfile_show,
3361 3362
};

T
Tejun Heo 已提交
3363 3364 3365 3366 3367 3368 3369 3370
static struct kernfs_ops cgroup_kf_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_start		= cgroup_seqfile_start,
	.seq_next		= cgroup_seqfile_next,
	.seq_stop		= cgroup_seqfile_stop,
	.seq_show		= cgroup_seqfile_show,
};
3371 3372 3373 3374

/*
 * cgroup_rename - Only allow simple rename of directories in place.
 */
T
Tejun Heo 已提交
3375 3376
static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			 const char *new_name_str)
3377
{
T
Tejun Heo 已提交
3378
	struct cgroup *cgrp = kn->priv;
3379 3380
	int ret;

T
Tejun Heo 已提交
3381
	if (kernfs_type(kn) != KERNFS_DIR)
3382
		return -ENOTDIR;
T
Tejun Heo 已提交
3383
	if (kn->parent != new_parent)
3384
		return -EIO;
3385

3386 3387
	/*
	 * This isn't a proper migration and its usefulness is very
3388
	 * limited.  Disallow on the default hierarchy.
3389
	 */
3390
	if (cgroup_on_dfl(cgrp))
3391
		return -EPERM;
L
Li Zefan 已提交
3392

3393
	/*
T
Tejun Heo 已提交
3394
	 * We're gonna grab cgroup_mutex which nests outside kernfs
3395
	 * active_ref.  kernfs_rename() doesn't require active_ref
T
Tejun Heo 已提交
3396
	 * protection.  Break them before grabbing cgroup_mutex.
3397 3398 3399
	 */
	kernfs_break_active_protection(new_parent);
	kernfs_break_active_protection(kn);
L
Li Zefan 已提交
3400

T
Tejun Heo 已提交
3401
	mutex_lock(&cgroup_mutex);
L
Li Zefan 已提交
3402

T
Tejun Heo 已提交
3403
	ret = kernfs_rename(kn, new_parent, new_name_str);
L
Li Zefan 已提交
3404

T
Tejun Heo 已提交
3405
	mutex_unlock(&cgroup_mutex);
3406 3407 3408

	kernfs_unbreak_active_protection(kn);
	kernfs_unbreak_active_protection(new_parent);
T
Tejun Heo 已提交
3409
	return ret;
L
Li Zefan 已提交
3410 3411
}

3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
			       .ia_uid = current_fsuid(),
			       .ia_gid = current_fsgid(), };

	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
		return 0;

	return kernfs_setattr(kn, &iattr);
}

3426 3427
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
			   struct cftype *cft)
3428
{
T
Tejun Heo 已提交
3429
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
3430 3431
	struct kernfs_node *kn;
	struct lock_class_key *key = NULL;
3432
	int ret;
T
Tejun Heo 已提交
3433

T
Tejun Heo 已提交
3434 3435 3436 3437 3438
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	key = &cft->lockdep_key;
#endif
	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
T
Tejun Heo 已提交
3439
				  NULL, key);
3440 3441 3442 3443
	if (IS_ERR(kn))
		return PTR_ERR(kn);

	ret = cgroup_kn_set_ugid(kn);
3444
	if (ret) {
3445
		kernfs_remove(kn);
3446 3447 3448
		return ret;
	}

3449 3450 3451
	if (cft->file_offset) {
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

3452
		spin_lock_irq(&cgroup_file_kn_lock);
3453
		cfile->kn = kn;
3454
		spin_unlock_irq(&cgroup_file_kn_lock);
3455 3456
	}

3457
	return 0;
3458 3459
}

3460 3461
/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
3462 3463
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
3464 3465 3466 3467
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3468
 * For removals, this function never fails.
3469
 */
3470 3471
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
3472
			      bool is_add)
3473
{
3474
	struct cftype *cft, *cft_end = NULL;
3475
	int ret = 0;
3476

3477
	lockdep_assert_held(&cgroup_mutex);
T
Tejun Heo 已提交
3478

3479 3480
restart:
	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3481
		/* does cft->flags tell us to skip this file on @cgrp? */
3482
		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
T
Tejun Heo 已提交
3483
			continue;
3484
		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3485
			continue;
T
Tejun Heo 已提交
3486
		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3487
			continue;
T
Tejun Heo 已提交
3488
		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3489 3490
			continue;

3491
		if (is_add) {
3492
			ret = cgroup_add_file(css, cgrp, cft);
3493
			if (ret) {
3494 3495
				pr_warn("%s: failed to add %s, err=%d\n",
					__func__, cft->name, ret);
3496 3497 3498
				cft_end = cft;
				is_add = false;
				goto restart;
3499
			}
3500 3501
		} else {
			cgroup_rm_file(cgrp, cft);
T
Tejun Heo 已提交
3502
		}
3503
	}
3504
	return ret;
3505 3506
}

3507
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3508 3509
{
	LIST_HEAD(pending);
3510
	struct cgroup_subsys *ss = cfts[0].ss;
3511
	struct cgroup *root = &ss->root->cgrp;
3512
	struct cgroup_subsys_state *css;
3513
	int ret = 0;
3514

3515
	lockdep_assert_held(&cgroup_mutex);
3516 3517

	/* add/rm files for all cgroups created before */
3518
	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3519 3520
		struct cgroup *cgrp = css->cgroup;

3521
		if (!(css->flags & CSS_VISIBLE))
3522 3523
			continue;

3524
		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3525 3526
		if (ret)
			break;
3527
	}
3528 3529 3530

	if (is_add && !ret)
		kernfs_activate(root->kn);
3531
	return ret;
3532 3533
}

3534
static void cgroup_exit_cftypes(struct cftype *cfts)
3535
{
3536
	struct cftype *cft;
3537

T
Tejun Heo 已提交
3538 3539 3540 3541 3542
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		/* free copy for custom atomic_write_len, see init_cftypes() */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
			kfree(cft->kf_ops);
		cft->kf_ops = NULL;
3543
		cft->ss = NULL;
3544 3545

		/* revert flags set by cgroup core while adding @cfts */
3546
		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
T
Tejun Heo 已提交
3547
	}
3548 3549
}

T
Tejun Heo 已提交
3550
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3551 3552 3553
{
	struct cftype *cft;

T
Tejun Heo 已提交
3554 3555 3556
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		struct kernfs_ops *kf_ops;

T
Tejun Heo 已提交
3557 3558
		WARN_ON(cft->ss || cft->kf_ops);

T
Tejun Heo 已提交
3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575
		if (cft->seq_start)
			kf_ops = &cgroup_kf_ops;
		else
			kf_ops = &cgroup_kf_single_ops;

		/*
		 * Ugh... if @cft wants a custom max_write_len, we need to
		 * make a copy of kf_ops to set its atomic_write_len.
		 */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
			if (!kf_ops) {
				cgroup_exit_cftypes(cfts);
				return -ENOMEM;
			}
			kf_ops->atomic_write_len = cft->max_write_len;
		}
3576

T
Tejun Heo 已提交
3577
		cft->kf_ops = kf_ops;
3578
		cft->ss = ss;
T
Tejun Heo 已提交
3579
	}
3580

T
Tejun Heo 已提交
3581
	return 0;
3582 3583
}

3584 3585
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
3586
	lockdep_assert_held(&cgroup_mutex);
3587 3588 3589 3590 3591 3592 3593 3594

	if (!cfts || !cfts[0].ss)
		return -ENOENT;

	list_del(&cfts->node);
	cgroup_apply_cftypes(cfts, false);
	cgroup_exit_cftypes(cfts);
	return 0;
3595 3596
}

3597 3598 3599 3600
/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
3601 3602 3603
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
3604 3605
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3606
 * registered.
3607
 */
3608
int cgroup_rm_cftypes(struct cftype *cfts)
3609
{
3610
	int ret;
3611

3612
	mutex_lock(&cgroup_mutex);
3613
	ret = cgroup_rm_cftypes_locked(cfts);
3614
	mutex_unlock(&cgroup_mutex);
3615
	return ret;
T
Tejun Heo 已提交
3616 3617
}

3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631
/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
3632
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3633
{
3634
	int ret;
3635

3636
	if (!cgroup_ssid_enabled(ss->id))
3637 3638
		return 0;

3639 3640
	if (!cfts || cfts[0].name[0] == '\0')
		return 0;
3641

T
Tejun Heo 已提交
3642 3643 3644
	ret = cgroup_init_cftypes(ss, cfts);
	if (ret)
		return ret;
3645

3646
	mutex_lock(&cgroup_mutex);
3647

T
Tejun Heo 已提交
3648
	list_add_tail(&cfts->node, &ss->cfts);
3649
	ret = cgroup_apply_cftypes(cfts, true);
3650
	if (ret)
3651
		cgroup_rm_cftypes_locked(cfts);
3652

3653
	mutex_unlock(&cgroup_mutex);
3654
	return ret;
3655 3656
}

3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669
/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
	struct cftype *cft;

	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3670
		cft->flags |= __CFTYPE_ONLY_ON_DFL;
3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681
	return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
3682 3683
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
3684 3685
	struct cftype *cft;

3686 3687
	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
		cft->flags |= __CFTYPE_NOT_ON_DFL;
3688 3689 3690
	return cgroup_add_cftypes(ss, cfts);
}

3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706
/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
	unsigned long flags;

	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
	if (cfile->kn)
		kernfs_notify(cfile->kn);
	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

L
Li Zefan 已提交
3707 3708 3709 3710 3711 3712
/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 *
 * Return the number of tasks in the cgroup.
 */
3713
static int cgroup_task_count(const struct cgroup *cgrp)
3714 3715
{
	int count = 0;
3716
	struct cgrp_cset_link *link;
3717

3718
	spin_lock_bh(&css_set_lock);
3719 3720
	list_for_each_entry(link, &cgrp->cset_links, cset_link)
		count += atomic_read(&link->cset->refcount);
3721
	spin_unlock_bh(&css_set_lock);
3722 3723 3724
	return count;
}

3725
/**
3726
 * css_next_child - find the next child of a given css
3727 3728
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
3729
 *
3730
 * This function returns the next child of @parent and should be called
3731
 * under either cgroup_mutex or RCU read lock.  The only requirement is
3732 3733 3734 3735 3736 3737 3738 3739 3740
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3741
 */
3742 3743
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
					   struct cgroup_subsys_state *parent)
3744
{
3745
	struct cgroup_subsys_state *next;
3746

T
Tejun Heo 已提交
3747
	cgroup_assert_mutex_or_rcu_locked();
3748 3749

	/*
3750 3751 3752 3753 3754 3755 3756 3757 3758 3759
	 * @pos could already have been unlinked from the sibling list.
	 * Once a cgroup is removed, its ->sibling.next is no longer
	 * updated when its next sibling changes.  CSS_RELEASED is set when
	 * @pos is taken off list, at which time its next pointer is valid,
	 * and, as releases are serialized, the one pointed to by the next
	 * pointer is guaranteed to not have started release yet.  This
	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
	 * critical section, the one pointed to by its next pointer is
	 * guaranteed to not have finished its RCU grace period even if we
	 * have dropped rcu_read_lock() inbetween iterations.
3760
	 *
3761 3762 3763 3764 3765 3766 3767
	 * If @pos has CSS_RELEASED set, its next pointer can't be
	 * dereferenced; however, as each css is given a monotonically
	 * increasing unique serial number and always appended to the
	 * sibling list, the next one can be found by walking the parent's
	 * children until the first css with higher serial number than
	 * @pos's.  While this path can be slower, it happens iff iteration
	 * races against release and the race window is very small.
3768
	 */
3769
	if (!pos) {
3770 3771 3772
		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
	} else if (likely(!(pos->flags & CSS_RELEASED))) {
		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3773
	} else {
3774
		list_for_each_entry_rcu(next, &parent->children, sibling)
3775 3776
			if (next->serial_nr > pos->serial_nr)
				break;
3777 3778
	}

3779 3780
	/*
	 * @next, if not pointing to the head, can be dereferenced and is
3781
	 * the next sibling.
3782
	 */
3783 3784
	if (&next->sibling != &parent->children)
		return next;
3785
	return NULL;
3786 3787
}

3788
/**
3789
 * css_next_descendant_pre - find the next descendant for pre-order walk
3790
 * @pos: the current position (%NULL to initiate traversal)
3791
 * @root: css whose descendants to walk
3792
 *
3793
 * To be used by css_for_each_descendant_pre().  Find the next descendant
3794 3795
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
3796
 *
3797 3798 3799 3800
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
3801 3802 3803 3804 3805 3806 3807
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3808
 */
3809 3810 3811
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
			struct cgroup_subsys_state *root)
3812
{
3813
	struct cgroup_subsys_state *next;
3814

T
Tejun Heo 已提交
3815
	cgroup_assert_mutex_or_rcu_locked();
3816

3817
	/* if first iteration, visit @root */
3818
	if (!pos)
3819
		return root;
3820 3821

	/* visit the first child if exists */
3822
	next = css_next_child(NULL, pos);
3823 3824 3825 3826
	if (next)
		return next;

	/* no child, visit my or the closest ancestor's next sibling */
3827
	while (pos != root) {
T
Tejun Heo 已提交
3828
		next = css_next_child(pos, pos->parent);
3829
		if (next)
3830
			return next;
T
Tejun Heo 已提交
3831
		pos = pos->parent;
3832
	}
3833 3834 3835 3836

	return NULL;
}

3837
/**
3838 3839
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
3840
 *
3841 3842
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
3843
 * subtree of @pos.
3844
 *
3845 3846 3847 3848
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
3849
 */
3850 3851
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
3852
{
3853
	struct cgroup_subsys_state *last, *tmp;
3854

T
Tejun Heo 已提交
3855
	cgroup_assert_mutex_or_rcu_locked();
3856 3857 3858 3859 3860

	do {
		last = pos;
		/* ->prev isn't RCU safe, walk ->next till the end */
		pos = NULL;
3861
		css_for_each_child(tmp, last)
3862 3863 3864 3865 3866 3867
			pos = tmp;
	} while (pos);

	return last;
}

3868 3869
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
3870
{
3871
	struct cgroup_subsys_state *last;
3872 3873 3874

	do {
		last = pos;
3875
		pos = css_next_child(NULL, pos);
3876 3877 3878 3879 3880 3881
	} while (pos);

	return last;
}

/**
3882
 * css_next_descendant_post - find the next descendant for post-order walk
3883
 * @pos: the current position (%NULL to initiate traversal)
3884
 * @root: css whose descendants to walk
3885
 *
3886
 * To be used by css_for_each_descendant_post().  Find the next descendant
3887 3888
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
3889
 *
3890 3891 3892 3893 3894
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
3895 3896 3897 3898 3899 3900 3901
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3902
 */
3903 3904 3905
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
			 struct cgroup_subsys_state *root)
3906
{
3907
	struct cgroup_subsys_state *next;
3908

T
Tejun Heo 已提交
3909
	cgroup_assert_mutex_or_rcu_locked();
3910

3911 3912 3913
	/* if first iteration, visit leftmost descendant which may be @root */
	if (!pos)
		return css_leftmost_descendant(root);
3914

3915 3916 3917 3918
	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

3919
	/* if there's an unvisited sibling, visit its leftmost descendant */
T
Tejun Heo 已提交
3920
	next = css_next_child(pos, pos->parent);
3921
	if (next)
3922
		return css_leftmost_descendant(next);
3923 3924

	/* no sibling left, visit parent */
T
Tejun Heo 已提交
3925
	return pos->parent;
3926 3927
}

3928 3929 3930 3931 3932 3933 3934 3935 3936
/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
3937
{
3938 3939
	struct cgroup_subsys_state *child;
	bool ret = false;
3940 3941

	rcu_read_lock();
3942
	css_for_each_child(child, css) {
3943
		if (child->flags & CSS_ONLINE) {
3944 3945
			ret = true;
			break;
3946 3947 3948
		}
	}
	rcu_read_unlock();
3949
	return ret;
3950 3951
}

3952
/**
3953
 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3954 3955 3956
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
3957
 */
3958
static void css_task_iter_advance_css_set(struct css_task_iter *it)
3959
{
T
Tejun Heo 已提交
3960
	struct list_head *l = it->cset_pos;
3961 3962 3963
	struct cgrp_cset_link *link;
	struct css_set *cset;

3964
	lockdep_assert_held(&css_set_lock);
3965

3966 3967 3968
	/* Advance to the next non-empty css_set */
	do {
		l = l->next;
T
Tejun Heo 已提交
3969 3970
		if (l == it->cset_head) {
			it->cset_pos = NULL;
3971
			it->task_pos = NULL;
3972 3973
			return;
		}
3974 3975 3976 3977 3978 3979 3980 3981

		if (it->ss) {
			cset = container_of(l, struct css_set,
					    e_cset_node[it->ss->id]);
		} else {
			link = list_entry(l, struct cgrp_cset_link, cset_link);
			cset = link->cset;
		}
3982
	} while (!css_set_populated(cset));
T
Tejun Heo 已提交
3983

T
Tejun Heo 已提交
3984
	it->cset_pos = l;
T
Tejun Heo 已提交
3985 3986

	if (!list_empty(&cset->tasks))
T
Tejun Heo 已提交
3987
		it->task_pos = cset->tasks.next;
T
Tejun Heo 已提交
3988
	else
T
Tejun Heo 已提交
3989 3990 3991 3992
		it->task_pos = cset->mg_tasks.next;

	it->tasks_head = &cset->tasks;
	it->mg_tasks_head = &cset->mg_tasks;
3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015

	/*
	 * We don't keep css_sets locked across iteration steps and thus
	 * need to take steps to ensure that iteration can be resumed after
	 * the lock is re-acquired.  Iteration is performed at two levels -
	 * css_sets and tasks in them.
	 *
	 * Once created, a css_set never leaves its cgroup lists, so a
	 * pinned css_set is guaranteed to stay put and we can resume
	 * iteration afterwards.
	 *
	 * Tasks may leave @cset across iteration steps.  This is resolved
	 * by registering each iterator with the css_set currently being
	 * walked and making css_set_move_task() advance iterators whose
	 * next task is leaving.
	 */
	if (it->cur_cset) {
		list_del(&it->iters_node);
		put_css_set_locked(it->cur_cset);
	}
	get_css_set(cset);
	it->cur_cset = cset;
	list_add(&it->iters_node, &cset->task_iters);
4016 4017
}

4018 4019 4020 4021
static void css_task_iter_advance(struct css_task_iter *it)
{
	struct list_head *l = it->task_pos;

4022
	lockdep_assert_held(&css_set_lock);
4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040
	WARN_ON_ONCE(!l);

	/*
	 * Advance iterator to find next entry.  cset->tasks is consumed
	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
	 * next cset.
	 */
	l = l->next;

	if (l == it->tasks_head)
		l = it->mg_tasks_head->next;

	if (l == it->mg_tasks_head)
		css_task_iter_advance_css_set(it);
	else
		it->task_pos = l;
}

4041
/**
4042 4043
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
4044 4045
 * @it: the task iterator to use
 *
4046 4047 4048 4049
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
4050
 */
4051 4052
void css_task_iter_start(struct cgroup_subsys_state *css,
			 struct css_task_iter *it)
4053
{
4054 4055
	/* no one should try to iterate before mounting cgroups */
	WARN_ON_ONCE(!use_task_css_set_links);
4056

4057 4058
	memset(it, 0, sizeof(*it));

4059
	spin_lock_bh(&css_set_lock);
4060

4061 4062 4063 4064 4065 4066 4067
	it->ss = css->ss;

	if (it->ss)
		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
	else
		it->cset_pos = &css->cgroup->cset_links;

T
Tejun Heo 已提交
4068
	it->cset_head = it->cset_pos;
4069

4070
	css_task_iter_advance_css_set(it);
4071

4072
	spin_unlock_bh(&css_set_lock);
4073 4074
}

4075
/**
4076
 * css_task_iter_next - return the next task for the iterator
4077 4078 4079
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
4080 4081
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
4082
 */
4083
struct task_struct *css_task_iter_next(struct css_task_iter *it)
4084
{
4085
	if (it->cur_task) {
4086
		put_task_struct(it->cur_task);
4087 4088
		it->cur_task = NULL;
	}
4089

4090
	spin_lock_bh(&css_set_lock);
4091

4092 4093 4094 4095 4096 4097
	if (it->task_pos) {
		it->cur_task = list_entry(it->task_pos, struct task_struct,
					  cg_list);
		get_task_struct(it->cur_task);
		css_task_iter_advance(it);
	}
4098

4099
	spin_unlock_bh(&css_set_lock);
4100 4101

	return it->cur_task;
4102 4103
}

4104
/**
4105
 * css_task_iter_end - finish task iteration
4106 4107
 * @it: the task iterator to finish
 *
4108
 * Finish task iteration started by css_task_iter_start().
4109
 */
4110
void css_task_iter_end(struct css_task_iter *it)
4111
{
4112
	if (it->cur_cset) {
4113
		spin_lock_bh(&css_set_lock);
4114 4115
		list_del(&it->iters_node);
		put_css_set_locked(it->cur_cset);
4116
		spin_unlock_bh(&css_set_lock);
4117 4118 4119 4120
	}

	if (it->cur_task)
		put_task_struct(it->cur_task);
4121 4122 4123
}

/**
4124 4125 4126
 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
 * @to: cgroup to which the tasks will be moved
 * @from: cgroup in which the tasks currently reside
4127
 *
4128 4129 4130 4131 4132
 * Locking rules between cgroup_post_fork() and the migration path
 * guarantee that, if a task is forking while being migrated, the new child
 * is guaranteed to be either visible in the source cgroup after the
 * parent's migration is complete or put into the target cgroup.  No task
 * can slip out of migration through forking.
4133
 */
4134
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4135
{
4136 4137
	LIST_HEAD(preloaded_csets);
	struct cgrp_cset_link *link;
4138
	struct css_task_iter it;
4139
	struct task_struct *task;
4140
	int ret;
4141

4142
	mutex_lock(&cgroup_mutex);
4143

4144
	/* all tasks in @from are being moved, all csets are source */
4145
	spin_lock_bh(&css_set_lock);
4146 4147
	list_for_each_entry(link, &from->cset_links, cset_link)
		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4148
	spin_unlock_bh(&css_set_lock);
4149

4150 4151 4152
	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
	if (ret)
		goto out_err;
4153

4154
	/*
R
Rami Rosen 已提交
4155
	 * Migrate tasks one-by-one until @from is empty.  This fails iff
4156 4157
	 * ->can_attach() fails.
	 */
4158
	do {
4159
		css_task_iter_start(&from->self, &it);
4160 4161 4162 4163 4164 4165
		task = css_task_iter_next(&it);
		if (task)
			get_task_struct(task);
		css_task_iter_end(&it);

		if (task) {
4166
			ret = cgroup_migrate(task, false, to);
4167 4168 4169
			put_task_struct(task);
		}
	} while (task && !ret);
4170 4171
out_err:
	cgroup_migrate_finish(&preloaded_csets);
T
Tejun Heo 已提交
4172
	mutex_unlock(&cgroup_mutex);
4173
	return ret;
4174 4175
}

4176
/*
4177
 * Stuff for reading the 'tasks'/'procs' files.
4178 4179 4180 4181 4182 4183 4184 4185
 *
 * Reading this file can return large amounts of data if a cgroup has
 * *lots* of attached tasks. So it may need several calls to read(),
 * but we cannot guarantee that the information we produce is correct
 * unless we produce it entirely atomically.
 *
 */

4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211
/* which pidlist file are we talking about? */
enum cgroup_filetype {
	CGROUP_FILE_PROCS,
	CGROUP_FILE_TASKS,
};

/*
 * A pidlist is a list of pids that virtually represents the contents of one
 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
 * a pair (one each for procs, tasks) for each pid namespace that's relevant
 * to the cgroup.
 */
struct cgroup_pidlist {
	/*
	 * used to find which pidlist is wanted. doesn't change as long as
	 * this particular list stays in the list.
	*/
	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
	/* array of xids */
	pid_t *list;
	/* how many elements the above list has */
	int length;
	/* each of these stored in a list by its cgroup */
	struct list_head links;
	/* pointer to the cgroup we belong to, for list removal purposes */
	struct cgroup *owner;
4212 4213
	/* for delayed destruction */
	struct delayed_work destroy_dwork;
4214 4215
};

4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228
/*
 * The following two functions "fix" the issue where there are more pids
 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
 * TODO: replace with a kernel-wide solution to this problem
 */
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
	if (PIDLIST_TOO_LARGE(count))
		return vmalloc(count * sizeof(pid_t));
	else
		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
4229

4230 4231
static void pidlist_free(void *p)
{
4232
	kvfree(p);
4233 4234
}

4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261
/*
 * Used to destroy all pidlists lingering waiting for destroy timer.  None
 * should be left afterwards.
 */
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
	struct cgroup_pidlist *l, *tmp_l;

	mutex_lock(&cgrp->pidlist_mutex);
	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
	mutex_unlock(&cgrp->pidlist_mutex);

	flush_workqueue(cgroup_pidlist_destroy_wq);
	BUG_ON(!list_empty(&cgrp->pidlists));
}

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
	struct delayed_work *dwork = to_delayed_work(work);
	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
						destroy_dwork);
	struct cgroup_pidlist *tofree = NULL;

	mutex_lock(&l->owner->pidlist_mutex);

	/*
4262 4263
	 * Destroy iff we didn't get queued again.  The state won't change
	 * as destroy_dwork can only be queued while locked.
4264
	 */
4265
	if (!delayed_work_pending(dwork)) {
4266 4267 4268 4269 4270 4271 4272 4273 4274 4275
		list_del(&l->links);
		pidlist_free(l->list);
		put_pid_ns(l->key.ns);
		tofree = l;
	}

	mutex_unlock(&l->owner->pidlist_mutex);
	kfree(tofree);
}

4276
/*
4277
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
4278
 * Returns the number of unique elements.
4279
 */
4280
static int pidlist_uniq(pid_t *list, int length)
4281
{
4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305
	int src, dest = 1;

	/*
	 * we presume the 0th element is unique, so i starts at 1. trivial
	 * edge cases first; no work needs to be done for either
	 */
	if (length == 0 || length == 1)
		return length;
	/* src and dest walk down the list; dest counts unique elements */
	for (src = 1; src < length; src++) {
		/* find next unique element */
		while (list[src] == list[src-1]) {
			src++;
			if (src == length)
				goto after;
		}
		/* dest always points to where the next unique element goes */
		list[dest] = list[src];
		dest++;
	}
after:
	return dest;
}

4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316
/*
 * The two pid files - task and cgroup.procs - guaranteed that the result
 * is sorted, which forced this whole pidlist fiasco.  As pid order is
 * different per namespace, each namespace needs differently sorted list,
 * making it impossible to use, for example, single rbtree of member tasks
 * sorted by task pointer.  As pidlists can be fairly large, allocating one
 * per open file is dangerous, so cgroup had to implement shared pool of
 * pidlists keyed by cgroup and namespace.
 *
 * All this extra complexity was caused by the original implementation
 * committing to an entirely unnecessary property.  In the long term, we
4317 4318 4319
 * want to do away with it.  Explicitly scramble sort order if on the
 * default hierarchy so that no such expectation exists in the new
 * interface.
4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333
 *
 * Scrambling is done by swapping every two consecutive bits, which is
 * non-identity one-to-one mapping which disturbs sort order sufficiently.
 */
static pid_t pid_fry(pid_t pid)
{
	unsigned a = pid & 0x55555555;
	unsigned b = pid & 0xAAAAAAAA;

	return (a << 1) | (b >> 1);
}

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
4334
	if (cgroup_on_dfl(cgrp))
4335 4336 4337 4338 4339
		return pid_fry(pid);
	else
		return pid;
}

4340 4341 4342 4343 4344
static int cmppid(const void *a, const void *b)
{
	return *(pid_t *)a - *(pid_t *)b;
}

4345 4346 4347 4348 4349
static int fried_cmppid(const void *a, const void *b)
{
	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}

T
Tejun Heo 已提交
4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
						  enum cgroup_filetype type)
{
	struct cgroup_pidlist *l;
	/* don't need task_nsproxy() if we're looking at ourself */
	struct pid_namespace *ns = task_active_pid_ns(current);

	lockdep_assert_held(&cgrp->pidlist_mutex);

	list_for_each_entry(l, &cgrp->pidlists, links)
		if (l->key.type == type && l->key.ns == ns)
			return l;
	return NULL;
}

4365 4366 4367 4368 4369 4370
/*
 * find the appropriate pidlist for our purpose (given procs vs tasks)
 * returns with the lock on that pidlist already held, and takes care
 * of the use count, or returns NULL with no locks held if we're out of
 * memory.
 */
T
Tejun Heo 已提交
4371 4372
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
						enum cgroup_filetype type)
4373 4374
{
	struct cgroup_pidlist *l;
4375

T
Tejun Heo 已提交
4376 4377 4378 4379 4380 4381
	lockdep_assert_held(&cgrp->pidlist_mutex);

	l = cgroup_pidlist_find(cgrp, type);
	if (l)
		return l;

4382
	/* entry not found; create a new one */
4383
	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
T
Tejun Heo 已提交
4384
	if (!l)
4385
		return l;
T
Tejun Heo 已提交
4386

4387
	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
4388
	l->key.type = type;
T
Tejun Heo 已提交
4389 4390
	/* don't need task_nsproxy() if we're looking at ourself */
	l->key.ns = get_pid_ns(task_active_pid_ns(current));
4391 4392 4393 4394 4395
	l->owner = cgrp;
	list_add(&l->links, &cgrp->pidlists);
	return l;
}

4396 4397 4398
/*
 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
 */
4399 4400
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
			      struct cgroup_pidlist **lp)
4401 4402 4403 4404
{
	pid_t *array;
	int length;
	int pid, n = 0; /* used for populating the array */
4405
	struct css_task_iter it;
4406
	struct task_struct *tsk;
4407 4408
	struct cgroup_pidlist *l;

4409 4410
	lockdep_assert_held(&cgrp->pidlist_mutex);

4411 4412 4413 4414 4415 4416 4417
	/*
	 * If cgroup gets more users after we read count, we won't have
	 * enough space - tough.  This race is indistinguishable to the
	 * caller from the case that the additional cgroup users didn't
	 * show up until sometime later on.
	 */
	length = cgroup_task_count(cgrp);
4418
	array = pidlist_allocate(length);
4419 4420 4421
	if (!array)
		return -ENOMEM;
	/* now, populate the array */
4422
	css_task_iter_start(&cgrp->self, &it);
4423
	while ((tsk = css_task_iter_next(&it))) {
4424
		if (unlikely(n == length))
4425
			break;
4426
		/* get tgid or pid for procs or tasks file respectively */
4427 4428 4429 4430
		if (type == CGROUP_FILE_PROCS)
			pid = task_tgid_vnr(tsk);
		else
			pid = task_pid_vnr(tsk);
4431 4432
		if (pid > 0) /* make sure to only use valid results */
			array[n++] = pid;
4433
	}
4434
	css_task_iter_end(&it);
4435 4436
	length = n;
	/* now sort & (if procs) strip out duplicates */
4437
	if (cgroup_on_dfl(cgrp))
4438 4439 4440
		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
	else
		sort(array, length, sizeof(pid_t), cmppid, NULL);
4441
	if (type == CGROUP_FILE_PROCS)
4442
		length = pidlist_uniq(array, length);
T
Tejun Heo 已提交
4443 4444

	l = cgroup_pidlist_find_create(cgrp, type);
4445
	if (!l) {
4446
		pidlist_free(array);
4447
		return -ENOMEM;
4448
	}
T
Tejun Heo 已提交
4449 4450

	/* store array, freeing old if necessary */
4451
	pidlist_free(l->list);
4452 4453
	l->list = array;
	l->length = length;
4454
	*lp = l;
4455
	return 0;
4456 4457
}

B
Balbir Singh 已提交
4458
/**
L
Li Zefan 已提交
4459
 * cgroupstats_build - build and fill cgroupstats
B
Balbir Singh 已提交
4460 4461 4462
 * @stats: cgroupstats to fill information into
 * @dentry: A dentry entry belonging to the cgroup for which stats have
 * been requested.
L
Li Zefan 已提交
4463 4464 4465
 *
 * Build and fill cgroupstats so that taskstats can export it to user
 * space.
B
Balbir Singh 已提交
4466 4467 4468
 */
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
T
Tejun Heo 已提交
4469
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4470
	struct cgroup *cgrp;
4471
	struct css_task_iter it;
B
Balbir Singh 已提交
4472
	struct task_struct *tsk;
4473

T
Tejun Heo 已提交
4474 4475 4476 4477 4478
	/* it should be kernfs_node belonging to cgroupfs and is a directory */
	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
	    kernfs_type(kn) != KERNFS_DIR)
		return -EINVAL;

4479 4480
	mutex_lock(&cgroup_mutex);

B
Balbir Singh 已提交
4481
	/*
T
Tejun Heo 已提交
4482
	 * We aren't being called from kernfs and there's no guarantee on
4483
	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
T
Tejun Heo 已提交
4484
	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
B
Balbir Singh 已提交
4485
	 */
T
Tejun Heo 已提交
4486 4487
	rcu_read_lock();
	cgrp = rcu_dereference(kn->priv);
4488
	if (!cgrp || cgroup_is_dead(cgrp)) {
T
Tejun Heo 已提交
4489
		rcu_read_unlock();
4490
		mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
4491 4492
		return -ENOENT;
	}
4493
	rcu_read_unlock();
B
Balbir Singh 已提交
4494

4495
	css_task_iter_start(&cgrp->self, &it);
4496
	while ((tsk = css_task_iter_next(&it))) {
B
Balbir Singh 已提交
4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515
		switch (tsk->state) {
		case TASK_RUNNING:
			stats->nr_running++;
			break;
		case TASK_INTERRUPTIBLE:
			stats->nr_sleeping++;
			break;
		case TASK_UNINTERRUPTIBLE:
			stats->nr_uninterruptible++;
			break;
		case TASK_STOPPED:
			stats->nr_stopped++;
			break;
		default:
			if (delayacct_is_task_waiting_on_io(tsk))
				stats->nr_io_wait++;
			break;
		}
	}
4516
	css_task_iter_end(&it);
B
Balbir Singh 已提交
4517

4518
	mutex_unlock(&cgroup_mutex);
T
Tejun Heo 已提交
4519
	return 0;
B
Balbir Singh 已提交
4520 4521
}

4522

4523
/*
4524
 * seq_file methods for the tasks/procs files. The seq_file position is the
4525
 * next pid to display; the seq_file iterator is a pointer to the pid
4526
 * in the cgroup->l->list array.
4527
 */
4528

4529
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4530
{
4531 4532 4533 4534 4535 4536
	/*
	 * Initially we receive a position value that corresponds to
	 * one more than the last pid shown (or 0 on the first call or
	 * after a seek to the start). Use a binary-search to find the
	 * next pid to display, if any
	 */
T
Tejun Heo 已提交
4537
	struct kernfs_open_file *of = s->private;
4538
	struct cgroup *cgrp = seq_css(s)->cgroup;
4539
	struct cgroup_pidlist *l;
4540
	enum cgroup_filetype type = seq_cft(s)->private;
4541
	int index = 0, pid = *pos;
4542 4543 4544 4545 4546
	int *iter, ret;

	mutex_lock(&cgrp->pidlist_mutex);

	/*
4547
	 * !NULL @of->priv indicates that this isn't the first start()
4548
	 * after open.  If the matching pidlist is around, we can use that.
4549
	 * Look for it.  Note that @of->priv can't be used directly.  It
4550 4551
	 * could already have been destroyed.
	 */
4552 4553
	if (of->priv)
		of->priv = cgroup_pidlist_find(cgrp, type);
4554 4555 4556 4557 4558

	/*
	 * Either this is the first start() after open or the matching
	 * pidlist has been destroyed inbetween.  Create a new one.
	 */
4559 4560 4561
	if (!of->priv) {
		ret = pidlist_array_load(cgrp, type,
					 (struct cgroup_pidlist **)&of->priv);
4562 4563 4564
		if (ret)
			return ERR_PTR(ret);
	}
4565
	l = of->priv;
4566 4567

	if (pid) {
4568
		int end = l->length;
S
Stephen Rothwell 已提交
4569

4570 4571
		while (index < end) {
			int mid = (index + end) / 2;
4572
			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4573 4574
				index = mid;
				break;
4575
			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4576 4577 4578 4579 4580 4581
				index = mid + 1;
			else
				end = mid;
		}
	}
	/* If we're off the end of the array, we're done */
4582
	if (index >= l->length)
4583 4584
		return NULL;
	/* Update the abstract position to be the actual pid that we found */
4585
	iter = l->list + index;
4586
	*pos = cgroup_pid_fry(cgrp, *iter);
4587 4588 4589
	return iter;
}

4590
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4591
{
T
Tejun Heo 已提交
4592
	struct kernfs_open_file *of = s->private;
4593
	struct cgroup_pidlist *l = of->priv;
4594

4595 4596
	if (l)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4597
				 CGROUP_PIDLIST_DESTROY_DELAY);
4598
	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4599 4600
}

4601
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4602
{
T
Tejun Heo 已提交
4603
	struct kernfs_open_file *of = s->private;
4604
	struct cgroup_pidlist *l = of->priv;
4605 4606
	pid_t *p = v;
	pid_t *end = l->list + l->length;
4607 4608 4609 4610 4611 4612 4613 4614
	/*
	 * Advance to the next pid in the array. If this goes off the
	 * end, we're done
	 */
	p++;
	if (p >= end) {
		return NULL;
	} else {
4615
		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4616 4617 4618 4619
		return p;
	}
}

4620
static int cgroup_pidlist_show(struct seq_file *s, void *v)
4621
{
4622 4623 4624
	seq_printf(s, "%d\n", *(int *)v);

	return 0;
4625
}
4626

4627 4628
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
					 struct cftype *cft)
4629
{
4630
	return notify_on_release(css->cgroup);
4631 4632
}

4633 4634
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
					  struct cftype *cft, u64 val)
4635 4636
{
	if (val)
4637
		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4638
	else
4639
		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4640 4641 4642
	return 0;
}

4643 4644
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
4645
{
4646
	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4647 4648
}

4649 4650
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, u64 val)
4651 4652
{
	if (val)
4653
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4654
	else
4655
		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4656 4657 4658
	return 0;
}

4659 4660
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_dfl_base_files[] = {
4661
	{
4662
		.name = "cgroup.procs",
4663
		.file_offset = offsetof(struct cgroup, procs_file),
4664 4665 4666 4667
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
4668
		.private = CGROUP_FILE_PROCS,
4669
		.write = cgroup_procs_write,
4670
	},
4671 4672 4673 4674 4675 4676 4677
	{
		.name = "cgroup.controllers",
		.seq_show = cgroup_controllers_show,
	},
	{
		.name = "cgroup.subtree_control",
		.seq_show = cgroup_subtree_control_show,
4678
		.write = cgroup_subtree_control_write,
4679
	},
4680
	{
4681
		.name = "cgroup.events",
4682
		.flags = CFTYPE_NOT_ON_ROOT,
4683
		.file_offset = offsetof(struct cgroup, events_file),
4684
		.seq_show = cgroup_events_show,
4685
	},
4686 4687
	{ }	/* terminate */
};
4688

4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709
/* cgroup core interface files for the legacy hierarchies */
static struct cftype cgroup_legacy_base_files[] = {
	{
		.name = "cgroup.procs",
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
		.private = CGROUP_FILE_PROCS,
		.write = cgroup_procs_write,
	},
	{
		.name = "cgroup.clone_children",
		.read_u64 = cgroup_clone_children_read,
		.write_u64 = cgroup_clone_children_write,
	},
	{
		.name = "cgroup.sane_behavior",
		.flags = CFTYPE_ONLY_ON_ROOT,
		.seq_show = cgroup_sane_behavior_show,
	},
4710 4711
	{
		.name = "tasks",
4712 4713 4714 4715
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
4716
		.private = CGROUP_FILE_TASKS,
4717
		.write = cgroup_tasks_write,
4718 4719 4720 4721 4722 4723
	},
	{
		.name = "notify_on_release",
		.read_u64 = cgroup_read_notify_on_release,
		.write_u64 = cgroup_write_notify_on_release,
	},
4724 4725
	{
		.name = "release_agent",
4726
		.flags = CFTYPE_ONLY_ON_ROOT,
4727
		.seq_show = cgroup_release_agent_show,
4728
		.write = cgroup_release_agent_write,
4729
		.max_write_len = PATH_MAX - 1,
4730
	},
T
Tejun Heo 已提交
4731
	{ }	/* terminate */
4732 4733
};

4734 4735 4736 4737 4738 4739 4740
/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4741 4742 4743
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_work_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
4756
static void css_free_work_fn(struct work_struct *work)
4757 4758
{
	struct cgroup_subsys_state *css =
4759
		container_of(work, struct cgroup_subsys_state, destroy_work);
4760
	struct cgroup_subsys *ss = css->ss;
4761
	struct cgroup *cgrp = css->cgroup;
4762

4763 4764
	percpu_ref_exit(&css->refcnt);

4765
	if (ss) {
4766
		/* css free path */
4767
		struct cgroup_subsys_state *parent = css->parent;
4768 4769 4770 4771
		int id = css->id;

		ss->css_free(css);
		cgroup_idr_remove(&ss->css_idr, id);
4772
		cgroup_put(cgrp);
4773 4774 4775

		if (parent)
			css_put(parent);
4776 4777 4778 4779
	} else {
		/* cgroup free path */
		atomic_dec(&cgrp->root->nr_cgrps);
		cgroup_pidlist_destroy_all(cgrp);
4780
		cancel_work_sync(&cgrp->release_agent_work);
4781

T
Tejun Heo 已提交
4782
		if (cgroup_parent(cgrp)) {
4783 4784 4785 4786 4787 4788
			/*
			 * We get a ref to the parent, and put the ref when
			 * this cgroup is being freed, so it's guaranteed
			 * that the parent won't be destroyed before its
			 * children.
			 */
T
Tejun Heo 已提交
4789
			cgroup_put(cgroup_parent(cgrp));
4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800
			kernfs_put(cgrp->kn);
			kfree(cgrp);
		} else {
			/*
			 * This is root cgroup's refcnt reaching zero,
			 * which indicates that the root should be
			 * released.
			 */
			cgroup_destroy_root(cgrp->root);
		}
	}
4801 4802
}

4803
static void css_free_rcu_fn(struct rcu_head *rcu_head)
4804 4805
{
	struct cgroup_subsys_state *css =
4806
		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4807

4808
	INIT_WORK(&css->destroy_work, css_free_work_fn);
4809
	queue_work(cgroup_destroy_wq, &css->destroy_work);
4810 4811
}

4812
static void css_release_work_fn(struct work_struct *work)
4813 4814
{
	struct cgroup_subsys_state *css =
4815
		container_of(work, struct cgroup_subsys_state, destroy_work);
4816
	struct cgroup_subsys *ss = css->ss;
4817
	struct cgroup *cgrp = css->cgroup;
4818

4819 4820
	mutex_lock(&cgroup_mutex);

4821
	css->flags |= CSS_RELEASED;
4822 4823
	list_del_rcu(&css->sibling);

4824 4825
	if (ss) {
		/* css release path */
4826
		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4827 4828
		if (ss->css_released)
			ss->css_released(css);
4829 4830 4831 4832
	} else {
		/* cgroup release path */
		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
		cgrp->id = -1;
4833 4834 4835 4836 4837 4838 4839 4840

		/*
		 * There are two control paths which try to determine
		 * cgroup from dentry without going through kernfs -
		 * cgroupstats_build() and css_tryget_online_from_dir().
		 * Those are supported by RCU protecting clearing of
		 * cgrp->kn->priv backpointer.
		 */
4841 4842 4843
		if (cgrp->kn)
			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
					 NULL);
4844
	}
4845

4846 4847
	mutex_unlock(&cgroup_mutex);

4848
	call_rcu(&css->rcu_head, css_free_rcu_fn);
4849 4850 4851 4852 4853 4854 4855
}

static void css_release(struct percpu_ref *ref)
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

4856 4857
	INIT_WORK(&css->destroy_work, css_release_work_fn);
	queue_work(cgroup_destroy_wq, &css->destroy_work);
4858 4859
}

4860 4861
static void init_and_link_css(struct cgroup_subsys_state *css,
			      struct cgroup_subsys *ss, struct cgroup *cgrp)
4862
{
4863 4864
	lockdep_assert_held(&cgroup_mutex);

4865 4866
	cgroup_get(cgrp);

4867
	memset(css, 0, sizeof(*css));
4868
	css->cgroup = cgrp;
4869
	css->ss = ss;
4870 4871
	INIT_LIST_HEAD(&css->sibling);
	INIT_LIST_HEAD(&css->children);
4872
	css->serial_nr = css_serial_nr_next++;
4873
	atomic_set(&css->online_cnt, 0);
4874

T
Tejun Heo 已提交
4875 4876
	if (cgroup_parent(cgrp)) {
		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4877 4878
		css_get(css->parent);
	}
4879

4880
	BUG_ON(cgroup_css(cgrp, ss));
4881 4882
}

4883
/* invoke ->css_online() on a new CSS and mark it online if successful */
4884
static int online_css(struct cgroup_subsys_state *css)
4885
{
4886
	struct cgroup_subsys *ss = css->ss;
T
Tejun Heo 已提交
4887 4888
	int ret = 0;

4889 4890
	lockdep_assert_held(&cgroup_mutex);

4891
	if (ss->css_online)
4892
		ret = ss->css_online(css);
4893
	if (!ret) {
4894
		css->flags |= CSS_ONLINE;
4895
		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4896 4897 4898 4899

		atomic_inc(&css->online_cnt);
		if (css->parent)
			atomic_inc(&css->parent->online_cnt);
4900
	}
T
Tejun Heo 已提交
4901
	return ret;
4902 4903
}

4904
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4905
static void offline_css(struct cgroup_subsys_state *css)
4906
{
4907
	struct cgroup_subsys *ss = css->ss;
4908 4909 4910 4911 4912 4913

	lockdep_assert_held(&cgroup_mutex);

	if (!(css->flags & CSS_ONLINE))
		return;

4914 4915 4916
	if (ss->css_reset)
		ss->css_reset(css);

4917
	if (ss->css_offline)
4918
		ss->css_offline(css);
4919

4920
	css->flags &= ~CSS_ONLINE;
4921
	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4922 4923

	wake_up_all(&css->cgroup->offline_waitq);
4924 4925
}

4926
/**
4927
 * css_create - create a cgroup_subsys_state
4928 4929 4930 4931
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
4932 4933
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
4934
 */
4935 4936
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss)
4937
{
T
Tejun Heo 已提交
4938
	struct cgroup *parent = cgroup_parent(cgrp);
4939
	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4940 4941 4942 4943 4944
	struct cgroup_subsys_state *css;
	int err;

	lockdep_assert_held(&cgroup_mutex);

4945
	css = ss->css_alloc(parent_css);
4946
	if (IS_ERR(css))
4947
		return css;
4948

4949
	init_and_link_css(css, ss, cgrp);
4950

4951
	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4952
	if (err)
4953
		goto err_free_css;
4954

V
Vladimir Davydov 已提交
4955
	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4956 4957 4958
	if (err < 0)
		goto err_free_percpu_ref;
	css->id = err;
4959

4960
	/* @css is ready to be brought online now, make it visible */
4961
	list_add_tail_rcu(&css->sibling, &parent_css->children);
4962
	cgroup_idr_replace(&ss->css_idr, css, css->id);
4963 4964 4965

	err = online_css(css);
	if (err)
4966
		goto err_list_del;
4967

4968
	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
T
Tejun Heo 已提交
4969
	    cgroup_parent(parent)) {
4970
		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4971
			current->comm, current->pid, ss->name);
4972
		if (!strcmp(ss->name, "memory"))
4973
			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4974 4975 4976
		ss->warned_broken_hierarchy = true;
	}

4977
	return css;
4978

4979 4980
err_list_del:
	list_del_rcu(&css->sibling);
4981
	cgroup_idr_remove(&ss->css_idr, css->id);
4982
err_free_percpu_ref:
4983
	percpu_ref_exit(&css->refcnt);
4984
err_free_css:
4985
	call_rcu(&css->rcu_head, css_free_rcu_fn);
4986
	return ERR_PTR(err);
4987 4988
}

4989
static struct cgroup *cgroup_create(struct cgroup *parent)
4990
{
4991 4992 4993
	struct cgroup_root *root = parent->root;
	struct cgroup *cgrp, *tcgrp;
	int level = parent->level + 1;
4994
	int ret;
4995

T
Tejun Heo 已提交
4996
	/* allocate the cgroup and its ID, 0 is reserved for the root */
4997 4998
	cgrp = kzalloc(sizeof(*cgrp) +
		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4999 5000
	if (!cgrp)
		return ERR_PTR(-ENOMEM);
5001

5002
	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5003 5004 5005
	if (ret)
		goto out_free_cgrp;

5006 5007 5008 5009
	/*
	 * Temporarily set the pointer to NULL, so idr_find() won't return
	 * a half-baked cgroup.
	 */
V
Vladimir Davydov 已提交
5010
	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5011
	if (cgrp->id < 0) {
T
Tejun Heo 已提交
5012
		ret = -ENOMEM;
5013
		goto out_cancel_ref;
5014 5015
	}

5016
	init_cgroup_housekeeping(cgrp);
5017

5018
	cgrp->self.parent = &parent->self;
T
Tejun Heo 已提交
5019
	cgrp->root = root;
5020 5021 5022 5023
	cgrp->level = level;

	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
5024

5025 5026 5027
	if (notify_on_release(parent))
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

5028 5029
	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5030

5031
	cgrp->self.serial_nr = css_serial_nr_next++;
5032

5033
	/* allocation complete, commit to creation */
5034
	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5035
	atomic_inc(&root->nr_cgrps);
5036
	cgroup_get(parent);
5037

5038 5039 5040 5041
	/*
	 * @cgrp is now fully operational.  If something fails after this
	 * point, it'll be released via the normal destruction path.
	 */
5042
	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5043

5044 5045
	/*
	 * On the default hierarchy, a child doesn't automatically inherit
5046
	 * subtree_control from the parent.  Each is configured manually.
5047
	 */
5048
	if (!cgroup_on_dfl(cgrp))
5049
		cgrp->subtree_control = cgroup_control(cgrp);
5050 5051 5052 5053 5054 5055 5056

	cgroup_propagate_control(cgrp);

	/* @cgrp doesn't have dir yet so the following will only create csses */
	ret = cgroup_apply_control_enable(cgrp);
	if (ret)
		goto out_destroy;
T
Tejun Heo 已提交
5057

5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074
	return cgrp;

out_cancel_ref:
	percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
	kfree(cgrp);
	return ERR_PTR(ret);
out_destroy:
	cgroup_destroy_locked(cgrp);
	return ERR_PTR(ret);
}

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
			umode_t mode)
{
	struct cgroup *parent, *cgrp;
	struct kernfs_node *kn;
5075
	int ret;
5076 5077 5078 5079 5080

	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
	if (strchr(name, '\n'))
		return -EINVAL;

5081
	parent = cgroup_kn_lock_live(parent_kn, false);
5082 5083 5084 5085 5086 5087 5088 5089 5090
	if (!parent)
		return -ENODEV;

	cgrp = cgroup_create(parent);
	if (IS_ERR(cgrp)) {
		ret = PTR_ERR(cgrp);
		goto out_unlock;
	}

5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108
	/* create the directory */
	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
	if (IS_ERR(kn)) {
		ret = PTR_ERR(kn);
		goto out_destroy;
	}
	cgrp->kn = kn;

	/*
	 * This extra ref will be put in cgroup_free_fn() and guarantees
	 * that @cgrp->kn is always accessible.
	 */
	kernfs_get(kn);

	ret = cgroup_kn_set_ugid(kn);
	if (ret)
		goto out_destroy;

5109
	ret = css_populate_dir(&cgrp->self);
5110 5111 5112
	if (ret)
		goto out_destroy;

5113 5114 5115
	ret = cgroup_apply_control_enable(cgrp);
	if (ret)
		goto out_destroy;
5116 5117

	/* let's create and online css's */
T
Tejun Heo 已提交
5118
	kernfs_activate(kn);
5119

T
Tejun Heo 已提交
5120 5121
	ret = 0;
	goto out_unlock;
5122

5123 5124
out_destroy:
	cgroup_destroy_locked(cgrp);
T
Tejun Heo 已提交
5125
out_unlock:
5126
	cgroup_kn_unlock(parent_kn);
T
Tejun Heo 已提交
5127
	return ret;
5128 5129
}

5130 5131
/*
 * This is called when the refcnt of a css is confirmed to be killed.
5132 5133
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initate destruction and put the css ref from kill_css().
5134 5135
 */
static void css_killed_work_fn(struct work_struct *work)
5136
{
5137 5138
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);
5139

5140
	mutex_lock(&cgroup_mutex);
5141

5142 5143 5144 5145 5146 5147 5148 5149
	do {
		offline_css(css);
		css_put(css);
		/* @css can't go away while we're holding cgroup_mutex */
		css = css->parent;
	} while (css && atomic_dec_and_test(&css->online_cnt));

	mutex_unlock(&cgroup_mutex);
5150 5151
}

5152 5153
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
5154 5155 5156 5157
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

5158 5159 5160 5161
	if (atomic_dec_and_test(&css->online_cnt)) {
		INIT_WORK(&css->destroy_work, css_killed_work_fn);
		queue_work(cgroup_destroy_wq, &css->destroy_work);
	}
5162 5163
}

5164 5165 5166 5167 5168 5169
/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
5170 5171
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
5172 5173
 */
static void kill_css(struct cgroup_subsys_state *css)
T
Tejun Heo 已提交
5174
{
5175
	lockdep_assert_held(&cgroup_mutex);
5176

T
Tejun Heo 已提交
5177 5178 5179 5180
	/*
	 * This must happen before css is disassociated with its cgroup.
	 * See seq_css() for details.
	 */
5181
	css_clear_dir(css);
5182

T
Tejun Heo 已提交
5183 5184 5185 5186 5187 5188 5189 5190 5191
	/*
	 * Killing would put the base ref, but we need to keep it alive
	 * until after ->css_offline().
	 */
	css_get(css);

	/*
	 * cgroup core guarantees that, by the time ->css_offline() is
	 * invoked, no new css reference will be given out via
5192
	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
T
Tejun Heo 已提交
5193 5194 5195 5196 5197 5198 5199
	 * proceed to offlining css's because percpu_ref_kill() doesn't
	 * guarantee that the ref is seen as killed on all CPUs on return.
	 *
	 * Use percpu_ref_kill_and_confirm() to get notifications as each
	 * css is confirmed to be seen as killed on all CPUs.
	 */
	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5200 5201 5202 5203 5204 5205 5206 5207
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
5208 5209 5210
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
5226 5227
static int cgroup_destroy_locked(struct cgroup *cgrp)
	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5228
{
T
Tejun Heo 已提交
5229
	struct cgroup_subsys_state *css;
T
Tejun Heo 已提交
5230
	int ssid;
5231

5232 5233
	lockdep_assert_held(&cgroup_mutex);

5234 5235 5236 5237 5238
	/*
	 * Only migration can raise populated from zero and we're already
	 * holding cgroup_mutex.
	 */
	if (cgroup_is_populated(cgrp))
5239
		return -EBUSY;
L
Li Zefan 已提交
5240

5241
	/*
5242 5243 5244
	 * Make sure there's no live children.  We can't test emptiness of
	 * ->self.children as dead children linger on it while being
	 * drained; otherwise, "rmdir parent/child parent" may fail.
5245
	 */
5246
	if (css_has_online_children(&cgrp->self))
5247 5248
		return -EBUSY;

5249 5250
	/*
	 * Mark @cgrp dead.  This prevents further task migration and child
5251
	 * creation by disabling cgroup_lock_live_group().
5252
	 */
5253
	cgrp->self.flags &= ~CSS_ONLINE;
5254

5255
	/* initiate massacre of all css's */
T
Tejun Heo 已提交
5256 5257
	for_each_css(css, ssid, cgrp)
		kill_css(css);
5258 5259

	/*
5260 5261
	 * Remove @cgrp directory along with the base files.  @cgrp has an
	 * extra ref on its kn.
5262
	 */
5263
	kernfs_remove(cgrp->kn);
5264

T
Tejun Heo 已提交
5265
	check_for_release(cgroup_parent(cgrp));
T
Tejun Heo 已提交
5266

5267
	/* put the base reference */
5268
	percpu_ref_kill(&cgrp->self.refcnt);
5269

5270 5271 5272
	return 0;
};

T
Tejun Heo 已提交
5273
static int cgroup_rmdir(struct kernfs_node *kn)
5274
{
5275
	struct cgroup *cgrp;
T
Tejun Heo 已提交
5276
	int ret = 0;
5277

5278
	cgrp = cgroup_kn_lock_live(kn, false);
5279 5280
	if (!cgrp)
		return 0;
5281

5282
	ret = cgroup_destroy_locked(cgrp);
5283

5284
	cgroup_kn_unlock(kn);
5285
	return ret;
5286 5287
}

T
Tejun Heo 已提交
5288 5289 5290 5291 5292 5293 5294 5295
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
	.remount_fs		= cgroup_remount,
	.show_options		= cgroup_show_options,
	.mkdir			= cgroup_mkdir,
	.rmdir			= cgroup_rmdir,
	.rename			= cgroup_rename,
};

5296
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5297 5298
{
	struct cgroup_subsys_state *css;
D
Diego Calleja 已提交
5299

5300
	pr_debug("Initializing cgroup subsys %s\n", ss->name);
5301

5302 5303
	mutex_lock(&cgroup_mutex);

5304
	idr_init(&ss->css_idr);
T
Tejun Heo 已提交
5305
	INIT_LIST_HEAD(&ss->cfts);
5306

5307 5308 5309
	/* Create the root cgroup state for this subsystem */
	ss->root = &cgrp_dfl_root;
	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5310 5311
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
5312
	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5313 5314 5315 5316 5317 5318 5319

	/*
	 * Root csses are never destroyed and we can't initialize
	 * percpu_ref during early init.  Disable refcnting.
	 */
	css->flags |= CSS_NO_REF;

5320
	if (early) {
5321
		/* allocation can't be done safely during early init */
5322 5323 5324 5325 5326
		css->id = 1;
	} else {
		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
		BUG_ON(css->id < 0);
	}
5327

L
Li Zefan 已提交
5328
	/* Update the init_css_set to contain a subsys
5329
	 * pointer to this state - since the subsystem is
L
Li Zefan 已提交
5330
	 * newly registered, all tasks and hence the
5331
	 * init_css_set is in the subsystem's root cgroup. */
5332
	init_css_set.subsys[ss->id] = css;
5333

5334 5335
	have_fork_callback |= (bool)ss->fork << ss->id;
	have_exit_callback |= (bool)ss->exit << ss->id;
5336
	have_free_callback |= (bool)ss->free << ss->id;
5337
	have_canfork_callback |= (bool)ss->can_fork << ss->id;
5338

L
Li Zefan 已提交
5339 5340 5341 5342 5343
	/* At system boot, before all subsystems have been
	 * registered, no tasks have been forked, so we don't
	 * need to invoke fork callbacks here. */
	BUG_ON(!list_empty(&init_task.tasks));

5344
	BUG_ON(online_css(css));
5345

B
Ben Blum 已提交
5346 5347 5348
	mutex_unlock(&cgroup_mutex);
}

5349
/**
L
Li Zefan 已提交
5350 5351 5352 5353
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
5354 5355 5356
 */
int __init cgroup_init_early(void)
{
5357
	static struct cgroup_sb_opts __initdata opts;
5358
	struct cgroup_subsys *ss;
5359
	int i;
5360

5361
	init_cgroup_root(&cgrp_dfl_root, &opts);
5362 5363
	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

5364
	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5365

T
Tejun Heo 已提交
5366
	for_each_subsys(ss, i) {
5367
		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5368
		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5369
		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5370
		     ss->id, ss->name);
5371 5372 5373
		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

5374
		ss->id = i;
5375
		ss->name = cgroup_subsys_name[i];
5376 5377
		if (!ss->legacy_name)
			ss->legacy_name = cgroup_subsys_name[i];
5378 5379

		if (ss->early_init)
5380
			cgroup_init_subsys(ss, true);
5381 5382 5383 5384
	}
	return 0;
}

5385
static u16 cgroup_disable_mask __initdata;
5386

5387
/**
L
Li Zefan 已提交
5388 5389 5390 5391
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
5392 5393 5394
 */
int __init cgroup_init(void)
{
5395
	struct cgroup_subsys *ss;
5396
	int ssid;
5397

5398
	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5399
	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5400 5401
	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5402

T
Tejun Heo 已提交
5403 5404
	mutex_lock(&cgroup_mutex);

5405 5406 5407 5408 5409 5410
	/*
	 * Add init_css_set to the hash table so that dfl_root can link to
	 * it during init.
	 */
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));
5411

5412
	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5413

T
Tejun Heo 已提交
5414 5415
	mutex_unlock(&cgroup_mutex);

5416
	for_each_subsys(ss, ssid) {
5417 5418 5419 5420 5421 5422 5423 5424 5425 5426
		if (ss->early_init) {
			struct cgroup_subsys_state *css =
				init_css_set.subsys[ss->id];

			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
						   GFP_KERNEL);
			BUG_ON(css->id < 0);
		} else {
			cgroup_init_subsys(ss, false);
		}
5427

T
Tejun Heo 已提交
5428 5429
		list_add_tail(&init_css_set.e_cset_node[ssid],
			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
5430 5431

		/*
5432 5433 5434
		 * Setting dfl_root subsys_mask needs to consider the
		 * disabled flag and cftype registration needs kmalloc,
		 * both of which aren't available during early_init.
5435
		 */
5436 5437 5438 5439
		if (cgroup_disable_mask & (1 << ssid)) {
			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
			printk(KERN_INFO "Disabling %s control group subsystem\n",
			       ss->name);
5440
			continue;
5441
		}
5442

5443 5444 5445 5446
		if (cgroup_ssid_no_v1(ssid))
			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
			       ss->name);

5447 5448
		cgrp_dfl_root.subsys_mask |= 1 << ss->id;

5449
		if (!ss->dfl_cftypes)
T
Tejun Heo 已提交
5450
			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5451

5452 5453 5454 5455 5456
		if (ss->dfl_cftypes == ss->legacy_cftypes) {
			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
		} else {
			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5457
		}
5458 5459 5460

		if (ss->bind)
			ss->bind(init_css_set.subsys[ssid]);
5461 5462
	}

5463 5464 5465 5466 5467
	/* init_css_set.subsys[] has been updated, re-hash */
	hash_del(&init_css_set.hlist);
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));

5468 5469
	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
	WARN_ON(register_filesystem(&cgroup_fs_type));
5470
	WARN_ON(register_filesystem(&cgroup2_fs_type));
5471
	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5472

T
Tejun Heo 已提交
5473
	return 0;
5474
}
5475

5476 5477 5478 5479 5480
static int __init cgroup_wq_init(void)
{
	/*
	 * There isn't much point in executing destruction path in
	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
5481
	 * Use 1 for @max_active.
5482 5483 5484 5485
	 *
	 * We would prefer to do this in cgroup_init() above, but that
	 * is called before init_workqueues(): so leave this until after.
	 */
5486
	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5487
	BUG_ON(!cgroup_destroy_wq);
5488 5489 5490 5491 5492 5493 5494 5495 5496

	/*
	 * Used to destroy pidlists and separate to serve as flush domain.
	 * Cap @max_active to 1 too.
	 */
	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
						    0, 1);
	BUG_ON(!cgroup_pidlist_destroy_wq);

5497 5498 5499 5500
	return 0;
}
core_initcall(cgroup_wq_init);

5501 5502 5503 5504 5505
/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
Z
Zefan Li 已提交
5506 5507
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
		     struct pid *pid, struct task_struct *tsk)
5508
{
T
Tejun Heo 已提交
5509
	char *buf, *path;
5510
	int retval;
5511
	struct cgroup_root *root;
5512 5513

	retval = -ENOMEM;
T
Tejun Heo 已提交
5514
	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5515 5516 5517 5518
	if (!buf)
		goto out;

	mutex_lock(&cgroup_mutex);
5519
	spin_lock_bh(&css_set_lock);
5520

5521
	for_each_root(root) {
5522
		struct cgroup_subsys *ss;
5523
		struct cgroup *cgrp;
T
Tejun Heo 已提交
5524
		int ssid, count = 0;
5525

T
Tejun Heo 已提交
5526
		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5527 5528
			continue;

5529
		seq_printf(m, "%d:", root->hierarchy_id);
5530 5531 5532 5533
		if (root != &cgrp_dfl_root)
			for_each_subsys(ss, ssid)
				if (root->subsys_mask & (1 << ssid))
					seq_printf(m, "%s%s", count++ ? "," : "",
5534
						   ss->legacy_name);
5535 5536 5537
		if (strlen(root->name))
			seq_printf(m, "%sname=%s", count ? "," : "",
				   root->name);
5538
		seq_putc(m, ':');
5539

5540
		cgrp = task_cgroup_from_root(tsk, root);
5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558

		/*
		 * On traditional hierarchies, all zombie tasks show up as
		 * belonging to the root cgroup.  On the default hierarchy,
		 * while a zombie doesn't show up in "cgroup.procs" and
		 * thus can't be migrated, its /proc/PID/cgroup keeps
		 * reporting the cgroup it belonged to before exiting.  If
		 * the cgroup is removed before the zombie is reaped,
		 * " (deleted)" is appended to the cgroup path.
		 */
		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
			path = cgroup_path(cgrp, buf, PATH_MAX);
			if (!path) {
				retval = -ENAMETOOLONG;
				goto out_unlock;
			}
		} else {
			path = "/";
T
Tejun Heo 已提交
5559
		}
5560

T
Tejun Heo 已提交
5561
		seq_puts(m, path);
5562 5563 5564 5565 5566

		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
			seq_puts(m, " (deleted)\n");
		else
			seq_putc(m, '\n');
5567 5568
	}

Z
Zefan Li 已提交
5569
	retval = 0;
5570
out_unlock:
5571
	spin_unlock_bh(&css_set_lock);
5572 5573 5574 5575 5576 5577 5578 5579 5580
	mutex_unlock(&cgroup_mutex);
	kfree(buf);
out:
	return retval;
}

/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
5581
	struct cgroup_subsys *ss;
5582 5583
	int i;

5584
	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
B
Ben Blum 已提交
5585 5586 5587 5588 5589
	/*
	 * ideally we don't want subsystems moving around while we do this.
	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
	 * subsys/hierarchy state.
	 */
5590
	mutex_lock(&cgroup_mutex);
5591 5592

	for_each_subsys(ss, i)
5593
		seq_printf(m, "%s\t%d\t%d\t%d\n",
5594
			   ss->legacy_name, ss->root->hierarchy_id,
5595 5596
			   atomic_read(&ss->root->nr_cgrps),
			   cgroup_ssid_enabled(i));
5597

5598 5599 5600 5601 5602 5603
	mutex_unlock(&cgroup_mutex);
	return 0;
}

static int cgroupstats_open(struct inode *inode, struct file *file)
{
A
Al Viro 已提交
5604
	return single_open(file, proc_cgroupstats_show, NULL);
5605 5606
}

5607
static const struct file_operations proc_cgroupstats_operations = {
5608 5609 5610 5611 5612 5613
	.open = cgroupstats_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

5614
/**
5615
 * cgroup_fork - initialize cgroup related fields during copy_process()
L
Li Zefan 已提交
5616
 * @child: pointer to task_struct of forking parent process.
5617
 *
5618 5619 5620
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the parent's css_set.  Empty cg_list indicates that
 * @child isn't holding reference to its css_set.
5621 5622 5623
 */
void cgroup_fork(struct task_struct *child)
{
5624
	RCU_INIT_POINTER(child->cgroups, &init_css_set);
5625
	INIT_LIST_HEAD(&child->cg_list);
5626 5627
}

5628 5629 5630 5631 5632 5633 5634 5635
/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the task in question.
 *
 * This calls the subsystem can_fork() callbacks. If the can_fork() callback
 * returns an error, the fork aborts with that error code. This allows for
 * a cgroup subsystem to conditionally allow or deny new forks.
 */
5636
int cgroup_can_fork(struct task_struct *child)
5637 5638 5639 5640
{
	struct cgroup_subsys *ss;
	int i, j, ret;

5641
	do_each_subsys_mask(ss, i, have_canfork_callback) {
5642
		ret = ss->can_fork(child);
5643 5644
		if (ret)
			goto out_revert;
5645
	} while_each_subsys_mask();
5646 5647 5648 5649 5650 5651 5652 5653

	return 0;

out_revert:
	for_each_subsys(ss, j) {
		if (j >= i)
			break;
		if (ss->cancel_fork)
5654
			ss->cancel_fork(child);
5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666
	}

	return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the task in question
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeded.
 */
5667
void cgroup_cancel_fork(struct task_struct *child)
5668 5669 5670 5671 5672 5673
{
	struct cgroup_subsys *ss;
	int i;

	for_each_subsys(ss, i)
		if (ss->cancel_fork)
5674
			ss->cancel_fork(child);
5675 5676
}

5677
/**
L
Li Zefan 已提交
5678 5679 5680
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
5681 5682 5683
 * Adds the task to the list running through its css_set if necessary and
 * call the subsystem fork() callbacks.  Has to be after the task is
 * visible on the task list in case we race with the first call to
5684
 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5685
 * list.
L
Li Zefan 已提交
5686
 */
5687
void cgroup_post_fork(struct task_struct *child)
5688
{
5689
	struct cgroup_subsys *ss;
5690 5691
	int i;

5692
	/*
D
Dongsheng Yang 已提交
5693
	 * This may race against cgroup_enable_task_cg_lists().  As that
5694 5695 5696 5697 5698 5699 5700
	 * function sets use_task_css_set_links before grabbing
	 * tasklist_lock and we just went through tasklist_lock to add
	 * @child, it's guaranteed that either we see the set
	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
	 * @child during its iteration.
	 *
	 * If we won the race, @child is associated with %current's
5701
	 * css_set.  Grabbing css_set_lock guarantees both that the
5702 5703 5704 5705 5706 5707
	 * association is stable, and, on completion of the parent's
	 * migration, @child is visible in the source of migration or
	 * already in the destination cgroup.  This guarantee is necessary
	 * when implementing operations which need to migrate all tasks of
	 * a cgroup to another.
	 *
D
Dongsheng Yang 已提交
5708
	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5709 5710 5711
	 * will remain in init_css_set.  This is safe because all tasks are
	 * in the init_css_set before cg_links is enabled and there's no
	 * operation which transfers all tasks out of init_css_set.
5712
	 */
5713
	if (use_task_css_set_links) {
5714 5715
		struct css_set *cset;

5716
		spin_lock_bh(&css_set_lock);
5717
		cset = task_css_set(current);
5718 5719
		if (list_empty(&child->cg_list)) {
			get_css_set(cset);
T
Tejun Heo 已提交
5720
			css_set_move_task(child, NULL, cset, false);
5721
		}
5722
		spin_unlock_bh(&css_set_lock);
5723
	}
5724 5725 5726 5727 5728 5729

	/*
	 * Call ss->fork().  This must happen after @child is linked on
	 * css_set; otherwise, @child might change state between ->fork()
	 * and addition to css_set.
	 */
5730
	do_each_subsys_mask(ss, i, have_fork_callback) {
5731
		ss->fork(child);
5732
	} while_each_subsys_mask();
5733
}
5734

5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746
/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk and release it.
 *
 * Note that cgroups marked notify_on_release force every task in
 * them to take the global cgroup_mutex mutex when exiting.
 * This could impact scaling on very large systems.  Be reluctant to
 * use notify_on_release cgroups where very high task exit scaling
 * is required on large systems.
 *
5747 5748 5749 5750 5751
 * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
 * call cgroup_exit() while the task is still competent to handle
 * notify_on_release(), then leave the task attached to the root cgroup in
 * each hierarchy for the remainder of its exit.  No need to bother with
 * init_css_set refcnting.  init_css_set never goes away and we can't race
5752
 * with migration path - PF_EXITING is visible to migration path.
5753
 */
5754
void cgroup_exit(struct task_struct *tsk)
5755
{
5756
	struct cgroup_subsys *ss;
5757
	struct css_set *cset;
5758
	int i;
5759 5760

	/*
5761
	 * Unlink from @tsk from its css_set.  As migration path can't race
5762
	 * with us, we can check css_set and cg_list without synchronization.
5763
	 */
5764 5765
	cset = task_css_set(tsk);

5766
	if (!list_empty(&tsk->cg_list)) {
5767
		spin_lock_bh(&css_set_lock);
T
Tejun Heo 已提交
5768
		css_set_move_task(tsk, cset, NULL, false);
5769
		spin_unlock_bh(&css_set_lock);
5770 5771
	} else {
		get_css_set(cset);
5772 5773
	}

5774
	/* see cgroup_post_fork() for details */
5775
	do_each_subsys_mask(ss, i, have_exit_callback) {
5776
		ss->exit(tsk);
5777
	} while_each_subsys_mask();
5778
}
5779

5780 5781 5782
void cgroup_free(struct task_struct *task)
{
	struct css_set *cset = task_css_set(task);
5783 5784 5785
	struct cgroup_subsys *ss;
	int ssid;

5786
	do_each_subsys_mask(ss, ssid, have_free_callback) {
5787
		ss->free(task);
5788
	} while_each_subsys_mask();
5789

5790
	put_css_set(cset);
5791
}
5792

5793
static void check_for_release(struct cgroup *cgrp)
5794
{
5795
	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
5796 5797
	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
		schedule_work(&cgrp->release_agent_work);
5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824
}

/*
 * Notify userspace when a cgroup is released, by running the
 * configured release agent with the name of the cgroup (path
 * relative to the root of cgroup file system) as the argument.
 *
 * Most likely, this user command will try to rmdir this cgroup.
 *
 * This races with the possibility that some other task will be
 * attached to this cgroup before it is removed, or that some other
 * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
 * unused, and this cgroup will be reprieved from its death sentence,
 * to continue to serve a useful existence.  Next time it's released,
 * we will get notified again, if it still has 'notify_on_release' set.
 *
 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
 * means only wait until the task is successfully execve()'d.  The
 * separate release agent task is forked by call_usermodehelper(),
 * then control in this thread returns here, without waiting for the
 * release agent task.  We don't bother to wait because the caller of
 * this routine has no use for the exit status of the release agent
 * task, so no sense holding our caller up for that.
 */
static void cgroup_release_agent(struct work_struct *work)
{
5825 5826 5827 5828 5829
	struct cgroup *cgrp =
		container_of(work, struct cgroup, release_agent_work);
	char *pathbuf = NULL, *agentbuf = NULL, *path;
	char *argv[3], *envp[3];

5830
	mutex_lock(&cgroup_mutex);
5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849

	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
	if (!pathbuf || !agentbuf)
		goto out;

	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
	if (!path)
		goto out;

	argv[0] = agentbuf;
	argv[1] = path;
	argv[2] = NULL;

	/* minimal command environment */
	envp[0] = "HOME=/";
	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
	envp[2] = NULL;

5850
	mutex_unlock(&cgroup_mutex);
5851
	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5852
	goto out_free;
5853
out:
5854
	mutex_unlock(&cgroup_mutex);
5855
out_free:
5856 5857
	kfree(agentbuf);
	kfree(pathbuf);
5858
}
5859 5860 5861

static int __init cgroup_disable(char *str)
{
5862
	struct cgroup_subsys *ss;
5863
	char *token;
5864
	int i;
5865 5866 5867 5868

	while ((token = strsep(&str, ",")) != NULL) {
		if (!*token)
			continue;
5869

T
Tejun Heo 已提交
5870
		for_each_subsys(ss, i) {
5871 5872 5873
			if (strcmp(token, ss->name) &&
			    strcmp(token, ss->legacy_name))
				continue;
5874
			cgroup_disable_mask |= 1 << i;
5875 5876 5877 5878 5879
		}
	}
	return 1;
}
__setup("cgroup_disable=", cgroup_disable);
K
KAMEZAWA Hiroyuki 已提交
5880

5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891
static int __init cgroup_no_v1(char *str)
{
	struct cgroup_subsys *ss;
	char *token;
	int i;

	while ((token = strsep(&str, ",")) != NULL) {
		if (!*token)
			continue;

		if (!strcmp(token, "all")) {
5892
			cgroup_no_v1_mask = U16_MAX;
5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907
			break;
		}

		for_each_subsys(ss, i) {
			if (strcmp(token, ss->name) &&
			    strcmp(token, ss->legacy_name))
				continue;

			cgroup_no_v1_mask |= 1 << i;
		}
	}
	return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);

5908
/**
5909
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5910 5911
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
5912
 *
5913 5914 5915
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
S
Stephane Eranian 已提交
5916
 */
5917 5918
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
						       struct cgroup_subsys *ss)
S
Stephane Eranian 已提交
5919
{
T
Tejun Heo 已提交
5920
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5921
	struct file_system_type *s_type = dentry->d_sb->s_type;
T
Tejun Heo 已提交
5922
	struct cgroup_subsys_state *css = NULL;
S
Stephane Eranian 已提交
5923 5924
	struct cgroup *cgrp;

5925
	/* is @dentry a cgroup dir? */
5926 5927
	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
	    !kn || kernfs_type(kn) != KERNFS_DIR)
S
Stephane Eranian 已提交
5928 5929
		return ERR_PTR(-EBADF);

5930 5931
	rcu_read_lock();

T
Tejun Heo 已提交
5932 5933 5934
	/*
	 * This path doesn't originate from kernfs and @kn could already
	 * have been or be removed at any point.  @kn->priv is RCU
5935
	 * protected for this access.  See css_release_work_fn() for details.
T
Tejun Heo 已提交
5936 5937 5938 5939
	 */
	cgrp = rcu_dereference(kn->priv);
	if (cgrp)
		css = cgroup_css(cgrp, ss);
5940

5941
	if (!css || !css_tryget_online(css))
5942 5943 5944 5945
		css = ERR_PTR(-ENOENT);

	rcu_read_unlock();
	return css;
S
Stephane Eranian 已提交
5946 5947
}

5948 5949 5950 5951 5952 5953 5954 5955 5956 5957
/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
5958
	WARN_ON_ONCE(!rcu_read_lock_held());
5959
	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
S
Stephane Eranian 已提交
5960 5961
}

5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995
/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
 * if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
	struct kernfs_node *kn;
	struct cgroup *cgrp;

	mutex_lock(&cgroup_mutex);

	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
	if (kn) {
		if (kernfs_type(kn) == KERNFS_DIR) {
			cgrp = kn->priv;
			cgroup_get(cgrp);
		} else {
			cgrp = ERR_PTR(-ENOTDIR);
		}
		kernfs_put(kn);
	} else {
		cgrp = ERR_PTR(-ENOENT);
	}

	mutex_unlock(&cgroup_mutex);
	return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

T
Tejun Heo 已提交
5996 5997 5998 5999 6000 6001 6002 6003
/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)

6004
DEFINE_SPINLOCK(cgroup_sk_update_lock);
T
Tejun Heo 已提交
6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048
static bool cgroup_sk_alloc_disabled __read_mostly;

void cgroup_sk_alloc_disable(void)
{
	if (cgroup_sk_alloc_disabled)
		return;
	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
	cgroup_sk_alloc_disabled = true;
}

#else

#define cgroup_sk_alloc_disabled	false

#endif

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
	if (cgroup_sk_alloc_disabled)
		return;

	rcu_read_lock();

	while (true) {
		struct css_set *cset;

		cset = task_css_set(current);
		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
			skcd->val = (unsigned long)cset->dfl_cgrp;
			break;
		}
		cpu_relax();
	}

	rcu_read_unlock();
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
	cgroup_put(sock_cgroup_ptr(skcd));
}

#endif	/* CONFIG_SOCK_CGROUP_DATA */

6049
#ifdef CONFIG_CGROUP_DEBUG
6050 6051
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
6052 6053 6054 6055 6056 6057 6058 6059 6060
{
	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

	if (!css)
		return ERR_PTR(-ENOMEM);

	return css;
}

6061
static void debug_css_free(struct cgroup_subsys_state *css)
6062
{
6063
	kfree(css);
6064 6065
}

6066 6067
static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
				struct cftype *cft)
6068
{
6069
	return cgroup_task_count(css->cgroup);
6070 6071
}

6072 6073
static u64 current_css_set_read(struct cgroup_subsys_state *css,
				struct cftype *cft)
6074 6075 6076 6077
{
	return (u64)(unsigned long)current->cgroups;
}

6078
static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
L
Li Zefan 已提交
6079
					 struct cftype *cft)
6080 6081 6082 6083
{
	u64 count;

	rcu_read_lock();
6084
	count = atomic_read(&task_css_set(current)->refcount);
6085 6086 6087 6088
	rcu_read_unlock();
	return count;
}

6089
static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6090
{
6091
	struct cgrp_cset_link *link;
6092
	struct css_set *cset;
T
Tejun Heo 已提交
6093 6094 6095 6096 6097
	char *name_buf;

	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
	if (!name_buf)
		return -ENOMEM;
6098

6099
	spin_lock_bh(&css_set_lock);
6100
	rcu_read_lock();
6101
	cset = rcu_dereference(current->cgroups);
6102
	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
6103 6104
		struct cgroup *c = link->cgrp;

T
Tejun Heo 已提交
6105
		cgroup_name(c, name_buf, NAME_MAX + 1);
6106
		seq_printf(seq, "Root %d group %s\n",
T
Tejun Heo 已提交
6107
			   c->root->hierarchy_id, name_buf);
6108 6109
	}
	rcu_read_unlock();
6110
	spin_unlock_bh(&css_set_lock);
T
Tejun Heo 已提交
6111
	kfree(name_buf);
6112 6113 6114 6115
	return 0;
}

#define MAX_TASKS_SHOWN_PER_CSS 25
6116
static int cgroup_css_links_read(struct seq_file *seq, void *v)
6117
{
6118
	struct cgroup_subsys_state *css = seq_css(seq);
6119
	struct cgrp_cset_link *link;
6120

6121
	spin_lock_bh(&css_set_lock);
6122
	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6123
		struct css_set *cset = link->cset;
6124 6125
		struct task_struct *task;
		int count = 0;
T
Tejun Heo 已提交
6126

6127
		seq_printf(seq, "css_set %p\n", cset);
T
Tejun Heo 已提交
6128

6129
		list_for_each_entry(task, &cset->tasks, cg_list) {
T
Tejun Heo 已提交
6130 6131 6132 6133 6134 6135 6136 6137 6138
			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				goto overflow;
			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
		}

		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				goto overflow;
			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
6139
		}
T
Tejun Heo 已提交
6140 6141 6142
		continue;
	overflow:
		seq_puts(seq, "  ...\n");
6143
	}
6144
	spin_unlock_bh(&css_set_lock);
6145 6146 6147
	return 0;
}

6148
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
6149
{
6150
	return (!cgroup_is_populated(css->cgroup) &&
Z
Zefan Li 已提交
6151
		!css_has_online_children(&css->cgroup->self));
6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169
}

static struct cftype debug_files[] =  {
	{
		.name = "taskcount",
		.read_u64 = debug_taskcount_read,
	},

	{
		.name = "current_css_set",
		.read_u64 = current_css_set_read,
	},

	{
		.name = "current_css_set_refcount",
		.read_u64 = current_css_set_refcount_read,
	},

6170 6171
	{
		.name = "current_css_set_cg_links",
6172
		.seq_show = current_css_set_cg_links_read,
6173 6174 6175 6176
	},

	{
		.name = "cgroup_css_links",
6177
		.seq_show = cgroup_css_links_read,
6178 6179
	},

6180 6181 6182 6183 6184
	{
		.name = "releasable",
		.read_u64 = releasable_read,
	},

6185 6186
	{ }	/* terminate */
};
6187

6188
struct cgroup_subsys debug_cgrp_subsys = {
6189 6190
	.css_alloc = debug_css_alloc,
	.css_free = debug_css_free,
6191
	.legacy_cftypes = debug_files,
6192 6193
};
#endif /* CONFIG_CGROUP_DEBUG */