cgroup.c 138.4 KB
Newer Older
1 2 3 4 5 6
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
7 8 9 10
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

29 30
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

31 32
#include "cgroup-internal.h"

33
#include <linux/cred.h>
34
#include <linux/errno.h>
35
#include <linux/init_task.h>
36
#include <linux/kernel.h>
37
#include <linux/magic.h>
38 39 40
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
41
#include <linux/proc_fs.h>
42 43
#include <linux/rcupdate.h>
#include <linux/sched.h>
44
#include <linux/sched/task.h>
45 46
#include <linux/slab.h>
#include <linux/spinlock.h>
47
#include <linux/percpu-rwsem.h>
48
#include <linux/string.h>
49
#include <linux/hashtable.h>
50
#include <linux/idr.h>
51
#include <linux/kthread.h>
A
Arun Sharma 已提交
52
#include <linux/atomic.h>
53
#include <linux/cpuset.h>
54 55
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
56
#include <linux/file.h>
T
Tejun Heo 已提交
57
#include <net/sock.h>
58

59 60 61
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

T
Tejun Heo 已提交
62 63 64
#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
					 MAX_CFTYPE_NAME + 2)

T
Tejun Heo 已提交
65 66 67 68
/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
69
 * css_set_lock protects task->cgroups pointer, the list of css_set
70
 * objects, and the chain of tasks off each css_set.
T
Tejun Heo 已提交
71
 *
72 73
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
T
Tejun Heo 已提交
74
 */
T
Tejun Heo 已提交
75
DEFINE_MUTEX(cgroup_mutex);
76
DEFINE_SPINLOCK(css_set_lock);
77 78

#ifdef CONFIG_PROVE_RCU
79
EXPORT_SYMBOL_GPL(cgroup_mutex);
80
EXPORT_SYMBOL_GPL(css_set_lock);
T
Tejun Heo 已提交
81 82
#endif

83
/*
84 85
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
86 87 88
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

89 90 91 92 93 94
/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

95 96
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;

T
Tejun Heo 已提交
97
#define cgroup_assert_mutex_or_rcu_locked()				\
98 99
	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
			   !lockdep_is_held(&cgroup_mutex),		\
T
Tejun Heo 已提交
100
			   "cgroup_mutex or RCU read lock required");
101

102 103 104 105 106 107 108 109
/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

T
Tejun Heo 已提交
110
/* generate an array of cgroup subsystem pointers */
111
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
112
struct cgroup_subsys *cgroup_subsys[] = {
113 114
#include <linux/cgroup_subsys.h>
};
115 116 117 118 119
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
120 121
#include <linux/cgroup_subsys.h>
};
122
#undef SUBSYS
123

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)								\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

145
/*
146
 * The default hierarchy, reserved for the subsystems that are otherwise
147 148
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
149
 */
T
Tejun Heo 已提交
150
struct cgroup_root cgrp_dfl_root;
T
Tejun Heo 已提交
151
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152

T
Tejun Heo 已提交
153 154 155 156
/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
T
Tejun Heo 已提交
157
static bool cgrp_dfl_visible;
158

159
/* some controllers are not supported in the default hierarchy */
T
Tejun Heo 已提交
160
static u16 cgrp_dfl_inhibit_ss_mask;
161

162
/* some controllers are implicitly enabled on the default hierarchy */
T
Tejun Heo 已提交
163
static u16 cgrp_dfl_implicit_ss_mask;
164

165
/* The list of hierarchy roots */
166
LIST_HEAD(cgroup_roots);
167
static int cgroup_root_count;
168

T
Tejun Heo 已提交
169
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
170
static DEFINE_IDR(cgroup_hierarchy_idr);
171

172
/*
173 174 175 176 177
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
178
 */
179
static u64 css_serial_nr_next = 1;
180

181
/*
T
Tejun Heo 已提交
182 183
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
184
 */
185 186 187
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
T
Tejun Heo 已提交
188
static u16 have_canfork_callback __read_mostly;
189

190 191
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
192
	.count		= REFCOUNT_INIT(2),
193 194 195 196 197 198
	.user_ns	= &init_user_ns,
	.ns.ops		= &cgroupns_operations,
	.ns.inum	= PROC_CGROUP_INIT_INO,
	.root_cset	= &init_css_set,
};

199
static struct file_system_type cgroup2_fs_type;
200
static struct cftype cgroup_base_files[];
201

202 203
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
204
static void css_task_iter_advance(struct css_task_iter *it);
205
static int cgroup_destroy_locked(struct cgroup *cgrp);
206 207
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss);
208
static void css_release(struct percpu_ref *ref);
209
static void kill_css(struct cgroup_subsys_state *css);
210 211
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
212
			      bool is_add);
213

214 215 216 217 218 219 220 221
/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
222
bool cgroup_ssid_enabled(int ssid)
223
{
224 225 226
	if (CGROUP_SUBSYS_COUNT == 0)
		return false;

227 228 229
	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differnetly depending on the
 * interface version.
 *
 * The set of behaviors which change on the default hierarchy are still
 * being determined and the mount option is prefixed with __DEVEL__.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - Remount is disallowed.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled inbetween reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 *   is not created.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 *
 * - debug: disallowed on the default hierarchy.
 */
283
bool cgroup_on_dfl(const struct cgroup *cgrp)
284 285 286 287
{
	return cgrp->root == &cgrp_dfl_root;
}

288 289 290 291 292 293 294
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
			    gfp_t gfp_mask)
{
	int ret;

	idr_preload(gfp_mask);
T
Tejun Heo 已提交
295
	spin_lock_bh(&cgroup_idr_lock);
296
	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
T
Tejun Heo 已提交
297
	spin_unlock_bh(&cgroup_idr_lock);
298 299 300 301 302 303 304 305
	idr_preload_end();
	return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
	void *ret;

T
Tejun Heo 已提交
306
	spin_lock_bh(&cgroup_idr_lock);
307
	ret = idr_replace(idr, ptr, id);
T
Tejun Heo 已提交
308
	spin_unlock_bh(&cgroup_idr_lock);
309 310 311 312 313
	return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
T
Tejun Heo 已提交
314
	spin_lock_bh(&cgroup_idr_lock);
315
	idr_remove(idr, id);
T
Tejun Heo 已提交
316
	spin_unlock_bh(&cgroup_idr_lock);
317 318
}

T
Tejun Heo 已提交
319 320 321 322 323 324 325 326 327
static struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
	struct cgroup_subsys_state *parent_css = cgrp->self.parent;

	if (parent_css)
		return container_of(parent_css, struct cgroup, self);
	return NULL;
}

328 329 330 331 332
static bool cgroup_has_tasks(struct cgroup *cgrp)
{
	return cgrp->nr_populated_csets;
}

333 334 335 336 337 338 339 340 341 342
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);
	u16 root_ss_mask = cgrp->root->subsys_mask;

	if (parent)
		return parent->subtree_control;

	if (cgroup_on_dfl(cgrp))
343 344
		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
				  cgrp_dfl_implicit_ss_mask);
345 346 347 348 349 350 351 352 353 354 355 356 357 358
	return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);

	if (parent)
		return parent->subtree_ss_mask;

	return cgrp->root->subsys_mask;
}

T
Tejun Heo 已提交
359 360 361
/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
362
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
T
Tejun Heo 已提交
363
 *
364 365 366 367 368
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
T
Tejun Heo 已提交
369 370
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
371
					      struct cgroup_subsys *ss)
T
Tejun Heo 已提交
372
{
373
	if (ss)
374
		return rcu_dereference_check(cgrp->subsys[ss->id],
T
Tejun Heo 已提交
375
					lockdep_is_held(&cgroup_mutex));
376
	else
377
		return &cgrp->self;
T
Tejun Heo 已提交
378
}
379

380 381 382
/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
383
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
384
 *
C
Chen Hanxiao 已提交
385
 * Similar to cgroup_css() but returns the effective css, which is defined
386 387 388 389 390 391 392 393 394 395
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
						struct cgroup_subsys *ss)
{
	lockdep_assert_held(&cgroup_mutex);

	if (!ss)
396
		return &cgrp->self;
397

T
Tejun Heo 已提交
398 399
	/*
	 * This function is used while updating css associations and thus
400
	 * can't test the csses directly.  Test ss_mask.
T
Tejun Heo 已提交
401
	 */
402
	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
T
Tejun Heo 已提交
403
		cgrp = cgroup_parent(cgrp);
404 405 406
		if (!cgrp)
			return NULL;
	}
407 408

	return cgroup_css(cgrp, ss);
T
Tejun Heo 已提交
409
}
410

T
Tejun Heo 已提交
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
					     struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	rcu_read_lock();

	do {
		css = cgroup_css(cgrp, ss);

		if (css && css_tryget_online(css))
			goto out_unlock;
		cgrp = cgroup_parent(cgrp);
	} while (cgrp);

	css = init_css_set.subsys[ss->id];
	css_get(css);
out_unlock:
	rcu_read_unlock();
	return css;
}

444
static void __maybe_unused cgroup_get(struct cgroup *cgrp)
445 446 447 448 449
{
	css_get(&cgrp->self);
}

static void cgroup_get_live(struct cgroup *cgrp)
T
Tejun Heo 已提交
450 451 452 453 454 455 456 457 458 459
{
	WARN_ON_ONCE(cgroup_is_dead(cgrp));
	css_get(&cgrp->self);
}

static bool cgroup_tryget(struct cgroup *cgrp)
{
	return css_tryget(&cgrp->self);
}

T
Tejun Heo 已提交
460
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
461
{
T
Tejun Heo 已提交
462
	struct cgroup *cgrp = of->kn->parent->priv;
T
Tejun Heo 已提交
463
	struct cftype *cft = of_cft(of);
T
Tejun Heo 已提交
464 465 466 467 468 469 470 471 472 473 474 475

	/*
	 * This is open and unprotected implementation of cgroup_css().
	 * seq_css() is only called from a kernfs file operation which has
	 * an active reference on the file.  Because all the subsystem
	 * files are drained before a css is disassociated with a cgroup,
	 * the matching css from the cgroup's subsys table is guaranteed to
	 * be and stay valid until the enclosing operation is complete.
	 */
	if (cft->ss)
		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
	else
476
		return &cgrp->self;
477
}
T
Tejun Heo 已提交
478
EXPORT_SYMBOL_GPL(of_css);
479

T
Tejun Heo 已提交
480 481 482 483 484 485
/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
486
 * Should be called under cgroup_[tree_]mutex.
T
Tejun Heo 已提交
487 488 489 490 491 492 493 494
 */
#define for_each_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = rcu_dereference_check(			\
				(cgrp)->subsys[(ssid)],			\
				lockdep_is_held(&cgroup_mutex)))) { }	\
		else

495 496 497 498 499 500 501 502 503 504 505 506 507 508
/**
 * for_each_e_css - iterate all effective css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
#define for_each_e_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
			;						\
		else

509
/**
510
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
511 512
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
513
 * @ss_mask: the bitmask
514 515
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
516
 * @ss_mask is set.
517
 */
518 519 520
#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
	unsigned long __ss_mask = (ss_mask);				\
	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
521
		(ssid) = 0;						\
522 523 524 525 526 527 528 529 530 531
		break;							\
	}								\
	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
		(ss) = cgroup_subsys[ssid];				\
		{

#define while_each_subsys_mask()					\
		}							\
	}								\
} while (false)
532

533 534
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
535
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
T
Tejun Heo 已提交
536
		if (({ lockdep_assert_held(&cgroup_mutex);		\
537 538 539
		       cgroup_is_dead(child); }))			\
			;						\
		else
540

541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

559 560
/*
 * The default css_set - used by init and its children prior to any
561 562 563 564 565
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
T
Tejun Heo 已提交
566
struct css_set init_css_set = {
567
	.refcount		= REFCOUNT_INIT(1),
568 569
	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
T
Tejun Heo 已提交
570 571
	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
572 573 574
	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
};
575

576
static int css_set_count	= 1;	/* 1 for init_css_set */
577

578 579 580
/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
581 582 583 584 585
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
586 587 588
 */
static bool css_set_populated(struct css_set *cset)
{
589
	lockdep_assert_held(&css_set_lock);
590 591 592 593

	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

594
/**
595
 * cgroup_update_populated - update the populated count of a cgroup
596 597 598
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
599
 * One of the css_sets associated with @cgrp is either getting its first
600 601 602 603 604 605 606 607 608 609
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
610 611 612
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
613 614 615
	struct cgroup *child = NULL;
	int adj = populated ? 1 : -1;

616
	lockdep_assert_held(&css_set_lock);
617 618

	do {
619
		bool was_populated = cgroup_is_populated(cgrp);
620

621 622
		if (!child)
			cgrp->nr_populated_csets += adj;
623
		else
624
			cgrp->nr_populated_children += adj;
625

626
		if (was_populated == cgroup_is_populated(cgrp))
627 628
			break;

629
		cgroup1_check_for_release(cgrp);
630 631
		cgroup_file_notify(&cgrp->events_file);

632
		child = cgrp;
T
Tejun Heo 已提交
633
		cgrp = cgroup_parent(cgrp);
634 635 636
	} while (cgrp);
}

637 638 639 640 641 642
/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
643
 * populated counters of all associated cgroups accordingly.
644 645 646 647 648
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
	struct cgrp_cset_link *link;

649
	lockdep_assert_held(&css_set_lock);
650 651 652 653 654

	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		cgroup_update_populated(link->cgrp, populated);
}

T
Tejun Heo 已提交
655 656 657 658 659 660 661 662 663 664 665
/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
666
 * This function automatically handles populated counter updates and
667 668
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
T
Tejun Heo 已提交
669 670 671 672 673
 */
static void css_set_move_task(struct task_struct *task,
			      struct css_set *from_cset, struct css_set *to_cset,
			      bool use_mg_tasks)
{
674
	lockdep_assert_held(&css_set_lock);
T
Tejun Heo 已提交
675

676 677 678
	if (to_cset && !css_set_populated(to_cset))
		css_set_update_populated(to_cset, true);

T
Tejun Heo 已提交
679
	if (from_cset) {
680 681
		struct css_task_iter *it, *pos;

T
Tejun Heo 已提交
682
		WARN_ON_ONCE(list_empty(&task->cg_list));
683 684 685 686 687 688 689 690 691 692 693 694 695

		/*
		 * @task is leaving, advance task iterators which are
		 * pointing to it so that they can resume at the next
		 * position.  Advancing an iterator might remove it from
		 * the list, use safe walk.  See css_task_iter_advance*()
		 * for details.
		 */
		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
					 iters_node)
			if (it->task_pos == &task->cg_list)
				css_task_iter_advance(it);

T
Tejun Heo 已提交
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
		list_del_init(&task->cg_list);
		if (!css_set_populated(from_cset))
			css_set_update_populated(from_cset, false);
	} else {
		WARN_ON_ONCE(!list_empty(&task->cg_list));
	}

	if (to_cset) {
		/*
		 * We are synchronized through cgroup_threadgroup_rwsem
		 * against PF_EXITING setting such that we can't race
		 * against cgroup_exit() changing the css_set to
		 * init_css_set and dropping the old one.
		 */
		WARN_ON_ONCE(task->flags & PF_EXITING);

		rcu_assign_pointer(task->cgroups, to_cset);
		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
							     &to_cset->tasks);
	}
}

718 719 720 721 722
/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
723
#define CSS_SET_HASH_BITS	7
724
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
725

726
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
727
{
728
	unsigned long key = 0UL;
729 730
	struct cgroup_subsys *ss;
	int i;
731

732
	for_each_subsys(ss, i)
733 734
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;
735

736
	return key;
737 738
}

739
void put_css_set_locked(struct css_set *cset)
740
{
741
	struct cgrp_cset_link *link, *tmp_link;
T
Tejun Heo 已提交
742 743
	struct cgroup_subsys *ss;
	int ssid;
744

745
	lockdep_assert_held(&css_set_lock);
746

747
	if (!refcount_dec_and_test(&cset->refcount))
748
		return;
749

750 751
	/* This css_set is dead. unlink it and release cgroup and css refs */
	for_each_subsys(ss, ssid) {
T
Tejun Heo 已提交
752
		list_del(&cset->e_cset_node[ssid]);
753 754
		css_put(cset->subsys[ssid]);
	}
755
	hash_del(&cset->hlist);
756 757
	css_set_count--;

758 759 760
	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
761 762
		if (cgroup_parent(link->cgrp))
			cgroup_put(link->cgrp);
763
		kfree(link);
764
	}
765

766
	kfree_rcu(cset, rcu_head);
767 768
}

769
/**
770
 * compare_css_sets - helper function for find_existing_css_set().
771 772
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
773 774 775
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
L
Li Zefan 已提交
776
 * Returns true if "cset" matches "old_cset" except for the hierarchy
777 778
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
779 780
static bool compare_css_sets(struct css_set *cset,
			     struct css_set *old_cset,
781 782 783 784 785
			     struct cgroup *new_cgrp,
			     struct cgroup_subsys_state *template[])
{
	struct list_head *l1, *l2;

786 787 788 789 790 791
	/*
	 * On the default hierarchy, there can be csets which are
	 * associated with the same set of cgroups but different csses.
	 * Let's first ensure that csses match.
	 */
	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
792 793 794 795
		return false;

	/*
	 * Compare cgroup pointers in order to distinguish between
796 797 798
	 * different cgroups in hierarchies.  As different cgroups may
	 * share the same effective css, this comparison is always
	 * necessary.
799
	 */
800 801
	l1 = &cset->cgrp_links;
	l2 = &old_cset->cgrp_links;
802
	while (1) {
803
		struct cgrp_cset_link *link1, *link2;
804
		struct cgroup *cgrp1, *cgrp2;
805 806 807 808

		l1 = l1->next;
		l2 = l2->next;
		/* See if we reached the end - both lists are equal length. */
809 810
		if (l1 == &cset->cgrp_links) {
			BUG_ON(l2 != &old_cset->cgrp_links);
811 812
			break;
		} else {
813
			BUG_ON(l2 == &old_cset->cgrp_links);
814 815
		}
		/* Locate the cgroups associated with these links. */
816 817 818 819
		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
		cgrp1 = link1->cgrp;
		cgrp2 = link2->cgrp;
820
		/* Hierarchies should be linked in the same order. */
821
		BUG_ON(cgrp1->root != cgrp2->root);
822 823 824 825 826 827 828 829

		/*
		 * If this hierarchy is the hierarchy of the cgroup
		 * that's changing, then we need to check that this
		 * css_set points to the new cgroup; if it's any other
		 * hierarchy, then this css_set should point to the
		 * same cgroup as the old css_set.
		 */
830 831
		if (cgrp1->root == new_cgrp->root) {
			if (cgrp1 != new_cgrp)
832 833
				return false;
		} else {
834
			if (cgrp1 != cgrp2)
835 836 837 838 839 840
				return false;
		}
	}
	return true;
}

841 842 843 844 845
/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
846
 */
847 848 849
static struct css_set *find_existing_css_set(struct css_set *old_cset,
					struct cgroup *cgrp,
					struct cgroup_subsys_state *template[])
850
{
851
	struct cgroup_root *root = cgrp->root;
852
	struct cgroup_subsys *ss;
853
	struct css_set *cset;
854
	unsigned long key;
855
	int i;
856

B
Ben Blum 已提交
857 858 859 860 861
	/*
	 * Build the set of subsystem state objects that we want to see in the
	 * new css_set. while subsystems can change globally, the entries here
	 * won't change, so no need for locking.
	 */
862
	for_each_subsys(ss, i) {
863
		if (root->subsys_mask & (1UL << i)) {
864 865 866 867 868
			/*
			 * @ss is in this hierarchy, so we want the
			 * effective css from @cgrp.
			 */
			template[i] = cgroup_e_css(cgrp, ss);
869
		} else {
870 871 872 873
			/*
			 * @ss is not in this hierarchy, so we don't want
			 * to change the css.
			 */
874
			template[i] = old_cset->subsys[i];
875 876 877
		}
	}

878
	key = css_set_hash(template);
879 880
	hash_for_each_possible(css_set_table, cset, hlist, key) {
		if (!compare_css_sets(cset, old_cset, cgrp, template))
881 882 883
			continue;

		/* This css_set matches what we need */
884
		return cset;
885
	}
886 887 888 889 890

	/* No existing cgroup group matched */
	return NULL;
}

891
static void free_cgrp_cset_links(struct list_head *links_to_free)
892
{
893
	struct cgrp_cset_link *link, *tmp_link;
894

895 896
	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
		list_del(&link->cset_link);
897 898 899 900
		kfree(link);
	}
}

901 902 903 904 905 906 907
/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
908
 */
909
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
910
{
911
	struct cgrp_cset_link *link;
912
	int i;
913 914 915

	INIT_LIST_HEAD(tmp_links);

916
	for (i = 0; i < count; i++) {
917
		link = kzalloc(sizeof(*link), GFP_KERNEL);
918
		if (!link) {
919
			free_cgrp_cset_links(tmp_links);
920 921
			return -ENOMEM;
		}
922
		list_add(&link->cset_link, tmp_links);
923 924 925 926
	}
	return 0;
}

927 928
/**
 * link_css_set - a helper function to link a css_set to a cgroup
929
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
930
 * @cset: the css_set to be linked
931 932
 * @cgrp: the destination cgroup
 */
933 934
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
935
{
936
	struct cgrp_cset_link *link;
937

938
	BUG_ON(list_empty(tmp_links));
T
Tejun Heo 已提交
939 940 941 942

	if (cgroup_on_dfl(cgrp))
		cset->dfl_cgrp = cgrp;

943 944
	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
945
	link->cgrp = cgrp;
946

947
	/*
948 949
	 * Always add links to the tail of the lists so that the lists are
	 * in choronological order.
950
	 */
951
	list_move_tail(&link->cset_link, &cgrp->cset_links);
952
	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
953 954

	if (cgroup_parent(cgrp))
955
		cgroup_get_live(cgrp);
956 957
}

958 959 960 961 962 963 964
/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
965
 */
966 967
static struct css_set *find_css_set(struct css_set *old_cset,
				    struct cgroup *cgrp)
968
{
969
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
970
	struct css_set *cset;
971 972
	struct list_head tmp_links;
	struct cgrp_cset_link *link;
T
Tejun Heo 已提交
973
	struct cgroup_subsys *ss;
974
	unsigned long key;
T
Tejun Heo 已提交
975
	int ssid;
976

977 978
	lockdep_assert_held(&cgroup_mutex);

979 980
	/* First see if we already have a cgroup group that matches
	 * the desired set */
981
	spin_lock_irq(&css_set_lock);
982 983 984
	cset = find_existing_css_set(old_cset, cgrp, template);
	if (cset)
		get_css_set(cset);
985
	spin_unlock_irq(&css_set_lock);
986

987 988
	if (cset)
		return cset;
989

990
	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
991
	if (!cset)
992 993
		return NULL;

994
	/* Allocate all the cgrp_cset_link objects that we'll need */
995
	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
996
		kfree(cset);
997 998 999
		return NULL;
	}

1000
	refcount_set(&cset->refcount, 1);
1001
	INIT_LIST_HEAD(&cset->tasks);
T
Tejun Heo 已提交
1002
	INIT_LIST_HEAD(&cset->mg_tasks);
1003
	INIT_LIST_HEAD(&cset->task_iters);
1004
	INIT_HLIST_NODE(&cset->hlist);
T
Tejun Heo 已提交
1005 1006 1007
	INIT_LIST_HEAD(&cset->cgrp_links);
	INIT_LIST_HEAD(&cset->mg_preload_node);
	INIT_LIST_HEAD(&cset->mg_node);
1008 1009 1010

	/* Copy the set of subsystem state objects generated in
	 * find_existing_css_set() */
1011
	memcpy(cset->subsys, template, sizeof(cset->subsys));
1012

1013
	spin_lock_irq(&css_set_lock);
1014
	/* Add reference counts and links from the new css_set. */
1015
	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1016
		struct cgroup *c = link->cgrp;
1017

1018 1019
		if (c->root == cgrp->root)
			c = cgrp;
1020
		link_css_set(&tmp_links, cset, c);
1021
	}
1022

1023
	BUG_ON(!list_empty(&tmp_links));
1024 1025

	css_set_count++;
1026

T
Tejun Heo 已提交
1027
	/* Add @cset to the hash table */
1028 1029
	key = css_set_hash(cset->subsys);
	hash_add(css_set_table, &cset->hlist, key);
1030

1031 1032 1033
	for_each_subsys(ss, ssid) {
		struct cgroup_subsys_state *css = cset->subsys[ssid];

T
Tejun Heo 已提交
1034
		list_add_tail(&cset->e_cset_node[ssid],
1035 1036 1037
			      &css->cgroup->e_csets[ssid]);
		css_get(css);
	}
T
Tejun Heo 已提交
1038

1039
	spin_unlock_irq(&css_set_lock);
1040

1041
	return cset;
1042 1043
}

1044
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1045
{
1046
	struct cgroup *root_cgrp = kf_root->kn->priv;
T
Tejun Heo 已提交
1047

1048
	return root_cgrp->root;
T
Tejun Heo 已提交
1049 1050
}

1051
static int cgroup_init_root_id(struct cgroup_root *root)
1052 1053 1054 1055 1056
{
	int id;

	lockdep_assert_held(&cgroup_mutex);

1057
	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1058 1059 1060 1061 1062 1063 1064
	if (id < 0)
		return id;

	root->hierarchy_id = id;
	return 0;
}

1065
static void cgroup_exit_root_id(struct cgroup_root *root)
1066 1067 1068
{
	lockdep_assert_held(&cgroup_mutex);

1069
	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1070 1071
}

1072
void cgroup_free_root(struct cgroup_root *root)
1073 1074 1075 1076 1077 1078 1079
{
	if (root) {
		idr_destroy(&root->cgroup_idr);
		kfree(root);
	}
}

1080
static void cgroup_destroy_root(struct cgroup_root *root)
1081
{
1082
	struct cgroup *cgrp = &root->cgrp;
1083 1084
	struct cgrp_cset_link *link, *tmp_link;

1085 1086
	trace_cgroup_destroy_root(root);

1087
	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1088

T
Tejun Heo 已提交
1089
	BUG_ON(atomic_read(&root->nr_cgrps));
1090
	BUG_ON(!list_empty(&cgrp->self.children));
1091 1092

	/* Rebind all subsystems back to the default hierarchy */
1093
	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1094 1095

	/*
1096 1097
	 * Release all the links from cset_links to this hierarchy's
	 * root cgroup
1098
	 */
1099
	spin_lock_irq(&css_set_lock);
1100 1101 1102 1103 1104 1105

	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
		kfree(link);
	}
1106

1107
	spin_unlock_irq(&css_set_lock);
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117

	if (!list_empty(&root->root_list)) {
		list_del(&root->root_list);
		cgroup_root_count--;
	}

	cgroup_exit_root_id(root);

	mutex_unlock(&cgroup_mutex);

T
Tejun Heo 已提交
1118
	kernfs_destroy_root(root->kf_root);
1119 1120 1121
	cgroup_free_root(root);
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156
/*
 * look up cgroup associated with current task's cgroup namespace on the
 * specified hierarchy
 */
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
	struct cgroup *res = NULL;
	struct css_set *cset;

	lockdep_assert_held(&css_set_lock);

	rcu_read_lock();

	cset = current->nsproxy->cgroup_ns->root_cset;
	if (cset == &init_css_set) {
		res = &root->cgrp;
	} else {
		struct cgrp_cset_link *link;

		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
			struct cgroup *c = link->cgrp;

			if (c->root == root) {
				res = c;
				break;
			}
		}
	}
	rcu_read_unlock();

	BUG_ON(!res);
	return res;
}

1157 1158
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1159
					    struct cgroup_root *root)
1160 1161 1162
{
	struct cgroup *res = NULL;

1163
	lockdep_assert_held(&cgroup_mutex);
1164
	lockdep_assert_held(&css_set_lock);
1165

1166
	if (cset == &init_css_set) {
1167
		res = &root->cgrp;
1168
	} else {
1169 1170 1171
		struct cgrp_cset_link *link;

		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1172
			struct cgroup *c = link->cgrp;
1173

1174 1175 1176 1177 1178 1179
			if (c->root == root) {
				res = c;
				break;
			}
		}
	}
1180

1181 1182 1183 1184
	BUG_ON(!res);
	return res;
}

1185
/*
1186
 * Return the cgroup for "task" from the given hierarchy. Must be
1187
 * called with cgroup_mutex and css_set_lock held.
1188
 */
1189 1190
struct cgroup *task_cgroup_from_root(struct task_struct *task,
				     struct cgroup_root *root)
1191 1192 1193 1194 1195 1196 1197 1198 1199
{
	/*
	 * No need to lock the task - since we hold cgroup_mutex the
	 * task can't change groups, so the only thing that can happen
	 * is that it exits and its css is set back to init_css_set.
	 */
	return cset_cgroup_from_root(task_css_set(task), root);
}

1200 1201 1202 1203 1204 1205
/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
1206
 * cgroup_attach_task() can increment it again.  Because a count of zero
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
1218
 * least one task in the system (init, pid == 1), therefore, root cgroup
1219
 * always has either children cgroups and/or using tasks.  So we don't
1220
 * need a special hack to ensure that root cgroup cannot be deleted.
1221 1222
 *
 * P.S.  One more locking exception.  RCU is used to guard the
1223
 * update of a tasks cgroup pointer by cgroup_attach_task()
1224 1225
 */

T
Tejun Heo 已提交
1226
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1227

T
Tejun Heo 已提交
1228 1229
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
			      char *buf)
1230
{
1231 1232
	struct cgroup_subsys *ss = cft->ss;

T
Tejun Heo 已提交
1233 1234 1235
	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1236 1237
			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
			 cft->name);
T
Tejun Heo 已提交
1238 1239 1240
	else
		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	return buf;
1241 1242
}

1243 1244 1245 1246
/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
1247
 * S_IRUGO for read, S_IWUSR for write.
1248 1249
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
1250
{
1251
	umode_t mode = 0;
1252

1253 1254 1255
	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
		mode |= S_IRUGO;

1256 1257 1258 1259 1260 1261
	if (cft->write_u64 || cft->write_s64 || cft->write) {
		if (cft->flags & CFTYPE_WORLD_WRITABLE)
			mode |= S_IWUGO;
		else
			mode |= S_IWUSR;
	}
1262 1263

	return mode;
1264 1265
}

1266
/**
1267
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1268
 * @subtree_control: the new subtree_control mask to consider
1269
 * @this_ss_mask: available subsystems
1270 1271 1272 1273 1274
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
1275
 * This function calculates which subsystems need to be enabled if
1276
 * @subtree_control is to be applied while restricted to @this_ss_mask.
1277
 */
1278
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1279
{
1280
	u16 cur_ss_mask = subtree_control;
1281 1282 1283 1284 1285
	struct cgroup_subsys *ss;
	int ssid;

	lockdep_assert_held(&cgroup_mutex);

1286 1287
	cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

1288
	while (true) {
1289
		u16 new_ss_mask = cur_ss_mask;
1290

1291
		do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1292
			new_ss_mask |= ss->depends_on;
1293
		} while_each_subsys_mask();
1294 1295 1296 1297 1298 1299

		/*
		 * Mask out subsystems which aren't available.  This can
		 * happen only if some depended-upon subsystems were bound
		 * to non-default hierarchies.
		 */
1300
		new_ss_mask &= this_ss_mask;
1301 1302 1303 1304 1305 1306

		if (new_ss_mask == cur_ss_mask)
			break;
		cur_ss_mask = new_ss_mask;
	}

1307 1308 1309
	return cur_ss_mask;
}

1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
1320
void cgroup_kn_unlock(struct kernfs_node *kn)
1321
{
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
	struct cgroup *cgrp;

	if (kernfs_type(kn) == KERNFS_DIR)
		cgrp = kn->priv;
	else
		cgrp = kn->parent->priv;

	mutex_unlock(&cgroup_mutex);

	kernfs_unbreak_active_protection(kn);
	cgroup_put(cgrp);
1333 1334
}

1335 1336 1337
/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
1338
 * @drain_offline: perform offline draining on the cgroup
1339 1340 1341 1342 1343
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
1344 1345
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
1346 1347 1348 1349 1350 1351
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
1352
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
T
Tejun Heo 已提交
1353
{
1354 1355 1356 1357 1358 1359
	struct cgroup *cgrp;

	if (kernfs_type(kn) == KERNFS_DIR)
		cgrp = kn->priv;
	else
		cgrp = kn->parent->priv;
T
Tejun Heo 已提交
1360

1361
	/*
1362
	 * We're gonna grab cgroup_mutex which nests outside kernfs
1363 1364 1365
	 * active_ref.  cgroup liveliness check alone provides enough
	 * protection against removal.  Ensure @cgrp stays accessible and
	 * break the active_ref protection.
1366
	 */
1367 1368
	if (!cgroup_tryget(cgrp))
		return NULL;
1369 1370
	kernfs_break_active_protection(kn);

1371 1372 1373 1374
	if (drain_offline)
		cgroup_lock_and_drain_offline(cgrp);
	else
		mutex_lock(&cgroup_mutex);
T
Tejun Heo 已提交
1375

1376 1377 1378 1379 1380
	if (!cgroup_is_dead(cgrp))
		return cgrp;

	cgroup_kn_unlock(kn);
	return NULL;
1381
}
T
Tejun Heo 已提交
1382

1383
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
T
Tejun Heo 已提交
1384
{
T
Tejun Heo 已提交
1385
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
1386

1387
	lockdep_assert_held(&cgroup_mutex);
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397

	if (cft->file_offset) {
		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		spin_lock_irq(&cgroup_file_kn_lock);
		cfile->kn = NULL;
		spin_unlock_irq(&cgroup_file_kn_lock);
	}

T
Tejun Heo 已提交
1398
	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
T
Tejun Heo 已提交
1399 1400
}

1401
/**
1402 1403
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: taget css
1404
 */
1405
static void css_clear_dir(struct cgroup_subsys_state *css)
T
Tejun Heo 已提交
1406
{
1407
	struct cgroup *cgrp = css->cgroup;
1408
	struct cftype *cfts;
T
Tejun Heo 已提交
1409

1410 1411 1412 1413 1414
	if (!(css->flags & CSS_VISIBLE))
		return;

	css->flags &= ~CSS_VISIBLE;

1415 1416
	list_for_each_entry(cfts, &css->ss->cfts, node)
		cgroup_addrm_files(css, cgrp, cfts, false);
1417 1418
}

1419
/**
1420 1421
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
1422 1423 1424
 *
 * On failure, no file is added.
 */
1425
static int css_populate_dir(struct cgroup_subsys_state *css)
1426
{
1427
	struct cgroup *cgrp = css->cgroup;
1428 1429
	struct cftype *cfts, *failed_cfts;
	int ret;
1430

1431
	if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1432 1433
		return 0;

1434 1435
	if (!css->ss) {
		if (cgroup_on_dfl(cgrp))
1436
			cfts = cgroup_base_files;
1437
		else
1438
			cfts = cgroup1_base_files;
1439

1440 1441
		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
	}
1442

1443 1444 1445 1446 1447
	list_for_each_entry(cfts, &css->ss->cfts, node) {
		ret = cgroup_addrm_files(css, cgrp, cfts, true);
		if (ret < 0) {
			failed_cfts = cfts;
			goto err;
1448 1449
		}
	}
1450 1451 1452

	css->flags |= CSS_VISIBLE;

1453 1454
	return 0;
err:
1455 1456 1457 1458 1459
	list_for_each_entry(cfts, &css->ss->cfts, node) {
		if (cfts == failed_cfts)
			break;
		cgroup_addrm_files(css, cgrp, cfts, false);
	}
1460 1461 1462
	return ret;
}

1463
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1464
{
1465
	struct cgroup *dcgrp = &dst_root->cgrp;
1466
	struct cgroup_subsys *ss;
T
Tejun Heo 已提交
1467
	int ssid, i, ret;
1468

T
Tejun Heo 已提交
1469
	lockdep_assert_held(&cgroup_mutex);
1470

1471
	do_each_subsys_mask(ss, ssid, ss_mask) {
1472 1473 1474 1475 1476 1477 1478
		/*
		 * If @ss has non-root csses attached to it, can't move.
		 * If @ss is an implicit controller, it is exempt from this
		 * rule and can be stolen.
		 */
		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
		    !ss->implicit_on_dfl)
T
Tejun Heo 已提交
1479
			return -EBUSY;
1480

1481
		/* can't move between two non-dummy roots either */
1482
		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1483
			return -EBUSY;
1484
	} while_each_subsys_mask();
1485

1486
	do_each_subsys_mask(ss, ssid, ss_mask) {
1487 1488 1489
		struct cgroup_root *src_root = ss->root;
		struct cgroup *scgrp = &src_root->cgrp;
		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
T
Tejun Heo 已提交
1490
		struct css_set *cset;
1491

1492
		WARN_ON(!css || cgroup_css(dcgrp, ss));
1493

1494 1495 1496 1497
		/* disable from the source */
		src_root->subsys_mask &= ~(1 << ssid);
		WARN_ON(cgroup_apply_control(scgrp));
		cgroup_finalize_control(scgrp, 0);
1498

1499
		/* rebind */
1500 1501
		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
		rcu_assign_pointer(dcgrp->subsys[ssid], css);
1502
		ss->root = dst_root;
1503
		css->cgroup = dcgrp;
1504

1505
		spin_lock_irq(&css_set_lock);
T
Tejun Heo 已提交
1506 1507
		hash_for_each(css_set_table, i, cset, hlist)
			list_move_tail(&cset->e_cset_node[ss->id],
1508
				       &dcgrp->e_csets[ss->id]);
1509
		spin_unlock_irq(&css_set_lock);
T
Tejun Heo 已提交
1510

1511
		/* default hierarchy doesn't enable controllers by default */
1512
		dst_root->subsys_mask |= 1 << ssid;
1513 1514 1515
		if (dst_root == &cgrp_dfl_root) {
			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
		} else {
1516
			dcgrp->subtree_control |= 1 << ssid;
1517
			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1518
		}
1519

1520 1521 1522 1523 1524
		ret = cgroup_apply_control(dcgrp);
		if (ret)
			pr_warn("partial failure to rebind %s controller (err=%d)\n",
				ss->name, ret);

1525 1526
		if (ss->bind)
			ss->bind(css);
1527
	} while_each_subsys_mask();
1528

1529
	kernfs_activate(dcgrp->kn);
1530 1531 1532
	return 0;
}

1533 1534
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
		     struct kernfs_root *kf_root)
1535
{
F
Felipe Balbi 已提交
1536
	int len = 0;
1537 1538 1539 1540 1541 1542 1543 1544
	char *buf = NULL;
	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
	struct cgroup *ns_cgroup;

	buf = kmalloc(PATH_MAX, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

1545
	spin_lock_irq(&css_set_lock);
1546 1547
	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1548
	spin_unlock_irq(&css_set_lock);
1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559

	if (len >= PATH_MAX)
		len = -ERANGE;
	else if (len > 0) {
		seq_escape(sf, buf, " \t\n\\");
		len = 0;
	}
	kfree(buf);
	return len;
}

1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
{
	char *token;

	*root_flags = 0;

	if (!data)
		return 0;

	while ((token = strsep(&data, ",")) != NULL) {
		if (!strcmp(token, "nsdelegate")) {
			*root_flags |= CGRP_ROOT_NS_DELEGATE;
			continue;
		}

		pr_err("cgroup2: unknown option \"%s\"\n", token);
		return -EINVAL;
	}

	return 0;
}

static void apply_cgroup_root_flags(unsigned int root_flags)
{
	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
		if (root_flags & CGRP_ROOT_NS_DELEGATE)
			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
		else
			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
	}
}

static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
		seq_puts(seq, ",nsdelegate");
	return 0;
}

T
Tejun Heo 已提交
1599
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1600
{
1601 1602 1603 1604 1605 1606 1607 1608 1609
	unsigned int root_flags;
	int ret;

	ret = parse_cgroup_root_flags(data, &root_flags);
	if (ret)
		return ret;

	apply_cgroup_root_flags(root_flags);
	return 0;
1610 1611
}

1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623
/*
 * To reduce the fork() overhead for systems that are not actually using
 * their cgroups capability, we don't maintain the lists running through
 * each css_set to its tasks until we see the list actually used - in other
 * words after the first mount.
 */
static bool use_task_css_set_links __read_mostly;

static void cgroup_enable_task_cg_lists(void)
{
	struct task_struct *p, *g;

1624
	spin_lock_irq(&css_set_lock);
1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646

	if (use_task_css_set_links)
		goto out_unlock;

	use_task_css_set_links = true;

	/*
	 * We need tasklist_lock because RCU is not safe against
	 * while_each_thread(). Besides, a forking task that has passed
	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
	 * is not guaranteed to have its child immediately visible in the
	 * tasklist if we walk through it with RCU.
	 */
	read_lock(&tasklist_lock);
	do_each_thread(g, p) {
		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
			     task_css_set(p) != &init_css_set);

		/*
		 * We should check if the process is exiting, otherwise
		 * it will race with cgroup_exit() in that the list
		 * entry won't be deleted though the process has exited.
1647 1648
		 * Do it while holding siglock so that we don't end up
		 * racing against cgroup_exit().
1649 1650 1651 1652
		 *
		 * Interrupts were already disabled while acquiring
		 * the css_set_lock, so we do not need to disable it
		 * again when acquiring the sighand->siglock here.
1653
		 */
1654
		spin_lock(&p->sighand->siglock);
1655 1656 1657
		if (!(p->flags & PF_EXITING)) {
			struct css_set *cset = task_css_set(p);

1658 1659
			if (!css_set_populated(cset))
				css_set_update_populated(cset, true);
1660
			list_add_tail(&p->cg_list, &cset->tasks);
1661
			get_css_set(cset);
1662
			cset->nr_tasks++;
1663
		}
1664
		spin_unlock(&p->sighand->siglock);
1665 1666 1667
	} while_each_thread(g, p);
	read_unlock(&tasklist_lock);
out_unlock:
1668
	spin_unlock_irq(&css_set_lock);
1669
}
1670

1671 1672
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
T
Tejun Heo 已提交
1673 1674 1675
	struct cgroup_subsys *ss;
	int ssid;

1676 1677
	INIT_LIST_HEAD(&cgrp->self.sibling);
	INIT_LIST_HEAD(&cgrp->self.children);
1678
	INIT_LIST_HEAD(&cgrp->cset_links);
1679 1680
	INIT_LIST_HEAD(&cgrp->pidlists);
	mutex_init(&cgrp->pidlist_mutex);
1681
	cgrp->self.cgroup = cgrp;
1682
	cgrp->self.flags |= CSS_ONLINE;
T
Tejun Heo 已提交
1683 1684 1685

	for_each_subsys(ss, ssid)
		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1686 1687

	init_waitqueue_head(&cgrp->offline_waitq);
1688
	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1689
}
1690

1691
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1692
{
1693
	struct cgroup *cgrp = &root->cgrp;
1694

1695
	INIT_LIST_HEAD(&root->root_list);
1696
	atomic_set(&root->nr_cgrps, 1);
1697
	cgrp->root = root;
1698
	init_cgroup_housekeeping(cgrp);
1699
	idr_init(&root->cgroup_idr);
1700 1701 1702 1703 1704 1705

	root->flags = opts->flags;
	if (opts->release_agent)
		strcpy(root->release_agent_path, opts->release_agent);
	if (opts->name)
		strcpy(root->name, opts->name);
1706
	if (opts->cpuset_clone_children)
1707
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1708 1709
}

1710
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
1711
{
1712
	LIST_HEAD(tmp_links);
1713
	struct cgroup *root_cgrp = &root->cgrp;
1714
	struct kernfs_syscall_ops *kf_sops;
1715 1716
	struct css_set *cset;
	int i, ret;
1717

1718
	lockdep_assert_held(&cgroup_mutex);
1719

V
Vladimir Davydov 已提交
1720
	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1721
	if (ret < 0)
T
Tejun Heo 已提交
1722
		goto out;
1723
	root_cgrp->id = ret;
1724
	root_cgrp->ancestor_ids[0] = ret;
1725

1726 1727
	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
			      ref_flags, GFP_KERNEL);
1728 1729 1730
	if (ret)
		goto out;

1731
	/*
1732
	 * We're accessing css_set_count without locking css_set_lock here,
1733
	 * but that's OK - it can only be increased by someone holding
1734 1735 1736
	 * cgroup_lock, and that's us.  Later rebinding may disable
	 * controllers on the default hierarchy and thus create new csets,
	 * which can't be more than the existing ones.  Allocate 2x.
1737
	 */
1738
	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1739
	if (ret)
1740
		goto cancel_ref;
1741

1742
	ret = cgroup_init_root_id(root);
1743
	if (ret)
1744
		goto cancel_ref;
1745

1746 1747 1748 1749
	kf_sops = root == &cgrp_dfl_root ?
		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

	root->kf_root = kernfs_create_root(kf_sops,
T
Tejun Heo 已提交
1750 1751 1752 1753 1754 1755 1756
					   KERNFS_ROOT_CREATE_DEACTIVATED,
					   root_cgrp);
	if (IS_ERR(root->kf_root)) {
		ret = PTR_ERR(root->kf_root);
		goto exit_root_id;
	}
	root_cgrp->kn = root->kf_root->kn;
1757

1758
	ret = css_populate_dir(&root_cgrp->self);
1759
	if (ret)
T
Tejun Heo 已提交
1760
		goto destroy_root;
1761

1762
	ret = rebind_subsystems(root, ss_mask);
1763
	if (ret)
T
Tejun Heo 已提交
1764
		goto destroy_root;
1765

1766 1767
	trace_cgroup_setup_root(root);

1768 1769 1770 1771 1772 1773 1774
	/*
	 * There must be no failure case after here, since rebinding takes
	 * care of subsystems' refcounts, which are explicitly dropped in
	 * the failure exit path.
	 */
	list_add(&root->root_list, &cgroup_roots);
	cgroup_root_count++;
A
Al Viro 已提交
1775

1776
	/*
1777
	 * Link the root cgroup in this hierarchy into all the css_set
1778 1779
	 * objects.
	 */
1780
	spin_lock_irq(&css_set_lock);
1781
	hash_for_each(css_set_table, i, cset, hlist) {
1782
		link_css_set(&tmp_links, cset, root_cgrp);
1783 1784 1785
		if (css_set_populated(cset))
			cgroup_update_populated(root_cgrp, true);
	}
1786
	spin_unlock_irq(&css_set_lock);
1787

1788
	BUG_ON(!list_empty(&root_cgrp->self.children));
1789
	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1790

T
Tejun Heo 已提交
1791
	kernfs_activate(root_cgrp->kn);
1792
	ret = 0;
T
Tejun Heo 已提交
1793
	goto out;
1794

T
Tejun Heo 已提交
1795 1796 1797 1798
destroy_root:
	kernfs_destroy_root(root->kf_root);
	root->kf_root = NULL;
exit_root_id:
1799
	cgroup_exit_root_id(root);
1800
cancel_ref:
1801
	percpu_ref_exit(&root_cgrp->self.refcnt);
T
Tejun Heo 已提交
1802
out:
1803 1804
	free_cgrp_cset_links(&tmp_links);
	return ret;
1805 1806
}

1807 1808 1809
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
			       struct cgroup_root *root, unsigned long magic,
			       struct cgroup_namespace *ns)
1810
{
T
Tejun Heo 已提交
1811
	struct dentry *dentry;
L
Li Zefan 已提交
1812
	bool new_sb;
1813

1814
	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
1815

1816
	/*
1817 1818
	 * In non-init cgroup namespace, instead of root cgroup's dentry,
	 * we return the dentry corresponding to the cgroupns->root_cgrp.
1819
	 */
1820 1821 1822
	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
		struct dentry *nsdentry;
		struct cgroup *cgrp;
1823

1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
		mutex_lock(&cgroup_mutex);
		spin_lock_irq(&css_set_lock);

		cgrp = cset_cgroup_from_root(ns->root_cset, root);

		spin_unlock_irq(&css_set_lock);
		mutex_unlock(&cgroup_mutex);

		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
		dput(dentry);
		dentry = nsdentry;
1835 1836
	}

1837 1838 1839 1840 1841 1842
	if (IS_ERR(dentry) || !new_sb)
		cgroup_put(&root->cgrp);

	return dentry;
}

A
Al Viro 已提交
1843
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1844
			 int flags, const char *unused_dev_name,
A
Al Viro 已提交
1845
			 void *data)
1846
{
1847
	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
T
Tejun Heo 已提交
1848
	struct dentry *dentry;
1849
	int ret;
1850

1851 1852 1853 1854 1855 1856 1857 1858
	get_cgroup_ns(ns);

	/* Check if the caller has permission to mount. */
	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
		put_cgroup_ns(ns);
		return ERR_PTR(-EPERM);
	}

1859 1860 1861 1862 1863 1864
	/*
	 * The first time anyone tries to mount a cgroup, enable the list
	 * linking each css_set to its tasks and fix up all existing tasks.
	 */
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();
1865

1866
	if (fs_type == &cgroup2_fs_type) {
1867 1868 1869 1870
		unsigned int root_flags;

		ret = parse_cgroup_root_flags(data, &root_flags);
		if (ret) {
1871
			put_cgroup_ns(ns);
1872
			return ERR_PTR(ret);
1873
		}
1874

T
Tejun Heo 已提交
1875
		cgrp_dfl_visible = true;
1876
		cgroup_get_live(&cgrp_dfl_root.cgrp);
1877 1878 1879

		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
					 CGROUP2_SUPER_MAGIC, ns);
1880 1881
		if (!IS_ERR(dentry))
			apply_cgroup_root_flags(root_flags);
1882 1883 1884
	} else {
		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
				       CGROUP_SUPER_MAGIC, ns);
1885 1886
	}

1887
	put_cgroup_ns(ns);
T
Tejun Heo 已提交
1888 1889
	return dentry;
}
1890

T
Tejun Heo 已提交
1891 1892 1893
static void cgroup_kill_sb(struct super_block *sb)
{
	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1894
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1895

1896
	/*
1897 1898 1899
	 * If @root doesn't have any mounts or children, start killing it.
	 * This prevents new mounts by disabling percpu_ref_tryget_live().
	 * cgroup_mount() may wait for @root's release.
1900 1901
	 *
	 * And don't kill the default root.
1902
	 */
1903
	if (!list_empty(&root->cgrp.self.children) ||
1904
	    root == &cgrp_dfl_root)
1905 1906 1907
		cgroup_put(&root->cgrp);
	else
		percpu_ref_kill(&root->cgrp.self.refcnt);
1908

T
Tejun Heo 已提交
1909
	kernfs_kill_sb(sb);
1910
}
1911

1912
struct file_system_type cgroup_fs_type = {
1913
	.name = "cgroup",
A
Al Viro 已提交
1914
	.mount = cgroup_mount,
1915
	.kill_sb = cgroup_kill_sb,
S
Serge Hallyn 已提交
1916
	.fs_flags = FS_USERNS_MOUNT,
1917
};
1918

1919 1920 1921 1922
static struct file_system_type cgroup2_fs_type = {
	.name = "cgroup2",
	.mount = cgroup_mount,
	.kill_sb = cgroup_kill_sb,
S
Serge Hallyn 已提交
1923
	.fs_flags = FS_USERNS_MOUNT,
1924
};
1925

1926 1927
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
			  struct cgroup_namespace *ns)
1928 1929 1930
{
	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

1931
	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
1932 1933
}

1934 1935
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
		   struct cgroup_namespace *ns)
1936
{
1937
	int ret;
1938 1939

	mutex_lock(&cgroup_mutex);
1940
	spin_lock_irq(&css_set_lock);
1941 1942 1943

	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

1944
	spin_unlock_irq(&css_set_lock);
1945 1946 1947 1948 1949 1950
	mutex_unlock(&cgroup_mutex);

	return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

1951
/**
1952
 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1953 1954 1955 1956
 * @task: target task
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
1957 1958 1959 1960 1961
 * Determine @task's cgroup on the first (the one with the lowest non-zero
 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
 * function grabs cgroup_mutex and shouldn't be used inside locks used by
 * cgroup controller callbacks.
 *
T
Tejun Heo 已提交
1962
 * Return value is the same as kernfs_path().
1963
 */
1964
int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1965
{
1966
	struct cgroup_root *root;
1967
	struct cgroup *cgrp;
T
Tejun Heo 已提交
1968
	int hierarchy_id = 1;
1969
	int ret;
1970 1971

	mutex_lock(&cgroup_mutex);
1972
	spin_lock_irq(&css_set_lock);
1973

1974 1975
	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1976 1977
	if (root) {
		cgrp = task_cgroup_from_root(task, root);
1978
		ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
1979 1980
	} else {
		/* if no hierarchy exists, everyone is in "/" */
1981
		ret = strlcpy(buf, "/", buflen);
1982 1983
	}

1984
	spin_unlock_irq(&css_set_lock);
1985
	mutex_unlock(&cgroup_mutex);
1986
	return ret;
1987
}
1988
EXPORT_SYMBOL_GPL(task_cgroup_path);
1989

1990
/**
1991
 * cgroup_migrate_add_task - add a migration target task to a migration context
1992
 * @task: target task
1993
 * @mgctx: target migration context
1994
 *
1995 1996 1997 1998
 * Add @task, which is a migration target, to @mgctx->tset.  This function
 * becomes noop if @task doesn't need to be migrated.  @task's css_set
 * should have been added as a migration source and @task->cg_list will be
 * moved from the css_set's tasks list to mg_tasks one.
1999
 */
2000 2001
static void cgroup_migrate_add_task(struct task_struct *task,
				    struct cgroup_mgctx *mgctx)
2002 2003 2004
{
	struct css_set *cset;

2005
	lockdep_assert_held(&css_set_lock);
2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020

	/* @task either already exited or can't exit until the end */
	if (task->flags & PF_EXITING)
		return;

	/* leave @task alone if post_fork() hasn't linked it yet */
	if (list_empty(&task->cg_list))
		return;

	cset = task_css_set(task);
	if (!cset->mg_src_cgrp)
		return;

	list_move_tail(&task->cg_list, &cset->mg_tasks);
	if (list_empty(&cset->mg_node))
2021 2022
		list_add_tail(&cset->mg_node,
			      &mgctx->tset.src_csets);
2023
	if (list_empty(&cset->mg_dst_cset->mg_node))
2024
		list_add_tail(&cset->mg_dst_cset->mg_node,
2025
			      &mgctx->tset.dst_csets);
2026 2027
}

2028 2029 2030
/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
2031
 * @dst_cssp: output variable for the destination css
2032 2033 2034
 *
 * @tset iteration is initialized and the first task is returned.
 */
2035 2036
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
					 struct cgroup_subsys_state **dst_cssp)
2037
{
2038 2039 2040
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_task = NULL;

2041
	return cgroup_taskset_next(tset, dst_cssp);
2042 2043 2044 2045 2046
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
2047
 * @dst_cssp: output variable for the destination css
2048 2049 2050 2051
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
2052 2053
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp)
2054
{
2055 2056
	struct css_set *cset = tset->cur_cset;
	struct task_struct *task = tset->cur_task;
2057

2058 2059 2060 2061 2062 2063
	while (&cset->mg_node != tset->csets) {
		if (!task)
			task = list_first_entry(&cset->mg_tasks,
						struct task_struct, cg_list);
		else
			task = list_next_entry(task, cg_list);
2064

2065 2066 2067
		if (&task->cg_list != &cset->mg_tasks) {
			tset->cur_cset = cset;
			tset->cur_task = task;
2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079

			/*
			 * This function may be called both before and
			 * after cgroup_taskset_migrate().  The two cases
			 * can be distinguished by looking at whether @cset
			 * has its ->mg_dst_cset set.
			 */
			if (cset->mg_dst_cset)
				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
			else
				*dst_cssp = cset->subsys[tset->ssid];

2080 2081
			return task;
		}
2082

2083 2084 2085
		cset = list_next_entry(cset, mg_node);
		task = NULL;
	}
2086

2087
	return NULL;
2088 2089
}

2090
/**
2091
 * cgroup_taskset_migrate - migrate a taskset
2092
 * @mgctx: migration context
2093
 *
2094
 * Migrate tasks in @mgctx as setup by migration preparation functions.
2095
 * This function fails iff one of the ->can_attach callbacks fails and
2096 2097
 * guarantees that either all or none of the tasks in @mgctx are migrated.
 * @mgctx is consumed regardless of success.
2098
 */
2099
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2100
{
2101
	struct cgroup_taskset *tset = &mgctx->tset;
2102
	struct cgroup_subsys *ss;
2103 2104
	struct task_struct *task, *tmp_task;
	struct css_set *cset, *tmp_cset;
2105
	int ssid, failed_ssid, ret;
2106 2107 2108 2109 2110 2111

	/* methods shouldn't be called if no task is actually migrating */
	if (list_empty(&tset->src_csets))
		return 0;

	/* check that we can legitimately attach to the cgroup */
2112
	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2113 2114 2115
		if (ss->can_attach) {
			tset->ssid = ssid;
			ret = ss->can_attach(tset);
2116
			if (ret) {
2117
				failed_ssid = ssid;
2118 2119 2120
				goto out_cancel_attach;
			}
		}
2121
	} while_each_subsys_mask();
2122 2123 2124 2125 2126 2127

	/*
	 * Now that we're guaranteed success, proceed to move all tasks to
	 * the new cgroup.  There are no failure cases after here, so this
	 * is the commit point.
	 */
2128
	spin_lock_irq(&css_set_lock);
2129
	list_for_each_entry(cset, &tset->src_csets, mg_node) {
T
Tejun Heo 已提交
2130 2131 2132 2133 2134
		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
			struct css_set *from_cset = task_css_set(task);
			struct css_set *to_cset = cset->mg_dst_cset;

			get_css_set(to_cset);
2135
			to_cset->nr_tasks++;
T
Tejun Heo 已提交
2136 2137
			css_set_move_task(task, from_cset, to_cset, true);
			put_css_set_locked(from_cset);
2138
			from_cset->nr_tasks--;
T
Tejun Heo 已提交
2139
		}
2140
	}
2141
	spin_unlock_irq(&css_set_lock);
2142 2143 2144 2145 2146 2147 2148 2149

	/*
	 * Migration is committed, all target tasks are now on dst_csets.
	 * Nothing is sensitive to fork() after this point.  Notify
	 * controllers that migration is complete.
	 */
	tset->csets = &tset->dst_csets;

2150
	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2151 2152 2153
		if (ss->attach) {
			tset->ssid = ssid;
			ss->attach(tset);
2154
		}
2155
	} while_each_subsys_mask();
2156 2157 2158 2159 2160

	ret = 0;
	goto out_release_tset;

out_cancel_attach:
2161
	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2162
		if (ssid == failed_ssid)
2163
			break;
2164 2165 2166
		if (ss->cancel_attach) {
			tset->ssid = ssid;
			ss->cancel_attach(tset);
2167
		}
2168
	} while_each_subsys_mask();
2169
out_release_tset:
2170
	spin_lock_irq(&css_set_lock);
2171 2172 2173 2174 2175
	list_splice_init(&tset->dst_csets, &tset->src_csets);
	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
		list_del_init(&cset->mg_node);
	}
2176
	spin_unlock_irq(&css_set_lock);
2177 2178 2179
	return ret;
}

2180 2181 2182 2183 2184 2185 2186 2187
/**
 * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
 * @dst_cgrp: destination cgroup to test
 *
 * On the default hierarchy, except for the root, subtree_control must be
 * zero for migration destination cgroups with tasks so that child cgroups
 * don't compete against tasks.
 */
2188
bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2189 2190 2191 2192 2193
{
	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
		!dst_cgrp->subtree_control;
}

L
Li Zefan 已提交
2194
/**
2195
 * cgroup_migrate_finish - cleanup after attach
2196
 * @mgctx: migration context
B
Ben Blum 已提交
2197
 *
2198 2199
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
B
Ben Blum 已提交
2200
 */
2201
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
B
Ben Blum 已提交
2202
{
2203
	LIST_HEAD(preloaded);
2204
	struct css_set *cset, *tmp_cset;
B
Ben Blum 已提交
2205

2206 2207
	lockdep_assert_held(&cgroup_mutex);

2208
	spin_lock_irq(&css_set_lock);
2209 2210 2211 2212 2213

	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);

	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2214
		cset->mg_src_cgrp = NULL;
2215
		cset->mg_dst_cgrp = NULL;
2216 2217
		cset->mg_dst_cset = NULL;
		list_del_init(&cset->mg_preload_node);
Z
Zefan Li 已提交
2218
		put_css_set_locked(cset);
2219
	}
2220

2221
	spin_unlock_irq(&css_set_lock);
2222 2223 2224 2225 2226 2227
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
2228
 * @mgctx: migration context
2229 2230
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2231
 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
2232 2233
 * up by cgroup_migrate_finish().
 *
2234 2235 2236 2237 2238
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
2239
 */
2240 2241
void cgroup_migrate_add_src(struct css_set *src_cset,
			    struct cgroup *dst_cgrp,
2242
			    struct cgroup_mgctx *mgctx)
2243 2244 2245 2246
{
	struct cgroup *src_cgrp;

	lockdep_assert_held(&cgroup_mutex);
2247
	lockdep_assert_held(&css_set_lock);
2248

2249 2250 2251 2252 2253 2254 2255 2256
	/*
	 * If ->dead, @src_set is associated with one or more dead cgroups
	 * and doesn't contain any migratable tasks.  Ignore it early so
	 * that the rest of migration path doesn't get confused by it.
	 */
	if (src_cset->dead)
		return;

2257 2258 2259 2260 2261 2262
	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

	if (!list_empty(&src_cset->mg_preload_node))
		return;

	WARN_ON(src_cset->mg_src_cgrp);
2263
	WARN_ON(src_cset->mg_dst_cgrp);
2264 2265 2266 2267
	WARN_ON(!list_empty(&src_cset->mg_tasks));
	WARN_ON(!list_empty(&src_cset->mg_node));

	src_cset->mg_src_cgrp = src_cgrp;
2268
	src_cset->mg_dst_cgrp = dst_cgrp;
2269
	get_css_set(src_cset);
2270
	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2271 2272 2273 2274
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2275
 * @mgctx: migration context
2276
 *
2277
 * Tasks are about to be moved and all the source css_sets have been
2278 2279 2280
 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @mgctx->preloaded_dst_csets.
2281 2282 2283 2284
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2285
 * @mgctx.
2286
 */
2287
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2288
{
2289
	struct css_set *src_cset, *tmp_cset;
2290 2291 2292 2293

	lockdep_assert_held(&cgroup_mutex);

	/* look up the dst cset for each src cset and link it to src */
2294 2295
	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
				 mg_preload_node) {
2296
		struct css_set *dst_cset;
2297 2298
		struct cgroup_subsys *ss;
		int ssid;
2299

2300
		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2301 2302 2303 2304
		if (!dst_cset)
			goto err;

		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2305 2306 2307 2308 2309 2310 2311 2312

		/*
		 * If src cset equals dst, it's noop.  Drop the src.
		 * cgroup_migrate() will skip the cset too.  Note that we
		 * can't handle src == dst as some nodes are used by both.
		 */
		if (src_cset == dst_cset) {
			src_cset->mg_src_cgrp = NULL;
2313
			src_cset->mg_dst_cgrp = NULL;
2314
			list_del_init(&src_cset->mg_preload_node);
Z
Zefan Li 已提交
2315 2316
			put_css_set(src_cset);
			put_css_set(dst_cset);
2317 2318 2319
			continue;
		}

2320 2321 2322
		src_cset->mg_dst_cset = dst_cset;

		if (list_empty(&dst_cset->mg_preload_node))
2323 2324
			list_add_tail(&dst_cset->mg_preload_node,
				      &mgctx->preloaded_dst_csets);
2325
		else
Z
Zefan Li 已提交
2326
			put_css_set(dst_cset);
2327 2328 2329 2330

		for_each_subsys(ss, ssid)
			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
				mgctx->ss_mask |= 1 << ssid;
2331 2332 2333 2334
	}

	return 0;
err:
2335
	cgroup_migrate_finish(mgctx);
2336 2337 2338 2339 2340 2341 2342
	return -ENOMEM;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
2343
 * @mgctx: migration context
2344
 *
2345 2346 2347
 * Migrate a process or task denoted by @leader.  If migrating a process,
 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
 * responsible for invoking cgroup_migrate_add_src() and
2348 2349 2350 2351 2352 2353 2354 2355 2356
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
2357
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2358
		   struct cgroup_mgctx *mgctx)
B
Ben Blum 已提交
2359
{
2360
	struct task_struct *task;
B
Ben Blum 已提交
2361

2362 2363 2364 2365 2366
	/*
	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
	 * already PF_EXITING could be freed from underneath us unless we
	 * take an rcu_read_lock.
	 */
2367
	spin_lock_irq(&css_set_lock);
2368
	rcu_read_lock();
2369
	task = leader;
B
Ben Blum 已提交
2370
	do {
2371
		cgroup_migrate_add_task(task, mgctx);
2372 2373
		if (!threadgroup)
			break;
2374
	} while_each_thread(leader, task);
2375
	rcu_read_unlock();
2376
	spin_unlock_irq(&css_set_lock);
B
Ben Blum 已提交
2377

2378
	return cgroup_migrate_execute(mgctx);
B
Ben Blum 已提交
2379 2380
}

2381 2382 2383 2384 2385 2386
/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
2387
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2388
 */
2389 2390
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
		       bool threadgroup)
2391
{
2392
	DEFINE_CGROUP_MGCTX(mgctx);
2393 2394 2395
	struct task_struct *task;
	int ret;

2396 2397 2398
	if (!cgroup_may_migrate_to(dst_cgrp))
		return -EBUSY;

2399
	/* look up all src csets */
2400
	spin_lock_irq(&css_set_lock);
2401 2402 2403
	rcu_read_lock();
	task = leader;
	do {
2404
		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2405 2406 2407 2408
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
2409
	spin_unlock_irq(&css_set_lock);
2410 2411

	/* prepare dst csets and commit */
2412
	ret = cgroup_migrate_prepare_dst(&mgctx);
2413
	if (!ret)
2414
		ret = cgroup_migrate(leader, threadgroup, &mgctx);
2415

2416
	cgroup_migrate_finish(&mgctx);
2417 2418 2419 2420

	if (!ret)
		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);

2421
	return ret;
B
Ben Blum 已提交
2422 2423
}

2424 2425
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
	__acquires(&cgroup_threadgroup_rwsem)
2426 2427
{
	struct task_struct *tsk;
2428
	pid_t pid;
2429

2430
	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2431
		return ERR_PTR(-EINVAL);
B
Ben Blum 已提交
2432

T
Tejun Heo 已提交
2433
	percpu_down_write(&cgroup_threadgroup_rwsem);
2434

2435
	rcu_read_lock();
2436
	if (pid) {
2437
		tsk = find_task_by_vpid(pid);
B
Ben Blum 已提交
2438
		if (!tsk) {
2439 2440
			tsk = ERR_PTR(-ESRCH);
			goto out_unlock_threadgroup;
2441
		}
2442
	} else {
2443
		tsk = current;
2444
	}
2445 2446

	if (threadgroup)
2447
		tsk = tsk->group_leader;
2448 2449

	/*
2450 2451 2452 2453
	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
	 * If userland migrates such a kthread to a non-root cgroup, it can
	 * become trapped in a cpuset, or RT kthread may be born in a
	 * cgroup with no rt_runtime allocated.  Just say no.
2454
	 */
2455
	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2456 2457
		tsk = ERR_PTR(-EINVAL);
		goto out_unlock_threadgroup;
2458 2459
	}

2460
	get_task_struct(tsk);
2461 2462 2463 2464 2465
	goto out_unlock_rcu;

out_unlock_threadgroup:
	percpu_up_write(&cgroup_threadgroup_rwsem);
out_unlock_rcu:
2466
	rcu_read_unlock();
2467 2468
	return tsk;
}
2469

2470 2471 2472 2473 2474
void cgroup_procs_write_finish(struct task_struct *task)
	__releases(&cgroup_threadgroup_rwsem)
{
	struct cgroup_subsys *ss;
	int ssid;
2475

2476 2477
	/* release reference from cgroup_procs_write_start() */
	put_task_struct(task);
T
Tejun Heo 已提交
2478 2479

	percpu_up_write(&cgroup_threadgroup_rwsem);
2480 2481 2482
	for_each_subsys(ss, ssid)
		if (ss->post_attach)
			ss->post_attach();
2483 2484
}

2485
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2486
{
2487 2488 2489
	struct cgroup_subsys *ss;
	bool printed = false;
	int ssid;
2490

2491
	do_each_subsys_mask(ss, ssid, ss_mask) {
2492 2493 2494 2495
		if (printed)
			seq_putc(seq, ' ');
		seq_printf(seq, "%s", ss->name);
		printed = true;
2496
	} while_each_subsys_mask();
2497 2498
	if (printed)
		seq_putc(seq, '\n');
2499 2500
}

2501 2502
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
2503
{
2504 2505
	struct cgroup *cgrp = seq_css(seq)->cgroup;

2506
	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2507
	return 0;
2508 2509
}

2510 2511
/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2512
{
2513 2514
	struct cgroup *cgrp = seq_css(seq)->cgroup;

2515
	cgroup_print_ss_mask(seq, cgrp->subtree_control);
2516 2517 2518 2519 2520 2521 2522
	return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
2523 2524 2525 2526
 * @cgrp's control masks have changed and its subtree's css associations
 * need to be updated accordingly.  This function looks up all css_sets
 * which are attached to the subtree, creates the matching updated css_sets
 * and migrates the tasks to the new ones.
2527 2528 2529
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
2530
	DEFINE_CGROUP_MGCTX(mgctx);
2531 2532
	struct cgroup_subsys_state *d_css;
	struct cgroup *dsct;
2533 2534 2535 2536 2537
	struct css_set *src_cset;
	int ret;

	lockdep_assert_held(&cgroup_mutex);

T
Tejun Heo 已提交
2538 2539
	percpu_down_write(&cgroup_threadgroup_rwsem);

2540
	/* look up all csses currently attached to @cgrp's subtree */
2541
	spin_lock_irq(&css_set_lock);
2542
	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2543 2544
		struct cgrp_cset_link *link;

2545
		list_for_each_entry(link, &dsct->cset_links, cset_link)
2546
			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2547
	}
2548
	spin_unlock_irq(&css_set_lock);
2549 2550

	/* NULL dst indicates self on default hierarchy */
2551
	ret = cgroup_migrate_prepare_dst(&mgctx);
2552 2553 2554
	if (ret)
		goto out_finish;

2555
	spin_lock_irq(&css_set_lock);
2556
	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2557
		struct task_struct *task, *ntask;
2558

2559 2560
		/* all tasks in src_csets need to be migrated */
		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2561
			cgroup_migrate_add_task(task, &mgctx);
2562
	}
2563
	spin_unlock_irq(&css_set_lock);
2564

2565
	ret = cgroup_migrate_execute(&mgctx);
2566
out_finish:
2567
	cgroup_migrate_finish(&mgctx);
T
Tejun Heo 已提交
2568
	percpu_up_write(&cgroup_threadgroup_rwsem);
2569 2570 2571
	return ret;
}

2572
/**
2573
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
2574
 * @cgrp: root of the target subtree
2575 2576
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
2577 2578
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
2579
 */
2580
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2581
	__acquires(&cgroup_mutex)
2582 2583
{
	struct cgroup *dsct;
2584
	struct cgroup_subsys_state *d_css;
2585 2586 2587
	struct cgroup_subsys *ss;
	int ssid;

2588 2589
restart:
	mutex_lock(&cgroup_mutex);
2590

2591
	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2592 2593 2594 2595
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
			DEFINE_WAIT(wait);

2596
			if (!css || !percpu_ref_is_dying(&css->refcnt))
2597 2598
				continue;

2599
			cgroup_get_live(dsct);
2600 2601 2602 2603 2604 2605 2606 2607
			prepare_to_wait(&dsct->offline_waitq, &wait,
					TASK_UNINTERRUPTIBLE);

			mutex_unlock(&cgroup_mutex);
			schedule();
			finish_wait(&dsct->offline_waitq, &wait);

			cgroup_put(dsct);
2608
			goto restart;
2609 2610 2611 2612
		}
	}
}

2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645
/**
 * cgroup_save_control - save control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control and ->subtree_ss_mask to the respective old_
 * prefixed fields for @cgrp's subtree including @cgrp itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
		dsct->old_subtree_control = dsct->subtree_control;
		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
	}
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
		dsct->subtree_control &= cgroup_control(dsct);
2646 2647 2648
		dsct->subtree_ss_mask =
			cgroup_calc_subtree_ss_mask(dsct->subtree_control,
						    cgroup_ss_mask(dsct));
2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669
	}
}

/**
 * cgroup_restore_control - restore control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
 * prefixed fields for @cgrp's subtree including @cgrp itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
	struct cgroup *dsct;
	struct cgroup_subsys_state *d_css;

	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
		dsct->subtree_control = dsct->old_subtree_control;
		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
	}
}

2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681
static bool css_visible(struct cgroup_subsys_state *css)
{
	struct cgroup_subsys *ss = css->ss;
	struct cgroup *cgrp = css->cgroup;

	if (cgroup_control(cgrp) & (1 << ss->id))
		return true;
	if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
		return false;
	return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}

2682 2683
/**
 * cgroup_apply_control_enable - enable or show csses according to control
2684
 * @cgrp: root of the target subtree
2685
 *
2686
 * Walk @cgrp's subtree and create new csses or make the existing ones
2687 2688 2689 2690 2691 2692
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
2693
 * cleaning up with cgroup_apply_control_disable().
2694 2695 2696 2697
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
	struct cgroup *dsct;
2698
	struct cgroup_subsys_state *d_css;
2699 2700 2701
	struct cgroup_subsys *ss;
	int ssid, ret;

2702
	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2703 2704 2705
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

2706 2707
			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));

2708 2709 2710 2711 2712 2713 2714 2715 2716
			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
				continue;

			if (!css) {
				css = css_create(dsct, ss);
				if (IS_ERR(css))
					return PTR_ERR(css);
			}

2717
			if (css_visible(css)) {
2718
				ret = css_populate_dir(css);
2719 2720 2721 2722 2723 2724 2725 2726 2727
				if (ret)
					return ret;
			}
		}
	}

	return 0;
}

2728 2729
/**
 * cgroup_apply_control_disable - kill or hide csses according to control
2730
 * @cgrp: root of the target subtree
2731
 *
2732
 * Walk @cgrp's subtree and kill and hide csses so that they match
2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
	struct cgroup *dsct;
2744
	struct cgroup_subsys_state *d_css;
2745 2746 2747
	struct cgroup_subsys *ss;
	int ssid;

2748
	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2749 2750 2751
		for_each_subsys(ss, ssid) {
			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

2752 2753
			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));

2754 2755 2756
			if (!css)
				continue;

2757 2758
			if (css->parent &&
			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
2759
				kill_css(css);
2760
			} else if (!css_visible(css)) {
2761
				css_clear_dir(css);
2762 2763 2764 2765 2766 2767 2768
				if (ss->css_reset)
					ss->css_reset(css);
			}
		}
	}
}

2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
	int ret;

	cgroup_propagate_control(cgrp);

	ret = cgroup_apply_control_enable(cgrp);
	if (ret)
		return ret;

	/*
	 * At this point, cgroup_e_css() results reflect the new csses
	 * making the following cgroup_update_dfl_csses() properly update
	 * css associations of all tasks in the subtree.
	 */
	ret = cgroup_update_dfl_csses(cgrp);
	if (ret)
		return ret;

	return 0;
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
	if (ret) {
		cgroup_restore_control(cgrp);
		cgroup_propagate_control(cgrp);
	}

	cgroup_apply_control_disable(cgrp);
}

2825
/* change the enabled child controllers for a cgroup in the default hierarchy */
2826 2827 2828
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
					    char *buf, size_t nbytes,
					    loff_t off)
2829
{
2830
	u16 enable = 0, disable = 0;
2831
	struct cgroup *cgrp, *child;
2832
	struct cgroup_subsys *ss;
2833
	char *tok;
2834 2835 2836
	int ssid, ret;

	/*
2837 2838
	 * Parse input - space separated list of subsystem names prefixed
	 * with either + or -.
2839
	 */
2840 2841
	buf = strstrip(buf);
	while ((tok = strsep(&buf, " "))) {
2842 2843
		if (tok[0] == '\0')
			continue;
T
Tejun Heo 已提交
2844
		do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
2845 2846
			if (!cgroup_ssid_enabled(ssid) ||
			    strcmp(tok + 1, ss->name))
2847 2848 2849
				continue;

			if (*tok == '+') {
2850 2851
				enable |= 1 << ssid;
				disable &= ~(1 << ssid);
2852
			} else if (*tok == '-') {
2853 2854
				disable |= 1 << ssid;
				enable &= ~(1 << ssid);
2855 2856 2857 2858
			} else {
				return -EINVAL;
			}
			break;
2859
		} while_each_subsys_mask();
2860 2861 2862 2863
		if (ssid == CGROUP_SUBSYS_COUNT)
			return -EINVAL;
	}

2864
	cgrp = cgroup_kn_lock_live(of->kn, true);
2865 2866
	if (!cgrp)
		return -ENODEV;
2867 2868 2869

	for_each_subsys(ss, ssid) {
		if (enable & (1 << ssid)) {
2870
			if (cgrp->subtree_control & (1 << ssid)) {
2871 2872 2873 2874
				enable &= ~(1 << ssid);
				continue;
			}

2875
			if (!(cgroup_control(cgrp) & (1 << ssid))) {
2876 2877 2878
				ret = -ENOENT;
				goto out_unlock;
			}
2879
		} else if (disable & (1 << ssid)) {
2880
			if (!(cgrp->subtree_control & (1 << ssid))) {
2881 2882 2883 2884 2885 2886
				disable &= ~(1 << ssid);
				continue;
			}

			/* a child has it enabled? */
			cgroup_for_each_live_child(child, cgrp) {
2887
				if (child->subtree_control & (1 << ssid)) {
2888
					ret = -EBUSY;
2889
					goto out_unlock;
2890 2891 2892 2893 2894 2895 2896
				}
			}
		}
	}

	if (!enable && !disable) {
		ret = 0;
2897
		goto out_unlock;
2898 2899 2900
	}

	/*
2901
	 * Except for the root, subtree_control must be zero for a cgroup
2902 2903
	 * with tasks so that child cgroups don't compete against tasks.
	 */
2904 2905 2906
	if (enable && cgroup_parent(cgrp) && cgroup_has_tasks(cgrp)) {
		ret = -EBUSY;
		goto out_unlock;
2907 2908
	}

2909 2910
	/* save and update control masks and prepare csses */
	cgroup_save_control(cgrp);
2911

2912 2913
	cgrp->subtree_control |= enable;
	cgrp->subtree_control &= ~disable;
2914

2915
	ret = cgroup_apply_control(cgrp);
2916

2917
	cgroup_finalize_control(cgrp, ret);
2918 2919 2920 2921

	kernfs_activate(cgrp->kn);
	ret = 0;
out_unlock:
2922
	cgroup_kn_unlock(of->kn);
2923
	return ret ?: nbytes;
2924 2925
}

2926
static int cgroup_events_show(struct seq_file *seq, void *v)
2927
{
2928
	seq_printf(seq, "populated %d\n",
2929
		   cgroup_is_populated(seq_css(seq)->cgroup));
2930 2931 2932
	return 0;
}

2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949
static int cgroup_file_open(struct kernfs_open_file *of)
{
	struct cftype *cft = of->kn->priv;

	if (cft->open)
		return cft->open(of);
	return 0;
}

static void cgroup_file_release(struct kernfs_open_file *of)
{
	struct cftype *cft = of->kn->priv;

	if (cft->release)
		cft->release(of);
}

T
Tejun Heo 已提交
2950 2951
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
				 size_t nbytes, loff_t off)
2952
{
2953
	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
T
Tejun Heo 已提交
2954 2955 2956
	struct cgroup *cgrp = of->kn->parent->priv;
	struct cftype *cft = of->kn->priv;
	struct cgroup_subsys_state *css;
2957
	int ret;
2958

2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969
	/*
	 * If namespaces are delegation boundaries, disallow writes to
	 * files in an non-init namespace root from inside the namespace
	 * except for the files explicitly marked delegatable -
	 * cgroup.procs and cgroup.subtree_control.
	 */
	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
		return -EPERM;

T
Tejun Heo 已提交
2970 2971 2972
	if (cft->write)
		return cft->write(of, buf, nbytes, off);

T
Tejun Heo 已提交
2973 2974 2975 2976 2977 2978 2979 2980 2981
	/*
	 * kernfs guarantees that a file isn't deleted with operations in
	 * flight, which means that the matching css is and stays alive and
	 * doesn't need to be pinned.  The RCU locking is not necessary
	 * either.  It's just for the convenience of using cgroup_css().
	 */
	rcu_read_lock();
	css = cgroup_css(cgrp, cft->ss);
	rcu_read_unlock();
2982

2983
	if (cft->write_u64) {
2984 2985 2986 2987 2988 2989 2990 2991 2992
		unsigned long long v;
		ret = kstrtoull(buf, 0, &v);
		if (!ret)
			ret = cft->write_u64(css, cft, v);
	} else if (cft->write_s64) {
		long long v;
		ret = kstrtoll(buf, 0, &v);
		if (!ret)
			ret = cft->write_s64(css, cft, v);
2993
	} else {
2994
		ret = -EINVAL;
2995
	}
T
Tejun Heo 已提交
2996

2997
	return ret ?: nbytes;
2998 2999
}

3000
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3001
{
T
Tejun Heo 已提交
3002
	return seq_cft(seq)->seq_start(seq, ppos);
3003 3004
}

3005
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3006
{
T
Tejun Heo 已提交
3007
	return seq_cft(seq)->seq_next(seq, v, ppos);
3008 3009
}

3010
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3011
{
3012 3013
	if (seq_cft(seq)->seq_stop)
		seq_cft(seq)->seq_stop(seq, v);
3014 3015
}

3016
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3017
{
3018 3019
	struct cftype *cft = seq_cft(m);
	struct cgroup_subsys_state *css = seq_css(m);
3020

3021 3022
	if (cft->seq_show)
		return cft->seq_show(m, arg);
3023

3024
	if (cft->read_u64)
3025 3026 3027 3028 3029 3030
		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
	else if (cft->read_s64)
		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
	else
		return -EINVAL;
	return 0;
3031 3032
}

T
Tejun Heo 已提交
3033 3034
static struct kernfs_ops cgroup_kf_single_ops = {
	.atomic_write_len	= PAGE_SIZE,
3035 3036
	.open			= cgroup_file_open,
	.release		= cgroup_file_release,
T
Tejun Heo 已提交
3037 3038
	.write			= cgroup_file_write,
	.seq_show		= cgroup_seqfile_show,
3039 3040
};

T
Tejun Heo 已提交
3041 3042
static struct kernfs_ops cgroup_kf_ops = {
	.atomic_write_len	= PAGE_SIZE,
3043 3044
	.open			= cgroup_file_open,
	.release		= cgroup_file_release,
T
Tejun Heo 已提交
3045 3046 3047 3048 3049 3050
	.write			= cgroup_file_write,
	.seq_start		= cgroup_seqfile_start,
	.seq_next		= cgroup_seqfile_next,
	.seq_stop		= cgroup_seqfile_stop,
	.seq_show		= cgroup_seqfile_show,
};
3051

3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
			       .ia_uid = current_fsuid(),
			       .ia_gid = current_fsgid(), };

	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
		return 0;

	return kernfs_setattr(kn, &iattr);
}

3066 3067
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
			   struct cftype *cft)
3068
{
T
Tejun Heo 已提交
3069
	char name[CGROUP_FILE_NAME_MAX];
T
Tejun Heo 已提交
3070 3071
	struct kernfs_node *kn;
	struct lock_class_key *key = NULL;
3072
	int ret;
T
Tejun Heo 已提交
3073

T
Tejun Heo 已提交
3074 3075 3076 3077 3078
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	key = &cft->lockdep_key;
#endif
	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
T
Tejun Heo 已提交
3079
				  NULL, key);
3080 3081 3082 3083
	if (IS_ERR(kn))
		return PTR_ERR(kn);

	ret = cgroup_kn_set_ugid(kn);
3084
	if (ret) {
3085
		kernfs_remove(kn);
3086 3087 3088
		return ret;
	}

3089 3090 3091
	if (cft->file_offset) {
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

3092
		spin_lock_irq(&cgroup_file_kn_lock);
3093
		cfile->kn = kn;
3094
		spin_unlock_irq(&cgroup_file_kn_lock);
3095 3096
	}

3097
	return 0;
3098 3099
}

3100 3101
/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
3102 3103
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
3104 3105 3106 3107
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3108
 * For removals, this function never fails.
3109
 */
3110 3111
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
3112
			      bool is_add)
3113
{
3114
	struct cftype *cft, *cft_end = NULL;
3115
	int ret = 0;
3116

3117
	lockdep_assert_held(&cgroup_mutex);
T
Tejun Heo 已提交
3118

3119 3120
restart:
	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3121
		/* does cft->flags tell us to skip this file on @cgrp? */
3122
		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
T
Tejun Heo 已提交
3123
			continue;
3124
		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3125
			continue;
T
Tejun Heo 已提交
3126
		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3127
			continue;
T
Tejun Heo 已提交
3128
		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3129 3130
			continue;

3131
		if (is_add) {
3132
			ret = cgroup_add_file(css, cgrp, cft);
3133
			if (ret) {
3134 3135
				pr_warn("%s: failed to add %s, err=%d\n",
					__func__, cft->name, ret);
3136 3137 3138
				cft_end = cft;
				is_add = false;
				goto restart;
3139
			}
3140 3141
		} else {
			cgroup_rm_file(cgrp, cft);
T
Tejun Heo 已提交
3142
		}
3143
	}
3144
	return ret;
3145 3146
}

3147
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3148
{
3149
	struct cgroup_subsys *ss = cfts[0].ss;
3150
	struct cgroup *root = &ss->root->cgrp;
3151
	struct cgroup_subsys_state *css;
3152
	int ret = 0;
3153

3154
	lockdep_assert_held(&cgroup_mutex);
3155 3156

	/* add/rm files for all cgroups created before */
3157
	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3158 3159
		struct cgroup *cgrp = css->cgroup;

3160
		if (!(css->flags & CSS_VISIBLE))
3161 3162
			continue;

3163
		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3164 3165
		if (ret)
			break;
3166
	}
3167 3168 3169

	if (is_add && !ret)
		kernfs_activate(root->kn);
3170
	return ret;
3171 3172
}

3173
static void cgroup_exit_cftypes(struct cftype *cfts)
3174
{
3175
	struct cftype *cft;
3176

T
Tejun Heo 已提交
3177 3178 3179 3180 3181
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		/* free copy for custom atomic_write_len, see init_cftypes() */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
			kfree(cft->kf_ops);
		cft->kf_ops = NULL;
3182
		cft->ss = NULL;
3183 3184

		/* revert flags set by cgroup core while adding @cfts */
3185
		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
T
Tejun Heo 已提交
3186
	}
3187 3188
}

T
Tejun Heo 已提交
3189
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3190 3191 3192
{
	struct cftype *cft;

T
Tejun Heo 已提交
3193 3194 3195
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		struct kernfs_ops *kf_ops;

T
Tejun Heo 已提交
3196 3197
		WARN_ON(cft->ss || cft->kf_ops);

T
Tejun Heo 已提交
3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214
		if (cft->seq_start)
			kf_ops = &cgroup_kf_ops;
		else
			kf_ops = &cgroup_kf_single_ops;

		/*
		 * Ugh... if @cft wants a custom max_write_len, we need to
		 * make a copy of kf_ops to set its atomic_write_len.
		 */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
			if (!kf_ops) {
				cgroup_exit_cftypes(cfts);
				return -ENOMEM;
			}
			kf_ops->atomic_write_len = cft->max_write_len;
		}
3215

T
Tejun Heo 已提交
3216
		cft->kf_ops = kf_ops;
3217
		cft->ss = ss;
T
Tejun Heo 已提交
3218
	}
3219

T
Tejun Heo 已提交
3220
	return 0;
3221 3222
}

3223 3224
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
3225
	lockdep_assert_held(&cgroup_mutex);
3226 3227 3228 3229 3230 3231 3232 3233

	if (!cfts || !cfts[0].ss)
		return -ENOENT;

	list_del(&cfts->node);
	cgroup_apply_cftypes(cfts, false);
	cgroup_exit_cftypes(cfts);
	return 0;
3234 3235
}

3236 3237 3238 3239
/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
3240 3241 3242
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
3243 3244
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3245
 * registered.
3246
 */
3247
int cgroup_rm_cftypes(struct cftype *cfts)
3248
{
3249
	int ret;
3250

3251
	mutex_lock(&cgroup_mutex);
3252
	ret = cgroup_rm_cftypes_locked(cfts);
3253
	mutex_unlock(&cgroup_mutex);
3254
	return ret;
T
Tejun Heo 已提交
3255 3256
}

3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270
/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
3271
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3272
{
3273
	int ret;
3274

3275
	if (!cgroup_ssid_enabled(ss->id))
3276 3277
		return 0;

3278 3279
	if (!cfts || cfts[0].name[0] == '\0')
		return 0;
3280

T
Tejun Heo 已提交
3281 3282 3283
	ret = cgroup_init_cftypes(ss, cfts);
	if (ret)
		return ret;
3284

3285
	mutex_lock(&cgroup_mutex);
3286

T
Tejun Heo 已提交
3287
	list_add_tail(&cfts->node, &ss->cfts);
3288
	ret = cgroup_apply_cftypes(cfts, true);
3289
	if (ret)
3290
		cgroup_rm_cftypes_locked(cfts);
3291

3292
	mutex_unlock(&cgroup_mutex);
3293
	return ret;
3294 3295
}

3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308
/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
	struct cftype *cft;

	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3309
		cft->flags |= __CFTYPE_ONLY_ON_DFL;
3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320
	return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
3321 3322
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
3323 3324
	struct cftype *cft;

3325 3326
	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
		cft->flags |= __CFTYPE_NOT_ON_DFL;
3327 3328 3329
	return cgroup_add_cftypes(ss, cfts);
}

3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345
/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
	unsigned long flags;

	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
	if (cfile->kn)
		kernfs_notify(cfile->kn);
	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

3346
/**
3347
 * css_next_child - find the next child of a given css
3348 3349
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
3350
 *
3351
 * This function returns the next child of @parent and should be called
3352
 * under either cgroup_mutex or RCU read lock.  The only requirement is
3353 3354 3355 3356 3357 3358 3359 3360 3361
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3362
 */
3363 3364
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
					   struct cgroup_subsys_state *parent)
3365
{
3366
	struct cgroup_subsys_state *next;
3367

T
Tejun Heo 已提交
3368
	cgroup_assert_mutex_or_rcu_locked();
3369 3370

	/*
3371 3372 3373 3374 3375 3376 3377 3378 3379 3380
	 * @pos could already have been unlinked from the sibling list.
	 * Once a cgroup is removed, its ->sibling.next is no longer
	 * updated when its next sibling changes.  CSS_RELEASED is set when
	 * @pos is taken off list, at which time its next pointer is valid,
	 * and, as releases are serialized, the one pointed to by the next
	 * pointer is guaranteed to not have started release yet.  This
	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
	 * critical section, the one pointed to by its next pointer is
	 * guaranteed to not have finished its RCU grace period even if we
	 * have dropped rcu_read_lock() inbetween iterations.
3381
	 *
3382 3383 3384 3385 3386 3387 3388
	 * If @pos has CSS_RELEASED set, its next pointer can't be
	 * dereferenced; however, as each css is given a monotonically
	 * increasing unique serial number and always appended to the
	 * sibling list, the next one can be found by walking the parent's
	 * children until the first css with higher serial number than
	 * @pos's.  While this path can be slower, it happens iff iteration
	 * races against release and the race window is very small.
3389
	 */
3390
	if (!pos) {
3391 3392 3393
		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
	} else if (likely(!(pos->flags & CSS_RELEASED))) {
		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3394
	} else {
3395
		list_for_each_entry_rcu(next, &parent->children, sibling)
3396 3397
			if (next->serial_nr > pos->serial_nr)
				break;
3398 3399
	}

3400 3401
	/*
	 * @next, if not pointing to the head, can be dereferenced and is
3402
	 * the next sibling.
3403
	 */
3404 3405
	if (&next->sibling != &parent->children)
		return next;
3406
	return NULL;
3407 3408
}

3409
/**
3410
 * css_next_descendant_pre - find the next descendant for pre-order walk
3411
 * @pos: the current position (%NULL to initiate traversal)
3412
 * @root: css whose descendants to walk
3413
 *
3414
 * To be used by css_for_each_descendant_pre().  Find the next descendant
3415 3416
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
3417
 *
3418 3419 3420 3421
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
3422 3423 3424 3425 3426 3427 3428
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3429
 */
3430 3431 3432
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
			struct cgroup_subsys_state *root)
3433
{
3434
	struct cgroup_subsys_state *next;
3435

T
Tejun Heo 已提交
3436
	cgroup_assert_mutex_or_rcu_locked();
3437

3438
	/* if first iteration, visit @root */
3439
	if (!pos)
3440
		return root;
3441 3442

	/* visit the first child if exists */
3443
	next = css_next_child(NULL, pos);
3444 3445 3446 3447
	if (next)
		return next;

	/* no child, visit my or the closest ancestor's next sibling */
3448
	while (pos != root) {
T
Tejun Heo 已提交
3449
		next = css_next_child(pos, pos->parent);
3450
		if (next)
3451
			return next;
T
Tejun Heo 已提交
3452
		pos = pos->parent;
3453
	}
3454 3455 3456 3457

	return NULL;
}

3458
/**
3459 3460
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
3461
 *
3462 3463
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
3464
 * subtree of @pos.
3465
 *
3466 3467 3468 3469
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
3470
 */
3471 3472
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
3473
{
3474
	struct cgroup_subsys_state *last, *tmp;
3475

T
Tejun Heo 已提交
3476
	cgroup_assert_mutex_or_rcu_locked();
3477 3478 3479 3480 3481

	do {
		last = pos;
		/* ->prev isn't RCU safe, walk ->next till the end */
		pos = NULL;
3482
		css_for_each_child(tmp, last)
3483 3484 3485 3486 3487 3488
			pos = tmp;
	} while (pos);

	return last;
}

3489 3490
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
3491
{
3492
	struct cgroup_subsys_state *last;
3493 3494 3495

	do {
		last = pos;
3496
		pos = css_next_child(NULL, pos);
3497 3498 3499 3500 3501 3502
	} while (pos);

	return last;
}

/**
3503
 * css_next_descendant_post - find the next descendant for post-order walk
3504
 * @pos: the current position (%NULL to initiate traversal)
3505
 * @root: css whose descendants to walk
3506
 *
3507
 * To be used by css_for_each_descendant_post().  Find the next descendant
3508 3509
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
3510
 *
3511 3512 3513 3514 3515
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
3516 3517 3518 3519 3520 3521 3522
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
3523
 */
3524 3525 3526
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
			 struct cgroup_subsys_state *root)
3527
{
3528
	struct cgroup_subsys_state *next;
3529

T
Tejun Heo 已提交
3530
	cgroup_assert_mutex_or_rcu_locked();
3531

3532 3533 3534
	/* if first iteration, visit leftmost descendant which may be @root */
	if (!pos)
		return css_leftmost_descendant(root);
3535

3536 3537 3538 3539
	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

3540
	/* if there's an unvisited sibling, visit its leftmost descendant */
T
Tejun Heo 已提交
3541
	next = css_next_child(pos, pos->parent);
3542
	if (next)
3543
		return css_leftmost_descendant(next);
3544 3545

	/* no sibling left, visit parent */
T
Tejun Heo 已提交
3546
	return pos->parent;
3547 3548
}

3549 3550 3551 3552 3553 3554 3555 3556 3557
/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
3558
{
3559 3560
	struct cgroup_subsys_state *child;
	bool ret = false;
3561 3562

	rcu_read_lock();
3563
	css_for_each_child(child, css) {
3564
		if (child->flags & CSS_ONLINE) {
3565 3566
			ret = true;
			break;
3567 3568 3569
		}
	}
	rcu_read_unlock();
3570
	return ret;
3571 3572
}

3573
/**
3574
 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3575 3576 3577
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
3578
 */
3579
static void css_task_iter_advance_css_set(struct css_task_iter *it)
3580
{
T
Tejun Heo 已提交
3581
	struct list_head *l = it->cset_pos;
3582 3583 3584
	struct cgrp_cset_link *link;
	struct css_set *cset;

3585
	lockdep_assert_held(&css_set_lock);
3586

3587 3588 3589
	/* Advance to the next non-empty css_set */
	do {
		l = l->next;
T
Tejun Heo 已提交
3590 3591
		if (l == it->cset_head) {
			it->cset_pos = NULL;
3592
			it->task_pos = NULL;
3593 3594
			return;
		}
3595 3596 3597 3598 3599 3600 3601 3602

		if (it->ss) {
			cset = container_of(l, struct css_set,
					    e_cset_node[it->ss->id]);
		} else {
			link = list_entry(l, struct cgrp_cset_link, cset_link);
			cset = link->cset;
		}
3603
	} while (!css_set_populated(cset));
T
Tejun Heo 已提交
3604

T
Tejun Heo 已提交
3605
	it->cset_pos = l;
T
Tejun Heo 已提交
3606 3607

	if (!list_empty(&cset->tasks))
T
Tejun Heo 已提交
3608
		it->task_pos = cset->tasks.next;
T
Tejun Heo 已提交
3609
	else
T
Tejun Heo 已提交
3610 3611 3612 3613
		it->task_pos = cset->mg_tasks.next;

	it->tasks_head = &cset->tasks;
	it->mg_tasks_head = &cset->mg_tasks;
3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636

	/*
	 * We don't keep css_sets locked across iteration steps and thus
	 * need to take steps to ensure that iteration can be resumed after
	 * the lock is re-acquired.  Iteration is performed at two levels -
	 * css_sets and tasks in them.
	 *
	 * Once created, a css_set never leaves its cgroup lists, so a
	 * pinned css_set is guaranteed to stay put and we can resume
	 * iteration afterwards.
	 *
	 * Tasks may leave @cset across iteration steps.  This is resolved
	 * by registering each iterator with the css_set currently being
	 * walked and making css_set_move_task() advance iterators whose
	 * next task is leaving.
	 */
	if (it->cur_cset) {
		list_del(&it->iters_node);
		put_css_set_locked(it->cur_cset);
	}
	get_css_set(cset);
	it->cur_cset = cset;
	list_add(&it->iters_node, &cset->task_iters);
3637 3638
}

3639 3640 3641 3642
static void css_task_iter_advance(struct css_task_iter *it)
{
	struct list_head *l = it->task_pos;

3643
	lockdep_assert_held(&css_set_lock);
3644 3645
	WARN_ON_ONCE(!l);

3646
repeat:
3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660
	/*
	 * Advance iterator to find next entry.  cset->tasks is consumed
	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
	 * next cset.
	 */
	l = l->next;

	if (l == it->tasks_head)
		l = it->mg_tasks_head->next;

	if (l == it->mg_tasks_head)
		css_task_iter_advance_css_set(it);
	else
		it->task_pos = l;
3661 3662 3663 3664 3665 3666

	/* if PROCS, skip over tasks which aren't group leaders */
	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
	    !thread_group_leader(list_entry(it->task_pos, struct task_struct,
					    cg_list)))
		goto repeat;
3667 3668
}

3669
/**
3670 3671
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
3672
 * @flags: CSS_TASK_ITER_* flags
3673 3674
 * @it: the task iterator to use
 *
3675 3676 3677 3678
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
3679
 */
3680
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
3681
			 struct css_task_iter *it)
3682
{
3683 3684
	/* no one should try to iterate before mounting cgroups */
	WARN_ON_ONCE(!use_task_css_set_links);
3685

3686 3687
	memset(it, 0, sizeof(*it));

3688
	spin_lock_irq(&css_set_lock);
3689

3690
	it->ss = css->ss;
3691
	it->flags = flags;
3692 3693 3694 3695 3696 3697

	if (it->ss)
		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
	else
		it->cset_pos = &css->cgroup->cset_links;

T
Tejun Heo 已提交
3698
	it->cset_head = it->cset_pos;
3699

3700
	css_task_iter_advance_css_set(it);
3701

3702
	spin_unlock_irq(&css_set_lock);
3703 3704
}

3705
/**
3706
 * css_task_iter_next - return the next task for the iterator
3707 3708 3709
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
3710 3711
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
3712
 */
3713
struct task_struct *css_task_iter_next(struct css_task_iter *it)
3714
{
3715
	if (it->cur_task) {
3716
		put_task_struct(it->cur_task);
3717 3718
		it->cur_task = NULL;
	}
3719

3720
	spin_lock_irq(&css_set_lock);
3721

3722 3723 3724 3725 3726 3727
	if (it->task_pos) {
		it->cur_task = list_entry(it->task_pos, struct task_struct,
					  cg_list);
		get_task_struct(it->cur_task);
		css_task_iter_advance(it);
	}
3728

3729
	spin_unlock_irq(&css_set_lock);
3730 3731

	return it->cur_task;
3732 3733
}

3734
/**
3735
 * css_task_iter_end - finish task iteration
3736 3737
 * @it: the task iterator to finish
 *
3738
 * Finish task iteration started by css_task_iter_start().
3739
 */
3740
void css_task_iter_end(struct css_task_iter *it)
3741
{
3742
	if (it->cur_cset) {
3743
		spin_lock_irq(&css_set_lock);
3744 3745
		list_del(&it->iters_node);
		put_css_set_locked(it->cur_cset);
3746
		spin_unlock_irq(&css_set_lock);
3747 3748 3749 3750
	}

	if (it->cur_task)
		put_task_struct(it->cur_task);
3751 3752
}

3753
static void cgroup_procs_release(struct kernfs_open_file *of)
3754
{
3755 3756 3757 3758 3759
	if (of->priv) {
		css_task_iter_end(of->priv);
		kfree(of->priv);
	}
}
3760

3761 3762 3763 3764
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
	struct kernfs_open_file *of = s->private;
	struct css_task_iter *it = of->priv;
3765

3766
	return css_task_iter_next(it);
3767
}
3768

3769 3770 3771 3772 3773
static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
	struct kernfs_open_file *of = s->private;
	struct cgroup *cgrp = seq_css(s)->cgroup;
	struct css_task_iter *it = of->priv;
3774

3775
	/*
3776 3777
	 * When a seq_file is seeked, it's always traversed sequentially
	 * from position 0, so we can simply keep iterating on !0 *pos.
3778
	 */
3779 3780 3781
	if (!it) {
		if (WARN_ON_ONCE((*pos)++))
			return ERR_PTR(-EINVAL);
3782

3783 3784 3785 3786
		it = kzalloc(sizeof(*it), GFP_KERNEL);
		if (!it)
			return ERR_PTR(-ENOMEM);
		of->priv = it;
3787
		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
3788 3789
	} else if (!(*pos)++) {
		css_task_iter_end(it);
3790
		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
3791
	}
3792

3793 3794
	return cgroup_procs_next(s, NULL, NULL);
}
3795

3796
static int cgroup_procs_show(struct seq_file *s, void *v)
3797
{
3798
	seq_printf(s, "%d\n", task_pid_vnr(v));
3799 3800 3801
	return 0;
}

3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
					 struct cgroup *dst_cgrp,
					 struct super_block *sb)
{
	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
	struct cgroup *com_cgrp = src_cgrp;
	struct inode *inode;
	int ret;

	lockdep_assert_held(&cgroup_mutex);

	/* find the common ancestor */
	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
		com_cgrp = cgroup_parent(com_cgrp);

	/* %current should be authorized to migrate to the common ancestor */
	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
	if (!inode)
		return -ENOMEM;

	ret = inode_permission(inode, MAY_WRITE);
	iput(inode);
	if (ret)
		return ret;

	/*
	 * If namespaces are delegation boundaries, %current must be able
	 * to see both source and destination cgroups from its namespace.
	 */
	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
	    (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
	     !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
		return -ENOENT;

	return 0;
}

static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
				  char *buf, size_t nbytes, loff_t off)
{
	struct cgroup *src_cgrp, *dst_cgrp;
	struct task_struct *task;
	ssize_t ret;

	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!dst_cgrp)
		return -ENODEV;

	task = cgroup_procs_write_start(buf, true);
	ret = PTR_ERR_OR_ZERO(task);
	if (ret)
		goto out_unlock;

	/* find the source cgroup */
	spin_lock_irq(&css_set_lock);
	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
	spin_unlock_irq(&css_set_lock);

	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
					    of->file->f_path.dentry->d_sb);
	if (ret)
		goto out_finish;

	ret = cgroup_attach_task(dst_cgrp, task, true);

out_finish:
	cgroup_procs_write_finish(task);
out_unlock:
	cgroup_kn_unlock(of->kn);

	return ret ?: nbytes;
}

3875
/* cgroup core interface files for the default hierarchy */
3876
static struct cftype cgroup_base_files[] = {
3877
	{
3878
		.name = "cgroup.procs",
3879
		.flags = CFTYPE_NS_DELEGATABLE,
3880
		.file_offset = offsetof(struct cgroup, procs_file),
3881 3882 3883 3884
		.release = cgroup_procs_release,
		.seq_start = cgroup_procs_start,
		.seq_next = cgroup_procs_next,
		.seq_show = cgroup_procs_show,
3885
		.write = cgroup_procs_write,
3886
	},
3887 3888 3889 3890 3891 3892
	{
		.name = "cgroup.controllers",
		.seq_show = cgroup_controllers_show,
	},
	{
		.name = "cgroup.subtree_control",
3893
		.flags = CFTYPE_NS_DELEGATABLE,
3894
		.seq_show = cgroup_subtree_control_show,
3895
		.write = cgroup_subtree_control_write,
3896
	},
3897
	{
3898
		.name = "cgroup.events",
3899
		.flags = CFTYPE_NOT_ON_ROOT,
3900
		.file_offset = offsetof(struct cgroup, events_file),
3901
		.seq_show = cgroup_events_show,
3902
	},
3903 3904
	{ }	/* terminate */
};
3905

3906 3907 3908 3909 3910 3911 3912
/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3913 3914 3915
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_work_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
3928
static void css_free_work_fn(struct work_struct *work)
3929 3930
{
	struct cgroup_subsys_state *css =
3931
		container_of(work, struct cgroup_subsys_state, destroy_work);
3932
	struct cgroup_subsys *ss = css->ss;
3933
	struct cgroup *cgrp = css->cgroup;
3934

3935 3936
	percpu_ref_exit(&css->refcnt);

3937
	if (ss) {
3938
		/* css free path */
3939
		struct cgroup_subsys_state *parent = css->parent;
3940 3941 3942 3943
		int id = css->id;

		ss->css_free(css);
		cgroup_idr_remove(&ss->css_idr, id);
3944
		cgroup_put(cgrp);
3945 3946 3947

		if (parent)
			css_put(parent);
3948 3949 3950
	} else {
		/* cgroup free path */
		atomic_dec(&cgrp->root->nr_cgrps);
3951
		cgroup1_pidlist_destroy_all(cgrp);
3952
		cancel_work_sync(&cgrp->release_agent_work);
3953

T
Tejun Heo 已提交
3954
		if (cgroup_parent(cgrp)) {
3955 3956 3957 3958 3959 3960
			/*
			 * We get a ref to the parent, and put the ref when
			 * this cgroup is being freed, so it's guaranteed
			 * that the parent won't be destroyed before its
			 * children.
			 */
T
Tejun Heo 已提交
3961
			cgroup_put(cgroup_parent(cgrp));
3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972
			kernfs_put(cgrp->kn);
			kfree(cgrp);
		} else {
			/*
			 * This is root cgroup's refcnt reaching zero,
			 * which indicates that the root should be
			 * released.
			 */
			cgroup_destroy_root(cgrp->root);
		}
	}
3973 3974
}

3975
static void css_free_rcu_fn(struct rcu_head *rcu_head)
3976 3977
{
	struct cgroup_subsys_state *css =
3978
		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3979

3980
	INIT_WORK(&css->destroy_work, css_free_work_fn);
3981
	queue_work(cgroup_destroy_wq, &css->destroy_work);
3982 3983
}

3984
static void css_release_work_fn(struct work_struct *work)
3985 3986
{
	struct cgroup_subsys_state *css =
3987
		container_of(work, struct cgroup_subsys_state, destroy_work);
3988
	struct cgroup_subsys *ss = css->ss;
3989
	struct cgroup *cgrp = css->cgroup;
3990

3991 3992
	mutex_lock(&cgroup_mutex);

3993
	css->flags |= CSS_RELEASED;
3994 3995
	list_del_rcu(&css->sibling);

3996 3997
	if (ss) {
		/* css release path */
3998
		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
3999 4000
		if (ss->css_released)
			ss->css_released(css);
4001 4002
	} else {
		/* cgroup release path */
4003 4004
		trace_cgroup_release(cgrp);

4005 4006
		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
		cgrp->id = -1;
4007 4008 4009 4010 4011 4012 4013 4014

		/*
		 * There are two control paths which try to determine
		 * cgroup from dentry without going through kernfs -
		 * cgroupstats_build() and css_tryget_online_from_dir().
		 * Those are supported by RCU protecting clearing of
		 * cgrp->kn->priv backpointer.
		 */
4015 4016 4017
		if (cgrp->kn)
			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
					 NULL);
4018 4019

		cgroup_bpf_put(cgrp);
4020
	}
4021

4022 4023
	mutex_unlock(&cgroup_mutex);

4024
	call_rcu(&css->rcu_head, css_free_rcu_fn);
4025 4026 4027 4028 4029 4030 4031
}

static void css_release(struct percpu_ref *ref)
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

4032 4033
	INIT_WORK(&css->destroy_work, css_release_work_fn);
	queue_work(cgroup_destroy_wq, &css->destroy_work);
4034 4035
}

4036 4037
static void init_and_link_css(struct cgroup_subsys_state *css,
			      struct cgroup_subsys *ss, struct cgroup *cgrp)
4038
{
4039 4040
	lockdep_assert_held(&cgroup_mutex);

4041
	cgroup_get_live(cgrp);
4042

4043
	memset(css, 0, sizeof(*css));
4044
	css->cgroup = cgrp;
4045
	css->ss = ss;
4046
	css->id = -1;
4047 4048
	INIT_LIST_HEAD(&css->sibling);
	INIT_LIST_HEAD(&css->children);
4049
	css->serial_nr = css_serial_nr_next++;
4050
	atomic_set(&css->online_cnt, 0);
4051

T
Tejun Heo 已提交
4052 4053
	if (cgroup_parent(cgrp)) {
		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4054 4055
		css_get(css->parent);
	}
4056

4057
	BUG_ON(cgroup_css(cgrp, ss));
4058 4059
}

4060
/* invoke ->css_online() on a new CSS and mark it online if successful */
4061
static int online_css(struct cgroup_subsys_state *css)
4062
{
4063
	struct cgroup_subsys *ss = css->ss;
T
Tejun Heo 已提交
4064 4065
	int ret = 0;

4066 4067
	lockdep_assert_held(&cgroup_mutex);

4068
	if (ss->css_online)
4069
		ret = ss->css_online(css);
4070
	if (!ret) {
4071
		css->flags |= CSS_ONLINE;
4072
		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4073 4074 4075 4076

		atomic_inc(&css->online_cnt);
		if (css->parent)
			atomic_inc(&css->parent->online_cnt);
4077
	}
T
Tejun Heo 已提交
4078
	return ret;
4079 4080
}

4081
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4082
static void offline_css(struct cgroup_subsys_state *css)
4083
{
4084
	struct cgroup_subsys *ss = css->ss;
4085 4086 4087 4088 4089 4090

	lockdep_assert_held(&cgroup_mutex);

	if (!(css->flags & CSS_ONLINE))
		return;

4091 4092 4093
	if (ss->css_reset)
		ss->css_reset(css);

4094
	if (ss->css_offline)
4095
		ss->css_offline(css);
4096

4097
	css->flags &= ~CSS_ONLINE;
4098
	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4099 4100

	wake_up_all(&css->cgroup->offline_waitq);
4101 4102
}

4103
/**
4104
 * css_create - create a cgroup_subsys_state
4105 4106 4107 4108
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
4109 4110
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
4111
 */
4112 4113
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss)
4114
{
T
Tejun Heo 已提交
4115
	struct cgroup *parent = cgroup_parent(cgrp);
4116
	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4117 4118 4119 4120 4121
	struct cgroup_subsys_state *css;
	int err;

	lockdep_assert_held(&cgroup_mutex);

4122
	css = ss->css_alloc(parent_css);
4123 4124
	if (!css)
		css = ERR_PTR(-ENOMEM);
4125
	if (IS_ERR(css))
4126
		return css;
4127

4128
	init_and_link_css(css, ss, cgrp);
4129

4130
	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4131
	if (err)
4132
		goto err_free_css;
4133

V
Vladimir Davydov 已提交
4134
	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4135
	if (err < 0)
4136
		goto err_free_css;
4137
	css->id = err;
4138

4139
	/* @css is ready to be brought online now, make it visible */
4140
	list_add_tail_rcu(&css->sibling, &parent_css->children);
4141
	cgroup_idr_replace(&ss->css_idr, css, css->id);
4142 4143 4144

	err = online_css(css);
	if (err)
4145
		goto err_list_del;
4146

4147
	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
T
Tejun Heo 已提交
4148
	    cgroup_parent(parent)) {
4149
		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4150
			current->comm, current->pid, ss->name);
4151
		if (!strcmp(ss->name, "memory"))
4152
			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4153 4154 4155
		ss->warned_broken_hierarchy = true;
	}

4156
	return css;
4157

4158 4159
err_list_del:
	list_del_rcu(&css->sibling);
4160
err_free_css:
4161
	call_rcu(&css->rcu_head, css_free_rcu_fn);
4162
	return ERR_PTR(err);
4163 4164
}

4165 4166 4167 4168 4169
/*
 * The returned cgroup is fully initialized including its control mask, but
 * it isn't associated with its kernfs_node and doesn't have the control
 * mask applied.
 */
4170
static struct cgroup *cgroup_create(struct cgroup *parent)
4171
{
4172 4173 4174
	struct cgroup_root *root = parent->root;
	struct cgroup *cgrp, *tcgrp;
	int level = parent->level + 1;
4175
	int ret;
4176

T
Tejun Heo 已提交
4177
	/* allocate the cgroup and its ID, 0 is reserved for the root */
4178 4179
	cgrp = kzalloc(sizeof(*cgrp) +
		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4180 4181
	if (!cgrp)
		return ERR_PTR(-ENOMEM);
4182

4183
	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4184 4185 4186
	if (ret)
		goto out_free_cgrp;

4187 4188 4189 4190
	/*
	 * Temporarily set the pointer to NULL, so idr_find() won't return
	 * a half-baked cgroup.
	 */
V
Vladimir Davydov 已提交
4191
	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4192
	if (cgrp->id < 0) {
T
Tejun Heo 已提交
4193
		ret = -ENOMEM;
4194
		goto out_cancel_ref;
4195 4196
	}

4197
	init_cgroup_housekeeping(cgrp);
4198

4199
	cgrp->self.parent = &parent->self;
T
Tejun Heo 已提交
4200
	cgrp->root = root;
4201 4202 4203 4204
	cgrp->level = level;

	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4205

4206 4207 4208
	if (notify_on_release(parent))
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

4209 4210
	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4211

4212
	cgrp->self.serial_nr = css_serial_nr_next++;
4213

4214
	/* allocation complete, commit to creation */
4215
	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4216
	atomic_inc(&root->nr_cgrps);
4217
	cgroup_get_live(parent);
4218

4219 4220 4221 4222
	/*
	 * @cgrp is now fully operational.  If something fails after this
	 * point, it'll be released via the normal destruction path.
	 */
4223
	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4224

4225 4226
	/*
	 * On the default hierarchy, a child doesn't automatically inherit
4227
	 * subtree_control from the parent.  Each is configured manually.
4228
	 */
4229
	if (!cgroup_on_dfl(cgrp))
4230
		cgrp->subtree_control = cgroup_control(cgrp);
4231

4232 4233 4234
	if (parent)
		cgroup_bpf_inherit(cgrp, parent);

4235 4236
	cgroup_propagate_control(cgrp);

4237 4238 4239 4240 4241 4242 4243 4244 4245
	return cgrp;

out_cancel_ref:
	percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
	kfree(cgrp);
	return ERR_PTR(ret);
}

4246
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4247 4248 4249
{
	struct cgroup *parent, *cgrp;
	struct kernfs_node *kn;
4250
	int ret;
4251 4252 4253 4254 4255

	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
	if (strchr(name, '\n'))
		return -EINVAL;

4256
	parent = cgroup_kn_lock_live(parent_kn, false);
4257 4258 4259 4260 4261 4262 4263 4264 4265
	if (!parent)
		return -ENODEV;

	cgrp = cgroup_create(parent);
	if (IS_ERR(cgrp)) {
		ret = PTR_ERR(cgrp);
		goto out_unlock;
	}

4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283
	/* create the directory */
	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
	if (IS_ERR(kn)) {
		ret = PTR_ERR(kn);
		goto out_destroy;
	}
	cgrp->kn = kn;

	/*
	 * This extra ref will be put in cgroup_free_fn() and guarantees
	 * that @cgrp->kn is always accessible.
	 */
	kernfs_get(kn);

	ret = cgroup_kn_set_ugid(kn);
	if (ret)
		goto out_destroy;

4284
	ret = css_populate_dir(&cgrp->self);
4285 4286 4287
	if (ret)
		goto out_destroy;

4288 4289 4290
	ret = cgroup_apply_control_enable(cgrp);
	if (ret)
		goto out_destroy;
4291

4292 4293
	trace_cgroup_mkdir(cgrp);

4294
	/* let's create and online css's */
T
Tejun Heo 已提交
4295
	kernfs_activate(kn);
4296

T
Tejun Heo 已提交
4297 4298
	ret = 0;
	goto out_unlock;
4299

4300 4301
out_destroy:
	cgroup_destroy_locked(cgrp);
T
Tejun Heo 已提交
4302
out_unlock:
4303
	cgroup_kn_unlock(parent_kn);
T
Tejun Heo 已提交
4304
	return ret;
4305 4306
}

4307 4308
/*
 * This is called when the refcnt of a css is confirmed to be killed.
4309 4310
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initate destruction and put the css ref from kill_css().
4311 4312
 */
static void css_killed_work_fn(struct work_struct *work)
4313
{
4314 4315
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);
4316

4317
	mutex_lock(&cgroup_mutex);
4318

4319 4320 4321 4322 4323 4324 4325 4326
	do {
		offline_css(css);
		css_put(css);
		/* @css can't go away while we're holding cgroup_mutex */
		css = css->parent;
	} while (css && atomic_dec_and_test(&css->online_cnt));

	mutex_unlock(&cgroup_mutex);
4327 4328
}

4329 4330
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
4331 4332 4333 4334
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

4335 4336 4337 4338
	if (atomic_dec_and_test(&css->online_cnt)) {
		INIT_WORK(&css->destroy_work, css_killed_work_fn);
		queue_work(cgroup_destroy_wq, &css->destroy_work);
	}
4339 4340
}

4341 4342 4343 4344 4345 4346
/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
4347 4348
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
4349 4350
 */
static void kill_css(struct cgroup_subsys_state *css)
T
Tejun Heo 已提交
4351
{
4352
	lockdep_assert_held(&cgroup_mutex);
4353

4354 4355 4356 4357 4358
	if (css->flags & CSS_DYING)
		return;

	css->flags |= CSS_DYING;

T
Tejun Heo 已提交
4359 4360 4361 4362
	/*
	 * This must happen before css is disassociated with its cgroup.
	 * See seq_css() for details.
	 */
4363
	css_clear_dir(css);
4364

T
Tejun Heo 已提交
4365 4366 4367 4368 4369 4370 4371 4372 4373
	/*
	 * Killing would put the base ref, but we need to keep it alive
	 * until after ->css_offline().
	 */
	css_get(css);

	/*
	 * cgroup core guarantees that, by the time ->css_offline() is
	 * invoked, no new css reference will be given out via
4374
	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
T
Tejun Heo 已提交
4375 4376 4377 4378 4379 4380 4381
	 * proceed to offlining css's because percpu_ref_kill() doesn't
	 * guarantee that the ref is seen as killed on all CPUs on return.
	 *
	 * Use percpu_ref_kill_and_confirm() to get notifications as each
	 * css is confirmed to be seen as killed on all CPUs.
	 */
	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4382 4383 4384 4385 4386 4387 4388 4389
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
4390 4391 4392
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
4408 4409
static int cgroup_destroy_locked(struct cgroup *cgrp)
	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4410
{
T
Tejun Heo 已提交
4411
	struct cgroup_subsys_state *css;
4412
	struct cgrp_cset_link *link;
T
Tejun Heo 已提交
4413
	int ssid;
4414

4415 4416
	lockdep_assert_held(&cgroup_mutex);

4417 4418 4419 4420 4421
	/*
	 * Only migration can raise populated from zero and we're already
	 * holding cgroup_mutex.
	 */
	if (cgroup_is_populated(cgrp))
4422
		return -EBUSY;
L
Li Zefan 已提交
4423

4424
	/*
4425 4426 4427
	 * Make sure there's no live children.  We can't test emptiness of
	 * ->self.children as dead children linger on it while being
	 * drained; otherwise, "rmdir parent/child parent" may fail.
4428
	 */
4429
	if (css_has_online_children(&cgrp->self))
4430 4431
		return -EBUSY;

4432
	/*
4433 4434 4435 4436
	 * Mark @cgrp and the associated csets dead.  The former prevents
	 * further task migration and child creation by disabling
	 * cgroup_lock_live_group().  The latter makes the csets ignored by
	 * the migration path.
4437
	 */
4438
	cgrp->self.flags &= ~CSS_ONLINE;
4439

4440
	spin_lock_irq(&css_set_lock);
4441 4442
	list_for_each_entry(link, &cgrp->cset_links, cset_link)
		link->cset->dead = true;
4443
	spin_unlock_irq(&css_set_lock);
4444

4445
	/* initiate massacre of all css's */
T
Tejun Heo 已提交
4446 4447
	for_each_css(css, ssid, cgrp)
		kill_css(css);
4448 4449

	/*
4450 4451
	 * Remove @cgrp directory along with the base files.  @cgrp has an
	 * extra ref on its kn.
4452
	 */
4453
	kernfs_remove(cgrp->kn);
4454

4455
	cgroup1_check_for_release(cgroup_parent(cgrp));
T
Tejun Heo 已提交
4456

4457
	/* put the base reference */
4458
	percpu_ref_kill(&cgrp->self.refcnt);
4459

4460 4461 4462
	return 0;
};

4463
int cgroup_rmdir(struct kernfs_node *kn)
4464
{
4465
	struct cgroup *cgrp;
T
Tejun Heo 已提交
4466
	int ret = 0;
4467

4468
	cgrp = cgroup_kn_lock_live(kn, false);
4469 4470
	if (!cgrp)
		return 0;
4471

4472
	ret = cgroup_destroy_locked(cgrp);
4473

4474 4475 4476
	if (!ret)
		trace_cgroup_rmdir(cgrp);

4477
	cgroup_kn_unlock(kn);
4478
	return ret;
4479 4480
}

T
Tejun Heo 已提交
4481
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4482
	.show_options		= cgroup_show_options,
T
Tejun Heo 已提交
4483 4484 4485
	.remount_fs		= cgroup_remount,
	.mkdir			= cgroup_mkdir,
	.rmdir			= cgroup_rmdir,
4486
	.show_path		= cgroup_show_path,
T
Tejun Heo 已提交
4487 4488
};

4489
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4490 4491
{
	struct cgroup_subsys_state *css;
D
Diego Calleja 已提交
4492

4493
	pr_debug("Initializing cgroup subsys %s\n", ss->name);
4494

4495 4496
	mutex_lock(&cgroup_mutex);

4497
	idr_init(&ss->css_idr);
T
Tejun Heo 已提交
4498
	INIT_LIST_HEAD(&ss->cfts);
4499

4500 4501 4502
	/* Create the root cgroup state for this subsystem */
	ss->root = &cgrp_dfl_root;
	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4503 4504
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
4505
	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4506 4507 4508 4509 4510 4511 4512

	/*
	 * Root csses are never destroyed and we can't initialize
	 * percpu_ref during early init.  Disable refcnting.
	 */
	css->flags |= CSS_NO_REF;

4513
	if (early) {
4514
		/* allocation can't be done safely during early init */
4515 4516 4517 4518 4519
		css->id = 1;
	} else {
		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
		BUG_ON(css->id < 0);
	}
4520

L
Li Zefan 已提交
4521
	/* Update the init_css_set to contain a subsys
4522
	 * pointer to this state - since the subsystem is
L
Li Zefan 已提交
4523
	 * newly registered, all tasks and hence the
4524
	 * init_css_set is in the subsystem's root cgroup. */
4525
	init_css_set.subsys[ss->id] = css;
4526

4527 4528
	have_fork_callback |= (bool)ss->fork << ss->id;
	have_exit_callback |= (bool)ss->exit << ss->id;
4529
	have_free_callback |= (bool)ss->free << ss->id;
4530
	have_canfork_callback |= (bool)ss->can_fork << ss->id;
4531

L
Li Zefan 已提交
4532 4533 4534 4535 4536
	/* At system boot, before all subsystems have been
	 * registered, no tasks have been forked, so we don't
	 * need to invoke fork callbacks here. */
	BUG_ON(!list_empty(&init_task.tasks));

4537
	BUG_ON(online_css(css));
4538

B
Ben Blum 已提交
4539 4540 4541
	mutex_unlock(&cgroup_mutex);
}

4542
/**
L
Li Zefan 已提交
4543 4544 4545 4546
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
4547 4548 4549
 */
int __init cgroup_init_early(void)
{
4550
	static struct cgroup_sb_opts __initdata opts;
4551
	struct cgroup_subsys *ss;
4552
	int i;
4553

4554
	init_cgroup_root(&cgrp_dfl_root, &opts);
4555 4556
	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

4557
	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4558

T
Tejun Heo 已提交
4559
	for_each_subsys(ss, i) {
4560
		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4561
		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
4562
		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4563
		     ss->id, ss->name);
4564 4565 4566
		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4567
		ss->id = i;
4568
		ss->name = cgroup_subsys_name[i];
4569 4570
		if (!ss->legacy_name)
			ss->legacy_name = cgroup_subsys_name[i];
4571 4572

		if (ss->early_init)
4573
			cgroup_init_subsys(ss, true);
4574 4575 4576 4577
	}
	return 0;
}

4578
static u16 cgroup_disable_mask __initdata;
4579

4580
/**
L
Li Zefan 已提交
4581 4582 4583 4584
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
4585 4586 4587
 */
int __init cgroup_init(void)
{
4588
	struct cgroup_subsys *ss;
4589
	int ssid;
4590

4591
	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
4592
	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
4593 4594
	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
4595

4596 4597 4598 4599 4600 4601
	/*
	 * The latency of the synchronize_sched() is too high for cgroups,
	 * avoid it at the cost of forcing all readers into the slow path.
	 */
	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);

4602 4603
	get_user_ns(init_cgroup_ns.user_ns);

T
Tejun Heo 已提交
4604 4605
	mutex_lock(&cgroup_mutex);

4606 4607 4608 4609 4610 4611
	/*
	 * Add init_css_set to the hash table so that dfl_root can link to
	 * it during init.
	 */
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));
4612

4613
	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
4614

T
Tejun Heo 已提交
4615 4616
	mutex_unlock(&cgroup_mutex);

4617
	for_each_subsys(ss, ssid) {
4618 4619 4620 4621 4622 4623 4624 4625 4626 4627
		if (ss->early_init) {
			struct cgroup_subsys_state *css =
				init_css_set.subsys[ss->id];

			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
						   GFP_KERNEL);
			BUG_ON(css->id < 0);
		} else {
			cgroup_init_subsys(ss, false);
		}
4628

T
Tejun Heo 已提交
4629 4630
		list_add_tail(&init_css_set.e_cset_node[ssid],
			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
4631 4632

		/*
4633 4634 4635
		 * Setting dfl_root subsys_mask needs to consider the
		 * disabled flag and cftype registration needs kmalloc,
		 * both of which aren't available during early_init.
4636
		 */
4637 4638 4639 4640
		if (cgroup_disable_mask & (1 << ssid)) {
			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
			printk(KERN_INFO "Disabling %s control group subsystem\n",
			       ss->name);
4641
			continue;
4642
		}
4643

4644
		if (cgroup1_ssid_disabled(ssid))
4645 4646 4647
			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
			       ss->name);

4648 4649
		cgrp_dfl_root.subsys_mask |= 1 << ss->id;

4650 4651 4652
		if (ss->implicit_on_dfl)
			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
		else if (!ss->dfl_cftypes)
T
Tejun Heo 已提交
4653
			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
4654

4655 4656 4657 4658 4659
		if (ss->dfl_cftypes == ss->legacy_cftypes) {
			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
		} else {
			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4660
		}
4661 4662 4663

		if (ss->bind)
			ss->bind(init_css_set.subsys[ssid]);
4664 4665
	}

4666 4667 4668 4669 4670
	/* init_css_set.subsys[] has been updated, re-hash */
	hash_del(&init_css_set.hlist);
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));

4671 4672
	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
	WARN_ON(register_filesystem(&cgroup_fs_type));
4673
	WARN_ON(register_filesystem(&cgroup2_fs_type));
4674
	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
4675

T
Tejun Heo 已提交
4676
	return 0;
4677
}
4678

4679 4680 4681 4682 4683
static int __init cgroup_wq_init(void)
{
	/*
	 * There isn't much point in executing destruction path in
	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4684
	 * Use 1 for @max_active.
4685 4686 4687 4688
	 *
	 * We would prefer to do this in cgroup_init() above, but that
	 * is called before init_workqueues(): so leave this until after.
	 */
4689
	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4690 4691 4692 4693 4694
	BUG_ON(!cgroup_destroy_wq);
	return 0;
}
core_initcall(cgroup_wq_init);

4695 4696 4697 4698 4699
/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
Z
Zefan Li 已提交
4700 4701
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
		     struct pid *pid, struct task_struct *tsk)
4702
{
4703
	char *buf;
4704
	int retval;
4705
	struct cgroup_root *root;
4706 4707

	retval = -ENOMEM;
T
Tejun Heo 已提交
4708
	buf = kmalloc(PATH_MAX, GFP_KERNEL);
4709 4710 4711 4712
	if (!buf)
		goto out;

	mutex_lock(&cgroup_mutex);
4713
	spin_lock_irq(&css_set_lock);
4714

4715
	for_each_root(root) {
4716
		struct cgroup_subsys *ss;
4717
		struct cgroup *cgrp;
T
Tejun Heo 已提交
4718
		int ssid, count = 0;
4719

T
Tejun Heo 已提交
4720
		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
4721 4722
			continue;

4723
		seq_printf(m, "%d:", root->hierarchy_id);
4724 4725 4726 4727
		if (root != &cgrp_dfl_root)
			for_each_subsys(ss, ssid)
				if (root->subsys_mask & (1 << ssid))
					seq_printf(m, "%s%s", count++ ? "," : "",
4728
						   ss->legacy_name);
4729 4730 4731
		if (strlen(root->name))
			seq_printf(m, "%sname=%s", count ? "," : "",
				   root->name);
4732
		seq_putc(m, ':');
4733

4734
		cgrp = task_cgroup_from_root(tsk, root);
4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745

		/*
		 * On traditional hierarchies, all zombie tasks show up as
		 * belonging to the root cgroup.  On the default hierarchy,
		 * while a zombie doesn't show up in "cgroup.procs" and
		 * thus can't be migrated, its /proc/PID/cgroup keeps
		 * reporting the cgroup it belonged to before exiting.  If
		 * the cgroup is removed before the zombie is reaped,
		 * " (deleted)" is appended to the cgroup path.
		 */
		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
4746
			retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
4747
						current->nsproxy->cgroup_ns);
4748
			if (retval >= PATH_MAX)
4749
				retval = -ENAMETOOLONG;
4750
			if (retval < 0)
4751
				goto out_unlock;
4752 4753

			seq_puts(m, buf);
4754
		} else {
4755
			seq_puts(m, "/");
T
Tejun Heo 已提交
4756
		}
4757 4758 4759 4760 4761

		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
			seq_puts(m, " (deleted)\n");
		else
			seq_putc(m, '\n');
4762 4763
	}

Z
Zefan Li 已提交
4764
	retval = 0;
4765
out_unlock:
4766
	spin_unlock_irq(&css_set_lock);
4767 4768 4769 4770 4771 4772
	mutex_unlock(&cgroup_mutex);
	kfree(buf);
out:
	return retval;
}

4773
/**
4774
 * cgroup_fork - initialize cgroup related fields during copy_process()
L
Li Zefan 已提交
4775
 * @child: pointer to task_struct of forking parent process.
4776
 *
4777 4778 4779
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the parent's css_set.  Empty cg_list indicates that
 * @child isn't holding reference to its css_set.
4780 4781 4782
 */
void cgroup_fork(struct task_struct *child)
{
4783
	RCU_INIT_POINTER(child->cgroups, &init_css_set);
4784
	INIT_LIST_HEAD(&child->cg_list);
4785 4786
}

4787 4788 4789 4790 4791 4792 4793 4794
/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the task in question.
 *
 * This calls the subsystem can_fork() callbacks. If the can_fork() callback
 * returns an error, the fork aborts with that error code. This allows for
 * a cgroup subsystem to conditionally allow or deny new forks.
 */
4795
int cgroup_can_fork(struct task_struct *child)
4796 4797 4798 4799
{
	struct cgroup_subsys *ss;
	int i, j, ret;

4800
	do_each_subsys_mask(ss, i, have_canfork_callback) {
4801
		ret = ss->can_fork(child);
4802 4803
		if (ret)
			goto out_revert;
4804
	} while_each_subsys_mask();
4805 4806 4807 4808 4809 4810 4811 4812

	return 0;

out_revert:
	for_each_subsys(ss, j) {
		if (j >= i)
			break;
		if (ss->cancel_fork)
4813
			ss->cancel_fork(child);
4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825
	}

	return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the task in question
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeded.
 */
4826
void cgroup_cancel_fork(struct task_struct *child)
4827 4828 4829 4830 4831 4832
{
	struct cgroup_subsys *ss;
	int i;

	for_each_subsys(ss, i)
		if (ss->cancel_fork)
4833
			ss->cancel_fork(child);
4834 4835
}

4836
/**
L
Li Zefan 已提交
4837 4838 4839
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
4840 4841 4842
 * Adds the task to the list running through its css_set if necessary and
 * call the subsystem fork() callbacks.  Has to be after the task is
 * visible on the task list in case we race with the first call to
4843
 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4844
 * list.
L
Li Zefan 已提交
4845
 */
4846
void cgroup_post_fork(struct task_struct *child)
4847
{
4848
	struct cgroup_subsys *ss;
4849 4850
	int i;

4851
	/*
D
Dongsheng Yang 已提交
4852
	 * This may race against cgroup_enable_task_cg_lists().  As that
4853 4854 4855 4856 4857 4858 4859
	 * function sets use_task_css_set_links before grabbing
	 * tasklist_lock and we just went through tasklist_lock to add
	 * @child, it's guaranteed that either we see the set
	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
	 * @child during its iteration.
	 *
	 * If we won the race, @child is associated with %current's
4860
	 * css_set.  Grabbing css_set_lock guarantees both that the
4861 4862 4863 4864 4865 4866
	 * association is stable, and, on completion of the parent's
	 * migration, @child is visible in the source of migration or
	 * already in the destination cgroup.  This guarantee is necessary
	 * when implementing operations which need to migrate all tasks of
	 * a cgroup to another.
	 *
D
Dongsheng Yang 已提交
4867
	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
4868 4869 4870
	 * will remain in init_css_set.  This is safe because all tasks are
	 * in the init_css_set before cg_links is enabled and there's no
	 * operation which transfers all tasks out of init_css_set.
4871
	 */
4872
	if (use_task_css_set_links) {
4873 4874
		struct css_set *cset;

4875
		spin_lock_irq(&css_set_lock);
4876
		cset = task_css_set(current);
4877 4878
		if (list_empty(&child->cg_list)) {
			get_css_set(cset);
4879
			cset->nr_tasks++;
T
Tejun Heo 已提交
4880
			css_set_move_task(child, NULL, cset, false);
4881
		}
4882
		spin_unlock_irq(&css_set_lock);
4883
	}
4884 4885 4886 4887 4888 4889

	/*
	 * Call ss->fork().  This must happen after @child is linked on
	 * css_set; otherwise, @child might change state between ->fork()
	 * and addition to css_set.
	 */
4890
	do_each_subsys_mask(ss, i, have_fork_callback) {
4891
		ss->fork(child);
4892
	} while_each_subsys_mask();
4893
}
4894

4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906
/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk and release it.
 *
 * Note that cgroups marked notify_on_release force every task in
 * them to take the global cgroup_mutex mutex when exiting.
 * This could impact scaling on very large systems.  Be reluctant to
 * use notify_on_release cgroups where very high task exit scaling
 * is required on large systems.
 *
4907 4908 4909 4910 4911
 * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
 * call cgroup_exit() while the task is still competent to handle
 * notify_on_release(), then leave the task attached to the root cgroup in
 * each hierarchy for the remainder of its exit.  No need to bother with
 * init_css_set refcnting.  init_css_set never goes away and we can't race
4912
 * with migration path - PF_EXITING is visible to migration path.
4913
 */
4914
void cgroup_exit(struct task_struct *tsk)
4915
{
4916
	struct cgroup_subsys *ss;
4917
	struct css_set *cset;
4918
	int i;
4919 4920

	/*
4921
	 * Unlink from @tsk from its css_set.  As migration path can't race
4922
	 * with us, we can check css_set and cg_list without synchronization.
4923
	 */
4924 4925
	cset = task_css_set(tsk);

4926
	if (!list_empty(&tsk->cg_list)) {
4927
		spin_lock_irq(&css_set_lock);
T
Tejun Heo 已提交
4928
		css_set_move_task(tsk, cset, NULL, false);
4929
		cset->nr_tasks--;
4930
		spin_unlock_irq(&css_set_lock);
4931 4932
	} else {
		get_css_set(cset);
4933 4934
	}

4935
	/* see cgroup_post_fork() for details */
4936
	do_each_subsys_mask(ss, i, have_exit_callback) {
4937
		ss->exit(tsk);
4938
	} while_each_subsys_mask();
4939
}
4940

4941 4942 4943
void cgroup_free(struct task_struct *task)
{
	struct css_set *cset = task_css_set(task);
4944 4945 4946
	struct cgroup_subsys *ss;
	int ssid;

4947
	do_each_subsys_mask(ss, ssid, have_free_callback) {
4948
		ss->free(task);
4949
	} while_each_subsys_mask();
4950

4951
	put_css_set(cset);
4952
}
4953

4954 4955
static int __init cgroup_disable(char *str)
{
4956
	struct cgroup_subsys *ss;
4957
	char *token;
4958
	int i;
4959 4960 4961 4962

	while ((token = strsep(&str, ",")) != NULL) {
		if (!*token)
			continue;
4963

T
Tejun Heo 已提交
4964
		for_each_subsys(ss, i) {
4965 4966 4967
			if (strcmp(token, ss->name) &&
			    strcmp(token, ss->legacy_name))
				continue;
4968
			cgroup_disable_mask |= 1 << i;
4969 4970 4971 4972 4973
		}
	}
	return 1;
}
__setup("cgroup_disable=", cgroup_disable);
K
KAMEZAWA Hiroyuki 已提交
4974

4975
/**
4976
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4977 4978
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
4979
 *
4980 4981 4982
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
S
Stephane Eranian 已提交
4983
 */
4984 4985
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
						       struct cgroup_subsys *ss)
S
Stephane Eranian 已提交
4986
{
T
Tejun Heo 已提交
4987
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4988
	struct file_system_type *s_type = dentry->d_sb->s_type;
T
Tejun Heo 已提交
4989
	struct cgroup_subsys_state *css = NULL;
S
Stephane Eranian 已提交
4990 4991
	struct cgroup *cgrp;

4992
	/* is @dentry a cgroup dir? */
4993 4994
	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
	    !kn || kernfs_type(kn) != KERNFS_DIR)
S
Stephane Eranian 已提交
4995 4996
		return ERR_PTR(-EBADF);

4997 4998
	rcu_read_lock();

T
Tejun Heo 已提交
4999 5000 5001
	/*
	 * This path doesn't originate from kernfs and @kn could already
	 * have been or be removed at any point.  @kn->priv is RCU
5002
	 * protected for this access.  See css_release_work_fn() for details.
T
Tejun Heo 已提交
5003
	 */
5004
	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
T
Tejun Heo 已提交
5005 5006
	if (cgrp)
		css = cgroup_css(cgrp, ss);
5007

5008
	if (!css || !css_tryget_online(css))
5009 5010 5011 5012
		css = ERR_PTR(-ENOENT);

	rcu_read_unlock();
	return css;
S
Stephane Eranian 已提交
5013 5014
}

5015 5016 5017 5018 5019 5020 5021 5022 5023 5024
/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
5025
	WARN_ON_ONCE(!rcu_read_lock_held());
5026
	return idr_find(&ss->css_idr, id);
S
Stephane Eranian 已提交
5027 5028
}

5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048
/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
 * if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
	struct kernfs_node *kn;
	struct cgroup *cgrp;

	mutex_lock(&cgroup_mutex);

	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
	if (kn) {
		if (kernfs_type(kn) == KERNFS_DIR) {
			cgrp = kn->priv;
5049
			cgroup_get_live(cgrp);
5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062
		} else {
			cgrp = ERR_PTR(-ENOTDIR);
		}
		kernfs_put(kn);
	} else {
		cgrp = ERR_PTR(-ENOENT);
	}

	mutex_unlock(&cgroup_mutex);
	return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096
/**
 * cgroup_get_from_fd - get a cgroup pointer from a fd
 * @fd: fd obtained by open(cgroup2_dir)
 *
 * Find the cgroup from a fd which should be obtained
 * by opening a cgroup directory.  Returns a pointer to the
 * cgroup on success. ERR_PTR is returned if the cgroup
 * cannot be found.
 */
struct cgroup *cgroup_get_from_fd(int fd)
{
	struct cgroup_subsys_state *css;
	struct cgroup *cgrp;
	struct file *f;

	f = fget_raw(fd);
	if (!f)
		return ERR_PTR(-EBADF);

	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
	fput(f);
	if (IS_ERR(css))
		return ERR_CAST(css);

	cgrp = css->cgroup;
	if (!cgroup_on_dfl(cgrp)) {
		cgroup_put(cgrp);
		return ERR_PTR(-EBADF);
	}

	return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

T
Tejun Heo 已提交
5097 5098 5099 5100 5101 5102 5103 5104
/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)

5105
DEFINE_SPINLOCK(cgroup_sk_update_lock);
T
Tejun Heo 已提交
5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126
static bool cgroup_sk_alloc_disabled __read_mostly;

void cgroup_sk_alloc_disable(void)
{
	if (cgroup_sk_alloc_disabled)
		return;
	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
	cgroup_sk_alloc_disabled = true;
}

#else

#define cgroup_sk_alloc_disabled	false

#endif

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
	if (cgroup_sk_alloc_disabled)
		return;

5127 5128
	/* Socket clone path */
	if (skcd->val) {
5129 5130 5131 5132 5133
		/*
		 * We might be cloning a socket which is left in an empty
		 * cgroup and the cgroup might have already been rmdir'd.
		 * Don't use cgroup_get_live().
		 */
5134 5135 5136 5137
		cgroup_get(sock_cgroup_ptr(skcd));
		return;
	}

T
Tejun Heo 已提交
5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160
	rcu_read_lock();

	while (true) {
		struct css_set *cset;

		cset = task_css_set(current);
		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
			skcd->val = (unsigned long)cset->dfl_cgrp;
			break;
		}
		cpu_relax();
	}

	rcu_read_unlock();
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
	cgroup_put(sock_cgroup_ptr(skcd));
}

#endif	/* CONFIG_SOCK_CGROUP_DATA */

5161
#ifdef CONFIG_CGROUP_BPF
5162 5163
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
		      enum bpf_attach_type type, bool overridable)
5164 5165
{
	struct cgroup *parent = cgroup_parent(cgrp);
5166
	int ret;
5167 5168

	mutex_lock(&cgroup_mutex);
5169
	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
5170
	mutex_unlock(&cgroup_mutex);
5171
	return ret;
5172 5173
}
#endif /* CONFIG_CGROUP_BPF */