cpuacct.c 14.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7
/*
 * CPU accounting code for task groups.
 *
 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
 * (balbir@in.ibm.com).
 */
8
#include "sched.h"
9

10
/* Time spent by the tasks of the CPU accounting group executing in ... */
L
Li Zefan 已提交
11 12 13 14 15 16 17
enum cpuacct_stat_index {
	CPUACCT_STAT_USER,	/* ... user mode */
	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */

	CPUACCT_STAT_NSTATS,
};

18 19 20
static const char * const cpuacct_stat_desc[] = {
	[CPUACCT_STAT_USER] = "user",
	[CPUACCT_STAT_SYSTEM] = "system",
21 22 23
};

struct cpuacct_usage {
24
	u64	usages[CPUACCT_STAT_NSTATS];
25 26 27
	struct prev_cputime prev_cputime1; /* utime and stime */
	struct prev_cputime prev_cputime2; /* user and nice */
} ____cacheline_aligned;
28

29
/* track CPU usage of a group of tasks and its child groups */
L
Li Zefan 已提交
30
struct cpuacct {
31 32 33 34
	struct cgroup_subsys_state	css;
	/* cpuusage holds pointer to a u64-type object on every CPU */
	struct cpuacct_usage __percpu	*cpuusage;
	struct kernel_cpustat __percpu	*cpustat;
35 36 37 38 39

	ALI_HOTFIX_RESERVE(1)
	ALI_HOTFIX_RESERVE(2)
	ALI_HOTFIX_RESERVE(3)
	ALI_HOTFIX_RESERVE(4)
L
Li Zefan 已提交
40 41
};

42 43 44 45 46
static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
{
	return css ? container_of(css, struct cpuacct, css) : NULL;
}

47
/* Return CPU accounting group to which this task belongs */
L
Li Zefan 已提交
48 49
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
50
	return css_ca(task_css(tsk, cpuacct_cgrp_id));
L
Li Zefan 已提交
51 52 53 54
}

static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
T
Tejun Heo 已提交
55
	return css_ca(ca->css.parent);
L
Li Zefan 已提交
56 57
}

58
static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
59 60 61 62
static struct cpuacct root_cpuacct = {
	.cpustat	= &kernel_cpustat,
	.cpuusage	= &root_cpuacct_cpuusage,
};
63

64
/* Create a new CPU accounting group */
65 66
static struct cgroup_subsys_state *
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
67 68
{
	struct cpuacct *ca;
69
	int i;
70

71
	if (!parent_css)
72 73 74 75 76 77
		return &root_cpuacct.css;

	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
	if (!ca)
		goto out;

78
	ca->cpuusage = alloc_percpu(struct cpuacct_usage);
79 80 81 82 83 84 85
	if (!ca->cpuusage)
		goto out_free_ca;

	ca->cpustat = alloc_percpu(struct kernel_cpustat);
	if (!ca->cpustat)
		goto out_free_cpuusage;

86 87 88 89 90
	for_each_possible_cpu(i) {
		prev_cputime_init(&per_cpu_ptr(ca->cpuusage, i)->prev_cputime1);
		prev_cputime_init(&per_cpu_ptr(ca->cpuusage, i)->prev_cputime2);
	}

91 92 93 94 95 96 97 98 99 100
	return &ca->css;

out_free_cpuusage:
	free_percpu(ca->cpuusage);
out_free_ca:
	kfree(ca);
out:
	return ERR_PTR(-ENOMEM);
}

101
/* Destroy an existing CPU accounting group */
102
static void cpuacct_css_free(struct cgroup_subsys_state *css)
103
{
104
	struct cpuacct *ca = css_ca(css);
105 106 107 108 109 110

	free_percpu(ca->cpustat);
	free_percpu(ca->cpuusage);
	kfree(ca);
}

111
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
112
				 enum cpuacct_stat_index index)
113
{
114
	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
115 116
	u64 data;

117
	/*
118
	 * We allow index == CPUACCT_STAT_NSTATS here to read
119 120
	 * the sum of suages.
	 */
121
	BUG_ON(index > CPUACCT_STAT_NSTATS);
122

123 124 125 126 127
#ifndef CONFIG_64BIT
	/*
	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
	 */
	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
128 129
#endif

130
	if (index == CPUACCT_STAT_NSTATS) {
131 132 133
		int i = 0;

		data = 0;
134
		for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
135 136 137 138 139 140
			data += cpuusage->usages[i];
	} else {
		data = cpuusage->usages[index];
	}

#ifndef CONFIG_64BIT
141 142 143 144 145 146 147 148
	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#endif

	return data;
}

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
149 150
	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
	int i;
151 152 153 154 155 156

#ifndef CONFIG_64BIT
	/*
	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
	 */
	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
157 158
#endif

159
	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
160 161 162
		cpuusage->usages[i] = val;

#ifndef CONFIG_64BIT
163 164 165 166
	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#endif
}

167
/* Return total CPU usage (in nanoseconds) of a group */
168
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
169
			   enum cpuacct_stat_index index)
170
{
171
	struct cpuacct *ca = css_ca(css);
172 173 174
	u64 totalcpuusage = 0;
	int i;

175
	for_each_possible_cpu(i)
176
		totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
177 178 179 180

	return totalcpuusage;
}

181 182 183
static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
			      struct cftype *cft)
{
184
	return __cpuusage_read(css, CPUACCT_STAT_USER);
185 186 187 188 189
}

static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
			     struct cftype *cft)
{
190
	return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
191 192 193 194
}

static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
195
	return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
196 197
}

198
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
199
			  u64 val)
200
{
201
	struct cpuacct *ca = css_ca(css);
202
	int cpu;
203

204 205 206
	/*
	 * Only allow '0' here to do a reset.
	 */
207 208
	if (val)
		return -EINVAL;
209

210 211
	for_each_possible_cpu(cpu)
		cpuacct_cpuusage_write(ca, cpu, 0);
212

213
	return 0;
214 215
}

216
static int __cpuacct_percpu_seq_show(struct seq_file *m,
217
				     enum cpuacct_stat_index index)
218
{
219
	struct cpuacct *ca = css_ca(seq_css(m));
220 221 222
	u64 percpu;
	int i;

223
	for_each_possible_cpu(i) {
224
		percpu = cpuacct_cpuusage_read(ca, i, index);
225 226 227 228 229 230
		seq_printf(m, "%llu ", (unsigned long long) percpu);
	}
	seq_printf(m, "\n");
	return 0;
}

231 232
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{
233
	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
234 235 236 237
}

static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{
238
	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
239 240 241 242
}

static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
243
	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
244 245
}

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
static int cpuacct_all_seq_show(struct seq_file *m, void *V)
{
	struct cpuacct *ca = css_ca(seq_css(m));
	int index;
	int cpu;

	seq_puts(m, "cpu");
	for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
		seq_printf(m, " %s", cpuacct_stat_desc[index]);
	seq_puts(m, "\n");

	for_each_possible_cpu(cpu) {
		struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

		seq_printf(m, "%d", cpu);

		for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
#ifndef CONFIG_64BIT
			/*
			 * Take rq->lock to make 64-bit read safe on 32-bit
			 * platforms.
			 */
			raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif

			seq_printf(m, " %llu", cpuusage->usages[index]);

#ifndef CONFIG_64BIT
			raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#endif
		}
		seq_puts(m, "\n");
	}
	return 0;
}

282
static int cpuacct_stats_show(struct seq_file *sf, void *v)
283
{
284
	struct cpuacct *ca = css_ca(seq_css(sf));
285
	s64 val[CPUACCT_STAT_NSTATS];
286
	int cpu;
287
	int stat;
288

289
	memset(val, 0, sizeof(val));
290
	for_each_possible_cpu(cpu) {
291
		u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
292

293 294 295 296 297
		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER];
		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE];
		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
298 299
	}

300 301 302
	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
		seq_printf(sf, "%s %lld\n",
			   cpuacct_stat_desc[stat],
303
			   (long long)nsec_to_clock_t(val[stat]));
304
	}
305 306 307 308

	return 0;
}

309 310 311 312 313 314 315 316 317 318 319
#ifdef CONFIG_SCHED_SLI
#ifndef arch_idle_time
#define arch_idle_time(cpu) 0
#endif

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
{
	return container_of(global_cgroup_css(cgrp, cpu_cgrp_id),
				struct task_group, css);
}

320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
static inline unsigned long nr_uninterruptible(void)
{
	unsigned long i, sum = 0;

	for_each_possible_cpu(i)
		sum += cpu_rq(i)->nr_uninterruptible;

	/*
	 * Since we read the counters lockless, it might be slightly
	 * inaccurate. Do not allow it to go below zero though:
	 */
	if (unlikely((long)sum < 0))
		sum = 0;

	return sum;
}

#ifdef CONFIG_CFS_BANDWIDTH
static inline bool tg_cfs_throttled(struct task_group *tg, int cpu)
{
	return tg->cfs_rq[cpu]->throttle_count;
}
#else
static inline bool tg_cfs_throttled(struct task_group *tg, int cpu)
{
	return false;
}
#endif

#ifdef CONFIG_RT_GROUP_SCHED
static inline bool tg_rt_throttled(struct task_group *tg, int cpu)
{
	return tg->rt_rq[cpu]->rt_throttled && !tg->rt_rq[cpu]->rt_nr_boosted;
}
#endif

static unsigned long ca_running(struct cpuacct *ca, int cpu)
{
	unsigned long nr_running = 0;
	struct cgroup *cgrp = ca->css.cgroup;
	struct task_group *tg;

	/* Make sure it is only called for non-root cpuacct */
	if (ca == &root_cpuacct)
		return 0;

	rcu_read_lock();
	tg = cgroup_tg(cgrp);
	if (unlikely(!tg))
		goto out;

	if (!tg_cfs_throttled(tg, cpu))
		nr_running += tg->cfs_rq[cpu]->h_nr_running;
#ifdef CONFIG_RT_GROUP_SCHED
	if (!tg_rt_throttled(tg, cpu))
		nr_running += tg->rt_rq[cpu]->rt_nr_running;
#endif
	/* SCHED_DEADLINE doesn't support cgroup yet */

out:
	rcu_read_unlock();
	return nr_running;
}

static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu)
{
	unsigned long nr = 0;
	struct cgroup *cgrp = ca->css.cgroup;
	struct task_group *tg;

	/* Make sure it is only called for non-root cpuacct */
	if (ca == &root_cpuacct)
		return nr;

	rcu_read_lock();
	tg = cgroup_tg(cgrp);
	if (unlikely(!tg))
		goto out_rcu_unlock;

	nr = tg->cfs_rq[cpu]->nr_uninterruptible;
#ifdef CONFIG_RT_GROUP_SCHED
	nr += tg->rt_rq[cpu]->nr_uninterruptible;
#endif

out_rcu_unlock:
	rcu_read_unlock();
	return nr;
}

409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
		struct task_group *tg, struct cpuacct_usage_result *res)
{
	struct kernel_cpustat *kcpustat;
	struct cpuacct_usage *cpuusage;
	struct task_cputime cputime;
	u64 tick_user, tick_nice, tick_sys, left, right;
	struct sched_entity *se;

	kcpustat = per_cpu_ptr(ca->cpustat, cpu);
	if (unlikely(!tg)) {
		memset(res, 0, sizeof(*res));
		return;
	}

	se = tg->se[cpu];
	cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
	tick_user = kcpustat->cpustat[CPUTIME_USER];
	tick_nice = kcpustat->cpustat[CPUTIME_NICE];
	tick_sys = kcpustat->cpustat[CPUTIME_SYSTEM];

	/* Calculate system run time */
	cputime.sum_exec_runtime = cpuusage->usages[CPUACCT_STAT_USER] +
			cpuusage->usages[CPUACCT_STAT_SYSTEM];
	cputime.utime = tick_user + tick_nice;
	cputime.stime = tick_sys;
	cputime_adjust(&cputime, &cpuusage->prev_cputime1, &left, &right);
	res->system = right;

	/* Calculate user and nice run time */
	cputime.sum_exec_runtime = left; /* user + nice */
	cputime.utime = tick_user;
	cputime.stime = tick_nice;
	cputime_adjust(&cputime, &cpuusage->prev_cputime2, &left, &right);
	res->user = left;
	res->nice = right;

	res->irq = kcpustat->cpustat[CPUTIME_IRQ];
	res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ];
	if (se)
		res->steal = se->statistics.wait_sum;
	else
		res->steal = 0;
452 453
	res->guest = kcpustat->cpustat[CPUTIME_GUEST];
	res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
454 455 456 457 458 459 460
}

static int cpuacct_proc_stats_show(struct seq_file *sf, void *v)
{
	struct cpuacct *ca = css_ca(seq_css(sf));
	struct cgroup *cgrp = seq_css(sf)->cgroup;
	u64 user, nice, system, idle, iowait, irq, softirq, steal, guest;
461
	unsigned long nr_run = 0, nr_uninter = 0;
462 463 464 465 466 467 468 469 470
	int cpu;

	user = nice = system = idle = iowait =
		irq = softirq = steal = guest = 0;

	if (ca != &root_cpuacct) {
		struct cpuacct_usage_result res;

		for_each_possible_cpu(cpu) {
471 472 473
			if (!housekeeping_cpu(cpu, HK_FLAG_DOMAIN))
				continue;

474 475 476 477 478 479 480 481 482 483 484 485
			rcu_read_lock();
			__cpuacct_get_usage_result(ca, cpu,
					cgroup_tg(cgrp), &res);
			rcu_read_unlock();

			user += res.user;
			nice += res.nice;
			system += res.system;
			irq += res.irq;
			softirq += res.softirq;
			steal += res.steal;
			guest += res.guest;
486
			guest += res.guest_nice;
487 488
			iowait += res.iowait;
			idle += res.idle;
489 490 491

			nr_run += ca_running(ca, cpu);
			nr_uninter += ca_uninterruptible(ca, cpu);
492 493 494 495 496 497 498 499 500 501 502 503
		}
	} else {
		struct kernel_cpustat *kcpustat;

		for_each_possible_cpu(cpu) {
			kcpustat = per_cpu_ptr(ca->cpustat, cpu);
			user += kcpustat->cpustat[CPUTIME_USER];
			nice += kcpustat->cpustat[CPUTIME_NICE];
			system += kcpustat->cpustat[CPUTIME_SYSTEM];
			irq += kcpustat->cpustat[CPUTIME_IRQ];
			softirq += kcpustat->cpustat[CPUTIME_SOFTIRQ];
			guest += kcpustat->cpustat[CPUTIME_GUEST];
504
			guest += kcpustat->cpustat[CPUTIME_GUEST_NICE];
505 506 507 508
			idle += get_idle_time(cpu);
			iowait += get_iowait_time(cpu);
			steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
		}
509 510 511

		nr_run = nr_running();
		nr_uninter = nr_uninterruptible();
512 513 514 515 516 517 518 519 520 521 522 523
	}

	seq_printf(sf, "user %lld\n", nsec_to_clock_t(user));
	seq_printf(sf, "nice %lld\n", nsec_to_clock_t(nice));
	seq_printf(sf, "system %lld\n", nsec_to_clock_t(system));
	seq_printf(sf, "idle %lld\n", nsec_to_clock_t(idle));
	seq_printf(sf, "iowait %lld\n", nsec_to_clock_t(iowait));
	seq_printf(sf, "irq %lld\n", nsec_to_clock_t(irq));
	seq_printf(sf, "softirq %lld\n", nsec_to_clock_t(softirq));
	seq_printf(sf, "steal %lld\n", nsec_to_clock_t(steal));
	seq_printf(sf, "guest %lld\n", nsec_to_clock_t(guest));

524 525 526 527 528
	seq_printf(sf, "nr_running %lld\n", (u64)nr_run);
	if ((long) nr_uninter < 0)
		nr_uninter = 0;
	seq_printf(sf, "nr_uninterruptible %lld\n", (u64)nr_uninter);

529 530 531 532
	return 0;
}
#endif

533 534 535 536 537 538
static struct cftype files[] = {
	{
		.name = "usage",
		.read_u64 = cpuusage_read,
		.write_u64 = cpuusage_write,
	},
539 540 541 542 543 544 545 546
	{
		.name = "usage_user",
		.read_u64 = cpuusage_user_read,
	},
	{
		.name = "usage_sys",
		.read_u64 = cpuusage_sys_read,
	},
547 548
	{
		.name = "usage_percpu",
549
		.seq_show = cpuacct_percpu_seq_show,
550
	},
551 552 553 554 555 556 557 558
	{
		.name = "usage_percpu_user",
		.seq_show = cpuacct_percpu_user_seq_show,
	},
	{
		.name = "usage_percpu_sys",
		.seq_show = cpuacct_percpu_sys_seq_show,
	},
559 560 561 562
	{
		.name = "usage_all",
		.seq_show = cpuacct_all_seq_show,
	},
563 564
	{
		.name = "stat",
565
		.seq_show = cpuacct_stats_show,
566
	},
567 568 569 570 571 572
#ifdef CONFIG_SCHED_SLI
	{
		.name = "proc_stat",
		.seq_show = cpuacct_proc_stats_show,
	},
#endif
573 574 575 576 577 578 579 580 581 582 583
	{ }	/* terminate */
};

/*
 * charge this task's execution time to its accounting group.
 *
 * called with rq->lock held.
 */
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
	struct cpuacct *ca;
584
	int index = CPUACCT_STAT_SYSTEM;
585
	struct pt_regs *regs = task_pt_regs(tsk);
586

587
	if (regs && user_mode(regs))
588
		index = CPUACCT_STAT_USER;
589 590

	rcu_read_lock();
591

592
	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
593 594
		this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;

595 596 597
	rcu_read_unlock();
}

598 599 600 601 602
/*
 * Add user/system time to cpuacct.
 *
 * Note: it's the caller that updates the account of the root cgroup.
 */
603
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
604 605 606 607
{
	struct cpuacct *ca;

	rcu_read_lock();
608 609
	for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
		this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
610 611 612
	rcu_read_unlock();
}

613
struct cgroup_subsys cpuacct_cgrp_subsys = {
614 615
	.css_alloc	= cpuacct_css_alloc,
	.css_free	= cpuacct_css_free,
616
	.legacy_cftypes	= files,
617
	.early_init	= true,
618
};
619 620

#ifdef CONFIG_PSI
621 622 623 624 625 626 627 628

static bool psi_v1_enable;
static int __init setup_psi_v1(char *str)
{
	return kstrtobool(str, &psi_v1_enable) == 0;
}
__setup("psi_v1=", setup_psi_v1);

629 630
static int __init cgroup_v1_psi_init(void)
{
631 632 633 634 635
	if (!psi_v1_enable) {
		static_branch_enable(&psi_v1_disabled);
		return 0;
	}

636 637 638 639 640 641
	cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files);
	return 0;
}

late_initcall_sync(cgroup_v1_psi_init);
#endif