cputime.c 16.7 KB
Newer Older
1 2 3 4 5
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
6
#include <linux/context_tracking.h>
7 8 9 10 11 12 13
#include "sched.h"


#ifdef CONFIG_IRQ_TIME_ACCOUNTING

/*
 * There are no locks covering percpu hardirq/softirq time.
14
 * They are only modified in vtime_account, on corresponding CPU
15 16 17
 * with interrupts disabled. So, writes are safe.
 * They are read and saved off onto struct rq in update_rq_clock().
 * This may result in other CPU reading this CPU's irq time and can
18
 * race with irq/vtime_account on this CPU. We would either get old
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 * or new value with a side effect of accounting a slice of irq time to wrong
 * task when irq is in progress while we read rq->clock. That is a worthy
 * compromise in place of having locks on each irq in account_system_time.
 */
DEFINE_PER_CPU(u64, cpu_hardirq_time);
DEFINE_PER_CPU(u64, cpu_softirq_time);

static DEFINE_PER_CPU(u64, irq_start_time);
static int sched_clock_irqtime;

void enable_sched_clock_irqtime(void)
{
	sched_clock_irqtime = 1;
}

void disable_sched_clock_irqtime(void)
{
	sched_clock_irqtime = 0;
}

#ifndef CONFIG_64BIT
DEFINE_PER_CPU(seqcount_t, irq_time_seq);
#endif /* CONFIG_64BIT */

/*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
47
void irqtime_account_irq(struct task_struct *curr)
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
{
	unsigned long flags;
	s64 delta;
	int cpu;

	if (!sched_clock_irqtime)
		return;

	local_irq_save(flags);

	cpu = smp_processor_id();
	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
	__this_cpu_add(irq_start_time, delta);

	irq_time_write_begin();
	/*
	 * We do not account for softirq time from ksoftirqd here.
	 * We want to continue accounting softirq time to ksoftirqd thread
	 * in that case, so as not to confuse scheduler with a special task
	 * that do not consume any time, but still wants to run.
	 */
	if (hardirq_count())
		__this_cpu_add(cpu_hardirq_time, delta);
	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
		__this_cpu_add(cpu_softirq_time, delta);

	irq_time_write_end();
	local_irq_restore(flags);
}
77
EXPORT_SYMBOL_GPL(irqtime_account_irq);
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291

static int irqtime_account_hi_update(void)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;
	unsigned long flags;
	u64 latest_ns;
	int ret = 0;

	local_irq_save(flags);
	latest_ns = this_cpu_read(cpu_hardirq_time);
	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
		ret = 1;
	local_irq_restore(flags);
	return ret;
}

static int irqtime_account_si_update(void)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;
	unsigned long flags;
	u64 latest_ns;
	int ret = 0;

	local_irq_save(flags);
	latest_ns = this_cpu_read(cpu_softirq_time);
	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
		ret = 1;
	local_irq_restore(flags);
	return ret;
}

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

#define sched_clock_irqtime	(0)

#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */

static inline void task_group_account_field(struct task_struct *p, int index,
					    u64 tmp)
{
#ifdef CONFIG_CGROUP_CPUACCT
	struct kernel_cpustat *kcpustat;
	struct cpuacct *ca;
#endif
	/*
	 * Since all updates are sure to touch the root cgroup, we
	 * get ourselves ahead and touch it first. If the root cgroup
	 * is the only cgroup, then nothing else should be necessary.
	 *
	 */
	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;

#ifdef CONFIG_CGROUP_CPUACCT
	if (unlikely(!cpuacct_subsys.active))
		return;

	rcu_read_lock();
	ca = task_ca(p);
	while (ca && (ca != &root_cpuacct)) {
		kcpustat = this_cpu_ptr(ca->cpustat);
		kcpustat->cpustat[index] += tmp;
		ca = parent_ca(ca);
	}
	rcu_read_unlock();
#endif
}

/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in user space since the last update
 * @cputime_scaled: cputime scaled by cpu frequency
 */
void account_user_time(struct task_struct *p, cputime_t cputime,
		       cputime_t cputime_scaled)
{
	int index;

	/* Add user time to process. */
	p->utime += cputime;
	p->utimescaled += cputime_scaled;
	account_group_user_time(p, cputime);

	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;

	/* Add user time to cpustat. */
	task_group_account_field(p, index, (__force u64) cputime);

	/* Account for user time used */
	acct_update_integrals(p);
}

/*
 * Account guest cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in virtual machine since the last update
 * @cputime_scaled: cputime scaled by cpu frequency
 */
static void account_guest_time(struct task_struct *p, cputime_t cputime,
			       cputime_t cputime_scaled)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;

	/* Add guest time to process. */
	p->utime += cputime;
	p->utimescaled += cputime_scaled;
	account_group_user_time(p, cputime);
	p->gtime += cputime;

	/* Add guest time to cpustat. */
	if (TASK_NICE(p) > 0) {
		cpustat[CPUTIME_NICE] += (__force u64) cputime;
		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
	} else {
		cpustat[CPUTIME_USER] += (__force u64) cputime;
		cpustat[CPUTIME_GUEST] += (__force u64) cputime;
	}
}

/*
 * Account system cpu time to a process and desired cpustat field
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in kernel space since the last update
 * @cputime_scaled: cputime scaled by cpu frequency
 * @target_cputime64: pointer to cpustat field that has to be updated
 */
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
			cputime_t cputime_scaled, int index)
{
	/* Add system time to process. */
	p->stime += cputime;
	p->stimescaled += cputime_scaled;
	account_group_system_time(p, cputime);

	/* Add system time to cpustat. */
	task_group_account_field(p, index, (__force u64) cputime);

	/* Account for system time used */
	acct_update_integrals(p);
}

/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
 * @cputime: the cpu time spent in kernel space since the last update
 * @cputime_scaled: cputime scaled by cpu frequency
 */
void account_system_time(struct task_struct *p, int hardirq_offset,
			 cputime_t cputime, cputime_t cputime_scaled)
{
	int index;

	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
		account_guest_time(p, cputime, cputime_scaled);
		return;
	}

	if (hardirq_count() - hardirq_offset)
		index = CPUTIME_IRQ;
	else if (in_serving_softirq())
		index = CPUTIME_SOFTIRQ;
	else
		index = CPUTIME_SYSTEM;

	__account_system_time(p, cputime, cputime_scaled, index);
}

/*
 * Account for involuntary wait time.
 * @cputime: the cpu time spent in involuntary wait
 */
void account_steal_time(cputime_t cputime)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;

	cpustat[CPUTIME_STEAL] += (__force u64) cputime;
}

/*
 * Account for idle time.
 * @cputime: the cpu time spent in idle wait
 */
void account_idle_time(cputime_t cputime)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;
	struct rq *rq = this_rq();

	if (atomic_read(&rq->nr_iowait) > 0)
		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
	else
		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}

static __always_inline bool steal_account_process_tick(void)
{
#ifdef CONFIG_PARAVIRT
	if (static_key_false(&paravirt_steal_enabled)) {
		u64 steal, st = 0;

		steal = paravirt_steal_clock(smp_processor_id());
		steal -= this_rq()->prev_steal_time;

		st = steal_ticks(steal);
		this_rq()->prev_steal_time += st * TICK_NSEC;

		account_steal_time(st);
		return st;
	}
#endif
	return false;
}

292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
/*
 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 * tasks (sum on group iteration) belonging to @tsk's group.
 */
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
	struct signal_struct *sig = tsk->signal;
	struct task_struct *t;

	times->utime = sig->utime;
	times->stime = sig->stime;
	times->sum_exec_runtime = sig->sum_sched_runtime;

	rcu_read_lock();
	/* make sure we can trust tsk->thread_group list */
	if (!likely(pid_alive(tsk)))
		goto out;

	t = tsk;
	do {
		times->utime += t->utime;
		times->stime += t->stime;
		times->sum_exec_runtime += task_sched_runtime(t);
	} while_each_thread(tsk, t);
out:
	rcu_read_unlock();
}

320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
 * Account a tick to a process and cpustat
 * @p: the process that the cpu time gets accounted to
 * @user_tick: is the tick from userspace
 * @rq: the pointer to rq
 *
 * Tick demultiplexing follows the order
 * - pending hardirq update
 * - pending softirq update
 * - user_time
 * - idle_time
 * - system time
 *   - check for guest_time
 *   - else account as system_time
 *
 * Check for hardirq is done both for system and user time as there is
 * no timer going off while we are on hardirq and hence we may never get an
 * opportunity to update it solely in system time.
 * p->stime and friends are only updated on system time and not on irq
 * softirq as those do not count in task exec_runtime any more.
 */
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
						struct rq *rq)
{
	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
	u64 *cpustat = kcpustat_this_cpu->cpustat;

	if (steal_account_process_tick())
		return;

	if (irqtime_account_hi_update()) {
		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
	} else if (irqtime_account_si_update()) {
		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
	} else if (this_cpu_ksoftirqd() == p) {
		/*
		 * ksoftirqd time do not get accounted in cpu_softirq_time.
		 * So, we have to handle it separately here.
		 * Also, p->stime needs to be updated for ksoftirqd.
		 */
		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
					CPUTIME_SOFTIRQ);
	} else if (user_tick) {
		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
	} else if (p == rq->idle) {
		account_idle_time(cputime_one_jiffy);
	} else if (p->flags & PF_VCPU) { /* System time or guest time */
		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
	} else {
		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
					CPUTIME_SYSTEM);
	}
}

static void irqtime_account_idle_ticks(int ticks)
{
	int i;
	struct rq *rq = this_rq();

	for (i = 0; i < ticks; i++)
		irqtime_account_process_tick(current, 0, rq);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
384 385
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
386 387 388
						struct rq *rq) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

389
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
390 391 392 393 394 395 396 397 398 399
/*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
 * @user_tick: indicates if the tick is a user or a system tick
 */
void account_process_tick(struct task_struct *p, int user_tick)
{
	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
	struct rq *rq = this_rq();

400 401 402
	if (vtime_accounting_enabled())
		return;

403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
	if (sched_clock_irqtime) {
		irqtime_account_process_tick(p, user_tick, rq);
		return;
	}

	if (steal_account_process_tick())
		return;

	if (user_tick)
		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
				    one_jiffy_scaled);
	else
		account_idle_time(cputime_one_jiffy);
}

/*
 * Account multiple ticks of steal time.
 * @p: the process from which the cpu time has been stolen
 * @ticks: number of stolen ticks
 */
void account_steal_ticks(unsigned long ticks)
{
	account_steal_time(jiffies_to_cputime(ticks));
}

/*
 * Account multiple ticks of idle time.
 * @ticks: number of stolen ticks
 */
void account_idle_ticks(unsigned long ticks)
{

	if (sched_clock_irqtime) {
		irqtime_account_idle_ticks(ticks);
		return;
	}

	account_idle_time(jiffies_to_cputime(ticks));
}
444
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
445 446 447 448 449

/*
 * Use precise platform statistics if available:
 */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
450
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
451 452 453 454 455
{
	*ut = p->utime;
	*st = p->stime;
}

456
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
457 458 459 460 461 462 463 464
{
	struct task_cputime cputime;

	thread_group_cputime(p, &cputime);

	*ut = cputime.utime;
	*st = cputime.stime;
}
465

466
void vtime_account_system_irqsafe(struct task_struct *tsk)
467 468 469 470
{
	unsigned long flags;

	local_irq_save(flags);
471
	vtime_account_system(tsk);
472 473
	local_irq_restore(flags);
}
474
EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
475

476 477 478
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev)
{
479 480 481
	if (!vtime_accounting_enabled())
		return;

482 483 484 485 486
	if (is_idle_task(prev))
		vtime_account_idle(prev);
	else
		vtime_account_system(prev);

487
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
488
	vtime_account_user(prev);
489
#endif
490 491 492
	arch_vtime_task_switch(prev);
}
#endif
493

494 495 496
/*
 * Archs that account the whole time spent in the idle task
 * (outside irq) as idle time can rely on this and just implement
497
 * vtime_account_system() and vtime_account_idle(). Archs that
498 499 500 501 502 503 504
 * have other meaning of the idle time (s390 only includes the
 * time spent by the CPU when it's in low power mode) must override
 * vtime_account().
 */
#ifndef __ARCH_HAS_VTIME_ACCOUNT
void vtime_account(struct task_struct *tsk)
{
505 506 507
	if (!vtime_accounting_enabled())
		return;

508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
	if (!in_interrupt()) {
		/*
		 * If we interrupted user, context_tracking_in_user()
		 * is 1 because the context tracking don't hook
		 * on irq entry/exit. This way we know if
		 * we need to flush user time on kernel entry.
		 */
		if (context_tracking_in_user()) {
			vtime_account_user(tsk);
			return;
		}

		if (is_idle_task(tsk)) {
			vtime_account_idle(tsk);
			return;
		}
	}
	vtime_account_system(tsk);
526 527 528 529
}
EXPORT_SYMBOL_GPL(vtime_account);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */

530
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
531 532 533 534 535 536 537 538 539 540 541 542 543 544 545

static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
{
	u64 temp = (__force u64) rtime;

	temp *= (__force u64) utime;

	if (sizeof(cputime_t) == 4)
		temp = div_u64(temp, (__force u32) total);
	else
		temp = div64_u64(temp, (__force u64) total);

	return (__force cputime_t) temp;
}

546 547 548 549
/*
 * Adjust tick based cputime random precision against scheduler
 * runtime accounting.
 */
550 551 552
static void cputime_adjust(struct task_cputime *curr,
			   struct cputime *prev,
			   cputime_t *ut, cputime_t *st)
553
{
554
	cputime_t rtime, utime, total;
555

556 557
	utime = curr->utime;
	total = utime + curr->stime;
558

559
	/*
560 561 562 563 564 565 566 567
	 * Tick based cputime accounting depend on random scheduling
	 * timeslices of a task to be interrupted or not by the timer.
	 * Depending on these circumstances, the number of these interrupts
	 * may be over or under-optimistic, matching the real user and system
	 * cputime with a variable precision.
	 *
	 * Fix this by scaling these tick based values against the total
	 * runtime accounted by the CFS scheduler.
568
	 */
569
	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
570 571 572 573 574 575 576

	if (total)
		utime = scale_utime(utime, rtime, total);
	else
		utime = rtime;

	/*
577 578 579
	 * If the tick based count grows faster than the scheduler one,
	 * the result of the scaling may go backward.
	 * Let's enforce monotonicity.
580
	 */
581 582 583 584 585 586
	prev->utime = max(prev->utime, utime);
	prev->stime = max(prev->stime, rtime - prev->utime);

	*ut = prev->utime;
	*st = prev->stime;
}
587

588 589 590 591 592 593 594 595 596
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
	struct task_cputime cputime = {
		.utime = p->utime,
		.stime = p->stime,
		.sum_exec_runtime = p->se.sum_exec_runtime,
	};

	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
597 598 599 600 601
}

/*
 * Must be called with siglock held.
 */
602
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
603 604 605 606
{
	struct task_cputime cputime;

	thread_group_cputime(p, &cputime);
607
	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
608
}
609
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static DEFINE_PER_CPU(unsigned long long, cputime_snap);

static cputime_t get_vtime_delta(void)
{
	unsigned long long delta;

	delta = sched_clock() - __this_cpu_read(cputime_snap);
	__this_cpu_add(cputime_snap, delta);

	/* CHECKME: always safe to convert nsecs to cputime? */
	return nsecs_to_cputime(delta);
}

void vtime_account_system(struct task_struct *tsk)
{
627 628 629 630
	cputime_t delta_cpu;

	if (!vtime_accounting_enabled())
		return;
631

632
	delta_cpu = get_vtime_delta();
633 634 635 636 637
	account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
}

void vtime_account_user(struct task_struct *tsk)
{
638 639 640 641 642 643
	cputime_t delta_cpu;

	if (!vtime_accounting_enabled())
		return;

	delta_cpu = get_vtime_delta();
644 645 646 647 648 649 650 651 652 653

	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
}

void vtime_account_idle(struct task_struct *tsk)
{
	cputime_t delta_cpu = get_vtime_delta();

	account_idle_time(delta_cpu);
}
654 655 656 657 658

bool vtime_accounting_enabled(void)
{
	return context_tracking_active();
}
659
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */