diff --git a/kernel/sched.c b/kernel/sched.c index f96be9370b750d7b13cd848f67f13f3a65c77d89..bae6fcfe6d758539fa5d53b924607308b9f2b638 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8997,6 +8997,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + /* * Charge the system/user time to the task's accounting group. */ @@ -9004,6 +9021,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, enum cpuacct_stat_index idx, cputime_t val) { struct cpuacct *ca; + int batch = CPUACCT_BATCH; if (unlikely(!cpuacct_subsys.active)) return; @@ -9012,7 +9030,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, ca = task_ca(tsk); do { - percpu_counter_add(&ca->cpustat[idx], val); + __percpu_counter_add(&ca->cpustat[idx], val, batch); ca = ca->parent; } while (ca); rcu_read_unlock();