diff --git a/include/linux/sched.h b/include/linux/sched.h index 07f374d18577034fd2b1156d3d96d90736e275d9..e7d7b9952f023dba0f02c314f8bf9095700f0995 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -468,9 +468,13 @@ struct sched_entity { u64 cg_idle_start; u64 cg_idle_sum; u64 cg_init_time; + u64 cg_nr_iowait; + u64 cg_iowait_sum; + u64 cg_iowait_start; u64 cg_ineffective_sum; u64 cg_ineffective_start; seqcount_t idle_seqcount; + spinlock_t iowait_lock; u64 nr_migrations; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7510ec0678a10e7600af8a7411fa9db5cd521184..6b3e01949debea96d8d76a62a766a3dfe43425d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2046,6 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); + update_nr_iowait(p, -1); } cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); @@ -2060,6 +2061,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); + update_nr_iowait(p, -1); } #endif /* CONFIG_SMP */ @@ -3436,6 +3438,7 @@ static void __sched notrace __schedule(bool preempt) if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); + update_nr_iowait(prev, 1); delayacct_blkio_start(); } } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 52f0307348bf4f62ca25f9875e9320f92317e317..eaacf6edf69b0d40e3f44e00daf16d2675aa2aa7 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -447,6 +447,7 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu) void cgroup_idle_start(struct sched_entity *se) { + unsigned long flags; u64 clock; if (!schedstat_enabled()) @@ -457,12 +458,18 @@ void cgroup_idle_start(struct sched_entity *se) write_seqcount_begin(&se->idle_seqcount); __schedstat_set(se->cg_idle_start, clock); write_seqcount_end(&se->idle_seqcount); + + spin_lock_irqsave(&se->iowait_lock, flags); + if (schedstat_val(se->cg_nr_iowait)) + __schedstat_set(se->cg_iowait_start, clock); + spin_unlock_irqrestore(&se->iowait_lock, flags); } void cgroup_idle_end(struct sched_entity *se) { + unsigned long flags; u64 clock; - u64 idle_start; + u64 idle_start, iowait_start; if (!schedstat_enabled()) return; @@ -474,6 +481,14 @@ void cgroup_idle_end(struct sched_entity *se) __schedstat_add(se->cg_idle_sum, clock - idle_start); __schedstat_set(se->cg_idle_start, 0); write_seqcount_end(&se->idle_seqcount); + + spin_lock_irqsave(&se->iowait_lock, flags); + if (schedstat_val(se->cg_nr_iowait)) { + iowait_start = schedstat_val(se->cg_iowait_start); + __schedstat_add(se->cg_iowait_sum, clock - iowait_start); + __schedstat_set(se->cg_iowait_start, 0); + } + spin_unlock_irqrestore(&se->iowait_lock, flags); } void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted, @@ -561,8 +576,9 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ]; if (se && schedstat_enabled()) { unsigned int seq; + unsigned long flags; u64 idle_start, ineff, ineff_start, elapse, complement; - u64 clock; + u64 clock, iowait_start; do { seq = read_seqcount_begin(&se->idle_seqcount); @@ -578,6 +594,13 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, if (ineff_start) __schedstat_add(ineff, clock - ineff_start); + spin_lock_irqsave(&se->iowait_lock, flags); + res->iowait = schedstat_val(se->cg_iowait_sum); + iowait_start = schedstat_val(se->cg_iowait_start); + if (iowait_start) + __schedstat_add(res->iowait, clock - iowait_start); + spin_unlock_irqrestore(&se->iowait_lock, flags); + res->steal = 0; elapse = clock - schedstat_val(se->cg_init_time); @@ -585,6 +608,7 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, if (elapse > complement) res->steal = elapse - complement; + res->idle -= res->iowait; } else { res->idle = res->iowait = res->steal = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c5cc33b2882327f870918d353b7e54bd4d4f453f..08671ef70091f8ca0b9e04e047e102ccf4e64939 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10032,6 +10032,44 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SCHED_SLI +static void update_nr_iowait_fair(struct task_struct *p, long inc) +{ + unsigned long flags; + struct sched_entity *se = p->se.parent; + u64 clock; + + if (!schedstat_enabled()) + return; + + clock = __rq_clock_broken(cpu_rq(p->cpu)); + + for_each_sched_entity(se) { + /* + * Avoid locking rq->lock from try_to_wakeup hot path, in + * the price of poor consistency among cgroup hierarchy, + * which we can tolerate. + * While accessing se->on_rq does need to hold rq->lock. We + * already do, because when inc==1, the caller is __schedule + * and task_move_group_fair + */ + spin_lock_irqsave(&se->iowait_lock, flags); + if (!se->on_rq && !schedstat_val(se->cg_nr_iowait) && inc > 0) + __schedstat_set(se->cg_iowait_start, clock); + if (schedstat_val(se->cg_iowait_start) > 0 && + schedstat_val(se->cg_nr_iowait) + inc == 0) { + __schedstat_add(se->cg_iowait_sum, clock - + schedstat_val(se->cg_iowait_start)); + __schedstat_set(se->cg_iowait_start, 0); + } + __schedstat_add(se->cg_nr_iowait, inc); + spin_unlock_irqrestore(&se->iowait_lock, flags); + } +} +#else +static void update_nr_iowait_fair(struct task_struct *p, long inc) {} +#endif + static void task_set_group_fair(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -10042,6 +10080,8 @@ static void task_set_group_fair(struct task_struct *p) static void task_move_group_fair(struct task_struct *p) { + if (p->in_iowait) + update_nr_iowait_fair(p, -1); detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); @@ -10050,6 +10090,8 @@ static void task_move_group_fair(struct task_struct *p) p->se.avg.last_update_time = 0; #endif attach_task_cfs_rq(p); + if (p->in_iowait) + update_nr_iowait_fair(p, 1); } static void task_change_group_fair(struct task_struct *p, int type) @@ -10196,6 +10238,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; seqcount_init(&se->idle_seqcount); + spin_lock_init(&se->iowait_lock); se->cg_idle_start = se->cg_init_time = cpu_clock(cpu); } @@ -10322,6 +10365,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_SLI .update_nr_uninterruptible = update_nr_uninterruptible_fair, + .update_nr_iowait = update_nr_iowait_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 06a75e694ef597d0219f2444458b4c2807a5a7e5..f1d1f314b1262363c088dddcdd899719b8929cba 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1677,6 +1677,7 @@ struct sched_class { #endif void (*update_nr_uninterruptible)(struct task_struct *p, long inc); + void (*update_nr_iowait)(struct task_struct *p, long inc); }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -1689,6 +1690,12 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +static inline void update_nr_iowait(struct task_struct *p, long inc) +{ + if (p->sched_class->update_nr_iowait) + p->sched_class->update_nr_iowait(p, inc); +} + #ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) #else