未验证 提交 df9cfeee 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!844 A patchset of sched to improve benchmark performance

Merge Pull Request from: @NNNNicole 
 
1.sched/pelt: Relax the sync of *_sum with *_avg (patch1-patch3)
2.Adjust NUMA imbalance for multiple LLCs(patch4-patch6)
3.sched: Queue task on wakelist in the same llc if the wakee cpu is idle(patch7)
4.Clear ttwu_pending after enqueue_task(patch8)
 
 
Link:https://gitee.com/openeuler/kernel/pulls/844 

Reviewed-by: Zucheng Zheng <zhengzucheng@huawei.com> 
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> 
...@@ -153,8 +153,12 @@ struct sched_domain { ...@@ -153,8 +153,12 @@ struct sched_domain {
struct rcu_head rcu; /* used during destruction */ struct rcu_head rcu; /* used during destruction */
}; };
struct sched_domain_shared *shared; struct sched_domain_shared *shared;
#ifndef __GENKSYMS__
unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */
KABI_FILL_HOLE(unsigned int kabi_hole)
#else
KABI_RESERVE(1) KABI_RESERVE(1)
#endif
KABI_RESERVE(2) KABI_RESERVE(2)
unsigned int span_weight; unsigned int span_weight;
......
...@@ -2933,13 +2933,6 @@ void sched_ttwu_pending(void *arg) ...@@ -2933,13 +2933,6 @@ void sched_ttwu_pending(void *arg)
if (!llist) if (!llist)
return; return;
/*
* rq::ttwu_pending racy indication of out-standing wakeups.
* Races such that false-negatives are possible, since they
* are shorter lived that false-positives would be.
*/
WRITE_ONCE(rq->ttwu_pending, 0);
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
...@@ -2953,6 +2946,17 @@ void sched_ttwu_pending(void *arg) ...@@ -2953,6 +2946,17 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
} }
/*
* Must be after enqueueing at least once task such that
* idle_cpu() does not observe a false-negative -- if it does,
* it is possible for select_idle_siblings() to stack a number
* of tasks on this CPU during that window.
*
* It is ok to clear ttwu_pending when another task pending.
* We will receive IPI after local irq enabled and then enqueue it.
* Since now nr_running > 0, idle_cpu() will always get correct result.
*/
WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
} }
...@@ -3026,7 +3030,7 @@ bool cpus_share_lowest_cache(int this_cpu, int that_cpu) ...@@ -3026,7 +3030,7 @@ bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu); return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
} }
static inline bool ttwu_queue_cond(int cpu, int wake_flags) static inline bool ttwu_queue_cond(int cpu)
{ {
/* /*
* If the CPU does not share cache, then queue the task on the * If the CPU does not share cache, then queue the task on the
...@@ -3035,17 +3039,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) ...@@ -3035,17 +3039,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu)) if (!cpus_share_cache(smp_processor_id(), cpu))
return true; return true;
if (cpu == smp_processor_id())
return false;
/* /*
* If the task is descheduling and the only running task on the * If the wakee cpu is idle, or the task is descheduling and the
* CPU then use the wakelist to offload the task activation to * only running task on the CPU, then use the wakelist to offload
* the soon-to-be-idle CPU as the current CPU is likely busy. * the task activation to the idle (or soon-to-be-idle) CPU as
* nr_running is checked to avoid unnecessary task stacking. * the current CPU is likely busy. nr_running is checked to
* avoid unnecessary task stacking.
* *
* Note that we can only get here with (wakee) p->on_rq=0, * Note that we can only get here with (wakee) p->on_rq=0,
* p->on_cpu can be whatever, we've done the dequeue, so * p->on_cpu can be whatever, we've done the dequeue, so
* the wakee has been accounted out of ->nr_running. * the wakee has been accounted out of ->nr_running.
*/ */
if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running) if (!cpu_rq(cpu)->nr_running)
return true; return true;
return false; return false;
...@@ -3053,10 +3061,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) ...@@ -3053,10 +3061,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{ {
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
if (WARN_ON_ONCE(cpu == smp_processor_id()))
return false;
sched_clock_cpu(cpu); /* Sync clocks across CPUs */ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags); __ttwu_queue_wakelist(p, cpu, wake_flags);
return true; return true;
...@@ -3333,7 +3338,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -3333,7 +3338,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling. * scheduling.
*/ */
if (smp_load_acquire(&p->on_cpu) && if (smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock; goto unlock;
/* /*
...@@ -3895,7 +3900,8 @@ static inline void prepare_task(struct task_struct *next) ...@@ -3895,7 +3900,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it * Claim the task as running, we do this before switching to it
* such that any running task will have this set. * such that any running task will have this set.
* *
* See the ttwu() WF_ON_CPU case and its ordering comment. * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
* its ordering comment.
*/ */
WRITE_ONCE(next->on_cpu, 1); WRITE_ONCE(next->on_cpu, 1);
#endif #endif
......
...@@ -1524,6 +1524,7 @@ struct task_numa_env { ...@@ -1524,6 +1524,7 @@ struct task_numa_env {
int src_cpu, src_nid; int src_cpu, src_nid;
int dst_cpu, dst_nid; int dst_cpu, dst_nid;
int imb_numa_nr;
struct numa_stats src_stats, dst_stats; struct numa_stats src_stats, dst_stats;
...@@ -1539,7 +1540,7 @@ static unsigned long cpu_load(struct rq *rq); ...@@ -1539,7 +1540,7 @@ static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu); static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight); int dst_running, int imb_numa_nr);
static inline enum static inline enum
numa_type numa_classify(unsigned int imbalance_pct, numa_type numa_classify(unsigned int imbalance_pct,
...@@ -1920,7 +1921,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, ...@@ -1920,7 +1921,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1; dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running); imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running, imbalance = adjust_numa_imbalance(imbalance, dst_running,
env->dst_stats.weight); env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */ /* Use idle CPU if there is no imbalance */
if (!imbalance) { if (!imbalance) {
...@@ -1985,8 +1986,10 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1985,8 +1986,10 @@ static int task_numa_migrate(struct task_struct *p)
*/ */
rcu_read_lock(); rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
if (sd) if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
}
rcu_read_unlock(); rcu_read_unlock();
/* /*
...@@ -3084,6 +3087,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -3084,6 +3087,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
} }
#else #else
static inline void static inline void
...@@ -3449,11 +3455,11 @@ void set_task_rq_fair(struct sched_entity *se, ...@@ -3449,11 +3455,11 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{ {
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 divider; u32 new_sum, divider;
/* Nothing to update */ /* Nothing to update */
if (!delta) if (!delta_avg)
return; return;
/* /*
...@@ -3462,23 +3468,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq ...@@ -3462,23 +3468,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/ */
divider = get_pelt_divider(&cfs_rq->avg); divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's utilization */ /* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider; new_sum = se->avg.util_avg * divider;
delta_sum = (long)new_sum - (long)se->avg.util_sum;
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */ /* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta); add_positive(&cfs_rq->avg.util_avg, delta_avg);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; add_positive(&cfs_rq->avg.util_sum, delta_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
} }
static inline void static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{ {
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
u32 divider; u32 new_sum, divider;
/* Nothing to update */ /* Nothing to update */
if (!delta) if (!delta_avg)
return; return;
/* /*
...@@ -3489,19 +3502,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf ...@@ -3489,19 +3502,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */ /* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider; new_sum = se->avg.runnable_avg * divider;
delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */ /* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta); add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
} }
static inline void static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{ {
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg; unsigned long load_avg;
u64 load_sum = 0; u64 load_sum = 0;
s64 delta_sum;
u32 divider; u32 divider;
if (!runnable_sum) if (!runnable_sum)
...@@ -3528,7 +3547,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq ...@@ -3528,7 +3547,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable. * assuming all tasks are equally runnable.
*/ */
if (scale_load_down(gcfs_rq->load.weight)) { if (scale_load_down(gcfs_rq->load.weight)) {
load_sum = div_s64(gcfs_rq->avg.load_sum, load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight)); scale_load_down(gcfs_rq->load.weight));
} }
...@@ -3545,16 +3564,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq ...@@ -3545,16 +3564,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum); runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum; load_sum = se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider); load_avg = div_u64(load_sum, divider);
delta_avg = load_avg - se->avg.load_avg;
if (!delta_avg)
return;
delta = load_avg - se->avg.load_avg; delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
se->avg.load_sum = runnable_sum; se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg; se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_avg, delta); add_positive(&cfs_rq->avg.load_sum, delta_sum);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; /* See update_cfs_rq_load_avg() */
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
} }
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
...@@ -3670,7 +3695,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) ...@@ -3670,7 +3695,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load; r = removed_load;
sub_positive(&sa->load_avg, r); sub_positive(&sa->load_avg, r);
sa->load_sum = sa->load_avg * divider; sub_positive(&sa->load_sum, r * divider);
/* See sa->util_sum below */
sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util; r = removed_util;
sub_positive(&sa->util_avg, r); sub_positive(&sa->util_avg, r);
...@@ -3690,7 +3717,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) ...@@ -3690,7 +3717,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_runnable; r = removed_runnable;
sub_positive(&sa->runnable_avg, r); sub_positive(&sa->runnable_avg, r);
sa->runnable_sum = sa->runnable_avg * divider; sub_positive(&sa->runnable_sum, r * divider);
/* See sa->util_sum above */
sa->runnable_sum = max_t(u32, sa->runnable_sum,
sa->runnable_avg * PELT_MIN_DIVIDER);
/* /*
* removed_runnable is the unweighted version of removed_load so we * removed_runnable is the unweighted version of removed_load so we
...@@ -3777,17 +3807,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s ...@@ -3777,17 +3807,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
dequeue_load_avg(cfs_rq, se); dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
...@@ -9965,9 +9996,9 @@ static bool update_pick_idlest(struct sched_group *idlest, ...@@ -9965,9 +9996,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
* This is an approximation as the number of running tasks may not be * This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity. * related to the number of busy CPUs due to sched_setaffinity.
*/ */
static inline bool allow_numa_imbalance(int dst_running, int dst_weight) static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{ {
return (dst_running < (dst_weight >> 2)); return running <= imb_numa_nr;
} }
/* /*
...@@ -10106,12 +10137,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -10106,12 +10137,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest; return idlest;
#endif #endif
/* /*
* Otherwise, keep the task on this node to stay close * Otherwise, keep the task close to the wakeup source
* its wakeup source and improve locality. If there is * and improve locality if the number of running tasks
* a real need of migration, periodic load balance will * would remain below threshold where an imbalance is
* take care of it. * allowed. If there is a real need of migration,
* periodic load balance will take care of it.
*/ */
if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
return NULL; return NULL;
} }
...@@ -10291,9 +10323,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -10291,9 +10323,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
#define NUMA_IMBALANCE_MIN 2 #define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance, static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight) int dst_running, int imb_numa_nr)
{ {
if (!allow_numa_imbalance(dst_running, dst_weight)) if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance; return imbalance;
/* /*
...@@ -10405,7 +10437,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ...@@ -10405,7 +10437,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */ /* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) { if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance, env->imbalance = adjust_numa_imbalance(env->imbalance,
busiest->sum_nr_running, busiest->group_weight); local->sum_nr_running + 1, env->sd->imb_numa_nr);
} }
return; return;
......
...@@ -2085,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) ...@@ -2085,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */
#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
/* /*
* To aid in avoiding the subversion of "niceness" due to uneven distribution * To aid in avoiding the subversion of "niceness" due to uneven distribution
......
...@@ -2343,6 +2343,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ...@@ -2343,6 +2343,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
} }
} }
/*
* Calculate an allowed NUMA imbalance such that LLCs do not get
* imbalanced.
*/
for_each_cpu(i, cpu_map) {
unsigned int imb = 0;
unsigned int imb_span = 1;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
struct sched_domain *child = sd->child;
if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
(child->flags & SD_SHARE_PKG_RESOURCES)) {
struct sched_domain *top, *top_p;
unsigned int nr_llcs;
/*
* For a single LLC per node, allow an
* imbalance up to 25% of the node. This is an
* arbitrary cutoff based on SMT-2 to balance
* between memory bandwidth and avoiding
* premature sharing of HT resources and SMT-4
* or SMT-8 *may* benefit from a different
* cutoff.
*
* For multiple LLCs, allow an imbalance
* until multiple tasks would share an LLC
* on one node while LLCs on another node
* remain idle.
*/
nr_llcs = sd->span_weight / child->span_weight;
if (nr_llcs == 1)
imb = sd->span_weight >> 2;
else
imb = nr_llcs;
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */
top = sd;
top_p = top->parent;
while (top_p && !(top_p->flags & SD_NUMA)) {
top = top->parent;
top_p = top->parent;
}
imb_span = top_p ? top_p->span_weight : sd->span_weight;
} else {
int factor = max(1U, (sd->span_weight / imb_span));
sd->imb_numa_nr = imb * factor;
}
}
}
/* Calculate CPU capacity for physical packages and nodes */ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) { for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map)) if (!cpumask_test_cpu(i, cpu_map))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册