diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d98175d5c2c6ecd9dc4451f598b5615ddab35ebc..51a7600811931ad0a0d9cb58fabf80b41b8a5271 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) } static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long power_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); +struct numa_stats { + unsigned long load; + s64 eff_load; + unsigned long faults; +}; -static int -find_idlest_cpu_node(int this_cpu, int nid) -{ - unsigned long load, min_load = ULONG_MAX; - int i, idlest_cpu = this_cpu; +struct task_numa_env { + struct task_struct *p; - BUG_ON(cpu_to_node(this_cpu) == nid); + int src_cpu, src_nid; + int dst_cpu, dst_nid; - rcu_read_lock(); - for_each_cpu(i, cpumask_of_node(nid)) { - load = weighted_cpuload(i); + struct numa_stats src_stats, dst_stats; - if (load < min_load) { - min_load = load; - idlest_cpu = i; + unsigned long best_load; + int best_cpu; +}; + +static int task_numa_migrate(struct task_struct *p) +{ + int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid)); + struct task_numa_env env = { + .p = p, + .src_cpu = task_cpu(p), + .src_nid = cpu_to_node(task_cpu(p)), + .dst_cpu = node_cpu, + .dst_nid = p->numa_preferred_nid, + .best_load = ULONG_MAX, + .best_cpu = task_cpu(p), + }; + struct sched_domain *sd; + int cpu; + struct task_group *tg = task_group(p); + unsigned long weight; + bool balanced; + int imbalance_pct, idx = -1; + + /* + * Find the lowest common scheduling domain covering the nodes of both + * the CPU the task is currently running on and the target NUMA node. + */ + rcu_read_lock(); + for_each_domain(env.src_cpu, sd) { + if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) { + /* + * busy_idx is used for the load decision as it is the + * same index used by the regular load balancer for an + * active cpu. + */ + idx = sd->busy_idx; + imbalance_pct = sd->imbalance_pct; + break; } } rcu_read_unlock(); - return idlest_cpu; + if (WARN_ON_ONCE(idx == -1)) + return 0; + + /* + * XXX the below is mostly nicked from wake_affine(); we should + * see about sharing a bit if at all possible; also it might want + * some per entity weight love. + */ + weight = p->se.load.weight; + env.src_stats.load = source_load(env.src_cpu, idx); + env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2; + env.src_stats.eff_load *= power_of(env.src_cpu); + env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight); + + for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) { + env.dst_cpu = cpu; + env.dst_stats.load = target_load(cpu, idx); + + /* If the CPU is idle, use it */ + if (!env.dst_stats.load) { + env.best_cpu = cpu; + goto migrate; + } + + /* Otherwise check the target CPU load */ + env.dst_stats.eff_load = 100; + env.dst_stats.eff_load *= power_of(cpu); + env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight); + + /* + * Destination is considered balanced if the destination CPU is + * less loaded than the source CPU. Unfortunately there is a + * risk that a task running on a lightly loaded CPU will not + * migrate to its preferred node due to load imbalances. + */ + balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load); + if (!balanced) + continue; + + if (env.dst_stats.eff_load < env.best_load) { + env.best_load = env.dst_stats.eff_load; + env.best_cpu = cpu; + } + } + +migrate: + return migrate_task_to(p, env.best_cpu); } static void task_numa_placement(struct task_struct *p) @@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p) * the working set placement. */ if (max_faults && max_nid != p->numa_preferred_nid) { - int preferred_cpu; - - /* - * If the task is not on the preferred node then find the most - * idle CPU to migrate to. - */ - preferred_cpu = task_cpu(p); - if (cpu_to_node(preferred_cpu) != max_nid) { - preferred_cpu = find_idlest_cpu_node(preferred_cpu, - max_nid); - } - /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; p->numa_migrate_seq = 1; - migrate_task_to(p, preferred_cpu); + task_numa_migrate(p); } } @@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) /* the trivial, non-cgroup case */ + if (!tg->parent || !wl) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { @@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { return wl; }