提交 7125face 编写于 作者: L Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched, x86: Avoid unnecessary overflow in sched_clock
  sched: Fix buglet in return_cfs_rq_runtime()
  sched: Avoid SMT siblings in select_idle_sibling() if possible
  sched: Set the command name of the idle tasks in SMP kernels
  sched, rt: Provide means of disabling cross-cpu bandwidth sharing
  sched: Document wait_for_completion_*() return values
  sched_fair: Fix a typo in the comment describing update_sd_lb_stats
  sched: Add a comment to effective_load() since it's a pain
...@@ -32,6 +32,22 @@ extern int no_timer_check; ...@@ -32,6 +32,22 @@ extern int no_timer_check;
* (mathieu.desnoyers@polymtl.ca) * (mathieu.desnoyers@polymtl.ca)
* *
* -johnstul@us.ibm.com "math is hard, lets go shopping!" * -johnstul@us.ibm.com "math is hard, lets go shopping!"
*
* In:
*
* ns = cycles * cyc2ns_scale / SC
*
* Although we may still have enough bits to store the value of ns,
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
* leading to an incorrect result.
*
* To avoid this, we can decompose 'cycles' into quotient and remainder
* of division by SC. Then,
*
* ns = (quot * SC + rem) * cyc2ns_scale / SC
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
*
* - sqazi@google.com
*/ */
DECLARE_PER_CPU(unsigned long, cyc2ns); DECLARE_PER_CPU(unsigned long, cyc2ns);
...@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); ...@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
static inline unsigned long long __cycles_2_ns(unsigned long long cyc) static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{ {
unsigned long long quot;
unsigned long long rem;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu); unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; quot = (cyc >> CYC2NS_SCALE_FACTOR);
rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
ns += quot * per_cpu(cyc2ns, cpu) +
((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
return ns; return ns;
} }
......
...@@ -126,6 +126,8 @@ extern struct cred init_cred; ...@@ -126,6 +126,8 @@ extern struct cred init_cred;
# define INIT_PERF_EVENTS(tsk) # define INIT_PERF_EVENTS(tsk)
#endif #endif
#define INIT_TASK_COMM "swapper"
/* /*
* INIT_TASK is used to set up the first task table, touch at * INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB) * your own risk!. Base=0, limit=0x1fffff (=2MB)
...@@ -162,7 +164,7 @@ extern struct cred init_cred; ...@@ -162,7 +164,7 @@ extern struct cred init_cred;
.group_leader = &tsk, \ .group_leader = &tsk, \
RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.real_cred, &init_cred), \
RCU_INIT_POINTER(.cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \
.comm = "swapper", \ .comm = INIT_TASK_COMM, \
.thread = INIT_THREAD, \ .thread = INIT_THREAD, \
.fs = &init_fs, \ .fs = &init_fs, \
.files = &init_files, \ .files = &init_files, \
......
...@@ -71,6 +71,7 @@ ...@@ -71,6 +71,7 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/ftrace.h> #include <linux/ftrace.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/init_task.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
...@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion); ...@@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
* This waits for either a completion of a specific task to be signaled or for a * This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not * specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. * interruptible.
*
* The return value is 0 if timed out, and positive (at least 1, or number of
* jiffies left till timeout) if completed.
*/ */
unsigned long __sched unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout) wait_for_completion_timeout(struct completion *x, unsigned long timeout)
...@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); ...@@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
* *
* This waits for completion of a specific task to be signaled. It is * This waits for completion of a specific task to be signaled. It is
* interruptible. * interruptible.
*
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/ */
int __sched wait_for_completion_interruptible(struct completion *x) int __sched wait_for_completion_interruptible(struct completion *x)
{ {
...@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); ...@@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* *
* This waits for either a completion of a specific task to be signaled or for a * This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies. * specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
* positive (at least 1, or number of jiffies left till timeout) if completed.
*/ */
long __sched long __sched
wait_for_completion_interruptible_timeout(struct completion *x, wait_for_completion_interruptible_timeout(struct completion *x,
...@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); ...@@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
* *
* This waits to be signaled for completion of a specific task. It can be * This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal. * interrupted by a kill signal.
*
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/ */
int __sched wait_for_completion_killable(struct completion *x) int __sched wait_for_completion_killable(struct completion *x)
{ {
...@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); ...@@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* This waits for either a completion of a specific task to be * This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be * signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies. * interrupted by a kill signal. The timeout is in jiffies.
*
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
* positive (at least 1, or number of jiffies left till timeout) if completed.
*/ */
long __sched long __sched
wait_for_completion_killable_timeout(struct completion *x, wait_for_completion_killable_timeout(struct completion *x,
...@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) ...@@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/ */
idle->sched_class = &idle_sched_class; idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu); ftrace_graph_init_idle_task(idle, cpu);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
} }
/* /*
......
...@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) ...@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq); list_del_leaf_cfs_rq(cfs_rq);
} }
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
/*
* Use this CPU's actual weight instead of the last load_contribution
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
tg_weight = atomic_read(&tg->load_weight);
tg_weight -= cfs_rq->load_contribution;
tg_weight += cfs_rq->load.weight;
return tg_weight;
}
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{ {
long load_weight, load, shares; long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight; load = cfs_rq->load.weight;
load_weight = atomic_read(&tg->load_weight);
load_weight += load;
load_weight -= cfs_rq->load_contribution;
shares = (tg->shares * load); shares = (tg->shares * load);
if (load_weight) if (tg_weight)
shares /= load_weight; shares /= tg_weight;
if (shares < MIN_SHARES) if (shares < MIN_SHARES)
shares = MIN_SHARES; shares = MIN_SHARES;
...@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ {
if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return; return;
__return_cfs_rq_runtime(cfs_rq); __return_cfs_rq_runtime(cfs_rq);
...@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p) ...@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement * Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one * of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares. * can calculate the shift in shares.
*
* Calculate the effective load difference if @wl is added (subtracted) to @tg
* on this @cpu and results in a total addition (subtraction) of @wg to the
* total group weight.
*
* Given a runqueue weight distribution (rw_i) we can compute a shares
* distribution (s_i) using:
*
* s_i = rw_i / \Sum rw_j (1)
*
* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
* shares distribution (s_i):
*
* rw_i = { 2, 4, 1, 0 }
* s_i = { 2/7, 4/7, 1/7, 0 }
*
* As per wake_affine() we're interested in the load of two CPUs (the CPU the
* task used to run on and the CPU the waker is running on), we need to
* compute the effect of waking a task on either CPU and, in case of a sync
* wakeup, compute the effect of the current task going to sleep.
*
* So for a change of @wl to the local @cpu with an overall group weight change
* of @wl we can compute the new shares distribution (s'_i) using:
*
* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
*
* Suppose we're interested in CPUs 0 and 1, and want to compute the load
* differences in waking a task to CPU 0. The additional task changes the
* weight and shares distributions like:
*
* rw'_i = { 3, 4, 1, 0 }
* s'_i = { 3/8, 4/8, 1/8, 0 }
*
* We can then compute the difference in effective weight by using:
*
* dw_i = S * (s'_i - s_i) (3)
*
* Where 'S' is the group weight as seen by its parent.
*
* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
* 4/7) times the weight of the group.
*/ */
static long effective_load(struct task_group *tg, int cpu, long wl, long wg) static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{ {
struct sched_entity *se = tg->se[cpu]; struct sched_entity *se = tg->se[cpu];
if (!tg->parent) if (!tg->parent) /* the trivial, non-cgroup case */
return wl; return wl;
for_each_sched_entity(se) { for_each_sched_entity(se) {
long lw, w; long w, W;
tg = se->my_q->tg; tg = se->my_q->tg;
w = se->my_q->load.weight;
/* use this cpu's instantaneous contribution */ /*
lw = atomic_read(&tg->load_weight); * W = @wg + \Sum rw_j
lw -= se->my_q->load_contribution; */
lw += w + wg; W = wg + calc_tg_weight(tg, se->my_q);
wl += w; /*
* w = rw_i + @wl
*/
w = se->my_q->load.weight + wl;
if (lw > 0 && wl < lw) /*
wl = (wl * tg->shares) / lw; * wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
wl = (w * tg->shares) / W;
else else
wl = tg->shares; wl = tg->shares;
/* zero point is MIN_SHARES */ /*
* Per the above, wl is the new se->load.weight value; since
* those are clipped to [MIN_SHARES, ...) do so now. See
* calc_cfs_shares().
*/
if (wl < MIN_SHARES) if (wl < MIN_SHARES)
wl = MIN_SHARES; wl = MIN_SHARES;
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
wl -= se->load.weight; wl -= se->load.weight;
/*
* Recursively apply this logic to all parent groups to compute
* the final effective load change on the root group. Since
* only the @tg group gets extra weight, all parent groups can
* only redistribute existing shares. @wl is the shift in shares
* resulting from this level per the above.
*/
wg = 0; wg = 0;
} }
...@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
int prev_cpu = task_cpu(p); int prev_cpu = task_cpu(p);
struct sched_domain *sd; struct sched_domain *sd;
int i; struct sched_group *sg;
int i, smt = 0;
/* /*
* If the task is going to be woken-up on this cpu and if it is * If the task is going to be woken-up on this cpu and if it is
...@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu. * Otherwise, iterate the domains and find an elegible idle cpu.
*/ */
rcu_read_lock(); rcu_read_lock();
again:
for_each_domain(target, sd) { for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
break; continue;
for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
if (idle_cpu(i)) { if (!smt) {
target = i; smt = 1;
break; goto again;
} }
break;
} }
/* sg = sd->groups;
* Lets stop looking for an idle sibling when we reached do {
* the domain that spans the current cpu and prev_cpu. if (!cpumask_intersects(sched_group_cpus(sg),
*/ tsk_cpus_allowed(p)))
if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && goto next;
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break; for_each_cpu(i, sched_group_cpus(sg)) {
if (!idle_cpu(i))
goto next;
}
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
next:
sg = sg->next;
} while (sg != sd->groups);
} }
done:
rcu_read_unlock(); rcu_read_unlock();
return target; return target;
...@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, ...@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
} }
/** /**
* update_sd_lb_stats - Update sched_group's statistics for load balancing. * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated. * @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed. * @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu * @idle: Idle status of this_cpu
......
...@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1) ...@@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
SCHED_FEAT(TTWU_QUEUE, 1) SCHED_FEAT(TTWU_QUEUE, 1)
SCHED_FEAT(FORCE_SD_OVERLAP, 0) SCHED_FEAT(FORCE_SD_OVERLAP, 0)
SCHED_FEAT(RT_RUNTIME_SHARE, 1)
...@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq) ...@@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
{ {
int more = 0; int more = 0;
if (!sched_feat(RT_RUNTIME_SHARE))
return more;
if (rt_rq->rt_time > rt_rq->rt_runtime) { if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_rq->rt_runtime_lock);
more = do_balance_runtime(rt_rq); more = do_balance_runtime(rt_rq);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册