提交 b2e09f63 编写于 作者: L Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull more scheduler updates from Ingo Molnar:
 "Second round of scheduler changes:
   - try-to-wakeup and IPI reduction speedups, from Andy Lutomirski
   - continued power scheduling cleanups and refactorings, from Nicolas
     Pitre
   - misc fixes and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Delete extraneous extern for to_ratio()
  sched/idle: Optimize try-to-wake-up IPI
  sched/idle: Simplify wake_up_idle_cpu()
  sched/idle: Clear polling before descheduling the idle thread
  sched, trace: Add a tracepoint for IPI-less remote wakeups
  cpuidle: Set polling in poll_idle
  sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...)
  sched: Rename capacity related flags
  sched: Final power vs. capacity cleanups
  sched: Remove remaining dubious usage of "power"
  sched: Let 'struct sched_group_power' care about CPU capacity
  sched/fair: Disambiguate existing/remaining "capacity" usage
  sched/fair: Change "has_capacity" to "has_free_capacity"
  sched/fair: Remove "power" from 'struct numa_stats'
  sched: Fix signedness bug in yield_to()
  sched/fair: Use time_after() in record_wakee()
  sched/balancing: Reduce the rate of needless idle load balancing
  sched/fair: Fix unlocked reads of some cfs_b->quota/period
......@@ -26,30 +26,30 @@
#include <asm/topology.h>
/*
* cpu power scale management
* cpu capacity scale management
*/
/*
* cpu power table
* cpu capacity table
* This per cpu data structure describes the relative capacity of each core.
* On a heteregenous system, cores don't have the same computation capacity
* and we reflect that difference in the cpu_power field so the scheduler can
* take this difference into account during load balance. A per cpu structure
* is preferred because each CPU updates its own cpu_power field during the
* load balance except for idle cores. One idle core is selected to run the
* rebalance_domains for all idle cores and the cpu_power can be updated
* during this sequence.
* and we reflect that difference in the cpu_capacity field so the scheduler
* can take this difference into account during load balance. A per cpu
* structure is preferred because each CPU updates its own cpu_capacity field
* during the load balance except for idle cores. One idle core is selected
* to run the rebalance_domains for all idle cores and the cpu_capacity can be
* updated during this sequence.
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(cpu_scale, cpu);
}
static void set_power_scale(unsigned int cpu, unsigned long power)
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
{
per_cpu(cpu_scale, cpu) = power;
per_cpu(cpu_scale, cpu) = capacity;
}
#ifdef CONFIG_OF
......@@ -62,11 +62,11 @@ struct cpu_efficiency {
* Table of relative efficiency of each processors
* The efficiency value must fit in 20bit and the final
* cpu_scale value must be in the range
* 0 < cpu_scale < 3*SCHED_POWER_SCALE/2
* 0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
* in order to return at most 1 when DIV_ROUND_CLOSEST
* is used to compute the capacity of a CPU.
* Processors that are not defined in the table,
* use the default SCHED_POWER_SCALE value for cpu_scale.
* use the default SCHED_CAPACITY_SCALE value for cpu_scale.
*/
static const struct cpu_efficiency table_efficiency[] = {
{"arm,cortex-a15", 3891},
......@@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1;
* Iterate all CPUs' descriptor in DT and compute the efficiency
* (as per table_efficiency). Also calculate a middle efficiency
* as close as possible to (max{eff_i} - min{eff_i}) / 2
* This is later used to scale the cpu_power field such that an
* 'average' CPU is of middle power. Also see the comments near
* table_efficiency[] and update_cpu_power().
* This is later used to scale the cpu_capacity field such that an
* 'average' CPU is of middle capacity. Also see the comments near
* table_efficiency[] and update_cpu_capacity().
*/
static void __init parse_dt_topology(void)
{
......@@ -141,15 +141,15 @@ static void __init parse_dt_topology(void)
* cpu_scale because all CPUs have the same capacity. Otherwise, we
* compute a middle_capacity factor that will ensure that the capacity
* of an 'average' CPU of the system will be as close as possible to
* SCHED_POWER_SCALE, which is the default value, but with the
* SCHED_CAPACITY_SCALE, which is the default value, but with the
* constraint explained near table_efficiency[].
*/
if (4*max_capacity < (3*(max_capacity + min_capacity)))
middle_capacity = (min_capacity + max_capacity)
>> (SCHED_POWER_SHIFT+1);
>> (SCHED_CAPACITY_SHIFT+1);
else
middle_capacity = ((max_capacity / 3)
>> (SCHED_POWER_SHIFT-1)) + 1;
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
}
......@@ -158,20 +158,20 @@ static void __init parse_dt_topology(void)
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
* function returns directly for SMP system.
*/
static void update_cpu_power(unsigned int cpu)
static void update_cpu_capacity(unsigned int cpu)
{
if (!cpu_capacity(cpu))
return;
set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
cpu, arch_scale_freq_power(NULL, cpu));
printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_freq_capacity(NULL, cpu));
}
#else
static inline void parse_dt_topology(void) {}
static inline void update_cpu_power(unsigned int cpuid) {}
static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif
/*
......@@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid)
update_siblings_masks(cpuid);
update_cpu_power(cpuid);
update_cpu_capacity(cpuid);
printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
cpuid, cpu_topology[cpuid].thread_id,
......@@ -297,7 +297,7 @@ void __init init_cpu_topology(void)
{
unsigned int cpu;
/* init core mask and power*/
/* init core mask and capacity */
for_each_possible_cpu(cpu) {
struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);
......@@ -307,7 +307,7 @@ void __init init_cpu_topology(void)
cpumask_clear(&cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
set_power_scale(cpu, SCHED_POWER_SCALE);
set_capacity_scale(cpu, SCHED_CAPACITY_SCALE);
}
smp_wmb();
......
......@@ -749,7 +749,7 @@ int setup_profiling_timer(unsigned int multiplier)
/* cpumask of CPUs with asymetric SMT dependancy */
static const int powerpc_smt_flags(void)
{
int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
......
......@@ -187,8 +187,11 @@ static int poll_idle(struct cpuidle_device *dev,
t1 = ktime_get();
local_irq_enable();
while (!need_resched())
cpu_relax();
if (!current_set_polling_and_test()) {
while (!need_resched())
cpu_relax();
}
current_clr_polling();
t2 = ktime_get();
diff = ktime_to_us(ktime_sub(t2, t1));
......
......@@ -586,7 +586,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
......
......@@ -847,10 +847,10 @@ enum cpu_idle_type {
};
/*
* Increase resolution of cpu_power calculations
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_POWER_SHIFT 10
#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
#define SCHED_CAPACITY_SHIFT 10
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/*
* sched-domains (multiprocessor balancing) declarations:
......@@ -862,7 +862,7 @@ enum cpu_idle_type {
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
......@@ -874,7 +874,7 @@ enum cpu_idle_type {
#ifdef CONFIG_SCHED_SMT
static inline const int cpu_smt_flags(void)
{
return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
}
#endif
......@@ -1006,7 +1006,7 @@ typedef const int (*sched_domain_flags_f)(void);
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
struct sched_group_power **__percpu sgp;
struct sched_group_capacity **__percpu sgc;
};
struct sched_domain_topology_level {
......@@ -2173,7 +2173,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
extern bool yield_to(struct task_struct *p, bool preempt);
extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
/**
......
......@@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa,
__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
__entry->dst_cpu, __entry->dst_nid)
);
/*
* Tracepoint for waking a polling cpu without an IPI.
*/
TRACE_EVENT(sched_wake_idle_without_ipi,
TP_PROTO(int cpu),
TP_ARGS(cpu),
TP_STRUCT__entry(
__field( int, cpu )
),
TP_fast_assign(
__entry->cpu = cpu;
),
TP_printk("cpu=%d", __entry->cpu)
);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
......
......@@ -535,7 +535,7 @@ static inline void init_hrtick(void)
__old; \
})
#ifdef TIF_POLLING_NRFLAG
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
......@@ -546,12 +546,44 @@ static bool set_nr_and_not_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
/*
* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
*
* If this returns true, then the idle task promises to call
* sched_ttwu_pending() and reschedule soon.
*/
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
if (old == val)
break;
val = old;
}
return true;
}
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
#endif
/*
......@@ -580,6 +612,8 @@ void resched_task(struct task_struct *p)
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
......@@ -642,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
/*
* This is safe, as this function is called with the timer
* wheel base lock of (cpu) held. When the CPU is on the way
* to idle and has not yet set rq->curr to idle then it will
* be serialized on the timer wheel base lock and take the new
* timer into account automatically.
*/
if (rq->curr != rq->idle)
return;
/*
* We can set TIF_RESCHED on the idle task of the other CPU
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(rq->idle))
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
......@@ -888,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
......@@ -1521,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
static void sched_ttwu_pending(void)
void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
unsigned long flags;
raw_spin_lock(&rq->lock);
if (!llist)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
......@@ -1535,7 +1556,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
......@@ -1581,8 +1602,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
smp_send_reschedule(cpu);
struct rq *rq = cpu_rq(cpu);
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
if (!set_nr_if_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
}
bool cpus_share_cache(int this_cpu, int that_cpu)
......@@ -4219,7 +4246,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
bool __sched yield_to(struct task_struct *p, bool preempt)
int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
......@@ -5245,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
* Even though we initialize ->power to something semi-sane,
* we leave power_orig unset. This allows us to detect if
* Even though we initialize ->capacity to something semi-sane,
* we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
if (!group->sgp->power_orig) {
if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
......@@ -5274,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->sgp->power);
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
}
group = group->next;
......@@ -5334,7 +5360,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
......@@ -5365,7 +5391,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
......@@ -5490,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
static void free_sched_groups(struct sched_group *sg, int free_sgp)
static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
......@@ -5501,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
kfree(sg->sgp);
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
kfree(sg);
sg = tmp;
......@@ -5520,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
kfree(sd->groups->sgp);
kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
......@@ -5731,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
if (atomic_inc_return(&sg->sgp->ref) == 1)
sg->sgc = *per_cpu_ptr(sdd->sgc, i);
if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
* Initialize sgp->power such that even if we mess up the
* Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
sg->sgp->power_orig = sg->sgp->power;
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
......@@ -5779,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
......@@ -5789,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
* and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
......@@ -5843,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
* Initialize sched groups cpu_power.
* Initialize sched groups cpu_capacity.
*
* cpu_power indicates the capacity of sched group, which is used while
* cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
* Typically cpu_power for all the groups in a sched domain will be same unless
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
* Typically cpu_capacity for all the groups in a sched domain will be same
* unless there are asymmetries in the topology. If there are asymmetries,
* group having more cpu_capacity will pickup more load compared to the
* group having less cpu_capacity.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
......@@ -5866,8 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
update_group_capacity(sd, cpu);
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
......@@ -5958,8 +5984,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
*per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
#ifdef CONFIG_NUMA
......@@ -5972,7 +5998,7 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
* SD_SHARE_CPUPOWER - describes SMT topologies
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
......@@ -5981,7 +6007,7 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUPOWER | \
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
......@@ -6027,7 +6053,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
......@@ -6049,7 +6075,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
if (sd->flags & SD_SHARE_CPUPOWER) {
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
......@@ -6361,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
sdd->sgp = alloc_percpu(struct sched_group_power *);
if (!sdd->sgp)
sdd->sgc = alloc_percpu(struct sched_group_capacity *);
if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
......@@ -6386,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
if (!sgc)
return -ENOMEM;
*per_cpu_ptr(sdd->sgp, j) = sgp;
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
......@@ -6418,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgp)
kfree(*per_cpu_ptr(sdd->sgp, j));
if (sdd->sgc)
kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
free_percpu(sdd->sgp);
sdd->sgp = NULL;
free_percpu(sdd->sgc);
sdd->sgc = NULL;
}
}
......@@ -6496,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
/* Calculate CPU power for physical packages and nodes */
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
init_sched_groups_power(i, sd);
init_sched_groups_capacity(i, sd);
}
}
......@@ -6946,7 +6972,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_power = SCHED_POWER_SCALE;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
......
......@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
extern unsigned long to_ratio(u64 period, u64 runtime);
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
......
此差异已折叠。
......@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
* Use arch dependent cpu power functions
* Use arch dependent cpu capacity functions
*/
SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
* Decrement CPU power based on time not spent running tasks
* Decrement CPU capacity based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
......
......@@ -12,6 +12,8 @@
#include <trace/events/power.h>
#include "sched.h"
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
......@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
*
* On archs that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
......@@ -175,10 +181,22 @@ static void cpuidle_idle_call(void)
/*
* Generic idle loop implementation
*
* Called with polling cleared.
*/
static void cpu_idle_loop(void)
{
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if
* rq->curr != rq->idle). This means that, if rq->idle has
* the polling bit set, then setting need_resched is
* guaranteed to cause the cpu to reschedule.
*/
__current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
......@@ -218,6 +236,17 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending and reschedule
* if need_resched is set while polling is set. That
* means that clearing polling needs to be visible
* before doing these things.
*/
smp_mb__after_atomic();
sched_ttwu_pending();
schedule_preempt_disabled();
}
}
......@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
__current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
......@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
......@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
......
......@@ -567,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
......@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
......@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
struct sched_group_capacity {
atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU.
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
unsigned int power, power_orig;
unsigned int capacity, capacity_orig;
unsigned long next_update;
int imbalance; /* XXX unrelated to power but shared group state */
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
......@@ -750,7 +752,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
......@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgp->cpumask);
return to_cpumask(sg->sgc->cpumask);
}
/**
......@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
#else
static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
#include "stats.h"
......@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
extern void update_group_power(struct sched_domain *sd, int cpu);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
......
......@@ -1714,11 +1714,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
#endif /* !CONFIG_S390 */
bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
int kvm_vcpu_yield_to(struct kvm_vcpu *target)
{
struct pid *pid;
struct task_struct *task = NULL;
bool ret = false;
int ret = 0;
rcu_read_lock();
pid = rcu_dereference(target->pid);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部