Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
openeuler
Kernel
提交
d14c8a68
K
Kernel
项目概览
openeuler
/
Kernel
大约 1 年 前同步成功
通知
5
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
K
Kernel
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d14c8a68
编写于
7月 14, 2008
作者:
I
Ingo Molnar
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'sched/for-linus' into tracing/for-linus
上级
d59fdcf2
873a6ed6
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
1555 addition
and
570 deletion
+1555
-570
Documentation/scheduler/sched-domains.txt
Documentation/scheduler/sched-domains.txt
+2
-5
Documentation/scheduler/sched-rt-group.txt
Documentation/scheduler/sched-rt-group.txt
+2
-2
include/linux/sched.h
include/linux/sched.h
+30
-29
kernel/Makefile
kernel/Makefile
+3
-2
kernel/cpu.c
kernel/cpu.c
+24
-0
kernel/cpuset.c
kernel/cpuset.c
+13
-1
kernel/kthread.c
kernel/kthread.c
+1
-0
kernel/sched.c
kernel/sched.c
+491
-232
kernel/sched_clock.c
kernel/sched_clock.c
+118
-19
kernel/sched_cpupri.c
kernel/sched_cpupri.c
+174
-0
kernel/sched_cpupri.h
kernel/sched_cpupri.h
+36
-0
kernel/sched_debug.c
kernel/sched_debug.c
+59
-5
kernel/sched_fair.c
kernel/sched_fair.c
+290
-123
kernel/sched_features.h
kernel/sched_features.h
+5
-2
kernel/sched_rt.c
kernel/sched_rt.c
+264
-141
kernel/sched_stats.h
kernel/sched_stats.h
+33
-9
kernel/sysctl.c
kernel/sysctl.c
+8
-0
kernel/time/tick-sched.c
kernel/time/tick-sched.c
+2
-0
未找到文件。
Documentation/scheduler/sched-domains.txt
浏览文件 @
d14c8a68
...
...
@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
arch_init_sched_domains function. This function will attach domains to all
CPUs using cpu_attach_domain.
Implementors should change the line
#undef SCHED_DOMAIN_DEBUG
to
#define SCHED_DOMAIN_DEBUG
in kernel/sched.c as this enables an error checking parse of the sched domains
The sched-domains debugging infrastructure can be enabled by enabling
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
which should catch most possible errors (described above). It also prints out
the domain structure in a visual format.
Documentation/scheduler/sched-rt-group.txt
浏览文件 @
d14c8a68
...
...
@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
0.00015s. So this group can be scheduled with a period of 0.005s and a run time
of 0.00015s.
The remaining CPU time will be used for user input and other tass. Because
The remaining CPU time will be used for user input and other tas
k
s. Because
realtime tasks have explicitly allocated the CPU time they need to perform
their tasks, buffer underruns in the graph
o
cs or audio can be eliminated.
their tasks, buffer underruns in the graph
i
cs or audio can be eliminated.
NOTE: the above example is not fully implemented as of yet (2.6.25). We still
lack an EDF scheduler to make non-uniform periods usable.
...
...
include/linux/sched.h
浏览文件 @
d14c8a68
...
...
@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
extern
unsigned
long
nr_uninterruptible
(
void
);
extern
unsigned
long
nr_active
(
void
);
extern
unsigned
long
nr_iowait
(
void
);
extern
unsigned
long
weighted_cpuload
(
const
int
cpu
);
struct
seq_file
;
struct
cfs_rq
;
...
...
@@ -784,6 +783,8 @@ struct sched_domain {
unsigned
int
balance_interval
;
/* initialise to 1. units in ms. */
unsigned
int
nr_balance_failed
;
/* initialise to 0 */
u64
last_update
;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned
int
lb_count
[
CPU_MAX_IDLE_TYPES
];
...
...
@@ -823,23 +824,6 @@ extern int arch_reinit_sched_domains(void);
#endif
/* CONFIG_SMP */
/*
* A runqueue laden with a single nice 0 task scores a weighted_cpuload of
* SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
* task of nice 0 or enough lower priority tasks to bring up the
* weighted_cpuload
*/
static
inline
int
above_background_load
(
void
)
{
unsigned
long
cpu
;
for_each_online_cpu
(
cpu
)
{
if
(
weighted_cpuload
(
cpu
)
>=
SCHED_LOAD_SCALE
)
return
1
;
}
return
0
;
}
struct
io_context
;
/* See blkdev.h */
#define NGROUPS_SMALL 32
#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
...
...
@@ -921,8 +905,8 @@ struct sched_class {
void
(
*
set_cpus_allowed
)(
struct
task_struct
*
p
,
const
cpumask_t
*
newmask
);
void
(
*
join_domain
)(
struct
rq
*
rq
);
void
(
*
leave_domain
)(
struct
rq
*
rq
);
void
(
*
rq_online
)(
struct
rq
*
rq
);
void
(
*
rq_offline
)(
struct
rq
*
rq
);
void
(
*
switched_from
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
,
int
running
);
...
...
@@ -1039,6 +1023,7 @@ struct task_struct {
#endif
int
prio
,
static_prio
,
normal_prio
;
unsigned
int
rt_priority
;
const
struct
sched_class
*
sched_class
;
struct
sched_entity
se
;
struct
sched_rt_entity
rt
;
...
...
@@ -1122,7 +1107,6 @@ struct task_struct {
int
__user
*
set_child_tid
;
/* CLONE_CHILD_SETTID */
int
__user
*
clear_child_tid
;
/* CLONE_CHILD_CLEARTID */
unsigned
int
rt_priority
;
cputime_t
utime
,
stime
,
utimescaled
,
stimescaled
;
cputime_t
gtime
;
cputime_t
prev_utime
,
prev_stime
;
...
...
@@ -1141,12 +1125,12 @@ struct task_struct {
gid_t
gid
,
egid
,
sgid
,
fsgid
;
struct
group_info
*
group_info
;
kernel_cap_t
cap_effective
,
cap_inheritable
,
cap_permitted
,
cap_bset
;
unsigned
securebits
;
struct
user_struct
*
user
;
unsigned
securebits
;
#ifdef CONFIG_KEYS
unsigned
char
jit_keyring
;
/* default keyring to attach requested keys to */
struct
key
*
request_key_auth
;
/* assumed request_key authority */
struct
key
*
thread_keyring
;
/* keyring private to this thread */
unsigned
char
jit_keyring
;
/* default keyring to attach requested keys to */
#endif
char
comm
[
TASK_COMM_LEN
];
/* executable name excluding path
- access with [gs]et_task_comm (which lock
...
...
@@ -1233,8 +1217,8 @@ struct task_struct {
# define MAX_LOCK_DEPTH 48UL
u64
curr_chain_key
;
int
lockdep_depth
;
struct
held_lock
held_locks
[
MAX_LOCK_DEPTH
];
unsigned
int
lockdep_recursion
;
struct
held_lock
held_locks
[
MAX_LOCK_DEPTH
];
#endif
/* journalling filesystem info */
...
...
@@ -1262,10 +1246,6 @@ struct task_struct {
u64
acct_vm_mem1
;
/* accumulated virtual memory usage */
cputime_t
acct_stimexpd
;
/* stime since last update */
#endif
#ifdef CONFIG_NUMA
struct
mempolicy
*
mempolicy
;
short
il_next
;
#endif
#ifdef CONFIG_CPUSETS
nodemask_t
mems_allowed
;
int
cpuset_mems_generation
;
...
...
@@ -1284,6 +1264,10 @@ struct task_struct {
#endif
struct
list_head
pi_state_list
;
struct
futex_pi_state
*
pi_state_cache
;
#endif
#ifdef CONFIG_NUMA
struct
mempolicy
*
mempolicy
;
short
il_next
;
#endif
atomic_t
fs_excl
;
/* holding fs exclusive resources */
struct
rcu_head
rcu
;
...
...
@@ -1504,6 +1488,7 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_SWAPWRITE 0x00800000
/* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000
/* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000
/* Spread some slab caches over cpuset */
#define PF_THREAD_BOUND 0x04000000
/* Thread bound to specific cpu */
#define PF_MEMPOLICY 0x10000000
/* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000
/* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000
/* Freezer should not count it as freezeable */
...
...
@@ -1573,13 +1558,28 @@ static inline void sched_clock_idle_sleep_event(void)
static
inline
void
sched_clock_idle_wakeup_event
(
u64
delta_ns
)
{
}
#else
#ifdef CONFIG_NO_HZ
static
inline
void
sched_clock_tick_stop
(
int
cpu
)
{
}
static
inline
void
sched_clock_tick_start
(
int
cpu
)
{
}
#endif
#else
/* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
extern
void
sched_clock_init
(
void
);
extern
u64
sched_clock_cpu
(
int
cpu
);
extern
void
sched_clock_tick
(
void
);
extern
void
sched_clock_idle_sleep_event
(
void
);
extern
void
sched_clock_idle_wakeup_event
(
u64
delta_ns
);
#ifdef CONFIG_NO_HZ
extern
void
sched_clock_tick_stop
(
int
cpu
);
extern
void
sched_clock_tick_start
(
int
cpu
);
#endif
#endif
/* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
...
...
@@ -1622,6 +1622,7 @@ extern unsigned int sysctl_sched_child_runs_first;
extern
unsigned
int
sysctl_sched_features
;
extern
unsigned
int
sysctl_sched_migration_cost
;
extern
unsigned
int
sysctl_sched_nr_migrate
;
extern
unsigned
int
sysctl_sched_shares_ratelimit
;
int
sched_nr_latency_handler
(
struct
ctl_table
*
table
,
int
write
,
struct
file
*
file
,
void
__user
*
buffer
,
size_t
*
length
,
...
...
kernel/Makefile
浏览文件 @
d14c8a68
...
...
@@ -3,7 +3,7 @@
#
obj-y
=
sched.o fork.o exec_domain.o panic.o printk.o profile.o
\
exit.o itimer.o time.o softirq.o resource.o
\
cpu.o
exit.o itimer.o time.o softirq.o resource.o
\
sysctl.o capability.o ptrace.o timer.o user.o
\
signal.o sys.o kmod.o workqueue.o pid.o
\
rcupdate.o extable.o params.o posix-timers.o
\
...
...
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES)
+=
rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER)
+=
rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA)
+=
dma.o
obj-$(CONFIG_SMP)
+=
cpu.o
spinlock.o
obj-$(CONFIG_SMP)
+=
spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK)
+=
spinlock.o
obj-$(CONFIG_PROVE_LOCKING)
+=
spinlock.o
obj-$(CONFIG_UID16)
+=
uid16.o
...
...
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS)
+=
taskstats.o tsacct.o
obj-$(CONFIG_MARKERS)
+=
marker.o
obj-$(CONFIG_LATENCYTOP)
+=
latencytop.o
obj-$(CONFIG_SMP)
+=
sched_cpupri.o
ifneq
($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
...
...
kernel/cpu.c
浏览文件 @
d14c8a68
...
...
@@ -15,6 +15,28 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>
/*
* Represents all cpu's present in the system
* In systems capable of hotplug, this map could dynamically grow
* as new cpu's are detected in the system via any platform specific
* method, such as ACPI for e.g.
*/
cpumask_t
cpu_present_map
__read_mostly
;
EXPORT_SYMBOL
(
cpu_present_map
);
#ifndef CONFIG_SMP
/*
* Represents all cpu's that are currently online.
*/
cpumask_t
cpu_online_map
__read_mostly
=
CPU_MASK_ALL
;
EXPORT_SYMBOL
(
cpu_online_map
);
cpumask_t
cpu_possible_map
__read_mostly
=
CPU_MASK_ALL
;
EXPORT_SYMBOL
(
cpu_possible_map
);
#else
/* CONFIG_SMP */
/* Serializes the updates to cpu_online_map, cpu_present_map */
static
DEFINE_MUTEX
(
cpu_add_remove_lock
);
...
...
@@ -403,3 +425,5 @@ void __ref enable_nonboot_cpus(void)
cpu_maps_update_done
();
}
#endif
/* CONFIG_PM_SLEEP_SMP */
#endif
/* CONFIG_SMP */
kernel/cpuset.c
浏览文件 @
d14c8a68
...
...
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if
(
cpus_empty
(
cs
->
cpus_allowed
)
||
nodes_empty
(
cs
->
mems_allowed
))
return
-
ENOSPC
;
if
(
tsk
->
flags
&
PF_THREAD_BOUND
)
{
cpumask_t
mask
;
mutex_lock
(
&
callback_mutex
);
mask
=
cs
->
cpus_allowed
;
mutex_unlock
(
&
callback_mutex
);
if
(
!
cpus_equal
(
tsk
->
cpus_allowed
,
mask
))
return
-
EINVAL
;
}
return
security_task_setscheduler
(
tsk
,
0
,
NULL
);
}
...
...
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
struct
mm_struct
*
mm
;
struct
cpuset
*
cs
=
cgroup_cs
(
cont
);
struct
cpuset
*
oldcs
=
cgroup_cs
(
oldcont
);
int
err
;
mutex_lock
(
&
callback_mutex
);
guarantee_online_cpus
(
cs
,
&
cpus
);
set_cpus_allowed_ptr
(
tsk
,
&
cpus
);
err
=
set_cpus_allowed_ptr
(
tsk
,
&
cpus
);
mutex_unlock
(
&
callback_mutex
);
if
(
err
)
return
;
from
=
oldcs
->
mems_allowed
;
to
=
cs
->
mems_allowed
;
...
...
kernel/kthread.c
浏览文件 @
d14c8a68
...
...
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
set_task_cpu
(
k
,
cpu
);
k
->
cpus_allowed
=
cpumask_of_cpu
(
cpu
);
k
->
rt
.
nr_cpus_allowed
=
1
;
k
->
flags
|=
PF_THREAD_BOUND
;
}
EXPORT_SYMBOL
(
kthread_bind
);
...
...
kernel/sched.c
浏览文件 @
d14c8a68
此差异已折叠。
点击以展开。
kernel/sched_clock.c
浏览文件 @
d14c8a68
...
...
@@ -3,6 +3,9 @@
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
*
* Based on code by:
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
...
...
@@ -32,6 +35,11 @@
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
#define MULTI_SHIFT 15
/* Max is double, Min is 1/2 */
#define MAX_MULTI (2LL << MULTI_SHIFT)
#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
struct
sched_clock_data
{
/*
* Raw spinlock - this is a special case: this might be called
...
...
@@ -40,11 +48,15 @@ struct sched_clock_data {
*/
raw_spinlock_t
lock
;
unsigned
long
prev
_jiffies
;
unsigned
long
tick
_jiffies
;
u64
prev_raw
;
u64
tick_raw
;
u64
tick_gtod
;
u64
clock
;
s64
multi
;
#ifdef CONFIG_NO_HZ
int
check_max
;
#endif
};
static
DEFINE_PER_CPU_SHARED_ALIGNED
(
struct
sched_clock_data
,
sched_clock_data
);
...
...
@@ -71,41 +83,91 @@ void sched_clock_init(void)
struct
sched_clock_data
*
scd
=
cpu_sdc
(
cpu
);
scd
->
lock
=
(
raw_spinlock_t
)
__RAW_SPIN_LOCK_UNLOCKED
;
scd
->
prev
_jiffies
=
now_jiffies
;
scd
->
tick
_jiffies
=
now_jiffies
;
scd
->
prev_raw
=
0
;
scd
->
tick_raw
=
0
;
scd
->
tick_gtod
=
ktime_now
;
scd
->
clock
=
ktime_now
;
scd
->
multi
=
1
<<
MULTI_SHIFT
;
#ifdef CONFIG_NO_HZ
scd
->
check_max
=
1
;
#endif
}
sched_clock_running
=
1
;
}
#ifdef CONFIG_NO_HZ
/*
* The dynamic ticks makes the delta jiffies inaccurate. This
* prevents us from checking the maximum time update.
* Disable the maximum check during stopped ticks.
*/
void
sched_clock_tick_stop
(
int
cpu
)
{
struct
sched_clock_data
*
scd
=
cpu_sdc
(
cpu
);
scd
->
check_max
=
0
;
}
void
sched_clock_tick_start
(
int
cpu
)
{
struct
sched_clock_data
*
scd
=
cpu_sdc
(
cpu
);
scd
->
check_max
=
1
;
}
static
int
check_max
(
struct
sched_clock_data
*
scd
)
{
return
scd
->
check_max
;
}
#else
static
int
check_max
(
struct
sched_clock_data
*
scd
)
{
return
1
;
}
#endif
/* CONFIG_NO_HZ */
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
* - use jiffies to generate a min,max window to clip the raw values
*/
static
void
__update_sched_clock
(
struct
sched_clock_data
*
scd
,
u64
now
)
static
void
__update_sched_clock
(
struct
sched_clock_data
*
scd
,
u64
now
,
u64
*
time
)
{
unsigned
long
now_jiffies
=
jiffies
;
long
delta_jiffies
=
now_jiffies
-
scd
->
prev
_jiffies
;
long
delta_jiffies
=
now_jiffies
-
scd
->
tick
_jiffies
;
u64
clock
=
scd
->
clock
;
u64
min_clock
,
max_clock
;
s64
delta
=
now
-
scd
->
prev_raw
;
WARN_ON_ONCE
(
!
irqs_disabled
());
min_clock
=
scd
->
tick_gtod
+
delta_jiffies
*
TICK_NSEC
;
/*
* At schedule tick the clock can be just under the gtod. We don't
* want to push it too prematurely.
*/
min_clock
=
scd
->
tick_gtod
+
(
delta_jiffies
*
TICK_NSEC
);
if
(
min_clock
>
TICK_NSEC
)
min_clock
-=
TICK_NSEC
/
2
;
if
(
unlikely
(
delta
<
0
))
{
clock
++
;
goto
out
;
}
max_clock
=
min_clock
+
TICK_NSEC
;
/*
* The clock must stay within a jiffie of the gtod.
* But since we may be at the start of a jiffy or the end of one
* we add another jiffy buffer.
*/
max_clock
=
scd
->
tick_gtod
+
(
2
+
delta_jiffies
)
*
TICK_NSEC
;
delta
*=
scd
->
multi
;
delta
>>=
MULTI_SHIFT
;
if
(
unlikely
(
clock
+
delta
>
max_clock
))
{
if
(
unlikely
(
clock
+
delta
>
max_clock
)
&&
check_max
(
scd
)
)
{
if
(
clock
<
max_clock
)
clock
=
max_clock
;
else
...
...
@@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
if
(
unlikely
(
clock
<
min_clock
))
clock
=
min_clock
;
scd
->
prev_raw
=
now
;
scd
->
prev_jiffies
=
now_jiffies
;
scd
->
clock
=
clock
;
if
(
time
)
*
time
=
clock
;
else
{
scd
->
prev_raw
=
now
;
scd
->
clock
=
clock
;
}
}
static
void
lock_double_clock
(
struct
sched_clock_data
*
data1
,
...
...
@@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu)
now
-=
my_scd
->
tick_raw
;
now
+=
scd
->
tick_raw
;
now
-
=
my_scd
->
tick_gtod
;
now
+
=
scd
->
tick_gtod
;
now
+
=
my_scd
->
tick_gtod
;
now
-
=
scd
->
tick_gtod
;
__raw_spin_unlock
(
&
my_scd
->
lock
);
__update_sched_clock
(
scd
,
now
,
&
clock
);
__raw_spin_unlock
(
&
scd
->
lock
);
}
else
{
__raw_spin_lock
(
&
scd
->
lock
);
__update_sched_clock
(
scd
,
now
,
NULL
);
clock
=
scd
->
clock
;
__raw_spin_unlock
(
&
scd
->
lock
);
}
__update_sched_clock
(
scd
,
now
);
clock
=
scd
->
clock
;
__raw_spin_unlock
(
&
scd
->
lock
);
return
clock
;
}
void
sched_clock_tick
(
void
)
{
struct
sched_clock_data
*
scd
=
this_scd
();
unsigned
long
now_jiffies
=
jiffies
;
s64
mult
,
delta_gtod
,
delta_raw
;
u64
now
,
now_gtod
;
if
(
unlikely
(
!
sched_clock_running
))
...
...
@@ -186,18 +256,33 @@ void sched_clock_tick(void)
WARN_ON_ONCE
(
!
irqs_disabled
());
now
=
sched_clock
();
now_gtod
=
ktime_to_ns
(
ktime_get
());
now
=
sched_clock
();
__raw_spin_lock
(
&
scd
->
lock
);
__update_sched_clock
(
scd
,
now
);
__update_sched_clock
(
scd
,
now
,
NULL
);
/*
* update tick_gtod after __update_sched_clock() because that will
* already observe 1 new jiffy; adding a new tick_gtod to that would
* increase the clock 2 jiffies.
*/
delta_gtod
=
now_gtod
-
scd
->
tick_gtod
;
delta_raw
=
now
-
scd
->
tick_raw
;
if
((
long
)
delta_raw
>
0
)
{
mult
=
delta_gtod
<<
MULTI_SHIFT
;
do_div
(
mult
,
delta_raw
);
scd
->
multi
=
mult
;
if
(
scd
->
multi
>
MAX_MULTI
)
scd
->
multi
=
MAX_MULTI
;
else
if
(
scd
->
multi
<
MIN_MULTI
)
scd
->
multi
=
MIN_MULTI
;
}
else
scd
->
multi
=
1
<<
MULTI_SHIFT
;
scd
->
tick_raw
=
now
;
scd
->
tick_gtod
=
now_gtod
;
scd
->
tick_jiffies
=
now_jiffies
;
__raw_spin_unlock
(
&
scd
->
lock
);
}
...
...
@@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
__raw_spin_lock
(
&
scd
->
lock
);
scd
->
prev_raw
=
now
;
scd
->
clock
+=
delta_ns
;
scd
->
multi
=
1
<<
MULTI_SHIFT
;
__raw_spin_unlock
(
&
scd
->
lock
);
touch_softlockup_watchdog
();
...
...
@@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
{
return
(
unsigned
long
long
)
jiffies
*
(
NSEC_PER_SEC
/
HZ
);
}
unsigned
long
long
cpu_clock
(
int
cpu
)
{
unsigned
long
long
clock
;
unsigned
long
flags
;
local_irq_save
(
flags
);
clock
=
sched_clock_cpu
(
cpu
);
local_irq_restore
(
flags
);
return
clock
;
}
EXPORT_SYMBOL_GPL
(
cpu_clock
);
kernel/sched_cpupri.c
0 → 100644
浏览文件 @
d14c8a68
/*
* kernel/sched_cpupri.c
*
* CPU priority management
*
* Copyright (C) 2007-2008 Novell
*
* Author: Gregory Haskins <ghaskins@novell.com>
*
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
* (INVALID), IDLE, NORMAL, RT1, ... RT99
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for cpus
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include "sched_cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static
int
convert_prio
(
int
prio
)
{
int
cpupri
;
if
(
prio
==
CPUPRI_INVALID
)
cpupri
=
CPUPRI_INVALID
;
else
if
(
prio
==
MAX_PRIO
)
cpupri
=
CPUPRI_IDLE
;
else
if
(
prio
>=
MAX_RT_PRIO
)
cpupri
=
CPUPRI_NORMAL
;
else
cpupri
=
MAX_RT_PRIO
-
prio
+
1
;
return
cpupri
;
}
#define for_each_cpupri_active(array, idx) \
for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
idx < CPUPRI_NR_PRIORITIES; \
idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
* fact changed priorities any number of times. While not ideal, it is not
* an issue of correctness since the normal rebalancer logic will correct
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
* Returns: (int)bool - CPUs were found
*/
int
cpupri_find
(
struct
cpupri
*
cp
,
struct
task_struct
*
p
,
cpumask_t
*
lowest_mask
)
{
int
idx
=
0
;
int
task_pri
=
convert_prio
(
p
->
prio
);
for_each_cpupri_active
(
cp
->
pri_active
,
idx
)
{
struct
cpupri_vec
*
vec
=
&
cp
->
pri_to_cpu
[
idx
];
cpumask_t
mask
;
if
(
idx
>=
task_pri
)
break
;
cpus_and
(
mask
,
p
->
cpus_allowed
,
vec
->
mask
);
if
(
cpus_empty
(
mask
))
continue
;
*
lowest_mask
=
mask
;
return
1
;
}
return
0
;
}
/**
* cpupri_set - update the cpu priority setting
* @cp: The cpupri context
* @cpu: The target cpu
* @pri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void
cpupri_set
(
struct
cpupri
*
cp
,
int
cpu
,
int
newpri
)
{
int
*
currpri
=
&
cp
->
cpu_to_pri
[
cpu
];
int
oldpri
=
*
currpri
;
unsigned
long
flags
;
newpri
=
convert_prio
(
newpri
);
BUG_ON
(
newpri
>=
CPUPRI_NR_PRIORITIES
);
if
(
newpri
==
oldpri
)
return
;
/*
* If the cpu was currently mapped to a different value, we
* first need to unmap the old value
*/
if
(
likely
(
oldpri
!=
CPUPRI_INVALID
))
{
struct
cpupri_vec
*
vec
=
&
cp
->
pri_to_cpu
[
oldpri
];
spin_lock_irqsave
(
&
vec
->
lock
,
flags
);
vec
->
count
--
;
if
(
!
vec
->
count
)
clear_bit
(
oldpri
,
cp
->
pri_active
);
cpu_clear
(
cpu
,
vec
->
mask
);
spin_unlock_irqrestore
(
&
vec
->
lock
,
flags
);
}
if
(
likely
(
newpri
!=
CPUPRI_INVALID
))
{
struct
cpupri_vec
*
vec
=
&
cp
->
pri_to_cpu
[
newpri
];
spin_lock_irqsave
(
&
vec
->
lock
,
flags
);
cpu_set
(
cpu
,
vec
->
mask
);
vec
->
count
++
;
if
(
vec
->
count
==
1
)
set_bit
(
newpri
,
cp
->
pri_active
);
spin_unlock_irqrestore
(
&
vec
->
lock
,
flags
);
}
*
currpri
=
newpri
;
}
/**
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
* Returns: (void)
*/
void
cpupri_init
(
struct
cpupri
*
cp
)
{
int
i
;
memset
(
cp
,
0
,
sizeof
(
*
cp
));
for
(
i
=
0
;
i
<
CPUPRI_NR_PRIORITIES
;
i
++
)
{
struct
cpupri_vec
*
vec
=
&
cp
->
pri_to_cpu
[
i
];
spin_lock_init
(
&
vec
->
lock
);
vec
->
count
=
0
;
cpus_clear
(
vec
->
mask
);
}
for_each_possible_cpu
(
i
)
cp
->
cpu_to_pri
[
i
]
=
CPUPRI_INVALID
;
}
kernel/sched_cpupri.h
0 → 100644
浏览文件 @
d14c8a68
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
#define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0
#define CPUPRI_NORMAL 1
/* values 2-101 are RT priorities 0-99 */
struct
cpupri_vec
{
spinlock_t
lock
;
int
count
;
cpumask_t
mask
;
};
struct
cpupri
{
struct
cpupri_vec
pri_to_cpu
[
CPUPRI_NR_PRIORITIES
];
long
pri_active
[
CPUPRI_NR_PRI_WORDS
];
int
cpu_to_pri
[
NR_CPUS
];
};
#ifdef CONFIG_SMP
int
cpupri_find
(
struct
cpupri
*
cp
,
struct
task_struct
*
p
,
cpumask_t
*
lowest_mask
);
void
cpupri_set
(
struct
cpupri
*
cp
,
int
cpu
,
int
pri
);
void
cpupri_init
(
struct
cpupri
*
cp
);
#else
#define cpupri_set(cp, cpu, pri) do { } while (0)
#define cpupri_init() do { } while (0)
#endif
#endif
/* _LINUX_CPUPRI_H */
kernel/sched_debug.c
浏览文件 @
d14c8a68
...
...
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct
sched_entity
*
last
;
unsigned
long
flags
;
#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
SEQ_printf
(
m
,
"
\n
cfs_rq[%d]:
\n
"
,
cpu
);
#else
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
char
path
[
128
]
=
""
;
struct
cgroup
*
cgroup
=
NULL
;
struct
task_group
*
tg
=
cfs_rq
->
tg
;
...
...
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cgroup_path
(
cgroup
,
path
,
sizeof
(
path
));
SEQ_printf
(
m
,
"
\n
cfs_rq[%d]:%s
\n
"
,
cpu
,
path
);
#else
SEQ_printf
(
m
,
"
\n
cfs_rq[%d]:
\n
"
,
cpu
);
#endif
SEQ_printf
(
m
,
" .%-30s: %Ld.%06ld
\n
"
,
"exec_clock"
,
...
...
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf
(
m
,
" .%-30s: %ld
\n
"
,
"nr_running"
,
cfs_rq
->
nr_running
);
SEQ_printf
(
m
,
" .%-30s: %ld
\n
"
,
"load"
,
cfs_rq
->
load
.
weight
);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf
(
m
,
" .%-30s: %d
\n
"
,
"bkl_count"
,
rq
->
bkl_count
);
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
P
(
yld_exp_empty
);
P
(
yld_act_empty
);
P
(
yld_both_empty
);
P
(
yld_count
);
P
(
sched_switch
);
P
(
sched_count
);
P
(
sched_goidle
);
P
(
ttwu_count
);
P
(
ttwu_local
);
P
(
bkl_count
);
#undef P
#endif
SEQ_printf
(
m
,
" .%-30s: %ld
\n
"
,
"nr_spread_over"
,
cfs_rq
->
nr_spread_over
);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
SEQ_printf
(
m
,
" .%-30s: %lu
\n
"
,
"shares"
,
cfs_rq
->
shares
);
#endif
#endif
}
void
print_rt_rq
(
struct
seq_file
*
m
,
int
cpu
,
struct
rt_rq
*
rt_rq
)
{
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
char
path
[
128
]
=
""
;
struct
cgroup
*
cgroup
=
NULL
;
struct
task_group
*
tg
=
rt_rq
->
tg
;
if
(
tg
)
cgroup
=
tg
->
css
.
cgroup
;
if
(
cgroup
)
cgroup_path
(
cgroup
,
path
,
sizeof
(
path
));
SEQ_printf
(
m
,
"
\n
rt_rq[%d]:%s
\n
"
,
cpu
,
path
);
#else
SEQ_printf
(
m
,
"
\n
rt_rq[%d]:
\n
"
,
cpu
);
#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
P
(
rt_nr_running
);
P
(
rt_throttled
);
PN
(
rt_time
);
PN
(
rt_runtime
);
#undef PN
#undef P
}
static
void
print_cpu
(
struct
seq_file
*
m
,
int
cpu
)
...
...
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#undef PN
print_cfs_stats
(
m
,
cpu
);
print_rt_stats
(
m
,
cpu
);
print_rq
(
m
,
rq
,
cpu
);
}
...
...
kernel/sched_fair.c
浏览文件 @
d14c8a68
...
...
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/*
* SCHED_OTHER wake-up granularity.
* (default:
10
msec * (1 + ilog(ncpus)), units: nanoseconds)
* (default:
5
msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned
int
sysctl_sched_wakeup_granularity
=
10
000000UL
;
unsigned
int
sysctl_sched_wakeup_granularity
=
5
000000UL
;
const_debug
unsigned
int
sysctl_sched_migration_cost
=
500000UL
;
...
...
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
}
#endif
/*
* delta *= w / rw
*/
static
inline
unsigned
long
calc_delta_weight
(
unsigned
long
delta
,
struct
sched_entity
*
se
)
{
for_each_sched_entity
(
se
)
{
delta
=
calc_delta_mine
(
delta
,
se
->
load
.
weight
,
&
cfs_rq_of
(
se
)
->
load
);
}
return
delta
;
}
/*
* delta *= rw / w
*/
static
inline
unsigned
long
calc_delta_fair
(
unsigned
long
delta
,
struct
sched_entity
*
se
)
{
for_each_sched_entity
(
se
)
{
delta
=
calc_delta_mine
(
delta
,
cfs_rq_of
(
se
)
->
load
.
weight
,
&
se
->
load
);
}
return
delta
;
}
/*
* The idea is to set a period in which each task runs once.
*
...
...
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
*/
static
u64
sched_slice
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
u64
slice
=
__sched_period
(
cfs_rq
->
nr_running
);
for_each_sched_entity
(
se
)
{
cfs_rq
=
cfs_rq_of
(
se
);
slice
*=
se
->
load
.
weight
;
do_div
(
slice
,
cfs_rq
->
load
.
weight
);
}
return
slice
;
return
calc_delta_weight
(
__sched_period
(
cfs_rq
->
nr_running
),
se
);
}
/*
* We calculate the vruntime slice of a to be inserted task
*
* vs = s
/w = p/rw
* vs = s
*rw/w = p
*/
static
u64
sched_vslice_add
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
unsigned
long
nr_running
=
cfs_rq
->
nr_running
;
unsigned
long
weight
;
u64
vslice
;
if
(
!
se
->
on_rq
)
nr_running
++
;
vslice
=
__sched_period
(
nr_running
);
return
__sched_period
(
nr_running
);
}
/*
* The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
* that it favours >=0 over <0.
*
* -20 |
* |
* 0 --------+-------
* .'
* 19 .'
*
*/
static
unsigned
long
calc_delta_asym
(
unsigned
long
delta
,
struct
sched_entity
*
se
)
{
struct
load_weight
lw
=
{
.
weight
=
NICE_0_LOAD
,
.
inv_weight
=
1UL
<<
(
WMULT_SHIFT
-
NICE_0_SHIFT
)
};
for_each_sched_entity
(
se
)
{
cfs_rq
=
cfs_rq_of
(
se
);
struct
load_weight
*
se_lw
=
&
se
->
load
;
unsigned
long
rw
=
cfs_rq_of
(
se
)
->
load
.
weight
;
#ifdef CONFIG_FAIR_SCHED_GROUP
struct
cfs_rq
*
cfs_rq
=
se
->
my_q
;
struct
task_group
*
tg
=
NULL
if
(
cfs_rq
)
tg
=
cfs_rq
->
tg
;
if
(
tg
&&
tg
->
shares
<
NICE_0_LOAD
)
{
/*
* scale shares to what it would have been had
* tg->weight been NICE_0_LOAD:
*
* weight = 1024 * shares / tg->weight
*/
lw
.
weight
*=
se
->
load
.
weight
;
lw
.
weight
/=
tg
->
shares
;
lw
.
inv_weight
=
0
;
se_lw
=
&
lw
;
rw
+=
lw
.
weight
-
se
->
load
.
weight
;
}
else
#endif
weight
=
cfs_rq
->
load
.
weight
;
if
(
!
se
->
on_rq
)
weight
+=
se
->
load
.
weight
;
if
(
se
->
load
.
weight
<
NICE_0_LOAD
)
{
se_lw
=
&
lw
;
rw
+=
NICE_0_LOAD
-
se
->
load
.
weight
;
}
vslice
*=
NICE_0_LOAD
;
do_div
(
vslice
,
weight
);
delta
=
calc_delta_mine
(
delta
,
rw
,
se_lw
);
}
return
vslice
;
return
delta
;
}
/*
...
...
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr
->
sum_exec_runtime
+=
delta_exec
;
schedstat_add
(
cfs_rq
,
exec_clock
,
delta_exec
);
delta_exec_weighted
=
delta_exec
;
if
(
unlikely
(
curr
->
load
.
weight
!=
NICE_0_LOAD
))
{
delta_exec_weighted
=
calc_delta_fair
(
delta_exec_weighted
,
&
curr
->
load
);
}
delta_exec_weighted
=
calc_delta_fair
(
delta_exec
,
curr
);
curr
->
vruntime
+=
delta_exec_weighted
;
}
...
...
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static
void
add_cfs_task_weight
(
struct
cfs_rq
*
cfs_rq
,
unsigned
long
weight
)
{
cfs_rq
->
task_weight
+=
weight
;
}
#else
static
inline
void
add_cfs_task_weight
(
struct
cfs_rq
*
cfs_rq
,
unsigned
long
weight
)
{
}
#endif
static
void
account_entity_enqueue
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
update_load_add
(
&
cfs_rq
->
load
,
se
->
load
.
weight
);
if
(
!
parent_entity
(
se
))
inc_cpu_load
(
rq_of
(
cfs_rq
),
se
->
load
.
weight
);
if
(
entity_is_task
(
se
))
add_cfs_task_weight
(
cfs_rq
,
se
->
load
.
weight
);
cfs_rq
->
nr_running
++
;
se
->
on_rq
=
1
;
list_add
(
&
se
->
group_node
,
&
cfs_rq
->
tasks
);
...
...
@@ -523,6 +597,10 @@ static void
account_entity_dequeue
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
update_load_sub
(
&
cfs_rq
->
load
,
se
->
load
.
weight
);
if
(
!
parent_entity
(
se
))
dec_cpu_load
(
rq_of
(
cfs_rq
),
se
->
load
.
weight
);
if
(
entity_is_task
(
se
))
add_cfs_task_weight
(
cfs_rq
,
-
se
->
load
.
weight
);
cfs_rq
->
nr_running
--
;
se
->
on_rq
=
0
;
list_del_init
(
&
se
->
group_node
);
...
...
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if
(
!
initial
)
{
/* sleeps upto a single latency don't count. */
if
(
sched_feat
(
NEW_FAIR_SLEEPERS
))
vruntime
-=
sysctl_sched_latency
;
if
(
sched_feat
(
NEW_FAIR_SLEEPERS
))
{
unsigned
long
thresh
=
sysctl_sched_latency
;
/*
* convert the sleeper threshold into virtual time
*/
if
(
sched_feat
(
NORMALIZED_SLEEPER
))
thresh
=
calc_delta_fair
(
thresh
,
se
);
vruntime
-=
thresh
;
}
/* ensure we never gain time by being placed backwards. */
vruntime
=
max_vruntime
(
se
->
vruntime
,
vruntime
);
...
...
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity
(
cfs_rq
,
se
);
}
static
void
update_avg
(
u64
*
avg
,
u64
sample
)
{
s64
diff
=
sample
-
*
avg
;
*
avg
+=
diff
>>
3
;
}
static
void
update_avg_stats
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
if
(
!
se
->
last_wakeup
)
return
;
update_avg
(
&
se
->
avg_overlap
,
se
->
sum_exec_runtime
-
se
->
last_wakeup
);
se
->
last_wakeup
=
0
;
}
static
void
dequeue_entity
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
,
int
sleep
)
{
...
...
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue
(
cfs_rq
,
se
);
if
(
sleep
)
{
update_avg_stats
(
cfs_rq
,
se
);
#ifdef CONFIG_SCHEDSTATS
if
(
entity_is_task
(
se
))
{
struct
task_struct
*
tsk
=
task_of
(
se
);
...
...
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se
->
prev_sum_exec_runtime
=
se
->
sum_exec_runtime
;
}
static
int
wakeup_preempt_entity
(
struct
sched_entity
*
curr
,
struct
sched_entity
*
se
);
static
struct
sched_entity
*
pick_next
(
struct
cfs_rq
*
cfs_rq
,
struct
sched_entity
*
se
)
{
if
(
!
cfs_rq
->
next
)
return
se
;
struct
rq
*
rq
=
rq_of
(
cfs_rq
);
u64
pair_slice
=
rq
->
clock
-
cfs_rq
->
pair_start
;
if
(
wakeup_preempt_entity
(
cfs_rq
->
next
,
se
)
!=
0
)
if
(
!
cfs_rq
->
next
||
pair_slice
>
sched_slice
(
cfs_rq
,
cfs_rq
->
next
))
{
cfs_rq
->
pair_start
=
rq
->
clock
;
return
se
;
}
return
cfs_rq
->
next
;
}
...
...
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
hrtick_start
(
rq
,
delta
,
requeue
);
}
}
#else
#else
/* !CONFIG_SCHED_HRTICK */
static
inline
void
hrtick_start_fair
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
{
...
...
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
}
return
cpu
;
}
#else
#else
/* !ARCH_HAS_SCHED_WAKE_IDLE*/
static
inline
int
wake_idle
(
int
cpu
,
struct
task_struct
*
p
)
{
return
cpu
;
...
...
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
static
const
struct
sched_class
fair_sched_class
;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
*
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
*
* The problem is that perfectly aligning the shares is rather expensive, hence
* we try to avoid doing that too often - see update_shares(), which ratelimits
* this change.
*
* We compensate this by not only taking the current delta into account, but
* also considering the delta between when the shares were last adjusted and
* now.
*
* We still saw a performance dip, some tracing learned us that between
* cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
* significantly. Therefore try to bias the error in direction of failing
* the affine wakeup.
*
*/
static
long
effective_load
(
struct
task_group
*
tg
,
int
cpu
,
long
wl
,
long
wg
)
{
struct
sched_entity
*
se
=
tg
->
se
[
cpu
];
long
more_w
;
if
(
!
tg
->
parent
)
return
wl
;
/*
* By not taking the decrease of shares on the other cpu into
* account our error leans towards reducing the affine wakeups.
*/
if
(
!
wl
&&
sched_feat
(
ASYM_EFF_LOAD
))
return
wl
;
/*
* Instead of using this increment, also add the difference
* between when the shares were last updated and now.
*/
more_w
=
se
->
my_q
->
load
.
weight
-
se
->
my_q
->
rq_weight
;
wl
+=
more_w
;
wg
+=
more_w
;
for_each_sched_entity
(
se
)
{
#define D(n) (likely(n) ? (n) : 1)
long
S
,
rw
,
s
,
a
,
b
;
S
=
se
->
my_q
->
tg
->
shares
;
s
=
se
->
my_q
->
shares
;
rw
=
se
->
my_q
->
rq_weight
;
a
=
S
*
(
rw
+
wl
);
b
=
S
*
rw
+
s
*
wg
;
wl
=
s
*
(
a
-
b
)
/
D
(
b
);
/*
* Assume the group is already running and will
* thus already be accounted for in the weight.
*
* That is, moving shares between CPUs, does not
* alter the group weight.
*/
wg
=
0
;
#undef D
}
return
wl
;
}
#else
static
inline
unsigned
long
effective_load
(
struct
task_group
*
tg
,
int
cpu
,
unsigned
long
wl
,
unsigned
long
wg
)
{
return
wl
;
}
#endif
static
int
wake_affine
(
struct
rq
*
rq
,
struct
sched_domain
*
this_sd
,
struct
rq
*
this_rq
,
struct
task_struct
*
p
,
int
prev_cpu
,
int
this_cpu
,
int
sync
,
...
...
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
unsigned
int
imbalance
)
{
struct
task_struct
*
curr
=
this_rq
->
curr
;
struct
task_group
*
tg
;
unsigned
long
tl
=
this_load
;
unsigned
long
tl_per_task
;
unsigned
long
weight
;
int
balanced
;
if
(
!
(
this_sd
->
flags
&
SD_WAKE_AFFINE
)
||
!
sched_feat
(
AFFINE_WAKEUPS
))
...
...
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
* effect of the currently running task from the load
* of the current CPU:
*/
if
(
sync
)
tl
-=
current
->
se
.
load
.
weight
;
if
(
sync
)
{
tg
=
task_group
(
current
);
weight
=
current
->
se
.
load
.
weight
;
tl
+=
effective_load
(
tg
,
this_cpu
,
-
weight
,
-
weight
);
load
+=
effective_load
(
tg
,
prev_cpu
,
0
,
-
weight
);
}
balanced
=
100
*
(
tl
+
p
->
se
.
load
.
weight
)
<=
imbalance
*
load
;
tg
=
task_group
(
p
);
weight
=
p
->
se
.
load
.
weight
;
balanced
=
100
*
(
tl
+
effective_load
(
tg
,
this_cpu
,
weight
,
weight
))
<=
imbalance
*
(
load
+
effective_load
(
tg
,
prev_cpu
,
0
,
weight
));
/*
* If the currently running task will sleep within
* a reasonable amount of time then attract this newly
* woken task:
*/
if
(
sync
&&
balanced
&&
curr
->
sched_class
==
&
fair_sched_class
)
{
if
(
sync
&&
balanced
)
{
if
(
curr
->
se
.
avg_overlap
<
sysctl_sched_migration_cost
&&
p
->
se
.
avg_overlap
<
sysctl_sched_migration_cost
)
p
->
se
.
avg_overlap
<
sysctl_sched_migration_cost
)
return
1
;
}
...
...
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
unsigned
long
gran
=
sysctl_sched_wakeup_granularity
;
/*
* More easily preempt - nice tasks, while not making
*
it harder for
+ nice tasks.
* More easily preempt - nice tasks, while not making
it harder for
* + nice tasks.
*/
if
(
unlikely
(
se
->
load
.
weight
>
NICE_0_LOAD
))
gran
=
calc_delta_fair
(
gran
,
&
se
->
load
);
if
(
sched_feat
(
ASYM_GRAN
))
gran
=
calc_delta_asym
(
sysctl_sched_wakeup_granularity
,
se
);
else
gran
=
calc_delta_fair
(
sysctl_sched_wakeup_granularity
,
se
);
return
gran
;
}
...
...
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
return
;
}
se
->
last_wakeup
=
se
->
sum_exec_runtime
;
if
(
unlikely
(
se
==
pse
))
return
;
...
...
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
struct
task_struct
*
p
=
NULL
;
struct
sched_entity
*
se
;
if
(
next
==
&
cfs_rq
->
tasks
)
return
NULL
;
/* Skip over entities that are not tasks */
do
{
while
(
next
!=
&
cfs_rq
->
tasks
)
{
se
=
list_entry
(
next
,
struct
sched_entity
,
group_node
);
next
=
next
->
next
;
}
while
(
next
!=
&
cfs_rq
->
tasks
&&
!
entity_is_task
(
se
));
if
(
next
==
&
cfs_rq
->
tasks
)
return
NULL
;
/* Skip over entities that are not tasks */
if
(
entity_is_task
(
se
))
{
p
=
task_of
(
se
);
break
;
}
}
cfs_rq
->
balance_iterator
=
next
;
if
(
entity_is_task
(
se
))
p
=
task_of
(
se
);
return
p
;
}
...
...
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
return
__load_balance_iterator
(
cfs_rq
,
cfs_rq
->
balance_iterator
);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static
int
cfs_rq_best_prio
(
struct
cfs_rq
*
cfs_rq
)
static
unsigned
long
__load_balance_fair
(
struct
rq
*
this_rq
,
int
this_cpu
,
struct
rq
*
busiest
,
unsigned
long
max_load_move
,
struct
sched_domain
*
sd
,
enum
cpu_idle_type
idle
,
int
*
all_pinned
,
int
*
this_best_prio
,
struct
cfs_rq
*
cfs_rq
)
{
struct
sched_entity
*
curr
;
struct
task_struct
*
p
;
if
(
!
cfs_rq
->
nr_running
||
!
first_fair
(
cfs_rq
))
return
MAX_PRIO
;
curr
=
cfs_rq
->
curr
;
if
(
!
curr
)
curr
=
__pick_next_entity
(
cfs_rq
);
struct
rq_iterator
cfs_rq_iterator
;
p
=
task_of
(
curr
);
cfs_rq_iterator
.
start
=
load_balance_start_fair
;
cfs_rq_iterator
.
next
=
load_balance_next_fair
;
cfs_rq_iterator
.
arg
=
cfs_rq
;
return
p
->
prio
;
return
balance_tasks
(
this_rq
,
this_cpu
,
busiest
,
max_load_move
,
sd
,
idle
,
all_pinned
,
this_best_prio
,
&
cfs_rq_iterator
);
}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
static
unsigned
long
load_balance_fair
(
struct
rq
*
this_rq
,
int
this_cpu
,
struct
rq
*
busiest
,
unsigned
long
max_load_move
,
struct
sched_domain
*
sd
,
enum
cpu_idle_type
idle
,
int
*
all_pinned
,
int
*
this_best_prio
)
{
struct
cfs_rq
*
busy_cfs_rq
;
long
rem_load_move
=
max_load_move
;
struct
rq_iterator
cfs_rq_iterator
;
cfs_rq_iterator
.
start
=
load_balance_start_fair
;
cfs_rq_iterator
.
next
=
load_balance_next_fair
;
int
busiest_cpu
=
cpu_of
(
busiest
);
struct
task_group
*
tg
;
for_each_leaf_cfs_rq
(
busiest
,
busy_cfs_rq
)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
struct
cfs_rq
*
this_cfs_rq
;
long
imbalance
;
unsigned
long
maxload
;
rcu_read_lock
();
update_h_load
(
busiest_cpu
);
this_cfs_rq
=
cpu_cfs_rq
(
busy_cfs_rq
,
this_cpu
);
list_for_each_entry
(
tg
,
&
task_groups
,
list
)
{
struct
cfs_rq
*
busiest_cfs_rq
=
tg
->
cfs_rq
[
busiest_cpu
];
unsigned
long
busiest_h_load
=
busiest_cfs_rq
->
h_load
;
unsigned
long
busiest_weight
=
busiest_cfs_rq
->
load
.
weight
;
u64
rem_load
,
moved_load
;
imbalance
=
busy_cfs_rq
->
load
.
weight
-
this_cfs_rq
->
load
.
weight
;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if
(
imbalance
<=
0
)
/*
* empty group
*/
if
(
!
busiest_cfs_rq
->
task_weight
)
continue
;
/* Don't pull more than imbalance/2 */
imbalance
/=
2
;
maxload
=
min
(
rem_load_move
,
imbalance
);
rem_load
=
(
u64
)
rem_load_move
*
busiest_weight
;
rem_load
=
div_u64
(
rem_load
,
busiest_h_load
+
1
);
*
this_best_prio
=
cfs_rq_best_prio
(
this_cfs_rq
);
#else
# define maxload rem_load_move
#endif
/*
* pass busy_cfs_rq argument into
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator
.
arg
=
busy_cfs_rq
;
rem_load_move
-=
balance_tasks
(
this_rq
,
this_cpu
,
busiest
,
maxload
,
sd
,
idle
,
all_pinned
,
this_best_prio
,
&
cfs_rq_iterator
);
moved_load
=
__load_balance_fair
(
this_rq
,
this_cpu
,
busiest
,
rem_load
,
sd
,
idle
,
all_pinned
,
this_best_prio
,
tg
->
cfs_rq
[
busiest_cpu
]);
if
(
!
moved_load
)
continue
;
moved_load
*=
busiest_h_load
;
moved_load
=
div_u64
(
moved_load
,
busiest_weight
+
1
);
if
(
rem_load_move
<=
0
)
rem_load_move
-=
moved_load
;
if
(
rem_load_move
<
0
)
break
;
}
rcu_read_unlock
();
return
max_load_move
-
rem_load_move
;
}
#else
static
unsigned
long
load_balance_fair
(
struct
rq
*
this_rq
,
int
this_cpu
,
struct
rq
*
busiest
,
unsigned
long
max_load_move
,
struct
sched_domain
*
sd
,
enum
cpu_idle_type
idle
,
int
*
all_pinned
,
int
*
this_best_prio
)
{
return
__load_balance_fair
(
this_rq
,
this_cpu
,
busiest
,
max_load_move
,
sd
,
idle
,
all_pinned
,
this_best_prio
,
&
busiest
->
cfs
);
}
#endif
static
int
move_one_task_fair
(
struct
rq
*
this_rq
,
int
this_cpu
,
struct
rq
*
busiest
,
...
...
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
return
0
;
}
#endif
#endif
/* CONFIG_SMP */
/*
* scheduler tick hitting a task of our scheduling class:
...
...
kernel/sched_features.h
浏览文件 @
d14c8a68
SCHED_FEAT
(
NEW_FAIR_SLEEPERS
,
1
)
SCHED_FEAT
(
NORMALIZED_SLEEPER
,
1
)
SCHED_FEAT
(
WAKEUP_PREEMPT
,
1
)
SCHED_FEAT
(
START_DEBIT
,
1
)
SCHED_FEAT
(
AFFINE_WAKEUPS
,
1
)
...
...
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
SCHED_FEAT
(
SYNC_WAKEUPS
,
1
)
SCHED_FEAT
(
HRTICK
,
1
)
SCHED_FEAT
(
DOUBLE_TICK
,
0
)
SCHED_FEAT
(
NORMALIZED_SLEEPER
,
1
)
SCHED_FEAT
(
DEADLINE
,
1
)
SCHED_FEAT
(
ASYM_GRAN
,
1
)
SCHED_FEAT
(
LB_BIAS
,
0
)
SCHED_FEAT
(
LB_WAKEUP_UPDATE
,
1
)
SCHED_FEAT
(
ASYM_EFF_LOAD
,
1
)
kernel/sched_rt.c
浏览文件 @
d14c8a68
...
...
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
static
inline
void
rt_set_overload
(
struct
rq
*
rq
)
{
if
(
!
rq
->
online
)
return
;
cpu_set
(
rq
->
cpu
,
rq
->
rd
->
rto_mask
);
/*
* Make sure the mask is visible before we set
...
...
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
static
inline
void
rt_clear_overload
(
struct
rq
*
rq
)
{
if
(
!
rq
->
online
)
return
;
/* the order here really doesn't matter */
atomic_dec
(
&
rq
->
rd
->
rto_count
);
cpu_clear
(
rq
->
cpu
,
rq
->
rd
->
rto_mask
);
...
...
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return
&
rt_rq
->
tg
->
rt_bandwidth
;
}
#else
#else
/* !CONFIG_RT_GROUP_SCHED */
static
inline
u64
sched_rt_runtime
(
struct
rt_rq
*
rt_rq
)
{
...
...
@@ -220,49 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return
&
def_rt_bandwidth
;
}
#endif
static
int
do_sched_rt_period_timer
(
struct
rt_bandwidth
*
rt_b
,
int
overrun
)
{
int
i
,
idle
=
1
;
cpumask_t
span
;
if
(
rt_b
->
rt_runtime
==
RUNTIME_INF
)
return
1
;
span
=
sched_rt_period_mask
();
for_each_cpu_mask
(
i
,
span
)
{
int
enqueue
=
0
;
struct
rt_rq
*
rt_rq
=
sched_rt_period_rt_rq
(
rt_b
,
i
);
struct
rq
*
rq
=
rq_of_rt_rq
(
rt_rq
);
spin_lock
(
&
rq
->
lock
);
if
(
rt_rq
->
rt_time
)
{
u64
runtime
;
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
runtime
=
rt_rq
->
rt_runtime
;
rt_rq
->
rt_time
-=
min
(
rt_rq
->
rt_time
,
overrun
*
runtime
);
if
(
rt_rq
->
rt_throttled
&&
rt_rq
->
rt_time
<
runtime
)
{
rt_rq
->
rt_throttled
=
0
;
enqueue
=
1
;
}
if
(
rt_rq
->
rt_time
||
rt_rq
->
rt_nr_running
)
idle
=
0
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
}
else
if
(
rt_rq
->
rt_nr_running
)
idle
=
0
;
if
(
enqueue
)
sched_rt_rq_enqueue
(
rt_rq
);
spin_unlock
(
&
rq
->
lock
);
}
return
idle
;
}
#endif
/* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
static
int
balance_runtime
(
struct
rt_rq
*
rt_rq
)
static
int
do_
balance_runtime
(
struct
rt_rq
*
rt_rq
)
{
struct
rt_bandwidth
*
rt_b
=
sched_rt_bandwidth
(
rt_rq
);
struct
root_domain
*
rd
=
cpu_rq
(
smp_processor_id
())
->
rd
;
...
...
@@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
continue
;
spin_lock
(
&
iter
->
rt_runtime_lock
);
if
(
iter
->
rt_runtime
==
RUNTIME_INF
)
goto
next
;
diff
=
iter
->
rt_runtime
-
iter
->
rt_time
;
if
(
diff
>
0
)
{
do_div
(
diff
,
weight
);
...
...
@@ -294,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq)
break
;
}
}
next:
spin_unlock
(
&
iter
->
rt_runtime_lock
);
}
spin_unlock
(
&
rt_b
->
rt_runtime_lock
);
return
more
;
}
#endif
static
void
__disable_runtime
(
struct
rq
*
rq
)
{
struct
root_domain
*
rd
=
rq
->
rd
;
struct
rt_rq
*
rt_rq
;
if
(
unlikely
(
!
scheduler_running
))
return
;
for_each_leaf_rt_rq
(
rt_rq
,
rq
)
{
struct
rt_bandwidth
*
rt_b
=
sched_rt_bandwidth
(
rt_rq
);
s64
want
;
int
i
;
spin_lock
(
&
rt_b
->
rt_runtime_lock
);
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
if
(
rt_rq
->
rt_runtime
==
RUNTIME_INF
||
rt_rq
->
rt_runtime
==
rt_b
->
rt_runtime
)
goto
balanced
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
want
=
rt_b
->
rt_runtime
-
rt_rq
->
rt_runtime
;
for_each_cpu_mask
(
i
,
rd
->
span
)
{
struct
rt_rq
*
iter
=
sched_rt_period_rt_rq
(
rt_b
,
i
);
s64
diff
;
if
(
iter
==
rt_rq
)
continue
;
spin_lock
(
&
iter
->
rt_runtime_lock
);
if
(
want
>
0
)
{
diff
=
min_t
(
s64
,
iter
->
rt_runtime
,
want
);
iter
->
rt_runtime
-=
diff
;
want
-=
diff
;
}
else
{
iter
->
rt_runtime
-=
want
;
want
-=
want
;
}
spin_unlock
(
&
iter
->
rt_runtime_lock
);
if
(
!
want
)
break
;
}
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
BUG_ON
(
want
);
balanced:
rt_rq
->
rt_runtime
=
RUNTIME_INF
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
spin_unlock
(
&
rt_b
->
rt_runtime_lock
);
}
}
static
void
disable_runtime
(
struct
rq
*
rq
)
{
unsigned
long
flags
;
spin_lock_irqsave
(
&
rq
->
lock
,
flags
);
__disable_runtime
(
rq
);
spin_unlock_irqrestore
(
&
rq
->
lock
,
flags
);
}
static
void
__enable_runtime
(
struct
rq
*
rq
)
{
struct
rt_rq
*
rt_rq
;
if
(
unlikely
(
!
scheduler_running
))
return
;
for_each_leaf_rt_rq
(
rt_rq
,
rq
)
{
struct
rt_bandwidth
*
rt_b
=
sched_rt_bandwidth
(
rt_rq
);
spin_lock
(
&
rt_b
->
rt_runtime_lock
);
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
rt_rq
->
rt_runtime
=
rt_b
->
rt_runtime
;
rt_rq
->
rt_time
=
0
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
spin_unlock
(
&
rt_b
->
rt_runtime_lock
);
}
}
static
void
enable_runtime
(
struct
rq
*
rq
)
{
unsigned
long
flags
;
spin_lock_irqsave
(
&
rq
->
lock
,
flags
);
__enable_runtime
(
rq
);
spin_unlock_irqrestore
(
&
rq
->
lock
,
flags
);
}
static
int
balance_runtime
(
struct
rt_rq
*
rt_rq
)
{
int
more
=
0
;
if
(
rt_rq
->
rt_time
>
rt_rq
->
rt_runtime
)
{
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
more
=
do_balance_runtime
(
rt_rq
);
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
}
return
more
;
}
#else
/* !CONFIG_SMP */
static
inline
int
balance_runtime
(
struct
rt_rq
*
rt_rq
)
{
return
0
;
}
#endif
/* CONFIG_SMP */
static
int
do_sched_rt_period_timer
(
struct
rt_bandwidth
*
rt_b
,
int
overrun
)
{
int
i
,
idle
=
1
;
cpumask_t
span
;
if
(
rt_b
->
rt_runtime
==
RUNTIME_INF
)
return
1
;
span
=
sched_rt_period_mask
();
for_each_cpu_mask
(
i
,
span
)
{
int
enqueue
=
0
;
struct
rt_rq
*
rt_rq
=
sched_rt_period_rt_rq
(
rt_b
,
i
);
struct
rq
*
rq
=
rq_of_rt_rq
(
rt_rq
);
spin_lock
(
&
rq
->
lock
);
if
(
rt_rq
->
rt_time
)
{
u64
runtime
;
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
if
(
rt_rq
->
rt_throttled
)
balance_runtime
(
rt_rq
);
runtime
=
rt_rq
->
rt_runtime
;
rt_rq
->
rt_time
-=
min
(
rt_rq
->
rt_time
,
overrun
*
runtime
);
if
(
rt_rq
->
rt_throttled
&&
rt_rq
->
rt_time
<
runtime
)
{
rt_rq
->
rt_throttled
=
0
;
enqueue
=
1
;
}
if
(
rt_rq
->
rt_time
||
rt_rq
->
rt_nr_running
)
idle
=
0
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
}
else
if
(
rt_rq
->
rt_nr_running
)
idle
=
0
;
if
(
enqueue
)
sched_rt_rq_enqueue
(
rt_rq
);
spin_unlock
(
&
rq
->
lock
);
}
return
idle
;
}
static
inline
int
rt_se_prio
(
struct
sched_rt_entity
*
rt_se
)
{
...
...
@@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
if
(
sched_rt_runtime
(
rt_rq
)
>=
sched_rt_period
(
rt_rq
))
return
0
;
#ifdef CONFIG_SMP
if
(
rt_rq
->
rt_time
>
runtime
)
{
int
more
;
spin_unlock
(
&
rt_rq
->
rt_runtime_lock
);
more
=
balance_runtime
(
rt_rq
);
spin_lock
(
&
rt_rq
->
rt_runtime_lock
);
if
(
more
)
runtime
=
sched_rt_runtime
(
rt_rq
);
}
#endif
balance_runtime
(
rt_rq
);
runtime
=
sched_rt_runtime
(
rt_rq
);
if
(
runtime
==
RUNTIME_INF
)
return
0
;
if
(
rt_rq
->
rt_time
>
runtime
)
{
rt_rq
->
rt_throttled
=
1
;
...
...
@@ -392,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON
(
!
rt_prio
(
rt_se_prio
(
rt_se
)));
rt_rq
->
rt_nr_running
++
;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if
(
rt_se_prio
(
rt_se
)
<
rt_rq
->
highest_prio
)
if
(
rt_se_prio
(
rt_se
)
<
rt_rq
->
highest_prio
)
{
struct
rq
*
rq
=
rq_of_rt_rq
(
rt_rq
);
rt_rq
->
highest_prio
=
rt_se_prio
(
rt_se
);
#ifdef CONFIG_SMP
if
(
rq
->
online
)
cpupri_set
(
&
rq
->
rd
->
cpupri
,
rq
->
cpu
,
rt_se_prio
(
rt_se
));
#endif
}
#endif
#ifdef CONFIG_SMP
if
(
rt_se
->
nr_cpus_allowed
>
1
)
{
struct
rq
*
rq
=
rq_of_rt_rq
(
rt_rq
);
rq
->
rt
.
rt_nr_migratory
++
;
}
...
...
@@ -417,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static
inline
void
dec_rt_tasks
(
struct
sched_rt_entity
*
rt_se
,
struct
rt_rq
*
rt_rq
)
{
#ifdef CONFIG_SMP
int
highest_prio
=
rt_rq
->
highest_prio
;
#endif
WARN_ON
(
!
rt_prio
(
rt_se_prio
(
rt_se
)));
WARN_ON
(
!
rt_rq
->
rt_nr_running
);
rt_rq
->
rt_nr_running
--
;
...
...
@@ -440,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rq
->
rt
.
rt_nr_migratory
--
;
}
if
(
rt_rq
->
highest_prio
!=
highest_prio
)
{
struct
rq
*
rq
=
rq_of_rt_rq
(
rt_rq
);
if
(
rq
->
online
)
cpupri_set
(
&
rq
->
rd
->
cpupri
,
rq
->
cpu
,
rt_rq
->
highest_prio
);
}
update_rt_migration
(
rq_of_rt_rq
(
rt_rq
));
#endif
/* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
...
...
@@ -455,6 +588,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
struct
rt_rq
*
rt_rq
=
rt_rq_of_se
(
rt_se
);
struct
rt_prio_array
*
array
=
&
rt_rq
->
active
;
struct
rt_rq
*
group_rq
=
group_rt_rq
(
rt_se
);
struct
list_head
*
queue
=
array
->
queue
+
rt_se_prio
(
rt_se
);
/*
* Don't enqueue the group if its throttled, or when empty.
...
...
@@ -465,7 +599,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
if
(
group_rq
&&
(
rt_rq_throttled
(
group_rq
)
||
!
group_rq
->
rt_nr_running
))
return
;
list_add_tail
(
&
rt_se
->
run_list
,
array
->
queue
+
rt_se_prio
(
rt_se
));
if
(
rt_se
->
nr_cpus_allowed
==
1
)
list_add
(
&
rt_se
->
run_list
,
queue
);
else
list_add_tail
(
&
rt_se
->
run_list
,
queue
);
__set_bit
(
rt_se_prio
(
rt_se
),
array
->
bitmap
);
inc_rt_tasks
(
rt_se
,
rt_rq
);
...
...
@@ -532,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
rt_se
->
timeout
=
0
;
enqueue_rt_entity
(
rt_se
);
inc_cpu_load
(
rq
,
p
->
se
.
load
.
weight
);
}
static
void
dequeue_task_rt
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
sleep
)
...
...
@@ -540,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
update_curr_rt
(
rq
);
dequeue_rt_entity
(
rt_se
);
dec_cpu_load
(
rq
,
p
->
se
.
load
.
weight
);
}
/*
...
...
@@ -550,10 +692,12 @@ static
void
requeue_rt_entity
(
struct
rt_rq
*
rt_rq
,
struct
sched_rt_entity
*
rt_se
)
{
struct
rt_prio_array
*
array
=
&
rt_rq
->
active
;
struct
list_head
*
queue
=
array
->
queue
+
rt_se_prio
(
rt_se
);
if
(
on_rt_rq
(
rt_se
))
list_move_tail
(
&
rt_se
->
run_list
,
queue
);
if
(
on_rt_rq
(
rt_se
))
{
list_del_init
(
&
rt_se
->
run_list
);
list_add_tail
(
&
rt_se
->
run_list
,
array
->
queue
+
rt_se_prio
(
rt_se
));
}
}
static
void
requeue_task_rt
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
...
...
@@ -616,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
*/
static
void
check_preempt_curr_rt
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
{
if
(
p
->
prio
<
rq
->
curr
->
prio
)
if
(
p
->
prio
<
rq
->
curr
->
prio
)
{
resched_task
(
rq
->
curr
);
return
;
}
#ifdef CONFIG_SMP
/*
* If:
*
* - the newly woken task is of equal priority to the current task
* - the newly woken task is non-migratable while current is migratable
* - current will be preempted on the next reschedule
*
* we should check to see if current can readily move to a different
* cpu. If so, we will reschedule to allow the push logic to try
* to move current somewhere else, making room for our non-migratable
* task.
*/
if
((
p
->
prio
==
rq
->
curr
->
prio
)
&&
p
->
rt
.
nr_cpus_allowed
==
1
&&
rq
->
curr
->
rt
.
nr_cpus_allowed
!=
1
)
{
cpumask_t
mask
;
if
(
cpupri_find
(
&
rq
->
rd
->
cpupri
,
rq
->
curr
,
&
mask
))
/*
* There appears to be other cpus that can accept
* current, so lets reschedule to try and push it away
*/
resched_task
(
rq
->
curr
);
}
#endif
}
static
struct
sched_rt_entity
*
pick_next_rt_entity
(
struct
rq
*
rq
,
...
...
@@ -720,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
static
DEFINE_PER_CPU
(
cpumask_t
,
local_cpu_mask
);
static
int
find_lowest_cpus
(
struct
task_struct
*
task
,
cpumask_t
*
lowest_mask
)
{
int
lowest_prio
=
-
1
;
int
lowest_cpu
=
-
1
;
int
count
=
0
;
int
cpu
;
cpus_and
(
*
lowest_mask
,
task_rq
(
task
)
->
rd
->
online
,
task
->
cpus_allowed
);
/*
* Scan each rq for the lowest prio.
*/
for_each_cpu_mask
(
cpu
,
*
lowest_mask
)
{
struct
rq
*
rq
=
cpu_rq
(
cpu
);
/* We look for lowest RT prio or non-rt CPU */
if
(
rq
->
rt
.
highest_prio
>=
MAX_RT_PRIO
)
{
/*
* if we already found a low RT queue
* and now we found this non-rt queue
* clear the mask and set our bit.
* Otherwise just return the queue as is
* and the count==1 will cause the algorithm
* to use the first bit found.
*/
if
(
lowest_cpu
!=
-
1
)
{
cpus_clear
(
*
lowest_mask
);
cpu_set
(
rq
->
cpu
,
*
lowest_mask
);
}
return
1
;
}
/* no locking for now */
if
((
rq
->
rt
.
highest_prio
>
task
->
prio
)
&&
(
rq
->
rt
.
highest_prio
>=
lowest_prio
))
{
if
(
rq
->
rt
.
highest_prio
>
lowest_prio
)
{
/* new low - clear old data */
lowest_prio
=
rq
->
rt
.
highest_prio
;
lowest_cpu
=
cpu
;
count
=
0
;
}
count
++
;
}
else
cpu_clear
(
cpu
,
*
lowest_mask
);
}
/*
* Clear out all the set bits that represent
* runqueues that were of higher prio than
* the lowest_prio.
*/
if
(
lowest_cpu
>
0
)
{
/*
* Perhaps we could add another cpumask op to
* zero out bits. Like cpu_zero_bits(cpumask, nrbits);
* Then that could be optimized to use memset and such.
*/
for_each_cpu_mask
(
cpu
,
*
lowest_mask
)
{
if
(
cpu
>=
lowest_cpu
)
break
;
cpu_clear
(
cpu
,
*
lowest_mask
);
}
}
return
count
;
}
static
inline
int
pick_optimal_cpu
(
int
this_cpu
,
cpumask_t
*
mask
)
{
int
first
;
...
...
@@ -808,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task)
cpumask_t
*
lowest_mask
=
&
__get_cpu_var
(
local_cpu_mask
);
int
this_cpu
=
smp_processor_id
();
int
cpu
=
task_cpu
(
task
);
int
count
=
find_lowest_cpus
(
task
,
lowest_mask
);
if
(
!
count
)
return
-
1
;
/* No
targets found
*/
if
(
task
->
rt
.
nr_cpus_allowed
==
1
)
return
-
1
;
/* No
other targets possible
*/
/*
* There is no sense in performing an optimal search if only one
* target is found.
*/
if
(
count
==
1
)
return
first_cpu
(
*
lowest_mask
);
if
(
!
cpupri_find
(
&
task_rq
(
task
)
->
rd
->
cpupri
,
task
,
lowest_mask
))
return
-
1
;
/* No targets found */
/*
* At this point we have built a mask of cpus representing the
...
...
@@ -1163,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
}
/* Assumes rq->lock is held */
static
void
join_domain
_rt
(
struct
rq
*
rq
)
static
void
rq_online
_rt
(
struct
rq
*
rq
)
{
if
(
rq
->
rt
.
overloaded
)
rt_set_overload
(
rq
);
__enable_runtime
(
rq
);
cpupri_set
(
&
rq
->
rd
->
cpupri
,
rq
->
cpu
,
rq
->
rt
.
highest_prio
);
}
/* Assumes rq->lock is held */
static
void
leave_domain
_rt
(
struct
rq
*
rq
)
static
void
rq_offline
_rt
(
struct
rq
*
rq
)
{
if
(
rq
->
rt
.
overloaded
)
rt_clear_overload
(
rq
);
__disable_runtime
(
rq
);
cpupri_set
(
&
rq
->
rd
->
cpupri
,
rq
->
cpu
,
CPUPRI_INVALID
);
}
/*
...
...
@@ -1336,8 +1445,8 @@ static const struct sched_class rt_sched_class = {
.
load_balance
=
load_balance_rt
,
.
move_one_task
=
move_one_task_rt
,
.
set_cpus_allowed
=
set_cpus_allowed_rt
,
.
join_domain
=
join_domain
_rt
,
.
leave_domain
=
leave_domain
_rt
,
.
rq_online
=
rq_online
_rt
,
.
rq_offline
=
rq_offline
_rt
,
.
pre_schedule
=
pre_schedule_rt
,
.
post_schedule
=
post_schedule_rt
,
.
task_wake_up
=
task_wake_up_rt
,
...
...
@@ -1350,3 +1459,17 @@ static const struct sched_class rt_sched_class = {
.
prio_changed
=
prio_changed_rt
,
.
switched_to
=
switched_to_rt
,
};
#ifdef CONFIG_SCHED_DEBUG
extern
void
print_rt_rq
(
struct
seq_file
*
m
,
int
cpu
,
struct
rt_rq
*
rt_rq
);
static
void
print_rt_stats
(
struct
seq_file
*
m
,
int
cpu
)
{
struct
rt_rq
*
rt_rq
;
rcu_read_lock
();
for_each_leaf_rt_rq
(
rt_rq
,
cpu_rq
(
cpu
))
print_rt_rq
(
m
,
cpu
,
rt_rq
);
rcu_read_unlock
();
}
#endif
/* CONFIG_SCHED_DEBUG */
kernel/sched_stats.h
浏览文件 @
d14c8a68
...
...
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
if
(
rq
)
rq
->
rq_sched_info
.
cpu_time
+=
delta
;
}
static
inline
void
rq_sched_info_dequeued
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{
if
(
rq
)
rq
->
rq_sched_info
.
run_delay
+=
delta
;
}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val) do { var = (val); } while (0)
...
...
@@ -126,6 +133,9 @@ static inline void
rq_sched_info_arrive
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
static
inline
void
rq_sched_info_dequeued
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
static
inline
void
rq_sched_info_depart
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
# define schedstat_inc(rq, field) do { } while (0)
...
...
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
#endif
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
static
inline
void
sched_info_reset_dequeued
(
struct
task_struct
*
t
)
{
t
->
sched_info
.
last_queued
=
0
;
}
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
...
...
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
* active queue, thus delaying tasks in the expired queue from running;
* see scheduler_tick()).
*
* This function is only called from sched_info_arrive(), rather than
* dequeue_task(). Even though a task may be queued and dequeued multiple
* times as it is shuffled about, we're really interested in knowing how
* long it was from the *first* time it was queued to the time that it
* finally hit a cpu.
* Though we are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a cpu, we call this routine
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
static
inline
void
sched_info_dequeued
(
struct
task_struct
*
t
)
{
t
->
sched_info
.
last_queued
=
0
;
unsigned
long
long
now
=
task_rq
(
t
)
->
clock
,
delta
=
0
;
if
(
unlikely
(
sched_info_on
()))
if
(
t
->
sched_info
.
last_queued
)
delta
=
now
-
t
->
sched_info
.
last_queued
;
sched_info_reset_dequeued
(
t
);
t
->
sched_info
.
run_delay
+=
delta
;
rq_sched_info_dequeued
(
task_rq
(
t
),
delta
);
}
/*
...
...
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
if
(
t
->
sched_info
.
last_queued
)
delta
=
now
-
t
->
sched_info
.
last_queued
;
sched_info_dequeued
(
t
);
sched_info_
reset_
dequeued
(
t
);
t
->
sched_info
.
run_delay
+=
delta
;
t
->
sched_info
.
last_arrival
=
now
;
t
->
sched_info
.
pcount
++
;
...
...
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
__sched_info_switch
(
prev
,
next
);
}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#define sched_info_queued(t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_dequeued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#endif
/* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
kernel/sysctl.c
浏览文件 @
d14c8a68
...
...
@@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = {
.
extra1
=
&
min_wakeup_granularity_ns
,
.
extra2
=
&
max_wakeup_granularity_ns
,
},
{
.
ctl_name
=
CTL_UNNUMBERED
,
.
procname
=
"sched_shares_ratelimit"
,
.
data
=
&
sysctl_sched_shares_ratelimit
,
.
maxlen
=
sizeof
(
unsigned
int
),
.
mode
=
0644
,
.
proc_handler
=
&
proc_dointvec
,
},
{
.
ctl_name
=
CTL_UNNUMBERED
,
.
procname
=
"sched_child_runs_first"
,
...
...
kernel/time/tick-sched.c
浏览文件 @
d14c8a68
...
...
@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void)
ts
->
tick_stopped
=
1
;
ts
->
idle_jiffies
=
last_jiffies
;
rcu_enter_nohz
();
sched_clock_tick_stop
(
cpu
);
}
/*
...
...
@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void)
select_nohz_load_balancer
(
0
);
now
=
ktime_get
();
tick_do_update_jiffies64
(
now
);
sched_clock_tick_start
(
cpu
);
cpu_clear
(
cpu
,
nohz_cpu_mask
);
/*
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录