提交 fbfd4454 编写于 作者: Z zhangsong

sched: Introduce priority load balance for CFS

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HF3M
CVE: NA

--------------------------------

Add new sysctl interface:
`/proc/sys/kernel/sched_prio_load_balance_enabled`

 0: default behavior
 1: enable priority load balance for qos scheduler

For tasks co-location with qos scheduler, when CFS do load balance,
it is reasonable to prefer migrating online(Latency Sensitive) tasks.
So the CFS load balance can be changed to below:

1) `cfs_tasks` list is owned by online tasks.
2) Add new `cfs_offline_tasks` list which is owned by offline tasks.
3) Prefer to migrate the online tasks of `cfs_tasks` list to dst rq.
Signed-off-by: Nzhangsong <zhangsong34@huawei.com>
Reviewed-by: NZhang Qiao <zhangqiao22@huawei.com>
--------------------------------
V2->V3:
- remove skip_migrate_task for load balance
V1->V2:
- remove setting cpu shares for offline cgroup
上级 a0192c4a
......@@ -79,6 +79,10 @@ extern unsigned int sysctl_overload_detect_period;
extern unsigned int sysctl_offline_wait_interval;
#endif
#ifdef CONFIG_SCHED_PRIO_LB
extern unsigned int sysctl_sched_prio_load_balance_enabled;
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
#endif
......
......@@ -975,6 +975,15 @@ config QOS_SCHED_SMT_EXPELLER
This feature enable online tasks to expel offline tasks
on the smt sibling cpus, and exclusively occupy CPU resources.
config SCHED_PRIO_LB
bool "Priority load balance for CFS"
depends on SMP
default n
help
This feature enable priority load balance
for CFS, which prefer migrating online tasks
and migrating offline tasks secondly.
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED
......
......@@ -7432,6 +7432,9 @@ void __init sched_init(void)
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
#ifdef CONFIG_SCHED_PRIO_LB
INIT_LIST_HEAD(&rq->cfs_offline_tasks);
#endif
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
......
......@@ -131,6 +131,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */
static int unthrottle_qos_cfs_rqs(int cpu);
#endif
#ifdef CONFIG_SCHED_PRIO_LB
unsigned int sysctl_sched_prio_load_balance_enabled;
#endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
static DEFINE_PER_CPU(int, qos_smt_status);
#endif
......@@ -3018,6 +3022,20 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHED_PRIO_LB
static void
adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
struct rq *rq,
struct sched_entity *se)
{
if (sysctl_sched_prio_load_balance_enabled &&
task_has_idle_policy(task_of(se)))
(*list_op)(&se->group_node, &rq->cfs_offline_tasks);
else
(*list_op)(&se->group_node, &rq->cfs_tasks);
}
#endif
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
......@@ -3027,7 +3045,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_add, rq, se);
#else
list_add(&se->group_node, &rq->cfs_tasks);
#endif
}
#endif
cfs_rq->nr_running++;
......@@ -7736,7 +7758,11 @@ done: __maybe_unused;
* the list, so our cfs_tasks list becomes MRU
* one.
*/
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_move, rq, &p->se);
#else
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
#endif
if (hrtick_enabled(rq))
......@@ -8106,6 +8132,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
&p->se == cfs_rq_of(&p->se)->last))
return 1;
#ifdef CONFIG_SCHED_PRIO_LB
/* Preempt sched idle cpu do not consider migration cost */
if (sysctl_sched_prio_load_balance_enabled &&
cpus_share_cache(env->src_cpu, env->dst_cpu) &&
sched_idle_cpu(env->dst_cpu))
return 0;
#endif
if (sysctl_sched_migration_cost == -1)
return 1;
if (sysctl_sched_migration_cost == 0)
......@@ -8311,11 +8345,18 @@ static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
struct list_head *tasks = &env->src_rq->cfs_tasks;
#ifdef CONFIG_SCHED_PRIO_LB
int loop = 0;
#endif
lockdep_assert_held(&env->src_rq->lock);
#ifdef CONFIG_SCHED_PRIO_LB
again:
#endif
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
......@@ -8330,6 +8371,15 @@ static struct task_struct *detach_one_task(struct lb_env *env)
schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
#ifdef CONFIG_SCHED_PRIO_LB
if (sysctl_sched_prio_load_balance_enabled) {
loop++;
if (loop == 1) {
tasks = &env->src_rq->cfs_offline_tasks;
goto again;
}
}
#endif
return NULL;
}
......@@ -8347,12 +8397,18 @@ static int detach_tasks(struct lb_env *env)
unsigned long util, load;
struct task_struct *p;
int detached = 0;
#ifdef CONFIG_SCHED_PRIO_LB
int loop = 0;
#endif
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
#ifdef CONFIG_SCHED_PRIO_LB
again:
#endif
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
......@@ -8454,6 +8510,15 @@ static int detach_tasks(struct lb_env *env)
list_move(&p->se.group_node, tasks);
}
#ifdef CONFIG_SCHED_PRIO_LB
if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) {
loop++;
if (loop == 1) {
tasks = &env->src_rq->cfs_offline_tasks;
goto again;
}
}
#endif
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
......@@ -11780,7 +11845,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_move, rq, se);
#else
list_move(&se->group_node, &rq->cfs_tasks);
#endif
}
#endif
......
......@@ -1107,8 +1107,12 @@ struct rq {
struct cpuidle_state *idle_state;
#endif
#if defined(CONFIG_SCHED_PRIO_LB) && !defined(__GENKSYMS__)
struct list_head cfs_offline_tasks;
#else
KABI_RESERVE(1)
KABI_RESERVE(2)
#endif
KABI_RESERVE(3)
KABI_RESERVE(4)
KABI_RESERVE(5)
......
......@@ -2718,6 +2718,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one_hundred,
.extra2 = &one_thousand,
},
#endif
#ifdef CONFIG_SCHED_PRIO_LB
{
.procname = "sched_prio_load_balance_enabled",
.data = &sysctl_sched_prio_load_balance_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
{ }
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册