提交 fbfd4454 编写于 作者: Z zhangsong

sched: Introduce priority load balance for CFS

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HF3M
CVE: NA

--------------------------------

Add new sysctl interface:
`/proc/sys/kernel/sched_prio_load_balance_enabled`

 0: default behavior
 1: enable priority load balance for qos scheduler

For tasks co-location with qos scheduler, when CFS do load balance,
it is reasonable to prefer migrating online(Latency Sensitive) tasks.
So the CFS load balance can be changed to below:

1) `cfs_tasks` list is owned by online tasks.
2) Add new `cfs_offline_tasks` list which is owned by offline tasks.
3) Prefer to migrate the online tasks of `cfs_tasks` list to dst rq.
Signed-off-by: Nzhangsong <zhangsong34@huawei.com>
Reviewed-by: NZhang Qiao <zhangqiao22@huawei.com>
--------------------------------
V2->V3:
- remove skip_migrate_task for load balance
V1->V2:
- remove setting cpu shares for offline cgroup
上级 a0192c4a
...@@ -79,6 +79,10 @@ extern unsigned int sysctl_overload_detect_period; ...@@ -79,6 +79,10 @@ extern unsigned int sysctl_overload_detect_period;
extern unsigned int sysctl_offline_wait_interval; extern unsigned int sysctl_offline_wait_interval;
#endif #endif
#ifdef CONFIG_SCHED_PRIO_LB
extern unsigned int sysctl_sched_prio_load_balance_enabled;
#endif
#ifdef CONFIG_SCHED_AUTOGROUP #ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled; extern unsigned int sysctl_sched_autogroup_enabled;
#endif #endif
......
...@@ -975,6 +975,15 @@ config QOS_SCHED_SMT_EXPELLER ...@@ -975,6 +975,15 @@ config QOS_SCHED_SMT_EXPELLER
This feature enable online tasks to expel offline tasks This feature enable online tasks to expel offline tasks
on the smt sibling cpus, and exclusively occupy CPU resources. on the smt sibling cpus, and exclusively occupy CPU resources.
config SCHED_PRIO_LB
bool "Priority load balance for CFS"
depends on SMP
default n
help
This feature enable priority load balance
for CFS, which prefer migrating online tasks
and migrating offline tasks secondly.
config FAIR_GROUP_SCHED config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER" bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED depends on CGROUP_SCHED
......
...@@ -7432,6 +7432,9 @@ void __init sched_init(void) ...@@ -7432,6 +7432,9 @@ void __init sched_init(void)
rq->max_idle_balance_cost = sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks); INIT_LIST_HEAD(&rq->cfs_tasks);
#ifdef CONFIG_SCHED_PRIO_LB
INIT_LIST_HEAD(&rq->cfs_offline_tasks);
#endif
rq_attach_root(rq, &def_root_domain); rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
......
...@@ -131,6 +131,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ ...@@ -131,6 +131,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */
static int unthrottle_qos_cfs_rqs(int cpu); static int unthrottle_qos_cfs_rqs(int cpu);
#endif #endif
#ifdef CONFIG_SCHED_PRIO_LB
unsigned int sysctl_sched_prio_load_balance_enabled;
#endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
static DEFINE_PER_CPU(int, qos_smt_status); static DEFINE_PER_CPU(int, qos_smt_status);
#endif #endif
...@@ -3018,6 +3022,20 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) ...@@ -3018,6 +3022,20 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHED_PRIO_LB
static void
adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
struct rq *rq,
struct sched_entity *se)
{
if (sysctl_sched_prio_load_balance_enabled &&
task_has_idle_policy(task_of(se)))
(*list_op)(&se->group_node, &rq->cfs_offline_tasks);
else
(*list_op)(&se->group_node, &rq->cfs_tasks);
}
#endif
static void static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
...@@ -3027,7 +3045,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -3027,7 +3045,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct rq *rq = rq_of(cfs_rq); struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se)); account_numa_enqueue(rq, task_of(se));
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_add, rq, se);
#else
list_add(&se->group_node, &rq->cfs_tasks); list_add(&se->group_node, &rq->cfs_tasks);
#endif
} }
#endif #endif
cfs_rq->nr_running++; cfs_rq->nr_running++;
...@@ -7736,7 +7758,11 @@ done: __maybe_unused; ...@@ -7736,7 +7758,11 @@ done: __maybe_unused;
* the list, so our cfs_tasks list becomes MRU * the list, so our cfs_tasks list becomes MRU
* one. * one.
*/ */
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_move, rq, &p->se);
#else
list_move(&p->se.group_node, &rq->cfs_tasks); list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
#endif #endif
if (hrtick_enabled(rq)) if (hrtick_enabled(rq))
...@@ -8106,6 +8132,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) ...@@ -8106,6 +8132,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
&p->se == cfs_rq_of(&p->se)->last)) &p->se == cfs_rq_of(&p->se)->last))
return 1; return 1;
#ifdef CONFIG_SCHED_PRIO_LB
/* Preempt sched idle cpu do not consider migration cost */
if (sysctl_sched_prio_load_balance_enabled &&
cpus_share_cache(env->src_cpu, env->dst_cpu) &&
sched_idle_cpu(env->dst_cpu))
return 0;
#endif
if (sysctl_sched_migration_cost == -1) if (sysctl_sched_migration_cost == -1)
return 1; return 1;
if (sysctl_sched_migration_cost == 0) if (sysctl_sched_migration_cost == 0)
...@@ -8311,11 +8345,18 @@ static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) ...@@ -8311,11 +8345,18 @@ static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
static struct task_struct *detach_one_task(struct lb_env *env) static struct task_struct *detach_one_task(struct lb_env *env)
{ {
struct task_struct *p; struct task_struct *p;
struct list_head *tasks = &env->src_rq->cfs_tasks;
#ifdef CONFIG_SCHED_PRIO_LB
int loop = 0;
#endif
lockdep_assert_held(&env->src_rq->lock); lockdep_assert_held(&env->src_rq->lock);
#ifdef CONFIG_SCHED_PRIO_LB
again:
#endif
list_for_each_entry_reverse(p, list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) { tasks, se.group_node) {
if (!can_migrate_task(p, env)) if (!can_migrate_task(p, env))
continue; continue;
...@@ -8330,6 +8371,15 @@ static struct task_struct *detach_one_task(struct lb_env *env) ...@@ -8330,6 +8371,15 @@ static struct task_struct *detach_one_task(struct lb_env *env)
schedstat_inc(env->sd->lb_gained[env->idle]); schedstat_inc(env->sd->lb_gained[env->idle]);
return p; return p;
} }
#ifdef CONFIG_SCHED_PRIO_LB
if (sysctl_sched_prio_load_balance_enabled) {
loop++;
if (loop == 1) {
tasks = &env->src_rq->cfs_offline_tasks;
goto again;
}
}
#endif
return NULL; return NULL;
} }
...@@ -8347,12 +8397,18 @@ static int detach_tasks(struct lb_env *env) ...@@ -8347,12 +8397,18 @@ static int detach_tasks(struct lb_env *env)
unsigned long util, load; unsigned long util, load;
struct task_struct *p; struct task_struct *p;
int detached = 0; int detached = 0;
#ifdef CONFIG_SCHED_PRIO_LB
int loop = 0;
#endif
lockdep_assert_held(&env->src_rq->lock); lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0) if (env->imbalance <= 0)
return 0; return 0;
#ifdef CONFIG_SCHED_PRIO_LB
again:
#endif
while (!list_empty(tasks)) { while (!list_empty(tasks)) {
/* /*
* We don't want to steal all, otherwise we may be treated likewise, * We don't want to steal all, otherwise we may be treated likewise,
...@@ -8454,6 +8510,15 @@ static int detach_tasks(struct lb_env *env) ...@@ -8454,6 +8510,15 @@ static int detach_tasks(struct lb_env *env)
list_move(&p->se.group_node, tasks); list_move(&p->se.group_node, tasks);
} }
#ifdef CONFIG_SCHED_PRIO_LB
if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) {
loop++;
if (loop == 1) {
tasks = &env->src_rq->cfs_offline_tasks;
goto again;
}
}
#endif
/* /*
* Right now, this is one of only two places we collect this stat * Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather * so we can safely collect detach_one_task() stats here rather
...@@ -11780,7 +11845,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) ...@@ -11780,7 +11845,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
* Move the next running task to the front of the list, so our * Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one. * cfs_tasks list becomes MRU one.
*/ */
#ifdef CONFIG_SCHED_PRIO_LB
adjust_rq_cfs_tasks(list_move, rq, se);
#else
list_move(&se->group_node, &rq->cfs_tasks); list_move(&se->group_node, &rq->cfs_tasks);
#endif
} }
#endif #endif
......
...@@ -1107,8 +1107,12 @@ struct rq { ...@@ -1107,8 +1107,12 @@ struct rq {
struct cpuidle_state *idle_state; struct cpuidle_state *idle_state;
#endif #endif
#if defined(CONFIG_SCHED_PRIO_LB) && !defined(__GENKSYMS__)
struct list_head cfs_offline_tasks;
#else
KABI_RESERVE(1) KABI_RESERVE(1)
KABI_RESERVE(2) KABI_RESERVE(2)
#endif
KABI_RESERVE(3) KABI_RESERVE(3)
KABI_RESERVE(4) KABI_RESERVE(4)
KABI_RESERVE(5) KABI_RESERVE(5)
......
...@@ -2718,6 +2718,17 @@ static struct ctl_table kern_table[] = { ...@@ -2718,6 +2718,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one_hundred, .extra1 = &one_hundred,
.extra2 = &one_thousand, .extra2 = &one_thousand,
}, },
#endif
#ifdef CONFIG_SCHED_PRIO_LB
{
.procname = "sched_prio_load_balance_enabled",
.data = &sysctl_sched_prio_load_balance_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif #endif
{ } { }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册