!795 sched/fair: Introduce multiple qos level

Merge Pull Request from: @zhaowenhui8 Expand qos_level from {-1,0} to [-2, 2], to distinguish the tasks expected to be with extremely high or low priority level. Using qos_level_weight to reweight the shares when calculating group's weight. Meanwhile, set offline task's schedule policy to SCHED_IDLE so that it can be preempted at check_preempt_wakeup. kernel option: CONFIG_QOS_SCHED_MULTILEVEL Link:https://gitee.com/openeuler/kernel/pulls/795 Reviewed-by: Zucheng Zheng <zhengzucheng@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

!795 sched/fair: Introduce multiple qos level
Merge Pull Request from: @zhaowenhui8 Expand qos_level from {-1,0} to [-2, 2], to distinguish the tasks expected to be with extremely high or low priority level. Using qos_level_weight to reweight the shares when calculating group's weight. Meanwhile, set offline task's schedule policy to SCHED_IDLE so that it can be preempted at check_preempt_wakeup. kernel option: CONFIG_QOS_SCHED_MULTILEVEL Link:https://gitee.com/openeuler/kernel/pulls/795 Reviewed-by: Zucheng Zheng <zhengzucheng@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
c4fb2bc6 · openeuler-ci-bot · Gitee · 623763f1 · c51ad919 · c4fb2bc6
8 changed file
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -140,6 +140,7 @@ CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_V1_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_QOS_SCHED=y
+CONFIG_QOS_SCHED_MULTILEVEL=y
 CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
 CONFIG_QOS_SCHED_SMT_EXPELLER=y
 CONFIG_FAIR_GROUP_SCHED=y

--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -158,6 +158,7 @@ CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_V1_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_QOS_SCHED=y
+CONFIG_QOS_SCHED_MULTILEVEL=y
 CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
 CONFIG_QOS_SCHED_SMT_EXPELLER=y
 CONFIG_FAIR_GROUP_SCHED=y

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -83,6 +83,10 @@ extern unsigned int sysctl_overload_detect_period;
 extern unsigned int sysctl_offline_wait_interval;
 #endif

+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+extern unsigned int sysctl_qos_level_weights[];
+#endif
+
 #ifdef CONFIG_QOS_SCHED_PRIO_LB
 extern unsigned int sysctl_sched_prio_load_balance_enabled;
 #endif

--- a/init/Kconfig
+++ b/init/Kconfig
@@ -977,6 +977,15 @@ config QOS_SCHED

    default n

+config QOS_SCHED_MULTILEVEL
+	bool "Multiple qos level task scheduling"
+	depends on QOS_SCHED
+	default n
+	help
+	  This feature enable multiple qos level on task scheduling.
+	  Expand the qos_level to [-2,2] to distinguish the tasks expected
+	  to be with extremely high or low priority level.
+
 config QOS_SCHED_SMT_EXPELLER
 	bool "Qos smt expeller"
 	depends on SCHED_SMT

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6437,7 +6437,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	 * other than SCHED_IDLE, the online task preemption and cpu resource
 	 * isolation will be invalid, so return -EINVAL in this case.
 	 */
-	if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) {
+	if (unlikely(is_offline_level(task_group(p)->qos_level) && !idle_policy(policy))) {
 		retval = -EINVAL;
 		goto unlock;
 	}
@@ -8562,7 +8562,7 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t
 	 */
 	if (!(tsk->flags & PF_EXITING) &&
 	    !task_group_is_autogroup(tg) &&
-	    (tg->qos_level == -1)) {
+	    (is_offline_level(tg->qos_level))) {
 		attr.sched_priority = 0;
 		attr.sched_policy = SCHED_IDLE;
 		__setscheduler_params(tsk, &attr);
@@ -8590,7 +8590,7 @@ void sched_move_offline_task(struct task_struct *p)
 {
 	struct offline_args *args;

-	if (unlikely(task_group(p)->qos_level != -1))
+	if (unlikely(!is_offline_level(task_group(p)->qos_level)))
 		return;

 	args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC);
@@ -9463,7 +9463,7 @@ static int tg_change_scheduler(struct task_group *tg, void *data)
 	struct cgroup_subsys_state *css = &tg->css;

 	tg->qos_level = qos_level;
-	if (qos_level == -1)
+	if (is_offline_level(qos_level))
 		policy = SCHED_IDLE;
 	else
 		policy = SCHED_NORMAL;
@@ -9485,19 +9485,27 @@ static int cpu_qos_write(struct cgroup_subsys_state *css,
 	if (!tg->se[0])
 		return -EINVAL;

+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+	if (qos_level > QOS_LEVEL_HIGH_EX || qos_level < QOS_LEVEL_OFFLINE_EX)
+#else
 	if (qos_level != -1 && qos_level != 0)
+#endif
 		return -EINVAL;

 	if (tg->qos_level == qos_level)
 		goto done;

+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+	if (!is_normal_level(tg->qos_level))
+#else
 	if (tg->qos_level == -1 && qos_level == 0)
+#endif
 		return -EINVAL;

 	cpus_read_lock();
-	if (qos_level == -1)
+	if (is_offline_level(qos_level))
 		cfs_bandwidth_usage_inc();
-	else
+	else if (is_offline_level(tg->qos_level) && !is_offline_level(qos_level))
 		cfs_bandwidth_usage_dec();
 	cpus_read_unlock();


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -140,6 +140,23 @@ static int unthrottle_qos_cfs_rqs(int cpu);
 static bool qos_smt_expelled(int this_cpu);
 #endif

+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+#define QOS_LEVEL_WEIGHT_OFFLINE_EX	1
+#define QOS_LEVEL_WEIGHT_OFFLINE	10
+#define QOS_LEVEL_WEIGHT_ONLINE 	100
+#define QOS_LEVEL_WEIGHT_HIGH		1000
+#define QOS_LEVEL_WEIGHT_HIGH_EX	10000
+
+unsigned int sysctl_qos_level_weights[5] = {
+	QOS_LEVEL_WEIGHT_OFFLINE_EX,
+	QOS_LEVEL_WEIGHT_OFFLINE,
+	QOS_LEVEL_WEIGHT_ONLINE,
+	QOS_LEVEL_WEIGHT_HIGH,
+	QOS_LEVEL_WEIGHT_HIGH_EX,
+};
+static long qos_reweight(long shares, struct task_group *tg);
+#endif
+
 #ifdef CONFIG_QOS_SCHED_PRIO_LB
 unsigned int sysctl_sched_prio_load_balance_enabled;
 #endif
@@ -2987,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
 {
 	struct task_group *tg = task_group(task_of(se));

-	if (sysctl_sched_prio_load_balance_enabled && tg->qos_level == -1)
+	if (sysctl_sched_prio_load_balance_enabled && is_offline_level(tg->qos_level))
 		(*list_op)(&se->group_node, &rq->cfs_offline_tasks);
 	else
 		(*list_op)(&se->group_node, &rq->cfs_tasks);
@@ -3217,6 +3234,9 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 	struct task_group *tg = cfs_rq->tg;

 	tg_shares = READ_ONCE(tg->shares);
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+	tg_shares = qos_reweight(tg_shares, tg);
+#endif

 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);

@@ -3265,6 +3285,9 @@ static void update_cfs_group(struct sched_entity *se)

 #ifndef CONFIG_SMP
 	shares = READ_ONCE(gcfs_rq->tg->shares);
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+	shares = qos_reweight(shares, gcfs_rq->tg);
+#endif

 	if (likely(se->load.weight == shares))
 		return;
@@ -4494,6 +4517,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		update_min_vruntime(cfs_rq);
 }

+
 /*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -7548,7 +7572,7 @@ static inline void cancel_qos_timer(int cpu)

 static inline bool is_offline_task(struct task_struct *p)
 {
-	return task_group(p)->qos_level == QOS_LEVEL_OFFLINE;
+	return task_group(p)->qos_level < QOS_LEVEL_ONLINE;
 }

 static void start_qos_hrtimer(int cpu);
@@ -7739,7 +7763,7 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		return false;
 	}

-	if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
+	if (unlikely(cfs_rq && is_offline_level(cfs_rq->tg->qos_level) &&
 		!sched_idle_cpu(smp_processor_id()) &&
 		cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
 		throttle_qos_cfs_rq(cfs_rq);
@@ -7755,7 +7779,7 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq)
 	struct rq_flags rf;

 	rq_lock_irqsave(rq, &rf);
-	if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq))
+	if (is_offline_level(cfs_rq->tg->qos_level) && cfs_rq_throttled(cfs_rq))
 		unthrottle_qos_cfs_rq(cfs_rq);
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7768,7 +7792,7 @@ void sched_qos_offline_wait(void)
 		rcu_read_lock();
 		qos_level = task_group(current)->qos_level;
 		rcu_read_unlock();
-		if (qos_level != -1 || fatal_signal_pending(current))
+		if (!is_offline_level(qos_level) || fatal_signal_pending(current))
 			break;

 		schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval));
@@ -7835,6 +7859,39 @@ static bool qos_smt_expelled(int this_cpu)
 #endif
 #endif

+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+static long qos_reweight(long shares, struct task_group *tg)
+{
+	long qos_weight = 100;
+	long div = 100;
+	long scale_shares;
+
+	switch (tg->qos_level) {
+	case QOS_LEVEL_OFFLINE_EX:
+		qos_weight = sysctl_qos_level_weights[0];
+		break;
+	case QOS_LEVEL_OFFLINE:
+		qos_weight = sysctl_qos_level_weights[1];
+		break;
+	case QOS_LEVEL_ONLINE:
+		qos_weight = sysctl_qos_level_weights[2];
+		break;
+	case QOS_LEVEL_HIGH:
+		qos_weight = sysctl_qos_level_weights[3];
+		break;
+	case QOS_LEVEL_HIGH_EX:
+		qos_weight = sysctl_qos_level_weights[4];
+		break;
+	}
+	if (qos_weight > LONG_MAX / shares)
+		scale_shares = LONG_MAX / div;
+	else
+		scale_shares = shares * qos_weight / div;
+	scale_shares = clamp_t(long, scale_shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+	return scale_shares;
+}
+#endif
+
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
 DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch);

@@ -7891,7 +7948,7 @@ static bool qos_smt_update_status(struct task_struct *p)
 {
 	int status = QOS_LEVEL_OFFLINE;

-	if (p != NULL && task_group(p)->qos_level >= QOS_LEVEL_ONLINE)
+	if (p != NULL && !is_offline_level(task_group(p)->qos_level))
 		status = QOS_LEVEL_ONLINE;

 	if (__this_cpu_read(qos_smt_status) == status)
@@ -7969,7 +8026,7 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
 		*    and current cpu only has SCHED_IDLE tasks enqueued.
 		*/
 		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE &&
-		    task_group(current)->qos_level < QOS_LEVEL_ONLINE) {
+		    is_offline_level(task_group(current)->qos_level)) {
 			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
 			return true;
 		}

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1182,11 +1182,20 @@ static inline int cpu_of(struct rq *rq)
 }

 #ifdef CONFIG_QOS_SCHED
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
 enum task_qos_level {
+	QOS_LEVEL_OFFLINE_EX = -2,
 	QOS_LEVEL_OFFLINE = -1,
 	QOS_LEVEL_ONLINE = 0,
-	QOS_LEVEL_MAX
+	QOS_LEVEL_HIGH = 1,
+	QOS_LEVEL_HIGH_EX = 2
 };
+#else
+enum task_qos_level {
+	QOS_LEVEL_OFFLINE = -1,
+	QOS_LEVEL_ONLINE = 0,
+};
+#endif
 void init_qos_hrtimer(int cpu);
 #endif

@@ -3036,7 +3045,22 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 #ifdef CONFIG_QOS_SCHED
 static inline int qos_idle_policy(int policy)
 {
-	return policy == QOS_LEVEL_OFFLINE;
+	return policy <= QOS_LEVEL_OFFLINE;
+}
+
+static inline int is_high_level(long qos_level)
+{
+	return qos_level > QOS_LEVEL_ONLINE;
+}
+
+static inline int is_normal_level(long qos_level)
+{
+	return qos_level == QOS_LEVEL_ONLINE;
+}
+
+static inline int is_offline_level(long qos_level)
+{
+	return qos_level < QOS_LEVEL_ONLINE;
 }
 #endif


--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2718,6 +2718,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one_thousand,
 	},
 #endif
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL
+	{
+		.procname	= "qos_level_weights",
+		.data		= &sysctl_qos_level_weights,
+		.maxlen		= 5*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_QOS_SCHED_PRIO_LB
 	{
 		.procname	= "sched_prio_load_balance_enabled",