sched: Introduce handle priority reversion mechanism

hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA -------------------------------- When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle. Signed-off-by: N Zhang Qiao <zhangqiao22@huawei.com> Signed-off-by: N Zheng Zucheng <zhengzucheng@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Reviewed-by: N Xiu Jianfeng <xiujianfeng@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>

sched: Introduce handle priority reversion mechanism
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA -------------------------------- When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle. Signed-off-by: N Zhang Qiao <zhangqiao22@huawei.com> Signed-off-by: N Zheng Zucheng <zhengzucheng@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Reviewed-by: N Xiu Jianfeng <xiujianfeng@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>
a5d94c89 · Zheng Zucheng · Yang Yingliang · fca01562 · a5d94c89 · a5d94c89
7 changed file
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -162,6 +162,10 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 		if (cached_flags & _TIF_SIGPENDING)
 			do_signal(regs);

+#ifdef CONFIG_QOS_SCHED
+		sched_qos_offline_wait();
+#endif
+
 		if (cached_flags & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
 			tracehook_notify_resume(regs);
@@ -194,7 +198,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)

 	cached_flags = READ_ONCE(ti->flags);

-	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+	if (unlikely((cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS) ||
+		     sched_qos_cpu_overload()))
 		exit_to_usermode_loop(regs, cached_flags);

 #ifdef CONFIG_COMPAT

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1953,6 +1953,13 @@ static inline void rseq_syscall(struct pt_regs *regs)

 #ifdef CONFIG_QOS_SCHED
 void sched_move_offline_task(struct task_struct *p);
+void sched_qos_offline_wait(void);
+int sched_qos_cpu_overload(void);
+#else
+static inline int sched_qos_cpu_overload(void)
+{
+	return 0;
+}
 #endif

 #endif
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -67,6 +67,11 @@ extern int sysctl_sched_rt_runtime;
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif

+#ifdef CONFIG_QOS_SCHED
+extern unsigned int sysctl_overload_detect_period;
+extern unsigned int sysctl_offline_wait_interval;
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6115,6 +6115,9 @@ void __init sched_init(void)
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+#ifdef CONFIG_QOS_SCHED
+		init_qos_hrtimer(i);
+#endif
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -24,7 +24,9 @@
 #ifdef CONFIG_SCHED_STEAL
 #include "sparsemask.h"
 #endif
-
+#ifdef CONFIG_QOS_SCHED
+#include <linux/delay.h>
+#endif
 #include <trace/events/sched.h>

 /*
@@ -101,7 +103,10 @@ int __weak arch_asym_cpu_priority(int cpu)

 #ifdef CONFIG_QOS_SCHED
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
-
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer);
+static DEFINE_PER_CPU(int, qos_cpu_overload);
+unsigned int sysctl_overload_detect_period = 5000;  /* in ms */
+unsigned int sysctl_offline_wait_interval = 100;  /* in ms */
 static int unthrottle_qos_cfs_rqs(int cpu);
 #endif

@@ -6879,6 +6884,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 }

 #ifdef CONFIG_QOS_SCHED
+static void start_qos_hrtimer(int cpu);
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -6913,6 +6919,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		sub_nr_running(rq, task_delta);
 	}

+	if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq))))
+		start_qos_hrtimer(cpu_of(rq));
+
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);

@@ -6969,7 +6978,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		resched_curr(rq);
 }

-static int unthrottle_qos_cfs_rqs(int cpu)
+static int __unthrottle_qos_cfs_rqs(int cpu)
 {
 	struct cfs_rq *cfs_rq, *tmp_rq;
 	int res = 0;
@@ -6984,6 +6993,83 @@ static int unthrottle_qos_cfs_rqs(int cpu)

 	return res;
 }
+
+static int unthrottle_qos_cfs_rqs(int cpu)
+{
+	int res;
+
+	res = __unthrottle_qos_cfs_rqs(cpu);
+	if (res)
+		hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
+
+	return res;
+}
+
+static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(__this_cpu_read(qos_cpu_overload))) {
+		return false;
+	}
+
+	if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
+		!sched_idle_cpu(smp_processor_id()) &&
+		cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
+		throttle_qos_cfs_rq(cfs_rq);
+		return true;
+	}
+
+	return false;
+}
+
+void sched_qos_offline_wait(void)
+{
+	long qos_level;
+
+	while (unlikely(this_cpu_read(qos_cpu_overload))) {
+		rcu_read_lock();
+		qos_level = task_group(current)->qos_level;
+		rcu_read_unlock();
+		if (qos_level != -1 || signal_pending(current))
+			break;
+		msleep_interruptible(sysctl_offline_wait_interval);
+	}
+}
+
+int sched_qos_cpu_overload(void)
+{
+	return __this_cpu_read(qos_cpu_overload);
+}
+
+static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer)
+{
+	struct rq_flags rf;
+	struct rq *rq = this_rq();
+
+	rq_lock_irqsave(rq, &rf);
+	if (__unthrottle_qos_cfs_rqs(smp_processor_id()))
+		__this_cpu_write(qos_cpu_overload, 1);
+	rq_unlock_irqrestore(rq, &rf);
+
+	return HRTIMER_NORESTART;
+}
+
+static void start_qos_hrtimer(int cpu)
+{
+	ktime_t time;
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period);
+	hrtimer_set_expires(hrtimer, time);
+	hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED);
+}
+
+void init_qos_hrtimer(int cpu)
+{
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	hrtimer->function = qos_overload_timer_handler;
+}
 #endif

 static struct task_struct *
@@ -7045,10 +7131,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		se = pick_next_entity(cfs_rq, curr);
 		cfs_rq = group_cfs_rq(se);
 #ifdef CONFIG_QOS_SCHED
-		if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
-			     !sched_idle_cpu(cpu_of(rq)) &&
-			     cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
-			throttle_qos_cfs_rq(cfs_rq);
+		if (check_qos_cfs_rq(cfs_rq)) {
 			cfs_rq = &rq->cfs;
 			WARN(cfs_rq->nr_running == 0,
 			     "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n",
@@ -7151,6 +7234,8 @@ done: __maybe_unused;
 		rq->idle_stamp = 0;
 		goto again;
 	}
+
+	__this_cpu_write(qos_cpu_overload, 0);
 #endif

 	return NULL;

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -994,6 +994,9 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }

+#ifdef CONFIG_QOS_SCHED
+void init_qos_hrtimer(int cpu);
+#endif

 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);

--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -138,6 +138,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_QOS_SCHED
+static int hundred_thousand = 100000;
+#endif
 #ifdef CONFIG_PERF_EVENTS
 static int six_hundred_forty_kb = 640 * 1024;
 #endif
@@ -1280,6 +1283,26 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &three,
 	},

+#endif
+#ifdef CONFIG_QOS_SCHED
+	{
+		.procname	= "qos_overload_detect_period_ms",
+		.data		= &sysctl_overload_detect_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one_thousand,
+		.extra2		= &hundred_thousand,
+	},
+	{
+		.procname	= "qos_offline_wait_interval_ms",
+		.data		= &sysctl_offline_wait_interval,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one_hundred,
+		.extra2		= &one_thousand,
+	},
 #endif
 	{ }
 };