diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e9c0bd4197d4f1de1181e002f06d24cfd5898f8..b977f07ed41c4803312c543b835cf60790d78a4e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2120,6 +2120,13 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 525d73dd8ef9ea9deaeb0924e952d1f175b70bae..cd2b767bbff809f8dec8e71e136775d7e4c85cf3 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -74,6 +74,11 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_QOS_SCHED +extern unsigned int sysctl_overload_detect_period; +extern unsigned int sysctl_offline_wait_interval; +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8a4dd7027e908a442accff08d6e2496614a20761..df3c534dc138e1b6b95450c01e91436e1ae395e3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -160,6 +160,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_SIGPENDING) arch_do_signal(regs); +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); @@ -187,7 +191,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || + sched_qos_cpu_overload())) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 856c4123e92a44c521916a840414872e918bef9a..b46717970ab9e65a483f02fb66764961ad4ba0d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7394,6 +7394,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a0cb9a4161e56e1dc3869f7ae684ad79018cd9d..e5cf15fb9e84d552141a3f9b3e7edeb19a965f19 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -24,6 +24,9 @@ #ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" #endif +#ifdef CONFIG_QOS_SCHED +#include +#endif /* * Targeted preemption latency for CPU-bound tasks: @@ -153,6 +156,10 @@ int __weak arch_asym_cpu_priority(int cpu) #ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif @@ -7245,6 +7252,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ } #ifdef CONFIG_QOS_SCHED +static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -7283,6 +7291,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) } + if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -7342,7 +7353,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -7358,11 +7369,26 @@ static int unthrottle_qos_cfs_rqs(int cpu) return res; } +static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) { + if (unlikely(__this_cpu_read(qos_cpu_overload))) { + return false; + } + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(smp_processor_id()) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); return true; } @@ -7380,6 +7406,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || signal_pending(current)) + break; + msleep_interruptible(sysctl_offline_wait_interval); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} #endif struct task_struct * @@ -7548,6 +7624,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ec230220ee3a9c29e1f9cccf978f9561e01ef5b..4c58086cf080122df979e7219ac4c127dffc01c1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1100,6 +1100,9 @@ static inline int cpu_of(struct rq *rq) #endif } +#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 261787cebd8ebcbd75cd74ad1557759fcf01f850..749ef59224e28babb2c316e84b47a142b9fcd76c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -128,6 +128,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_QOS_SCHED +static int hundred_thousand = 100000; +#endif #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; #endif @@ -2725,6 +2728,26 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = ias_table, }, +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_thousand, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_hundred, + .extra2 = &one_thousand, + }, +#endif { } };