Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ...
53528695 · Linus Torvalds · b831ef2c · e73e85f0 · 53528695 · 53528695
26 changed file
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
 /*
 * must be macros to avoid header recursion hell
 */
-#define init_task_preempt_count(p) do { \
+#define init_task_preempt_count(p) do { } while (0)
-	task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
-} while (0)
 #define init_idle_preempt_count(p, cpu) do { \
-	task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
 	per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
 } while (0)

--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
-	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
@@ -69,7 +68,6 @@ struct thread_info {
 	.task		= &tsk,			\
 	.flags		= 0,			\
 	.cpu		= 0,			\
-	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
 }

--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
 		set_iopl_mask(next->iopl);
-	/*
-	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
-	 * preempt_count of all tasks was equal here and this would not be
-	 * needed.
-	 */
-	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Switch FS and GS.
 	 *
-	 * These are even more complicated than FS and GS: they have
+	 * These are even more complicated than DS and ES: they have
 	 * 64-bit bases are that controlled by arch_prctl.  Those bases
 	 * only differ from the values in the GDT or LDT if the selector
 	 * is 0.
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	this_cpu_write(current_task, next_p);
-	/*
-	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
-	 * preempt_count of all tasks was equal here and this would not be
-	 * needed.
-	 */
-	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	load_sp0(tss, next);

--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
 * must be macros to avoid header recursion hell
 */
 #define init_task_preempt_count(p) do { \
-	task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
+	task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
 } while (0)
 #define init_idle_preempt_count(p, cpu) do { \

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -26,7 +26,6 @@
 *         SOFTIRQ_MASK:	0x0000ff00
 *         HARDIRQ_MASK:	0x000f0000
 *             NMI_MASK:	0x00100000
- *       PREEMPT_ACTIVE:	0x00200000
 * PREEMPT_NEED_RESCHED:	0x80000000
 */
 #define PREEMPT_BITS	8
@@ -53,10 +52,6 @@
 #define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
-#define PREEMPT_ACTIVE_BITS	1
-#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
 /* We use the MSB mostly because its available */
 #define PREEMPT_NEED_RESCHED	0x80000000
@@ -126,8 +121,7 @@
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
-#define in_atomic_preempt_off() \
+#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
-#define preempt_active_enter() \
-do { \
-	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-	barrier(); \
-} while (0)
-#define preempt_active_exit() \
-do { \
-	barrier(); \
-	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-} while (0)
 #ifdef CONFIG_PREEMPT_COUNT
 #define preempt_disable() \

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,20 +599,26 @@ struct task_cputime_atomic {
 		.sum_exec_runtime = ATOMIC64_INIT(0),		\
 	}
-#ifdef CONFIG_PREEMPT_COUNT
+#define PREEMPT_DISABLED	(PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
-#define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
-#else
+/*
-#define PREEMPT_DISABLED	PREEMPT_ENABLED
+ * Disable preemption until the scheduler is running -- use an unconditional
-#endif
+ * value so that it also works on !PREEMPT_COUNT kernels.
+ *
+ * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ */
+#define INIT_PREEMPT_COUNT	PREEMPT_OFFSET
 /*
- * Disable preemption until the scheduler is running.
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
- * Reset by start_kernel()->sched_init()->init_idle().
+ * which states that during context switches:
 *
- * We include PREEMPT_ACTIVE to avoid cond_resched() from working
+ *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
- * before the scheduler is active -- see should_resched().
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
 */
-#define INIT_PREEMPT_COUNT	(PREEMPT_DISABLED + PREEMPT_ACTIVE)
+#define FORK_PREEMPT_COUNT	(2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 /**
 * struct thread_group_cputimer - thread group interval timer counts
@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
 #endif
 };
-extern struct sched_domain_topology_level *sched_domain_topology;
 extern void set_sched_topology(struct sched_domain_topology_level *tl);
 extern void wake_up_if_idle(int cpu);
@@ -1192,10 +1196,10 @@ struct load_weight {
 /*
 * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors the amount of time that a sched_entity is
+ * 1) load_avg factors frequency scaling into the amount of time that a
- * runnable on a rq into its weight. For cfs_rq, it is the aggregated
+ * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * such weights of all runnable and blocked sched_entities.
+ * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency scaling into the amount of time
+ * 2) util_avg factors frequency and cpu scaling into the amount of time
 * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
 * For cfs_rq, it is the aggregated such times of all runnable and
 * blocked sched_entities.

--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
 	return dl_prio(p->prio);
 }
+static inline bool dl_time_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
 #endif /* _SCHED_DEADLINE_H */
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -24,9 +24,6 @@ struct smpboot_thread_data;
 *			parked (cpu offline)
 * @unpark:		Optional unpark function, called when the thread is
 *			unparked (cpu online)
- * @pre_unpark:		Optional unpark function, called before the thread is
- *			unparked (cpu online). This is not guaranteed to be
- *			called on the target cpu of the thread. Careful!
 * @cpumask:		Internal state.  To update which threads are unparked,
 *			call smpboot_update_cpumask_percpu_thread().
 * @selfparking:	Thread is not parked by the park function.
@@ -42,7 +39,6 @@ struct smp_hotplug_thread {
 	void				(*cleanup)(unsigned int cpu, bool online);
 	void				(*park)(unsigned int cpu);
 	void				(*unpark)(unsigned int cpu);
-	void				(*pre_unpark)(unsigned int cpu);
 	cpumask_var_t			cpumask;
 	bool				selfparking;
 	const char			*thread_comm;

--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+void stop_machine_park(int cpu);
+void stop_machine_unpark(int cpu);
 #else	/* CONFIG_SMP */

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
 	     TP_ARGS(p));
 #ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
 {
-	long state = p->state;
-#ifdef CONFIG_PREEMPT
 #ifdef CONFIG_SCHED_DEBUG
 	BUG_ON(p != current);
 #endif /* CONFIG_SCHED_DEBUG */
 	/*
-	 * For all intents and purposes a preempted task is a running task.
+	 * Preemption ignores task state, therefore preempted tasks are always
+	 * RUNNING (we will not have dequeued if state != RUNNING).
 	 */
-	if (preempt_count() & PREEMPT_ACTIVE)
+	return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
-		state = TASK_RUNNING | TASK_STATE_MAX;
-#endif /* CONFIG_PREEMPT */
-	return state;
 }
 #endif /* CREATE_TRACE_POINTS */
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
 */
 TRACE_EVENT(sched_switch,
-	TP_PROTO(struct task_struct *prev,
+	TP_PROTO(bool preempt,
+		 struct task_struct *prev,
 		 struct task_struct *next),
-	TP_ARGS(prev, next),
+	TP_ARGS(preempt, prev, next),
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
-		__entry->prev_state	= __trace_sched_switch_state(prev);
+		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;

--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
 {
 	struct task_struct *g, *p;
-	read_lock_irq(&tasklist_lock);
+	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
+	for_each_process_thread(g, p) {
 		if (!p->on_rq)
 			continue;
 		/*
@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
 		pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
 			p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-	} while_each_thread(g, p);
+	}
-	read_unlock_irq(&tasklist_lock);
+	read_unlock(&tasklist_lock);
 }
 struct take_cpu_down_param {
@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
 	/* Park the stopper thread */
-	kthread_park(current);
+	stop_machine_park((long)param->hcpu);
 	return 0;
 }

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
-	if (unlikely(in_atomic()))
+	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
 			current->comm, task_pid_nr(current),
 			preempt_count());
+		preempt_count_set(PREEMPT_ENABLED);
+	}
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)

--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
 	 * then right waiter has a dl_prio() too.
 	 */
 	if (dl_prio(left->prio))
-		return (left->task->dl.deadline < right->task->dl.deadline);
+		return dl_time_before(left->task->dl.deadline,
+				      right->task->dl.deadline);
 	return 0;
 }

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
-	if (p->policy == SCHED_IDLE) {
+	if (idle_policy(p->policy)) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
 	load->inv_weight = prio_to_wmult[prio];
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 /*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
-			p->sched_class->migrate_task_rq(p, new_cpu);
+			p->sched_class->migrate_task_rq(p);
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 	}
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
 	struct rq *src_rq, *dst_rq;
 	int ret = -EAGAIN;
+	if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+		return -EAGAIN;
 	src_rq = cpu_rq(arg->src_cpu);
 	dst_rq = cpu_rq(arg->dst_cpu);
 	double_raw_lock(&arg->src_task->pi_lock,
 			&arg->dst_task->pi_lock);
 	double_rq_lock(src_rq, dst_rq);
 	if (task_cpu(arg->dst_task) != arg->dst_cpu)
 		goto unlock;
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			goto out;
 		}
+		/* No more Mr. Nice Guy. */
 		switch (state) {
 		case cpuset:
-			/* No more Mr. Nice Guy. */
+			if (IS_ENABLED(CONFIG_CPUSETS)) {
-			cpuset_cpus_allowed_fallback(p);
+				cpuset_cpus_allowed_fallback(p);
-			state = possible;
+				state = possible;
-			break;
+				break;
+			}
+			/* fall-through */
 		case possible:
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 #endif /* CONFIG_SCHEDSTATS */
 }
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif /* CONFIG_NUMA_BALANCING */
 }
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
 void set_numabalancing_state(bool enabled)
 {
 	if (enabled)
-		sched_feat_set("NUMA");
+		static_branch_enable(&sched_numa_balancing);
 	else
-		sched_feat_set("NO_NUMA");
+		static_branch_disable(&sched_numa_balancing);
 }
-#else
-__read_mostly bool numabalancing_enabled;
-void set_numabalancing_state(bool enabled)
-{
-	numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 {
 	struct ctl_table t;
 	int err;
-	int state = numabalancing_enabled;
+	int state = static_branch_likely(&sched_numa_balancing);
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/* Initialize new task's runnable average */
+	init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-	/* Initialize new task's runnable average */
-	init_entity_runnable_average(&p->se);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
-	trace_sched_switch(prev, next);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
+	/*
+	 * The previous task will have left us with a preempt_count of 2
+	 * because it left us after:
+	 *
+	 *	schedule()
+	 *	  preempt_disable();			// 1
+	 *	  __schedule()
+	 *	    raw_spin_lock_irq(&rq->lock)	// 2
+	 *
+	 * Also, see FORK_PREEMPT_COUNT.
+	 */
+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+		      "corrupted preempt_count: %s/%d/0x%x\n",
+		      current->comm, current->pid, preempt_count()))
+		preempt_count_set(FORK_PREEMPT_COUNT);
 	rq->prev_mm = NULL;
 	/*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 {
 	struct rq *rq;
-	/* finish_task_switch() drops rq->lock and enables preemtion */
+	/*
-	preempt_disable();
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
 	rq = finish_task_switch(prev);
 	balance_callback(rq);
 	preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+	BUG_ON(task_stack_end_corrupted(prev));
 #endif
-	/*
-	 * Test if we are atomic. Since do_exit() needs to call into
+	if (unlikely(in_atomic_preempt_off())) {
-	 * schedule() atomically, we ignore that path. Otherwise whine
-	 * if we are scheduling when we should not.
-	 */
-	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
 		__schedule_bug(prev);
+		preempt_count_set(PREEMPT_DISABLED);
+	}
 	rcu_sleep_check();
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 *
 * WARNING: must be called with preemption disabled!
 */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
 	rcu_note_context_switch();
 	prev = rq->curr;
+	/*
+	 * do_exit() calls schedule() with preemption disabled as an exception;
+	 * however we must fix that up, otherwise the next task will see an
+	 * inconsistent (higher) preempt count.
+	 *
+	 * It also avoids the below schedule_debug() test from complaining
+	 * about this.
+	 */
+	if (unlikely(prev->state == TASK_DEAD))
+		preempt_enable_no_resched_notrace();
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	if (!preempt && prev->state) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
 		rq->curr = next;
 		++*switch_count;
+		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
 	} else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
 	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule();
+		__schedule(false);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		preempt_active_enter();
+		preempt_disable_notrace();
-		__schedule();
+		__schedule(true);
-		preempt_active_exit();
+		preempt_enable_no_resched_notrace();
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		return;
 	do {
-		/*
+		preempt_disable_notrace();
-		 * Use raw __prempt_count() ops that don't call function.
-		 * We can't call functions before disabling preemption which
-		 * disarm preemption tracing recursions.
-		 */
-		__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-		barrier();
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
 		 * an infinite recursion.
 		 */
 		prev_ctx = exception_enter();
-		__schedule();
+		__schedule(true);
 		exception_exit(prev_ctx);
-		barrier();
+		preempt_enable_no_resched_notrace();
-		__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 	do {
-		preempt_active_enter();
+		preempt_disable();
 		local_irq_enable();
-		__schedule();
+		__schedule(true);
 		local_irq_disable();
-		preempt_active_exit();
+		sched_preempt_enable_no_resched();
 	} while (need_resched());
 	exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	} else {
 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-		if (policy != SCHED_DEADLINE &&
+		if (!valid_policy(policy))
-				policy != SCHED_FIFO && policy != SCHED_RR &&
-				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-				policy != SCHED_IDLE)
 			return -EINVAL;
 	}
@@ -3812,7 +3836,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
-		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+		if (idle_policy(p->policy) && !idle_policy(policy)) {
 			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
@@ -3937,7 +3961,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
@@ -3947,11 +3971,15 @@ static int __sched_setscheduler(struct task_struct *p,
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+		enqueue_task(rq, p, enqueue_flags);
 	}
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
@@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
 static int sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
+	int cpu = (long)hcpu;
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_STARTING:
 		set_cpu_rq_start_time();
 		return NOTIFY_OK;
 	case CPU_ONLINE:
 		/*
 		 * At this point a starting CPU has marked itself as online via
 		 * set_cpu_online(). But it might not yet have marked itself
 		 * as active, which is essential from here on.
-		 *
-		 * Thus, fall-through and help the starting CPU along.
 		 */
+		set_cpu_active(cpu, true);
+		stop_machine_unpark(cpu);
+		return NOTIFY_OK;
 	case CPU_DOWN_FAILED:
-		set_cpu_active((long)hcpu, true);
+		set_cpu_active(cpu, true);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
@@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
 	{ NULL, },
 };
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
 #define for_each_sd_topology(tl)			\
 	for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7478,7 +7513,7 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = preempt_count() + rcu_preempt_depth();
 	return (nested == preempt_offset);
 }
@@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
@@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk, queued);
+		tsk->sched_class->task_move_group(tsk);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
@@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
 			    struct cgroup_subsys_state *old_css,
 			    struct task_struct *task)
 {
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
 	sched_move_task(task);
 }

--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
 	return (i << 1) + 2;
 }
-static inline int dl_time_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
 static void cpudl_exchange(struct cpudl *cp, int a, int b)
 {
 	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;

--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
 #define _LINUX_CPUDL_H
 #include <linux/sched.h>
+#include <linux/sched/deadline.h>
 #define IDX_INVALID     -1

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 */
 SCHED_FEAT(WAKEUP_PREEMPTION, true)
-/*
- * Use arch dependent cpu capacity functions
- */
-SCHED_FEAT(ARCH_CAPACITY, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * NUMA will favor moving tasks towards nodes where a higher number of
- * hinting faults are recorded during active load balancing. It will
- * resist moving tasks towards nodes where a lower number of hinting
- * faults have been recorded.
- */
-SCHED_FEAT(NUMA,	true)
-#endif
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 /*
 * We ran out of runtime, see if we can borrow some from our neighbours.
 */
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
-	int i, weight, more = 0;
+	int i, weight;
 	u64 rt_period;
 	weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 				diff = rt_period - rt_rq->rt_runtime;
 			iter->rt_runtime -= diff;
 			rt_rq->rt_runtime += diff;
-			more = 1;
 			if (rt_rq->rt_runtime == rt_period) {
 				raw_spin_unlock(&iter->rt_runtime_lock);
 				break;
@@ -683,8 +682,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 		raw_spin_unlock(&iter->rt_runtime_lock);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
-	return more;
 }
 /*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
 	}
 }
-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
 {
-	int more = 0;
 	if (!sched_feat(RT_RUNTIME_SHARE))
-		return more;
+		return;
 	if (rt_rq->rt_time > rt_rq->rt_runtime) {
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
-		more = do_balance_runtime(rt_rq);
+		do_balance_runtime(rt_rq);
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 	}
-	return more;
 }
 #else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
-{
-	return 0;
-}
 #endif /* CONFIG_SMP */
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
 */
 #define RUNTIME_INF	((u64)~0ULL)
+static inline int idle_policy(int policy)
+{
+	return policy == SCHED_IDLE;
+}
 static inline int fair_policy(int policy)
 {
 	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
 {
 	return policy == SCHED_DEADLINE;
 }
+static inline bool valid_policy(int policy)
+{
+	return idle_policy(policy) || fair_policy(policy) ||
+		rt_policy(policy) || dl_policy(policy);
+}
 static inline int task_has_rt_policy(struct task_struct *p)
 {
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
 	return dl_policy(p->policy);
 }
-static inline bool dl_time_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
 /*
 * Tells if entity @a should preempt entity @b.
 */
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
-#ifdef CONFIG_NUMA_BALANCING
+extern struct static_key_false sched_numa_balancing;
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
 static inline u64 global_rt_period(void)
 {
@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_RESTORE	0x10
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02
 #define RETRY_TASK		((void *)-1UL)
@@ -1194,7 +1190,7 @@ struct sched_class {
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+	void (*migrate_task_rq)(struct task_struct *p);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1227,7 +1223,7 @@ struct sched_class {
 	void (*update_curr) (struct rq *rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_move_group) (struct task_struct *p, int on_rq);
+	void (*task_move_group) (struct task_struct *p);
 #endif
 };
@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+	if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+		return sd->smt_gain / sd->span_weight;
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 {
 	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-	if (ht->pre_unpark)
+	if (!ht->selfparking)
-		ht->pre_unpark(cpu);
+		kthread_unpark(tsk);
-	kthread_unpark(tsk);
 }
 void smpboot_unpark_threads(unsigned int cpu)

--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 	}
 }
+static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
+					struct cpu_stop_work *work)
+{
+	list_add_tail(&work->list, &stopper->works);
+	wake_up_process(stopper->thread);
+}
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	unsigned long flags;
 	spin_lock_irqsave(&stopper->lock, flags);
+	if (stopper->enabled)
-	if (stopper->enabled) {
+		__cpu_stop_queue_work(stopper, work);
-		list_add_tail(&work->list, &stopper->works);
+	else
-		wake_up_process(stopper->thread);
-	} else
 		cpu_stop_signal_done(work->done, false);
 	spin_unlock_irqrestore(&stopper->lock, flags);
 }
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
 	return err;
 }
+static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
+				    int cpu2, struct cpu_stop_work *work2)
+{
+	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+	int err;
+	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+	spin_lock_irq(&stopper1->lock);
+	spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+	err = -ENOENT;
+	if (!stopper1->enabled || !stopper2->enabled)
+		goto unlock;
+	err = 0;
+	__cpu_stop_queue_work(stopper1, work1);
+	__cpu_stop_queue_work(stopper2, work2);
+unlock:
+	spin_unlock(&stopper2->lock);
+	spin_unlock_irq(&stopper1->lock);
+	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+	return err;
+}
 /**
 * stop_two_cpus - stops two cpus
 * @cpu1: the cpu to stop
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	cpu_stop_init_done(&done, 2);
 	set_state(&msdata, MULTI_STOP_PREPARE);
-	/*
+	if (cpu1 > cpu2)
-	 * If we observe both CPUs active we know _cpu_down() cannot yet have
+		swap(cpu1, cpu2);
-	 * queued its stop_machine works and therefore ours will get executed
+	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
-	 * first. Or its not either one of our CPUs that's getting unplugged,
-	 * in which case we don't care.
-	 *
-	 * This relies on the stopper workqueues to be FIFO.
-	 */
-	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
 		preempt_enable();
 		return -ENOENT;
 	}
-	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
-	cpu_stop_queue_work(cpu1, &work1);
-	cpu_stop_queue_work(cpu2, &work2);
-	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
 	preempt_enable();
 	wait_for_completion(&done.completion);
@@ -452,6 +469,18 @@ static void cpu_stopper_thread(unsigned int cpu)
 	}
 }
+void stop_machine_park(int cpu)
+{
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	/*
+	 * Lockless. cpu_stopper_thread() will take stopper->lock and flush
+	 * the pending works before it parks, until then it is fine to queue
+	 * the new works.
+	 */
+	stopper->enabled = false;
+	kthread_park(stopper->thread);
+}
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 static void cpu_stop_create(unsigned int cpu)
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
 static void cpu_stop_park(unsigned int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	struct cpu_stop_work *work, *tmp;
-	unsigned long flags;
-	/* drain remaining works */
+	WARN_ON(!list_empty(&stopper->works));
-	spin_lock_irqsave(&stopper->lock, flags);
-	list_for_each_entry_safe(work, tmp, &stopper->works, list) {
-		list_del_init(&work->list);
-		cpu_stop_signal_done(work->done, false);
-	}
-	stopper->enabled = false;
-	spin_unlock_irqrestore(&stopper->lock, flags);
 }
-static void cpu_stop_unpark(unsigned int cpu)
+void stop_machine_unpark(int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	spin_lock_irq(&stopper->lock);
 	stopper->enabled = true;
-	spin_unlock_irq(&stopper->lock);
+	kthread_unpark(stopper->thread);
 }
 static struct smp_hotplug_thread cpu_stop_threads = {
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
 	.thread_fn		= cpu_stopper_thread,
 	.thread_comm		= "migration/%u",
 	.create			= cpu_stop_create,
-	.setup			= cpu_stop_unpark,
 	.park			= cpu_stop_park,
-	.pre_unpark		= cpu_stop_unpark,
 	.selfparking		= true,
 };
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
 	}
 	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
+	stop_machine_unpark(raw_smp_processor_id());
 	stop_machine_initialized = true;
 	return 0;
 }

--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5697,7 +5697,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 }
 static void
-ftrace_graph_probe_sched_switch(void *ignore,
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
 			struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long long timestamp;

--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
 static void
-probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, bool preempt,
+		   struct task_struct *prev, struct task_struct *next)
 {
 	if (unlikely(!sched_ref))
 		return;

--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 static void notrace
-probe_wakeup_sched_switch(void *ignore,
+probe_wakeup_sched_switch(void *ignore, bool preempt,
 			  struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;