!182 futex: introduce the direct-thread-switch mechanism

Merge Pull Request from: @hizhisong In some cases, we need to run several low-thrashing required threads together which act as logical operations like PV operations. This kind of thread always falls asleep and wakes other threads up, and thread switching requires the kernel to do several scheduling related overheads (Select the proper core to execute, wake the task up, enqueue the task, mark the task scheduling flag, pick the task at the proper time, dequeue the task and do context switching). These overheads mentioned above are not accepted for the low-thrashing threads. Therefore, we require a mechanism to decline the unnecessary overhead and to swap threads directly without affecting the fairness of CFS tasks. To achieve this goal, we implemented the direct-thread-switch mechanism based on the futex_swap patch*, which switches the DTS task directly with the shared schedule entity. Also, we ensured the kernel keeps secure and consistent basically. \* https://lore.kernel.org/lkml/20200722234538.166697-2-posk@posk.io/ Link:https://gitee.com/openeuler/kernel/pulls/182 Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

!182 futex: introduce the direct-thread-switch mechanism
Merge Pull Request from: @hizhisong In some cases, we need to run several low-thrashing required threads together which act as logical operations like PV operations. This kind of thread always falls asleep and wakes other threads up, and thread switching requires the kernel to do several scheduling related overheads (Select the proper core to execute, wake the task up, enqueue the task, mark the task scheduling flag, pick the task at the proper time, dequeue the task and do context switching). These overheads mentioned above are not accepted for the low-thrashing threads. Therefore, we require a mechanism to decline the unnecessary overhead and to swap threads directly without affecting the fairness of CFS tasks. To achieve this goal, we implemented the direct-thread-switch mechanism based on the futex_swap patch*, which switches the DTS task directly with the shared schedule entity. Also, we ensured the kernel keeps secure and consistent basically. \* https://lore.kernel.org/lkml/20200722234538.166697-2-posk@posk.io/ Link:https://gitee.com/openeuler/kernel/pulls/182 Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
525e6792 · openeuler-ci-bot · Gitee · 79d1782a · dad99a57 · 525e6792
11 changed file
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,6 +68,11 @@ struct task_delay_info;
 struct task_group;
 struct io_uring_task;

+#define NONE_BY_PASS			0x0000
+#define INIT_BY_PASS			0x0001
+#define IN_BY_PASS			0x0002
+#define END_BY_PASS			0x0004
+
 /*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
@@ -500,6 +505,10 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif

+#ifdef CONFIG_DTS
+	int 				by_pass;
+#endif
+
 #ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
@@ -726,6 +735,15 @@ struct task_struct {
 	int				normal_prio;
 	unsigned int			rt_priority;

+#ifdef CONFIG_DTS
+	/*
+	 * by_pass indicate that the task is launched by direct-thread-switch.
+	 * dts_shared_se is the schedule entity shared with DTS task.
+	 */
+	int				by_pass;
+	struct sched_entity		dts_shared_se;
+#endif
+
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
@@ -1829,6 +1847,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr);

 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_process_prefer_current_cpu(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);

 #ifdef CONFIG_SMP
@@ -2193,6 +2212,10 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif

+#ifdef CONFIG_DTS
+extern int check_task_left_time(struct task_struct *task);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 extern void sched_settag(struct task_struct *tsk, s64 tag);


--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -21,10 +21,18 @@
 #define FUTEX_WAKE_BITSET	10
 #define FUTEX_WAIT_REQUEUE_PI	11
 #define FUTEX_CMP_REQUEUE_PI	12
+#define FUTEX_SWAP		13

 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
+
+#ifdef CONFIG_DTS
+#define FUTEX_FLAGS_DTS_MODE	512
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_FLAGS_DTS_MODE)
+#else
 #define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#endif

 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -40,6 +48,8 @@
 					 FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
+#define FUTEX_SWAP_PRIVATE		(FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
+

 /*
 * Support for robust futexes: the kernel cleans up held futexes at

--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1261,6 +1261,13 @@ config SCHED_STEAL

 	  If unsure, say N here.

+config DTS
+	bool "Direct Thread Switch"
+	default y
+	depends on SCHED_STEAL
+	help
+	  enable the direct thread switch mechanism in the futex_swap operation
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN

--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -39,11 +39,16 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/time_namespace.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>

 #include <asm/futex.h>

 #include "locking/rtmutex_common.h"

+#ifdef CONFIG_DTS
+#include "sched/sched.h"
+#endif
 /*
 * READ this before attempting to hack on futexes!
 *
@@ -161,7 +166,7 @@ static int  __read_mostly futex_cmpxchg_enabled;
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
-# define FLAGS_SHARED		0x00
+#define FLAGS_SHARED		0x00
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
@@ -1584,16 +1589,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 }

 /*
- * Wake up waiters matching bitset queued on this futex (uaddr).
+ * Prepare wake queue matching bitset queued on this futex (uaddr).
 */
 static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
+	       struct wake_q_head *wake_q)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
-	DEFINE_WAKE_Q(wake_q);

 	if (!bitset)
 		return -EINVAL;
@@ -1621,14 +1626,28 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;

-			mark_wake_futex(&wake_q, this);
+			mark_wake_futex(wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}

 	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+	int ret;
+	DEFINE_WAKE_Q(wake_q);
+
+	ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
 	wake_up_q(&wake_q);
+
 	return ret;
 }

@@ -2571,14 +2590,230 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	return 0;
 }

+#ifdef CONFIG_DTS
+static int __direct_thread_switch(struct task_struct *next)
+{
+	int cpu = smp_processor_id();
+	int success = 1;
+	struct rq_flags rf;
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct task_struct *prev = rq->curr;
+	struct sched_entity *prev_se, *next_se;
+	unsigned long *switch_count = &prev->nvcsw;
+	unsigned long prev_state;
+	int next_state;
+	struct rq *src_rq_next;
+	bool locked;
+
+	preempt_disable();
+	local_irq_disable();
+
+	if (!prev->by_pass) {
+		prev_se = &prev->se;
+	} else {
+		prev_se = &prev->dts_shared_se;
+	}
+
+	next_se = &next->se;
+
+	prev->by_pass = NONE_BY_PASS;
+	next->by_pass = INIT_BY_PASS;
+	next->dts_shared_se = *prev_se;
+	prev_se->by_pass = NONE_BY_PASS;
+	next->dts_shared_se.by_pass = INIT_BY_PASS;
+
+	/* task_struct::state is volatile so far */
+	next_state = next->state;
+	src_rq_next = task_rq(next);
+	locked = true;
+	/* Deliver the execution to the callee. */
+	if (next_state == TASK_RUNNING) {
+		/* The next is running now. */
+		if (task_running(src_rq_next, next)) {
+			success = 0;
+			goto end;
+		}
+		/* The next task is runnable, and may stay in the current core's rq or other cores' rq. */
+		/* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence.
+		 * Enqueue the next task's se when the task expired.
+		 */
+		if (task_rq(next) != rq) {
+#ifdef CONFIG_SCHED_STEAL
+			/* migrate */
+			if (!steal_task(rq, &rf, &locked, next)) {
+				success = 0;
+				goto end;
+			}
+#else
+			success = 0;
+			goto end;
+#endif
+		}
+		replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se);
+	} else if (next_state == TASK_INTERRUPTIBLE) {
+		/*
+		 *
+		 * The next task in the sleeping state caused by futex_swap, futex_wait,
+		 * can be woken up here so far, but signals, and other interruptible situations
+		 * need to be implemented here.
+		 * P.S. We pick up the next task from the wake list of the corresponding futex_t.
+		 */
+
+		/* Enqueue the shared_se and change the state without entering schedule() path. */
+		if (!wake_up_process_prefer_current_cpu(next)) {
+			success = 0;
+			goto end;
+		}
+
+		/* success to wakeup (set p->state = TASK_RUNNING) */
+		/* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */
+		set_next_entity(cfs_rq, &next->dts_shared_se);
+
+	} else {
+		success = 0;
+		goto end;
+	}
+
+	/* increase rq->cfs.nr_running */
+	cfs_rq->nr_running++;
+
+	sched_submit_work(prev);
+
+	rcu_note_context_switch(false);
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up():
+	 *
+	 * __set_current_state(@state)		signal_wake_up()
+	 * __direct_thread_switch()		set_tsk_thread_flag(p, TIF_SIGPENDING)
+	 *					  wake_up_state(p, state)
+	 *   LOCK rq->lock			    LOCK p->pi_state
+	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
+	 *     if (signal_pending_state())	    if (p->state & @state)
+	 *
+	 * Also, the membarrier system call requires a full memory barrier
+	 * after coming from user-space, before storing to rq->curr.
+	 */
+	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
+
+	/*
+	 * We may fail to switch, so do not deactivate the current task before
+	 * process the next.
+	 */
+
+	/*
+	 * We must load prev->state once (task_struct::state is volatile), such
+	 * that:
+	 *
+	 *  - we form a control dependency vs deactivate_task() below.
+	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
+	 */
+	prev_state = prev->state;
+	if (prev_state) {
+		if (signal_pending_state(prev_state, prev)) {
+			prev->state = TASK_RUNNING;
+		} else {
+			prev->sched_contributes_to_load =
+				(prev_state & TASK_UNINTERRUPTIBLE) &&
+				!(prev_state & TASK_NOLOAD) &&
+				!(prev->flags & PF_FROZEN);
+
+			if (prev->sched_contributes_to_load)
+				rq->nr_uninterruptible++;
+
+			/*
+			 * __schedule()			ttwu()
+			 *   prev_state = prev->state;    if (p->on_rq && ...)
+			 *   if (prev_state)		    goto out;
+			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();
+			 *				  p->state = TASK_WAKING
+			 *
+			 * Where __schedule() and ttwu() have matching control dependencies.
+			 *
+			 * After this, schedule() must not care about p->state any more.
+			 */
+			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+			if (prev->in_iowait) {
+				atomic_inc(&rq->nr_iowait);
+				delayacct_blkio_start();
+			}
+		}
+	}
+
+	rq->nr_switches++;
+	/*
+	* RCU users of rcu_dereference(rq->curr) may not see
+	* changes to task_struct made by pick_next_task().
+	*/
+	RCU_INIT_POINTER(rq->curr, next);
+	/*
+	* The membarrier system call requires each architecture
+	* to have a full memory barrier after updating
+	* rq->curr, before returning to user-space.
+	*
+	* Here are the schemes providing that barrier on the
+	* various architectures:
+	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+	*   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+	* - finish_lock_switch() for weakly-ordered
+	*   architectures where spin_unlock is a full barrier,
+	* - switch_to() for arm64 (weakly-ordered, spin_unlock
+	*   is a RELEASE barrier),
+	*/
+	++*switch_count;
+
+	psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
+	trace_sched_switch(false, prev, next);
+
+	/* do the get_task_struct() in the futex_wait_queue_me() before */
+	put_task_struct(next);
+
+	rq = context_switch(rq, prev, next, &rf);
+
+	balance_callback(rq);
+	sched_update_worker(next);
+end:
+	sched_preempt_enable_no_resched();
+	return success;
+}
+
+/*
+ * return
+ * 0 for fail
+ * 1 for succeed
+ */
+static int direct_thread_switch(struct task_struct *next)
+{
+	if (next->sched_class != &fair_sched_class ||
+				 current == next) {
+		return 0;
+	}
+
+	if (!check_task_left_time(current)) {
+		return 0;
+	}
+
+	return __direct_thread_switch(next);
+}
+#endif /* CONFIG_DTS */
+
 /**
 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
 * @hb:		the futex hash bucket, must be locked by the caller
 * @q:		the futex_q to queue up on
 * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
+ * @next:	if present, wake next and hint to the scheduler that we'd
+ *		prefer to execute it locally.
 */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-				struct hrtimer_sleeper *timeout)
+				struct hrtimer_sleeper *timeout,
+				struct task_struct *next, int flags)
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
@@ -2598,15 +2833,60 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * has tried to wake us, and we can skip the call to schedule().
 	 */
 	if (likely(!plist_node_empty(&q->list))) {
+#ifdef CONFIG_DTS
+		int do_dts_switch = 0;
+#endif
 		/*
 		 * If the timer has already expired, current will already be
 		 * flagged for rescheduling. Only call schedule if there
 		 * is no timeout, or if it has yet to expire.
 		 */
-		if (!timeout || timeout->task)
+		if (!timeout || timeout->task) {
+			if (next) {
+#ifdef CONFIG_DTS
+				/*
+				 * If we fail to switch to the next task directly, try to switch to
+				 * the next task in the traditional way.
+				 *
+				 */
+				if (flags & FUTEX_FLAGS_DTS_MODE)
+					do_dts_switch = direct_thread_switch(next);
+
+				if (!do_dts_switch)
+#endif
+				{
+#ifdef CONFIG_SMP
+					wake_up_process_prefer_current_cpu(next);
+#else
+					wake_up_process(next);
+#endif
+				}
+
+#ifdef CONFIG_DTS
+				if (!do_dts_switch)
+#endif
+					put_task_struct(next);
+
+				next = NULL;
+			}
+#ifdef CONFIG_DTS
+			if (!do_dts_switch)
+#endif
 				freezable_schedule();
 		}
+	}
 	__set_current_state(TASK_RUNNING);
+
+
+
+	if (next) {
+#ifdef CONFIG_DTS
+		direct_thread_switch(next);
+#else
+		wake_up_process(next);
+		put_task_struct(next);
+#endif
+	}
 }

 /**
@@ -2682,7 +2962,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 }

 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-		      ktime_t *abs_time, u32 bitset)
+		      ktime_t *abs_time, u32 bitset, struct task_struct *next)
 {
 	struct hrtimer_sleeper timeout, *to;
 	struct restart_block *restart;
@@ -2706,7 +2986,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		goto out;

 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to);
+	futex_wait_queue_me(hb, &q, to, next, flags);
+	next = NULL;

 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
@@ -2738,6 +3019,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	ret = set_restart_fn(restart, futex_wait_restart);

 out:
+	if (next) {
+		wake_up_process(next);
+		put_task_struct(next);
+	}
 	if (to) {
 		hrtimer_cancel(&to->timer);
 		destroy_hrtimer_on_stack(&to->timer);
@@ -2745,7 +3030,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	return ret;
 }

-
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = restart->futex.uaddr;
@@ -2757,10 +3041,38 @@ static long futex_wait_restart(struct restart_block *restart)
 	}
 	restart->fn = do_no_restart_syscall;

-	return (long)futex_wait(uaddr, restart->futex.flags,
-				restart->futex.val, tp, restart->futex.bitset);
+	return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val,
+				tp, restart->futex.bitset, NULL);
 }

+static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+		      ktime_t *abs_time, u32 __user *uaddr2)
+{
+	u32 bitset = FUTEX_BITSET_MATCH_ANY;
+	struct task_struct *next = NULL;
+	DEFINE_WAKE_Q(wake_q);
+	int ret;
+
+	ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
+	if (ret < 0)
+		return ret;
+	if (!wake_q_empty(&wake_q)) {
+		/* At most one wakee can be present. Pull it out. */
+		next = container_of(wake_q.first, struct task_struct, wake_q);
+		next->wake_q.next = NULL;
+	}
+
+	/* Basic security test. (Are the two tasks in the same group?) */
+
+	/* Have any time slices to be used? */
+
+	/*
+	 * The old one will go to sleep and enqueue the rq, meanwhile, get
+	 * the new one to run.
+	 */
+
+	return futex_wait(uaddr, flags, val, abs_time, bitset, next);
+}

 /*
 * Userspace tried a 0 -> TID atomic transition of the futex value
@@ -3222,7 +3534,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	}

 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to);
+	futex_wait_queue_me(hb, &q, to, NULL, flags);

 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3708,6 +4020,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;

+#ifdef CONFIG_DTS
+	if (op & FUTEX_FLAGS_DTS_MODE) {
+		flags |= FUTEX_FLAGS_DTS_MODE;
+	}
+#endif
+
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;

@@ -3732,7 +4050,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		val3 = FUTEX_BITSET_MATCH_ANY;
 		fallthrough;
 	case FUTEX_WAIT_BITSET:
-		return futex_wait(uaddr, flags, val, timeout, val3);
+		return futex_wait(uaddr, flags, val, timeout, val3, NULL);
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 		fallthrough;
@@ -3756,6 +4074,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 					     uaddr2);
 	case FUTEX_CMP_REQUEUE_PI:
 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+	case FUTEX_SWAP:
+		return futex_swap(uaddr, flags, val, timeout, uaddr2);
 	}
 	return -ENOSYS;
 }
@@ -3772,7 +4092,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,

 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
-		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		      cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
 			return -EFAULT;
 		if (get_timespec64(&ts, utime))
@@ -3781,7 +4101,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 			return -EINVAL;

 		t = timespec64_to_ktime(ts);
-		if (cmd == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
 			t = ktime_add_safe(ktime_get(), t);
 		else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 			t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
@@ -3968,14 +4288,14 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,

 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
-		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		      cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
 		if (get_old_timespec32(&ts, utime))
 			return -EFAULT;
 		if (!timespec64_valid(&ts))
 			return -EINVAL;

 		t = timespec64_to_ktime(ts);
-		if (cmd == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
 			t = ktime_add_safe(ktime_get(), t);
 		else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 			t = timens_ktime_to_host(CLOCK_MONOTONIC, t);

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2469,7 +2469,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 			   struct rq_flags *rf)
 {
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
 		check_preempt_curr(rq, p, wake_flags);
+
 	p->state = TASK_RUNNING;
 	trace_sched_wakeup(p);

@@ -2996,6 +3000,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 out:
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
+#ifdef CONFIG_DTS
+	if (p->by_pass == INIT_BY_PASS) {
+		p->by_pass = IN_BY_PASS;
+		p->se.by_pass = IN_BY_PASS;
+		p->dts_shared_se.by_pass = IN_BY_PASS;
+		preempt_enable_no_resched();
+	}
+	else
+#endif
 		preempt_enable();

 	return success;
@@ -3086,6 +3099,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);

+#ifdef CONFIG_DTS
+	p->dts_shared_se.on_rq			= 0;
+	p->dts_shared_se.exec_start		= 0;
+	p->dts_shared_se.sum_exec_runtime		= 0;
+	p->dts_shared_se.prev_sum_exec_runtime	= 0;
+	p->dts_shared_se.nr_migrations		= 0;
+	p->dts_shared_se.vruntime			= 0;
+	INIT_LIST_HEAD(&p->dts_shared_se.group_node);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
 #endif
@@ -3315,6 +3338,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)

 	init_entity_runnable_average(&p->se);

+#ifdef CONFIG_DTS
+	p->by_pass = NONE_BY_PASS;
+	p->se.by_pass = NONE_BY_PASS;
+	p->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif

 #ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
@@ -3702,6 +3730,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
+#ifdef CONFIG_DTS
+		prev->by_pass = NONE_BY_PASS;
+		prev->se.by_pass = NONE_BY_PASS;
+		prev->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3744,7 +3777,7 @@ static void __balance_callback(struct rq *rq)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 	if (unlikely(rq->balance_callback))
 		__balance_callback(rq);
@@ -3752,7 +3785,7 @@ static inline void balance_callback(struct rq *rq)

 #else

-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 }

@@ -3789,7 +3822,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 /*
 * context_switch - switch to the new MM and the new thread's register state.
 */
-static __always_inline struct rq *
+__always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
@@ -3846,7 +3879,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	barrier();

 	return finish_task_switch(prev);
-}
+}EXPORT_SYMBOL(context_switch);

 /*
 * nr_running and nr_context_switches:
@@ -4615,7 +4648,7 @@ void __noreturn do_task_dead(void)
 		cpu_relax();
 }

-static inline void sched_submit_work(struct task_struct *tsk)
+inline void sched_submit_work(struct task_struct *tsk)
 {
 	unsigned int task_flags;

@@ -4651,7 +4684,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }

-static void sched_update_worker(struct task_struct *tsk)
+void sched_update_worker(struct task_struct *tsk)
 {
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
@@ -6931,6 +6964,11 @@ void sched_setnuma(struct task_struct *p, int nid)
 }
 #endif /* CONFIG_NUMA_BALANCING */

+int wake_up_process_prefer_current_cpu(struct task_struct *next)
+{
+       return try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
 * Ensure that the idle task is using init_mm right before its CPU goes

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -572,6 +572,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }

+static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node)
+{
+	struct sched_entity *entry;
+
+	if (!*node) {
+		printk("TREE END\n");
+		return;
+	}
+
+	entry = rb_entry(*node, struct sched_entity, run_node);
+
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+	printk("%p\n", entry);
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+}
+
+void traverse_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
+	__traverse_cfs_rq(cfs_rq, link);
+}
+
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
@@ -2982,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
 }
 #endif

-static void
+void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
@@ -4340,7 +4362,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue(cfs_rq, se, flags);
 	check_spread(cfs_rq, se);
 	if (!curr)
+#ifdef CONFIG_DTS
+		if (se->by_pass != INIT_BY_PASS)
+#endif
 			__enqueue_entity(cfs_rq, se);
+
 	se->on_rq = 1;

 	/*
@@ -4463,6 +4489,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	unsigned long ideal_runtime, delta_exec;
 	struct sched_entity *se;
 	s64 delta;
+#ifdef CONFIG_DTS
+	struct task_struct *curr_task = NULL;
+
+	if (entity_is_task(curr) && curr->by_pass != NONE_BY_PASS)
+		curr_task = task_of_dts_shared_se(curr);
+#endif

 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -4488,7 +4520,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
-		return;
+		goto end;
 	}

 	/*
@@ -4497,19 +4529,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * This also mitigates buddy induced latencies under load.
 	 */
 	if (delta_exec < sysctl_sched_min_granularity)
-		return;
+		goto end;

 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;

 	if (delta < 0)
-		return;
+		goto end;

-	if (delta > ideal_runtime)
+	if (delta > ideal_runtime) {
 		resched_curr(rq_of(cfs_rq));
+		goto end;
+	} else {
+		return;
+	}
+end:
+#ifdef CONFIG_DTS
+	if (curr_task) {
+		curr_task->by_pass = END_BY_PASS;
+		curr_task->se.by_pass = END_BY_PASS;
+		curr_task->dts_shared_se.by_pass = END_BY_PASS;
+	}
+#endif
 }

-static void
+#ifdef CONFIG_DTS
+/*
+ * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se.
+ * Correspondingly, enqueue the task original se without any changes on se's information
+ * when the shared se expired. // TODO
+ * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO
+ */
+void
+replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se)
+{
+	if (shared_se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		// TODO
+		update_stats_wait_end(cfs_rq, shared_se);
+		__dequeue_entity(cfs_rq, se);	/* the se of next task should be dequeued */
+		update_load_avg(cfs_rq, shared_se, UPDATE_TG);
+	}
+
+	update_stats_curr_start(cfs_rq, shared_se);
+	cfs_rq->curr = shared_se;	// 后续update_curr是update cfs_rq->curr
+
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (schedstat_enabled() &&
+	    rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) {
+		schedstat_set(shared_se->statistics.slice_max,
+			max((u64)schedstat_val(shared_se->statistics.slice_max),
+			    shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime));
+	}
+
+	shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime;
+}
+#endif
+
+void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* 'current' is not kept within the tree. */
@@ -4605,8 +4690,15 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)

 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

+/* the prev's value is unique or shared for the dts mechanism */
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+#ifdef CONFIG_DTS
+	struct task_struct *task = NULL;
+
+	if (entity_is_task(prev))
+		task = task_of(prev);
+#endif
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -4627,6 +4719,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
+#ifdef CONFIG_DTS
+	if (task && task->by_pass == END_BY_PASS) {
+		task->by_pass = NONE_BY_PASS;
+		task->se.by_pass = NONE_BY_PASS;
+		task->dts_shared_se.by_pass = NONE_BY_PASS;
+	}
+#endif
 }

 static void
@@ -5630,6 +5729,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	unsigned int prev_nr = rq->cfs.h_nr_running;

+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS) {
+		se = &p->dts_shared_se;
+	}
+#endif
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -5737,11 +5842,17 @@ static void set_next_buddy(struct sched_entity *se);
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	int idle_h_nr_running = task_has_idle_policy(p);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 	bool was_sched_idle = sched_idle_rq(rq);
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS)
+		se = &p->dts_shared_se;
+	else
+#endif
+		se = &p->se;

 	util_est_dequeue(&rq->cfs, p);

@@ -6899,6 +7010,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int ret;
 #endif

+        if ((wake_flags & WF_CURRENT_CPU) && cpumask_test_cpu(cpu, p->cpus_ptr))
+                return cpu;
+
+
 	time = schedstat_start_time();

 	if (sd_flag & SD_BALANCE_WAKE) {
@@ -7155,11 +7270,28 @@ static void set_skip_buddy(struct sched_entity *se)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct sched_entity *se, *pse;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;

+#ifdef CONFIG_DTS
+	int curr_by_pass = curr->by_pass;
+	int p_by_pass = p->by_pass;
+
+	if (curr_by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+		se = &curr->se;
+
+#ifdef CONFIG_DTS
+	if (p_by_pass != NONE_BY_PASS)
+		pse = &p->dts_shared_se;
+	else
+#endif
+		pse = &p->se;
+
 	if (unlikely(se == pse))
 		return;

@@ -7714,13 +7846,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf

 	p = task_of(se);

+	if (se == NULL) {
+		printk("CFS_RQ Nr_running: %d\n", rq->cfs.nr_running);
+		printk("RQ Nr_running: %d\n", rq->nr_running);
+	}
+
 	/*
 	 * Since we haven't yet done put_prev_entity and if the selected task
 	 * is a different task than we started out with, try and touch the
 	 * least amount of cfs_rqs.
 	 */
 	if (prev != p) {
-		struct sched_entity *pse = &prev->se;
+		struct sched_entity *pse;
+#ifdef CONFIG_DTS
+		if (prev->by_pass != NONE_BY_PASS)
+			pse = &prev->dts_shared_se;
+		else
+#endif
+		pse = &prev->se;
+

 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
@@ -7873,8 +8017,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
 */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct sched_entity *se = &prev->se;
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (prev->by_pass != NONE_BY_PASS)
+		se = &prev->dts_shared_se;
+	else
+#endif
+		se = &prev->se;
+

 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -7891,7 +8042,13 @@ static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *se = &curr->se;
+	struct sched_entity *se;
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+	se = &curr->se;

 	/*
 	 * Are we the only task in the tree?
@@ -7922,6 +8079,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;

+#ifdef CONFIG_DTS
+	/* DTS tasks DO NOT support being executed by yeild_to method.*/
+	if (p->by_pass != NONE_BY_PASS) {
+		return false;
+	}
+#endif
+
 	/* throttled hierarchies are not runnable */
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
@@ -8359,7 +8523,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
 /*
 * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
 */
-static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
 	lockdep_assert_held(&src_rq->lock);

@@ -8569,6 +8733,10 @@ static void attach_task(struct rq *rq, struct task_struct *p)

 	BUG_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
 	check_preempt_curr(rq, p, 0);
 }

@@ -11540,6 +11708,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 	return stolen;
 }

+int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      struct task_struct *tsk)
+{
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = task_rq(tsk);
+	int src_cpu = task_cpu(tsk);
+
+	if (!steal_enabled())
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_cpu == src_cpu)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_unlock(&dst_rq->lock);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (!cpu_active(src_cpu))
+		tsk = NULL;
+	else
+		detach_task(tsk, src_rq, dst_cpu);
+
+	rq_unlock(src_rq, &rf);
+
+	if (tsk) {
+		raw_spin_lock(&dst_rq->lock);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, tsk);
+		stolen = 1;
+		schedstat_inc(dst_rq->steal);
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
 /*
 * Conservative upper bound on the max cost of a steal, in nsecs (the typical
 * cost is 1-2 microsec).  Do not steal if average idle time is less.
@@ -11649,6 +11864,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;

+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS) {
+		se = &curr->dts_shared_se;
+	}
+#endif
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
@@ -12144,6 +12365,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }

+void update_before_bypass(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+	struct sched_entity *curr;
+	struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_DTS
+	if (current->by_pass != NONE_BY_PASS)
+		curr = &current->dts_shared_se;
+	else
+#endif
+		curr = &current->se;
+
+	cfs_rq = cfs_rq_of(curr);
+
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	rq_unlock(rq, &rf);
+}
+
+/*
+ * return 1: left time Y
+ *
+ */
+int check_task_left_time(struct task_struct *task)
+{
+	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (task->by_pass != NONE_BY_PASS)
+		se = &task->dts_shared_se;
+	else
+#endif
+		se = &task->se;
+
+	cfs_rq = cfs_rq_of(se);
+
+	ideal_runtime = sched_slice(cfs_rq, se);
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime) {
+		if (cfs_rq->nr_running > 1) {
+			resched_curr(rq_of(cfs_rq));
+			/*
+			* The current task ran long enough, ensure it doesn't get
+			* re-elected due to buddy favours.
+			*/
+			clear_buddies(cfs_rq, se);
+		}
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
 * All the scheduling class methods:
 */

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -534,6 +534,12 @@ extern void sched_offline_group(struct task_group *tg);

 extern void sched_move_task(struct task_struct *tsk);

+#ifdef CONFIG_DTS
+extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se);
+#endif
+
+extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

@@ -1185,9 +1191,22 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define raw_rq()		raw_cpu_ptr(&runqueues)

 #ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	SCHED_WARN_ON(!entity_is_task(dts_shared_se));
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	SCHED_WARN_ON(!entity_is_task(se));
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }

@@ -1210,8 +1229,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

 #else

+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+
+static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se)
+{
+	struct task_struct *p = task_of_dts_shared_se(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }

@@ -1220,7 +1259,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 	return &task_rq(p)->cfs;
 }

-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
@@ -1228,6 +1267,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 	return &rq->cfs;
 }

+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return cfs_rq_of_dts_shared_se(se);
+	else
+#endif
+	return cfs_rq_of_se(se);
+
+}
+
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
@@ -1862,6 +1912,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_FORK			0x02		/* Child wakeup after fork */
 #define WF_MIGRATED		0x04		/* Internal use, task got migrated */
 #define WF_ON_CPU		0x08		/* Wakee is on_cpu */
+#define WF_CURRENT_CPU         0x10            /* Prefer to move wakee to the current CPU */

 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -2403,6 +2454,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)

 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void traverse_cfs_rq(struct cfs_rq *cfs_rq);

 #ifdef	CONFIG_SCHED_DEBUG
 extern bool sched_debug_enabled;
@@ -2788,3 +2840,18 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)

 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_DTS
+extern void sched_submit_work(struct task_struct *tsk);
+extern void sched_update_worker(struct task_struct *tsk);
+extern struct rq *context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf);
+extern void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SCHED_STEAL
+extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+			struct task_struct *tsk);
+extern void update_before_bypass(void);
+extern void balance_callback(struct rq *rq);
+#endif
+#endif
\ No newline at end of file
--- a/tools/testing/selftests/futex/functional/.gitignore
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -2,6 +2,7 @@
 futex_requeue_pi
 futex_requeue_pi_mismatched_ops
 futex_requeue_pi_signal_restart
+futex_swap
 futex_wait_private_mapped_file
 futex_wait_timeout
 futex_wait_uninitialized_heap

--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -13,6 +13,7 @@ TEST_GEN_FILES := \
 	futex_requeue_pi \
 	futex_requeue_pi_signal_restart \
 	futex_requeue_pi_mismatched_ops \
+	futex_swap \
 	futex_wait_uninitialized_heap \
 	futex_wait_private_mapped_file


--- a/tools/testing/selftests/futex/functional/futex_swap.c
+++ b/tools/testing/selftests/futex/functional/futex_swap.c
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <errno.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "atomic.h"
+#include "futextest.h"
+
+/* The futex the main thread waits on. */
+futex_t futex_main = FUTEX_INITIALIZER;
+/* The futex the other thread waits on. */
+futex_t futex_other = FUTEX_INITIALIZER;
+
+/* The number of iterations to run (>1 => run benchmarks. */
+static int cfg_iterations = 5;
+
+/* If != 0, print diagnostic messages. */
+static int cfg_verbose;
+
+/* If == 0, do not use validation_counter. Useful for benchmarking. */
+static int cfg_validate = 1;
+
+/* How to swap threads. */
+#define SWAP_WAKE_WAIT 1
+#define SWAP_SWAP 2
+#define SWAP_SWAP_DTS 4
+
+/* Futex values. */
+#define FUTEX_WAITING 0
+#define FUTEX_WAKEUP 1
+
+#define FUTEX_FLAGS_DTS_MODE	512
+
+/* An atomic counter used to validate proper swapping. */
+static atomic_t validation_counter;
+
+void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
+{
+	int ret;
+	int flags = 0;
+
+	switch (mode) {
+	case SWAP_WAKE_WAIT:
+		futex_set(futex_this, FUTEX_WAITING);
+		futex_set(futex_that, FUTEX_WAKEUP);
+		futex_wake(futex_that, 1, FUTEX_PRIVATE_FLAG);
+		futex_wait(futex_this, FUTEX_WAITING, NULL, FUTEX_PRIVATE_FLAG);
+		if (*futex_this != FUTEX_WAKEUP) {
+			fprintf(stderr, "unexpected futex_this value on wakeup\n");
+			exit(1);
+		}
+		break;
+
+	case SWAP_SWAP_DTS:
+		flags |= FUTEX_FLAGS_DTS_MODE;
+	case SWAP_SWAP:
+		flags |= FUTEX_PRIVATE_FLAG;
+		futex_set(futex_this, FUTEX_WAITING);
+		futex_set(futex_that, FUTEX_WAKEUP);
+		ret = futex_swap(futex_this, FUTEX_WAITING, NULL,
+				 futex_that, flags);
+		if (ret < 0 && errno == ENOSYS) {
+			/* futex_swap not implemented */
+			perror("futex_swap");
+			exit(1);
+		}
+		if (*futex_this != FUTEX_WAKEUP) {
+			fprintf(stderr, "unexpected futex_this value on wakeup\n");
+			exit(1);
+		}
+		break;
+
+	default:
+		fprintf(stderr, "unknown mode in %s\n", __func__);
+		exit(1);
+	}
+}
+
+void *other_thread(void *arg)
+{
+	int mode = *((int *)arg);
+	int counter;
+
+	if (cfg_verbose)
+		printf("%s started\n", __func__);
+
+	futex_wait(&futex_other, 0, NULL, FUTEX_PRIVATE_FLAG);
+
+	for (counter = 0; counter < cfg_iterations; ++counter) {
+		if (cfg_validate) {
+			int prev = 2 * counter + 1;
+
+			if (prev != atomic_cmpxchg(&validation_counter, prev,
+						   prev + 1)) {
+				fprintf(stderr, "swap validation failed\n");
+				exit(1);
+			}
+		}
+		futex_swap_op(mode, &futex_other, &futex_main);
+	}
+
+	if (cfg_verbose)
+		printf("%s finished: %d iteration(s)\n", __func__, counter);
+
+	return NULL;
+}
+
+void run_test(int mode)
+{
+	struct timespec start, stop;
+	int ret, counter;
+	pthread_t thread;
+	uint64_t duration;
+
+	futex_set(&futex_other, FUTEX_WAITING);
+	atomic_set(&validation_counter, 0);
+	ret = pthread_create(&thread, NULL, &other_thread, &mode);
+	if (ret) {
+		perror("pthread_create");
+		exit(1);
+	}
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &start);
+	if (ret) {
+		perror("clock_gettime");
+		exit(1);
+	}
+
+	for (counter = 0; counter < cfg_iterations; ++counter) {
+		if (cfg_validate) {
+			int prev = 2 * counter;
+
+			if (prev != atomic_cmpxchg(&validation_counter, prev,
+						   prev + 1)) {
+				fprintf(stderr, "swap validation failed\n");
+				exit(1);
+			}
+		}
+		futex_swap_op(mode, &futex_main, &futex_other);
+	}
+	if (cfg_validate && validation_counter.val != 2 * cfg_iterations) {
+		fprintf(stderr, "final swap validation failed\n");
+		exit(1);
+	}
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &stop);
+	if (ret) {
+		perror("clock_gettime");
+		exit(1);
+	}
+
+	duration = (stop.tv_sec - start.tv_sec) * 1000000000LL +
+	stop.tv_nsec - start.tv_nsec;
+	if (cfg_verbose || cfg_iterations > 1) {
+		printf("completed %d swap and back iterations in %lu ns: %lu ns per swap\n",
+			cfg_iterations, duration,
+			duration / (cfg_iterations * 2));
+	}
+
+	/* The remote thread is blocked; send it the final wake. */
+	futex_set(&futex_other, FUTEX_WAKEUP);
+	futex_wake(&futex_other, 1, FUTEX_PRIVATE_FLAG);
+	if (pthread_join(thread, NULL)) {
+		perror("pthread_join");
+		exit(1);
+	}
+}
+
+void usage(char *prog)
+{
+	printf("Usage: %s\n", prog);
+	printf("  -h    Display this help message\n");
+	printf("  -i N  Use N iterations to benchmark\n");
+	printf("  -n    Do not validate swapping correctness\n");
+	printf("  -v    Print diagnostic messages\n");
+	printf("  -d    Benchmark with the direct-thread-switch(DTS) mechanism\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "hi:nvd")) != -1) {
+		switch (c) {
+		case 'h':
+			usage(basename(argv[0]));
+			exit(0);
+		case 'i':
+			cfg_iterations = atoi(optarg);
+			break;
+		case 'n':
+			cfg_validate = 0;
+			break;
+		case 'v':
+			cfg_verbose = 1;
+			break;
+		case 'd':
+			goto dts_test;
+			break;
+		default:
+			usage(basename(argv[0]));
+			exit(1);
+		}
+	}
+
+	printf("\n\n------- running SWAP_WAKE_WAIT -----------\n\n");
+	run_test(SWAP_WAKE_WAIT);
+	printf("PASS\n");
+
+	printf("\n\n------- running SWAP_SWAP -----------\n\n");
+	run_test(SWAP_SWAP);
+	printf("PASS\n");
+
+dts_test:
+	printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n");
+	run_test(SWAP_SWAP_DTS);
+	printf("PASS\n");
+
+	return 0;
+}
--- a/tools/testing/selftests/futex/include/futextest.h
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -38,6 +38,9 @@ typedef volatile u_int32_t futex_t;
 #ifndef FUTEX_CMP_REQUEUE_PI
 #define FUTEX_CMP_REQUEUE_PI		12
 #endif
+#ifndef FUTEX_SWAP
+#define FUTEX_SWAP			13
+#endif
 #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE
 #define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
@@ -46,6 +49,9 @@ typedef volatile u_int32_t futex_t;
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
 #endif
+#ifndef FUTEX_SWAP_PRIVATE
+#define FUTEX_SWAP_PRIVATE		(FUTEX_WAIT_WAKE | FUTEX_PRIVATE_FLAG)
+#endif

 /**
 * futex() - SYS_futex syscall wrapper
@@ -204,6 +210,19 @@ futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
 		     val, opflags);
 }

+/**
+ * futex_swap() - block on uaddr and wake one task blocked on uaddr2.
+ * @uaddr:	futex to block the current task on
+ * @timeout:	relative timeout for the current task block
+ * @uaddr2:	futex to wake tasks at (can be the same as uaddr)
+ */
+static inline int
+futex_swap(futex_t *uaddr, futex_t val, struct timespec *timeout,
+	   futex_t *uaddr2, int opflags)
+{
+	return futex(uaddr, FUTEX_SWAP, val, timeout, uaddr2, 0, opflags);
+}
+
 /**
 * futex_cmpxchg() - atomic compare and exchange
 * @uaddr:	The address of the futex to be modified