futex: introduce the direct-thread-switch mechanism

openeuler inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU CVE: NA Reference: https://lore.kernel.org/lkml/20200722234538.166697-2-posk@posk.io/ ------------------- In some scenarios, we need to run several low-thrashing required threads together which act as logical operations like PV operations. This kind of thread always falls asleep and wakes other threads up, and thread switching requires the kernel to do several scheduling related overheads (Select the proper core to execute, wake the task up, enqueue the task, mark the task scheduling flag, pick the task at the proper time, dequeue the task and do context switching). These overheads mentioned above are not accepted for the low-thrashing threads. Therefore, we require a mechanism to decline the unnecessary overhead and to swap threads directly without affecting the fairness of CFS tasks. To achieve this goal, we implemented the direct-thread-switch mechanism based on the futex_swap patch*, which switches the DTS task directly with the shared schedule entity. Also, we ensured the kernel keeps secure and consistent basically. Signed-off-by: N Zhi Song <hizhisong@gmail.com>

futex: introduce the direct-thread-switch mechanism
openeuler inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU CVE: NA Reference: https://lore.kernel.org/lkml/20200722234538.166697-2-posk@posk.io/ ------------------- In some scenarios, we need to run several low-thrashing required threads together which act as logical operations like PV operations. This kind of thread always falls asleep and wakes other threads up, and thread switching requires the kernel to do several scheduling related overheads (Select the proper core to execute, wake the task up, enqueue the task, mark the task scheduling flag, pick the task at the proper time, dequeue the task and do context switching). These overheads mentioned above are not accepted for the low-thrashing threads. Therefore, we require a mechanism to decline the unnecessary overhead and to swap threads directly without affecting the fairness of CFS tasks. To achieve this goal, we implemented the direct-thread-switch mechanism based on the futex_swap patch*, which switches the DTS task directly with the shared schedule entity. Also, we ensured the kernel keeps secure and consistent basically. Signed-off-by: N Zhi Song <hizhisong@gmail.com>
dad99a57 · briansun · 8871eed8 · dad99a57 · dad99a57 · dad99a57
8 changed file
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,6 +68,11 @@ struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+#define NONE_BY_PASS			0x0000
+#define INIT_BY_PASS			0x0001
+#define IN_BY_PASS			0x0002
+#define END_BY_PASS			0x0004
 /*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
@@ -500,6 +505,10 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
+#ifdef CONFIG_DTS
+	int 				by_pass;
+#endif
 #ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
@@ -726,6 +735,15 @@ struct task_struct {
 	int				normal_prio;
 	unsigned int			rt_priority;
+#ifdef CONFIG_DTS
+	/*
+	 * by_pass indicate that the task is launched by direct-thread-switch.
+	 * dts_shared_se is the schedule entity shared with DTS task.
+	 */
+	int				by_pass;
+	struct sched_entity		dts_shared_se;
+#endif
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
@@ -2194,6 +2212,10 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif
+#ifdef CONFIG_DTS
+extern int check_task_left_time(struct task_struct *task);
+#endif
 #ifdef CONFIG_BPF_SCHED
 extern void sched_settag(struct task_struct *tsk, s64 tag);

--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -25,7 +25,14 @@
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
+#ifdef CONFIG_DTS
+#define FUTEX_FLAGS_DTS_MODE	512
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_FLAGS_DTS_MODE)
+#else
 #define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#endif
 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -43,6 +50,7 @@
 					 FUTEX_PRIVATE_FLAG)
 #define FUTEX_SWAP_PRIVATE		(FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
 /*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.

--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1261,6 +1261,13 @@ config SCHED_STEAL
 	  If unsure, say N here.
+config DTS
+	bool "Direct Thread Switch"
+	default y
+	depends on SCHED_STEAL
+	help
+	  enable the direct thread switch mechanism in the futex_swap operation
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN

--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -39,11 +39,16 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/time_namespace.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <asm/futex.h>
 #include "locking/rtmutex_common.h"
+#ifdef CONFIG_DTS
+#include "sched/sched.h"
+#endif
 /*
 * READ this before attempting to hack on futexes!
 *
@@ -161,7 +166,7 @@ static int  __read_mostly futex_cmpxchg_enabled;
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
-# define FLAGS_SHARED		0x00
+#define FLAGS_SHARED		0x00
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
@@ -2585,6 +2590,219 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	return 0;
 }
+#ifdef CONFIG_DTS
+static int __direct_thread_switch(struct task_struct *next)
+{
+	int cpu = smp_processor_id();
+	int success = 1;
+	struct rq_flags rf;
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct task_struct *prev = rq->curr;
+	struct sched_entity *prev_se, *next_se;
+	unsigned long *switch_count = &prev->nvcsw;
+	unsigned long prev_state;
+	int next_state;
+	struct rq *src_rq_next;
+	bool locked;
+	preempt_disable();
+	local_irq_disable();
+	if (!prev->by_pass) {
+		prev_se = &prev->se;
+	} else {
+		prev_se = &prev->dts_shared_se;
+	}
+	next_se = &next->se;
+	prev->by_pass = NONE_BY_PASS;
+	next->by_pass = INIT_BY_PASS;
+	next->dts_shared_se = *prev_se;
+	prev_se->by_pass = NONE_BY_PASS;
+	next->dts_shared_se.by_pass = INIT_BY_PASS;
+	/* task_struct::state is volatile so far */
+	next_state = next->state;
+	src_rq_next = task_rq(next);
+	locked = true;
+	/* Deliver the execution to the callee. */
+	if (next_state == TASK_RUNNING) {
+		/* The next is running now. */
+		if (task_running(src_rq_next, next)) {
+			success = 0;
+			goto end;
+		}
+		/* The next task is runnable, and may stay in the current core's rq or other cores' rq. */
+		/* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence.
+		 * Enqueue the next task's se when the task expired.
+		 */
+		if (task_rq(next) != rq) {
+#ifdef CONFIG_SCHED_STEAL
+			/* migrate */
+			if (!steal_task(rq, &rf, &locked, next)) {
+				success = 0;
+				goto end;
+			}
+#else
+			success = 0;
+			goto end;
+#endif
+		}
+		replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se);
+	} else if (next_state == TASK_INTERRUPTIBLE) {
+		/*
+		 *
+		 * The next task in the sleeping state caused by futex_swap, futex_wait,
+		 * can be woken up here so far, but signals, and other interruptible situations
+		 * need to be implemented here.
+		 * P.S. We pick up the next task from the wake list of the corresponding futex_t.
+		 */
+		/* Enqueue the shared_se and change the state without entering schedule() path. */
+		if (!wake_up_process_prefer_current_cpu(next)) {
+			success = 0;
+			goto end;
+		}
+		/* success to wakeup (set p->state = TASK_RUNNING) */
+		/* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */
+		set_next_entity(cfs_rq, &next->dts_shared_se);
+	} else {
+		success = 0;
+		goto end;
+	}
+	/* increase rq->cfs.nr_running */
+	cfs_rq->nr_running++;
+	sched_submit_work(prev);
+	rcu_note_context_switch(false);
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up():
+	 *
+	 * __set_current_state(@state)		signal_wake_up()
+	 * __direct_thread_switch()		set_tsk_thread_flag(p, TIF_SIGPENDING)
+	 *					  wake_up_state(p, state)
+	 *   LOCK rq->lock			    LOCK p->pi_state
+	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
+	 *     if (signal_pending_state())	    if (p->state & @state)
+	 *
+	 * Also, the membarrier system call requires a full memory barrier
+	 * after coming from user-space, before storing to rq->curr.
+	 */
+	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
+	/*
+	 * We may fail to switch, so do not deactivate the current task before
+	 * process the next.
+	 */
+	/*
+	 * We must load prev->state once (task_struct::state is volatile), such
+	 * that:
+	 *
+	 *  - we form a control dependency vs deactivate_task() below.
+	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
+	 */
+	prev_state = prev->state;
+	if (prev_state) {
+		if (signal_pending_state(prev_state, prev)) {
+			prev->state = TASK_RUNNING;
+		} else {
+			prev->sched_contributes_to_load =
+				(prev_state & TASK_UNINTERRUPTIBLE) &&
+				!(prev_state & TASK_NOLOAD) &&
+				!(prev->flags & PF_FROZEN);
+			if (prev->sched_contributes_to_load)
+				rq->nr_uninterruptible++;
+			/*
+			 * __schedule()			ttwu()
+			 *   prev_state = prev->state;    if (p->on_rq && ...)
+			 *   if (prev_state)		    goto out;
+			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();
+			 *				  p->state = TASK_WAKING
+			 *
+			 * Where __schedule() and ttwu() have matching control dependencies.
+			 *
+			 * After this, schedule() must not care about p->state any more.
+			 */
+			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+			if (prev->in_iowait) {
+				atomic_inc(&rq->nr_iowait);
+				delayacct_blkio_start();
+			}
+		}
+	}
+	rq->nr_switches++;
+	/*
+	* RCU users of rcu_dereference(rq->curr) may not see
+	* changes to task_struct made by pick_next_task().
+	*/
+	RCU_INIT_POINTER(rq->curr, next);
+	/*
+	* The membarrier system call requires each architecture
+	* to have a full memory barrier after updating
+	* rq->curr, before returning to user-space.
+	*
+	* Here are the schemes providing that barrier on the
+	* various architectures:
+	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+	*   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+	* - finish_lock_switch() for weakly-ordered
+	*   architectures where spin_unlock is a full barrier,
+	* - switch_to() for arm64 (weakly-ordered, spin_unlock
+	*   is a RELEASE barrier),
+	*/
+	++*switch_count;
+	psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+	trace_sched_switch(false, prev, next);
+	/* do the get_task_struct() in the futex_wait_queue_me() before */
+	put_task_struct(next);
+	rq = context_switch(rq, prev, next, &rf);
+	balance_callback(rq);
+	sched_update_worker(next);
+end:
+	sched_preempt_enable_no_resched();
+	return success;
+}
+/*
+ * return
+ * 0 for fail
+ * 1 for succeed
+ */
+static int direct_thread_switch(struct task_struct *next)
+{
+	if (next->sched_class != &fair_sched_class ||
+				 current == next) {
+		return 0;
+	}
+	if (!check_task_left_time(current)) {
+		return 0;
+	}
+	return __direct_thread_switch(next);
+}
+#endif /* CONFIG_DTS */
 /**
 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
 * @hb:		the futex hash bucket, must be locked by the caller
@@ -2595,7 +2813,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 				struct hrtimer_sleeper *timeout,
-				struct task_struct *next)
+				struct task_struct *next, int flags)
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
@@ -2615,6 +2833,9 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * has tried to wake us, and we can skip the call to schedule().
 	 */
 	if (likely(!plist_node_empty(&q->list))) {
+#ifdef CONFIG_DTS
+		int do_dts_switch = 0;
+#endif
 		/*
 		 * If the timer has already expired, current will already be
 		 * flagged for rescheduling. Only call schedule if there
@@ -2622,27 +2843,49 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 		 */
 		if (!timeout || timeout->task) {
 			if (next) {
+#ifdef CONFIG_DTS
 				/*
-				 * wake_up_process() below will be replaced
+				 * If we fail to switch to the next task directly, try to switch to
-				 * in the next patch with
+				 * the next task in the traditional way.
-				 * wake_up_process_prefer_current_cpu().
+				 *
 				 */
+				if (flags & FUTEX_FLAGS_DTS_MODE)
+					do_dts_switch = direct_thread_switch(next);
+				if (!do_dts_switch)
+#endif
+				{
 #ifdef CONFIG_SMP
-                               wake_up_process_prefer_current_cpu(next);
+					wake_up_process_prefer_current_cpu(next);
 #else
-                               wake_up_process(next);
+					wake_up_process(next);
+#endif
+				}
+#ifdef CONFIG_DTS
+				if (!do_dts_switch)
 #endif
-				put_task_struct(next);
+					put_task_struct(next);
 				next = NULL;
 			}
-			freezable_schedule();
+#ifdef CONFIG_DTS
+			if (!do_dts_switch)
+#endif
+				freezable_schedule();
 		}
 	}
 	__set_current_state(TASK_RUNNING);
 	if (next) {
+#ifdef CONFIG_DTS
+		direct_thread_switch(next);
+#else
 		wake_up_process(next);
 		put_task_struct(next);
+#endif
 	}
 }
@@ -2743,7 +2986,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		goto out;
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to, next);
+	futex_wait_queue_me(hb, &q, to, next, flags);
 	next = NULL;
 	/* If we were woken (and unqueued), we succeeded, whatever. */
@@ -2819,6 +3062,15 @@ static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
 		next->wake_q.next = NULL;
 	}
+	/* Basic security test. (Are the two tasks in the same group?) */
+	/* Have any time slices to be used? */
+	/*
+	 * The old one will go to sleep and enqueue the rq, meanwhile, get
+	 * the new one to run.
+	 */
 	return futex_wait(uaddr, flags, val, abs_time, bitset, next);
 }
@@ -3282,7 +3534,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	}
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to, NULL);
+	futex_wait_queue_me(hb, &q, to, NULL, flags);
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3768,6 +4020,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;
+#ifdef CONFIG_DTS
+	if (op & FUTEX_FLAGS_DTS_MODE) {
+		flags |= FUTEX_FLAGS_DTS_MODE;
+	}
+#endif
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2469,7 +2469,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 			   struct rq_flags *rf)
 {
-	check_preempt_curr(rq, p, wake_flags);
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
+		check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
 	trace_sched_wakeup(p);
@@ -2996,7 +3000,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 out:
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
-	preempt_enable();
+#ifdef CONFIG_DTS
+	if (p->by_pass == INIT_BY_PASS) {
+		p->by_pass = IN_BY_PASS;
+		p->se.by_pass = IN_BY_PASS;
+		p->dts_shared_se.by_pass = IN_BY_PASS;
+		preempt_enable_no_resched();
+	}
+	else
+#endif
+		preempt_enable();
 	return success;
 }
@@ -3086,6 +3099,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_DTS
+	p->dts_shared_se.on_rq			= 0;
+	p->dts_shared_se.exec_start		= 0;
+	p->dts_shared_se.sum_exec_runtime		= 0;
+	p->dts_shared_se.prev_sum_exec_runtime	= 0;
+	p->dts_shared_se.nr_migrations		= 0;
+	p->dts_shared_se.vruntime			= 0;
+	INIT_LIST_HEAD(&p->dts_shared_se.group_node);
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
 #endif
@@ -3315,6 +3338,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_entity_runnable_average(&p->se);
+#ifdef CONFIG_DTS
+	p->by_pass = NONE_BY_PASS;
+	p->se.by_pass = NONE_BY_PASS;
+	p->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 #ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
@@ -3702,6 +3730,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
+#ifdef CONFIG_DTS
+		prev->by_pass = NONE_BY_PASS;
+		prev->se.by_pass = NONE_BY_PASS;
+		prev->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3744,7 +3777,7 @@ static void __balance_callback(struct rq *rq)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 	if (unlikely(rq->balance_callback))
 		__balance_callback(rq);
@@ -3752,7 +3785,7 @@ static inline void balance_callback(struct rq *rq)
 #else
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 }
@@ -3789,7 +3822,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 /*
 * context_switch - switch to the new MM and the new thread's register state.
 */
-static __always_inline struct rq *
+__always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
@@ -3846,7 +3879,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	barrier();
 	return finish_task_switch(prev);
-}
+}EXPORT_SYMBOL(context_switch);
 /*
 * nr_running and nr_context_switches:
@@ -4615,7 +4648,7 @@ void __noreturn do_task_dead(void)
 		cpu_relax();
 }
-static inline void sched_submit_work(struct task_struct *tsk)
+inline void sched_submit_work(struct task_struct *tsk)
 {
 	unsigned int task_flags;
@@ -4651,7 +4684,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
-static void sched_update_worker(struct task_struct *tsk)
+void sched_update_worker(struct task_struct *tsk)
 {
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -572,6 +572,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }
+static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node)
+{
+	struct sched_entity *entry;
+	if (!*node) {
+		printk("TREE END\n");
+		return;
+	}
+	entry = rb_entry(*node, struct sched_entity, run_node);
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+	printk("%p\n", entry);
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+}
+void traverse_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
+	__traverse_cfs_rq(cfs_rq, link);
+}
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
@@ -2982,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
 }
 #endif
-static void
+void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
@@ -4340,7 +4362,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue(cfs_rq, se, flags);
 	check_spread(cfs_rq, se);
 	if (!curr)
-		__enqueue_entity(cfs_rq, se);
+#ifdef CONFIG_DTS
+		if (se->by_pass != INIT_BY_PASS)
+#endif
+			__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 	/*
@@ -4463,6 +4489,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	unsigned long ideal_runtime, delta_exec;
 	struct sched_entity *se;
 	s64 delta;
+#ifdef CONFIG_DTS
+	struct task_struct *curr_task = NULL;
+	if (entity_is_task(curr) && curr->by_pass != NONE_BY_PASS)
+		curr_task = task_of_dts_shared_se(curr);
+#endif
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -4488,7 +4520,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
-		return;
+		goto end;
 	}
 	/*
@@ -4497,19 +4529,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * This also mitigates buddy induced latencies under load.
 	 */
 	if (delta_exec < sysctl_sched_min_granularity)
-		return;
+		goto end;
 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;
 	if (delta < 0)
-		return;
+		goto end;
-	if (delta > ideal_runtime)
+	if (delta > ideal_runtime) {
 		resched_curr(rq_of(cfs_rq));
+		goto end;
+	} else {
+		return;
+	}
+end:
+#ifdef CONFIG_DTS
+	if (curr_task) {
+		curr_task->by_pass = END_BY_PASS;
+		curr_task->se.by_pass = END_BY_PASS;
+		curr_task->dts_shared_se.by_pass = END_BY_PASS;
+	}
+#endif
 }
-static void
+#ifdef CONFIG_DTS
+/*
+ * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se.
+ * Correspondingly, enqueue the task original se without any changes on se's information
+ * when the shared se expired. // TODO
+ * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO
+ */
+void
+replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se)
+{
+	if (shared_se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		// TODO
+		update_stats_wait_end(cfs_rq, shared_se);
+		__dequeue_entity(cfs_rq, se);	/* the se of next task should be dequeued */
+		update_load_avg(cfs_rq, shared_se, UPDATE_TG);
+	}
+	update_stats_curr_start(cfs_rq, shared_se);
+	cfs_rq->curr = shared_se;	// 后续update_curr是update cfs_rq->curr
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (schedstat_enabled() &&
+	    rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) {
+		schedstat_set(shared_se->statistics.slice_max,
+			max((u64)schedstat_val(shared_se->statistics.slice_max),
+			    shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime));
+	}
+	shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime;
+}
+#endif
+void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* 'current' is not kept within the tree. */
@@ -4605,8 +4690,15 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+/* the prev's value is unique or shared for the dts mechanism */
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+#ifdef CONFIG_DTS
+	struct task_struct *task = NULL;
+	if (entity_is_task(prev))
+		task = task_of(prev);
+#endif
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -4627,6 +4719,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
+#ifdef CONFIG_DTS
+	if (task && task->by_pass == END_BY_PASS) {
+		task->by_pass = NONE_BY_PASS;
+		task->se.by_pass = NONE_BY_PASS;
+		task->dts_shared_se.by_pass = NONE_BY_PASS;
+	}
+#endif
 }
 static void
@@ -5630,6 +5729,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS) {
+		se = &p->dts_shared_se;
+	}
+#endif
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -5737,11 +5842,17 @@ static void set_next_buddy(struct sched_entity *se);
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	int idle_h_nr_running = task_has_idle_policy(p);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 	bool was_sched_idle = sched_idle_rq(rq);
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS)
+		se = &p->dts_shared_se;
+	else
+#endif
+		se = &p->se;
 	util_est_dequeue(&rq->cfs, p);
@@ -7159,11 +7270,28 @@ static void set_skip_buddy(struct sched_entity *se)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct sched_entity *se, *pse;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
+#ifdef CONFIG_DTS
+	int curr_by_pass = curr->by_pass;
+	int p_by_pass = p->by_pass;
+	if (curr_by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+		se = &curr->se;
+#ifdef CONFIG_DTS
+	if (p_by_pass != NONE_BY_PASS)
+		pse = &p->dts_shared_se;
+	else
+#endif
+		pse = &p->se;
 	if (unlikely(se == pse))
 		return;
@@ -7718,13 +7846,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	p = task_of(se);
+	if (se == NULL) {
+		printk("CFS_RQ Nr_running: %d\n", rq->cfs.nr_running);
+		printk("RQ Nr_running: %d\n", rq->nr_running);
+	}
 	/*
 	 * Since we haven't yet done put_prev_entity and if the selected task
 	 * is a different task than we started out with, try and touch the
 	 * least amount of cfs_rqs.
 	 */
 	if (prev != p) {
-		struct sched_entity *pse = &prev->se;
+		struct sched_entity *pse;
+#ifdef CONFIG_DTS
+		if (prev->by_pass != NONE_BY_PASS)
+			pse = &prev->dts_shared_se;
+		else
+#endif
+		pse = &prev->se;
 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
@@ -7877,8 +8017,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
 */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct sched_entity *se = &prev->se;
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (prev->by_pass != NONE_BY_PASS)
+		se = &prev->dts_shared_se;
+	else
+#endif
+		se = &prev->se;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -7895,7 +8042,13 @@ static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *se = &curr->se;
+	struct sched_entity *se;
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+	se = &curr->se;
 	/*
 	 * Are we the only task in the tree?
@@ -7926,6 +8079,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
+#ifdef CONFIG_DTS
+	/* DTS tasks DO NOT support being executed by yeild_to method.*/
+	if (p->by_pass != NONE_BY_PASS) {
+		return false;
+	}
+#endif
 	/* throttled hierarchies are not runnable */
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
@@ -8363,7 +8523,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
 /*
 * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
 */
-static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
 	lockdep_assert_held(&src_rq->lock);
@@ -8573,6 +8733,10 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	BUG_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
 	check_preempt_curr(rq, p, 0);
 }
@@ -11544,6 +11708,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 	return stolen;
 }
+int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      struct task_struct *tsk)
+{
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = task_rq(tsk);
+	int src_cpu = task_cpu(tsk);
+	if (!steal_enabled())
+		return 0;
+	if (!cpu_active(dst_cpu))
+		return 0;
+	if (dst_cpu == src_cpu)
+		return 0;
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_unlock(&dst_rq->lock);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+	if (!cpu_active(src_cpu))
+		tsk = NULL;
+	else
+		detach_task(tsk, src_rq, dst_cpu);
+	rq_unlock(src_rq, &rf);
+	if (tsk) {
+		raw_spin_lock(&dst_rq->lock);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, tsk);
+		stolen = 1;
+		schedstat_inc(dst_rq->steal);
+	}
+	local_irq_restore(rf.flags);
+	return stolen;
+}
 /*
 * Conservative upper bound on the max cost of a steal, in nsecs (the typical
 * cost is 1-2 microsec).  Do not steal if average idle time is less.
@@ -11653,6 +11864,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS) {
+		se = &curr->dts_shared_se;
+	}
+#endif
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
@@ -12148,6 +12365,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
+void update_before_bypass(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+	struct sched_entity *curr;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (current->by_pass != NONE_BY_PASS)
+		curr = &current->dts_shared_se;
+	else
+#endif
+		curr = &current->se;
+	cfs_rq = cfs_rq_of(curr);
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+	rq_unlock(rq, &rf);
+}
+/*
+ * return 1: left time Y
+ *
+ */
+int check_task_left_time(struct task_struct *task)
+{
+	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (task->by_pass != NONE_BY_PASS)
+		se = &task->dts_shared_se;
+	else
+#endif
+		se = &task->se;
+	cfs_rq = cfs_rq_of(se);
+	ideal_runtime = sched_slice(cfs_rq, se);
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime) {
+		if (cfs_rq->nr_running > 1) {
+			resched_curr(rq_of(cfs_rq));
+			/*
+			* The current task ran long enough, ensure it doesn't get
+			* re-elected due to buddy favours.
+			*/
+			clear_buddies(cfs_rq, se);
+		}
+		return 0;
+	}
+	return 1;
+}
 /*
 * All the scheduling class methods:
 */

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -534,6 +534,12 @@ extern void sched_offline_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
+#ifdef CONFIG_DTS
+extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se);
+#endif
+extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
@@ -1185,9 +1191,22 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 #ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	SCHED_WARN_ON(!entity_is_task(dts_shared_se));
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+#endif
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	SCHED_WARN_ON(!entity_is_task(se));
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
@@ -1210,8 +1229,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 #else
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se)
+{
+	struct task_struct *p = task_of_dts_shared_se(se);
+	struct rq *rq = task_rq(p);
+	return &rq->cfs;
+}
+#endif
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
@@ -1220,7 +1259,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 	return &task_rq(p)->cfs;
 }
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
@@ -1228,6 +1267,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 	return &rq->cfs;
 }
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return cfs_rq_of_dts_shared_se(se);
+	else
+#endif
+	return cfs_rq_of_se(se);
+}
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
@@ -2404,6 +2454,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void traverse_cfs_rq(struct cfs_rq *cfs_rq);
 #ifdef	CONFIG_SCHED_DEBUG
 extern bool sched_debug_enabled;
@@ -2789,3 +2840,18 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+#ifdef CONFIG_DTS
+extern void sched_submit_work(struct task_struct *tsk);
+extern void sched_update_worker(struct task_struct *tsk);
+extern struct rq *context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf);
+extern void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SCHED_STEAL
+extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+			struct task_struct *tsk);
+extern void update_before_bypass(void);
+extern void balance_callback(struct rq *rq);
+#endif
+#endif
\ No newline at end of file
--- a/tools/testing/selftests/futex/functional/futex_swap.c
+++ b/tools/testing/selftests/futex/functional/futex_swap.c
@@ -13,11 +13,11 @@
 /* The futex the main thread waits on. */
 futex_t futex_main = FUTEX_INITIALIZER;
-/* The futex the other thread wats on. */
+/* The futex the other thread waits on. */
 futex_t futex_other = FUTEX_INITIALIZER;
 /* The number of iterations to run (>1 => run benchmarks. */
-static int cfg_iterations = 1;
+static int cfg_iterations = 5;
 /* If != 0, print diagnostic messages. */
 static int cfg_verbose;
@@ -28,17 +28,21 @@ static int cfg_validate = 1;
 /* How to swap threads. */
 #define SWAP_WAKE_WAIT 1
 #define SWAP_SWAP 2
+#define SWAP_SWAP_DTS 4
 /* Futex values. */
 #define FUTEX_WAITING 0
 #define FUTEX_WAKEUP 1
+#define FUTEX_FLAGS_DTS_MODE	512
 /* An atomic counter used to validate proper swapping. */
 static atomic_t validation_counter;
 void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 {
 	int ret;
+	int flags = 0;
 	switch (mode) {
 	case SWAP_WAKE_WAIT:
@@ -52,11 +56,14 @@ void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 		}
 		break;
+	case SWAP_SWAP_DTS:
+		flags |= FUTEX_FLAGS_DTS_MODE;
 	case SWAP_SWAP:
+		flags |= FUTEX_PRIVATE_FLAG;
 		futex_set(futex_this, FUTEX_WAITING);
 		futex_set(futex_that, FUTEX_WAKEUP);
 		ret = futex_swap(futex_this, FUTEX_WAITING, NULL,
-				 futex_that, FUTEX_PRIVATE_FLAG);
+				 futex_that, flags);
 		if (ret < 0 && errno == ENOSYS) {
 			/* futex_swap not implemented */
 			perror("futex_swap");
@@ -171,13 +178,14 @@ void usage(char *prog)
 	printf("  -i N  Use N iterations to benchmark\n");
 	printf("  -n    Do not validate swapping correctness\n");
 	printf("  -v    Print diagnostic messages\n");
+	printf("  -d    Benchmark with the direct-thread-switch(DTS) mechanism\n");
 }
 int main(int argc, char *argv[])
 {
 	int c;
-	while ((c = getopt(argc, argv, "hi:nv")) != -1) {
+	while ((c = getopt(argc, argv, "hi:nvd")) != -1) {
 		switch (c) {
 		case 'h':
 			usage(basename(argv[0]));
@@ -191,6 +199,9 @@ int main(int argc, char *argv[])
 		case 'v':
 			cfg_verbose = 1;
 			break;
+		case 'd':
+			goto dts_test;
+			break;
 		default:
 			usage(basename(argv[0]));
 			exit(1);
@@ -205,5 +216,10 @@ int main(int argc, char *argv[])
 	run_test(SWAP_SWAP);
 	printf("PASS\n");
+dts_test:
+	printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n");
+	run_test(SWAP_SWAP_DTS);
+	printf("PASS\n");
 	return 0;
 }