diff --git a/include/linux/sched.h b/include/linux/sched.h
index 714386dd3e181e35e16deb0a87175fbb8e149006..9aa03c0957be5a5c3e13e0990e0375f1c63f399d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,6 +68,11 @@ struct task_delay_info;
 struct task_group;
 struct io_uring_task;
 
+#define NONE_BY_PASS			0x0000
+#define INIT_BY_PASS			0x0001
+#define IN_BY_PASS			0x0002
+#define END_BY_PASS			0x0004
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
@@ -500,6 +505,10 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
 
+#ifdef CONFIG_DTS
+	int 				by_pass;
+#endif
+
 #ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
@@ -726,6 +735,15 @@ struct task_struct {
 	int				normal_prio;
 	unsigned int			rt_priority;
 
+#ifdef CONFIG_DTS
+	/*
+	 * by_pass indicate that the task is launched by direct-thread-switch.
+	 * dts_shared_se is the schedule entity shared with DTS task.
+	 */
+	int				by_pass;
+	struct sched_entity		dts_shared_se;
+#endif
+
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
@@ -2194,6 +2212,10 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif
 
+#ifdef CONFIG_DTS
+extern int check_task_left_time(struct task_struct *task);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 extern void sched_settag(struct task_struct *tsk, s64 tag);
 
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index c1d151d97deaa3bfb760545b7475a455b6c6d3ec..e2ed0553046d8af5a970837440f01054a05d022c 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -25,7 +25,14 @@
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
+
+#ifdef CONFIG_DTS
+#define FUTEX_FLAGS_DTS_MODE	512
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_FLAGS_DTS_MODE)
+#else
 #define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#endif
 
 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -43,6 +50,7 @@
 					 FUTEX_PRIVATE_FLAG)
 #define FUTEX_SWAP_PRIVATE		(FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
 
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
diff --git a/init/Kconfig b/init/Kconfig
index 1c607825c2dbd2a0ae5554ac8eef314209f80e55..e1be030628a3b40b09a92cce446476ed3e1c2a65 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1261,6 +1261,13 @@ config SCHED_STEAL
 
 	  If unsure, say N here.
 
+config DTS
+	bool "Direct Thread Switch"
+	default y
+	depends on SCHED_STEAL
+	help
+	  enable the direct thread switch mechanism in the futex_swap operation
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN
diff --git a/kernel/futex.c b/kernel/futex.c
index 42f55d1da678b04d4a298e0e1269d8fe0fa97a85..48da1e8b89480ae86bf19a2fc9b9272c9c7da1ca 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -39,11 +39,16 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/time_namespace.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/futex.h>
 
 #include "locking/rtmutex_common.h"
 
+#ifdef CONFIG_DTS
+#include "sched/sched.h"
+#endif
 /*
  * READ this before attempting to hack on futexes!
  *
@@ -161,7 +166,7 @@ static int  __read_mostly futex_cmpxchg_enabled;
  * NOMMU does not have per process address space. Let the compiler optimize
  * code away.
  */
-# define FLAGS_SHARED		0x00
+#define FLAGS_SHARED		0x00
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
@@ -2585,6 +2590,219 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	return 0;
 }
 
+#ifdef CONFIG_DTS
+static int __direct_thread_switch(struct task_struct *next)
+{
+	int cpu = smp_processor_id();
+	int success = 1;
+	struct rq_flags rf;
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct task_struct *prev = rq->curr;
+	struct sched_entity *prev_se, *next_se;
+	unsigned long *switch_count = &prev->nvcsw;
+	unsigned long prev_state;
+	int next_state;
+	struct rq *src_rq_next;
+	bool locked;
+
+	preempt_disable();
+	local_irq_disable();
+
+	if (!prev->by_pass) {
+		prev_se = &prev->se;
+	} else {
+		prev_se = &prev->dts_shared_se;
+	}
+
+	next_se = &next->se;
+
+	prev->by_pass = NONE_BY_PASS;
+	next->by_pass = INIT_BY_PASS;
+	next->dts_shared_se = *prev_se;
+	prev_se->by_pass = NONE_BY_PASS;
+	next->dts_shared_se.by_pass = INIT_BY_PASS;
+
+	/* task_struct::state is volatile so far */
+	next_state = next->state;
+	src_rq_next = task_rq(next);
+	locked = true;
+	/* Deliver the execution to the callee. */
+	if (next_state == TASK_RUNNING) {
+		/* The next is running now. */
+		if (task_running(src_rq_next, next)) {
+			success = 0;
+			goto end;
+		}
+		/* The next task is runnable, and may stay in the current core's rq or other cores' rq. */
+		/* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence.
+		 * Enqueue the next task's se when the task expired.
+		 */
+		if (task_rq(next) != rq) {
+#ifdef CONFIG_SCHED_STEAL
+			/* migrate */
+			if (!steal_task(rq, &rf, &locked, next)) {
+				success = 0;
+				goto end;
+			}
+#else
+			success = 0;
+			goto end;
+#endif
+		}
+		replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se);
+	} else if (next_state == TASK_INTERRUPTIBLE) {
+		/*
+		 *
+		 * The next task in the sleeping state caused by futex_swap, futex_wait,
+		 * can be woken up here so far, but signals, and other interruptible situations
+		 * need to be implemented here.
+		 * P.S. We pick up the next task from the wake list of the corresponding futex_t.
+		 */
+
+		/* Enqueue the shared_se and change the state without entering schedule() path. */
+		if (!wake_up_process_prefer_current_cpu(next)) {
+			success = 0;
+			goto end;
+		}
+
+		/* success to wakeup (set p->state = TASK_RUNNING) */
+		/* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */
+		set_next_entity(cfs_rq, &next->dts_shared_se);
+
+	} else {
+		success = 0;
+		goto end;
+	}
+
+	/* increase rq->cfs.nr_running */
+	cfs_rq->nr_running++;
+
+	sched_submit_work(prev);
+
+	rcu_note_context_switch(false);
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up():
+	 *
+	 * __set_current_state(@state)		signal_wake_up()
+	 * __direct_thread_switch()		set_tsk_thread_flag(p, TIF_SIGPENDING)
+	 *					  wake_up_state(p, state)
+	 *   LOCK rq->lock			    LOCK p->pi_state
+	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
+	 *     if (signal_pending_state())	    if (p->state & @state)
+	 *
+	 * Also, the membarrier system call requires a full memory barrier
+	 * after coming from user-space, before storing to rq->curr.
+	 */
+	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
+
+	/*
+	 * We may fail to switch, so do not deactivate the current task before
+	 * process the next.
+	 */
+
+	/*
+	 * We must load prev->state once (task_struct::state is volatile), such
+	 * that:
+	 *
+	 *  - we form a control dependency vs deactivate_task() below.
+	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
+	 */
+	prev_state = prev->state;
+	if (prev_state) {
+		if (signal_pending_state(prev_state, prev)) {
+			prev->state = TASK_RUNNING;
+		} else {
+			prev->sched_contributes_to_load =
+				(prev_state & TASK_UNINTERRUPTIBLE) &&
+				!(prev_state & TASK_NOLOAD) &&
+				!(prev->flags & PF_FROZEN);
+
+			if (prev->sched_contributes_to_load)
+				rq->nr_uninterruptible++;
+
+			/*
+			 * __schedule()			ttwu()
+			 *   prev_state = prev->state;    if (p->on_rq && ...)
+			 *   if (prev_state)		    goto out;
+			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();
+			 *				  p->state = TASK_WAKING
+			 *
+			 * Where __schedule() and ttwu() have matching control dependencies.
+			 *
+			 * After this, schedule() must not care about p->state any more.
+			 */
+			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+			if (prev->in_iowait) {
+				atomic_inc(&rq->nr_iowait);
+				delayacct_blkio_start();
+			}
+		}
+	}
+
+	rq->nr_switches++;
+	/*
+	* RCU users of rcu_dereference(rq->curr) may not see
+	* changes to task_struct made by pick_next_task().
+	*/
+	RCU_INIT_POINTER(rq->curr, next);
+	/*
+	* The membarrier system call requires each architecture
+	* to have a full memory barrier after updating
+	* rq->curr, before returning to user-space.
+	*
+	* Here are the schemes providing that barrier on the
+	* various architectures:
+	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+	*   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+	* - finish_lock_switch() for weakly-ordered
+	*   architectures where spin_unlock is a full barrier,
+	* - switch_to() for arm64 (weakly-ordered, spin_unlock
+	*   is a RELEASE barrier),
+	*/
+	++*switch_count;
+
+	psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
+	trace_sched_switch(false, prev, next);
+
+	/* do the get_task_struct() in the futex_wait_queue_me() before */
+	put_task_struct(next);
+
+	rq = context_switch(rq, prev, next, &rf);
+
+	balance_callback(rq);
+	sched_update_worker(next);
+end:
+	sched_preempt_enable_no_resched();
+	return success;
+}
+
+/*
+ * return
+ * 0 for fail
+ * 1 for succeed
+ */
+static int direct_thread_switch(struct task_struct *next)
+{
+	if (next->sched_class != &fair_sched_class ||
+				 current == next) {
+		return 0;
+	}
+
+	if (!check_task_left_time(current)) {
+		return 0;
+	}
+
+	return __direct_thread_switch(next);
+}
+#endif /* CONFIG_DTS */
+
 /**
  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
  * @hb:		the futex hash bucket, must be locked by the caller
@@ -2595,7 +2813,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 				struct hrtimer_sleeper *timeout,
-				struct task_struct *next)
+				struct task_struct *next, int flags)
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
@@ -2615,6 +2833,9 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * has tried to wake us, and we can skip the call to schedule().
 	 */
 	if (likely(!plist_node_empty(&q->list))) {
+#ifdef CONFIG_DTS
+		int do_dts_switch = 0;
+#endif
 		/*
 		 * If the timer has already expired, current will already be
 		 * flagged for rescheduling. Only call schedule if there
@@ -2622,27 +2843,49 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 		 */
 		if (!timeout || timeout->task) {
 			if (next) {
+#ifdef CONFIG_DTS
 				/*
-				 * wake_up_process() below will be replaced
-				 * in the next patch with
-				 * wake_up_process_prefer_current_cpu().
+				 * If we fail to switch to the next task directly, try to switch to
+				 * the next task in the traditional way.
+				 *
 				 */
+				if (flags & FUTEX_FLAGS_DTS_MODE)
+					do_dts_switch = direct_thread_switch(next);
+
+				if (!do_dts_switch)
+#endif
+				{
 #ifdef CONFIG_SMP
-                               wake_up_process_prefer_current_cpu(next);
+					wake_up_process_prefer_current_cpu(next);
 #else
-                               wake_up_process(next);
+					wake_up_process(next);
+#endif
+				}
+
+#ifdef CONFIG_DTS
+				if (!do_dts_switch)
 #endif
-				put_task_struct(next);
+					put_task_struct(next);
+
 				next = NULL;
 			}
-			freezable_schedule();
+#ifdef CONFIG_DTS
+			if (!do_dts_switch)
+#endif
+				freezable_schedule();
 		}
 	}
 	__set_current_state(TASK_RUNNING);
 
+
+
 	if (next) {
+#ifdef CONFIG_DTS
+		direct_thread_switch(next);
+#else
 		wake_up_process(next);
 		put_task_struct(next);
+#endif
 	}
 }
 
@@ -2743,7 +2986,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		goto out;
 
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to, next);
+	futex_wait_queue_me(hb, &q, to, next, flags);
 	next = NULL;
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
@@ -2819,6 +3062,15 @@ static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
 		next->wake_q.next = NULL;
 	}
 
+	/* Basic security test. (Are the two tasks in the same group?) */
+
+	/* Have any time slices to be used? */
+
+	/*
+	 * The old one will go to sleep and enqueue the rq, meanwhile, get
+	 * the new one to run.
+	 */
+
 	return futex_wait(uaddr, flags, val, abs_time, bitset, next);
 }
 
@@ -3282,7 +3534,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to, NULL);
+	futex_wait_queue_me(hb, &q, to, NULL, flags);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3768,6 +4020,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;
 
+#ifdef CONFIG_DTS
+	if (op & FUTEX_FLAGS_DTS_MODE) {
+		flags |= FUTEX_FLAGS_DTS_MODE;
+	}
+#endif
+
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5bf3553233819a40f40b4dc3ebec2b9910d4d8e5..a9093f6d98d9172acef1afde6ee86f51559d6eba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2469,7 +2469,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 			   struct rq_flags *rf)
 {
-	check_preempt_curr(rq, p, wake_flags);
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
+		check_preempt_curr(rq, p, wake_flags);
+
 	p->state = TASK_RUNNING;
 	trace_sched_wakeup(p);
 
@@ -2996,7 +3000,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 out:
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
-	preempt_enable();
+#ifdef CONFIG_DTS
+	if (p->by_pass == INIT_BY_PASS) {
+		p->by_pass = IN_BY_PASS;
+		p->se.by_pass = IN_BY_PASS;
+		p->dts_shared_se.by_pass = IN_BY_PASS;
+		preempt_enable_no_resched();
+	}
+	else
+#endif
+		preempt_enable();
 
 	return success;
 }
@@ -3086,6 +3099,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_DTS
+	p->dts_shared_se.on_rq			= 0;
+	p->dts_shared_se.exec_start		= 0;
+	p->dts_shared_se.sum_exec_runtime		= 0;
+	p->dts_shared_se.prev_sum_exec_runtime	= 0;
+	p->dts_shared_se.nr_migrations		= 0;
+	p->dts_shared_se.vruntime			= 0;
+	INIT_LIST_HEAD(&p->dts_shared_se.group_node);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
 #endif
@@ -3315,6 +3338,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	init_entity_runnable_average(&p->se);
 
+#ifdef CONFIG_DTS
+	p->by_pass = NONE_BY_PASS;
+	p->se.by_pass = NONE_BY_PASS;
+	p->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 
 #ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
@@ -3702,6 +3730,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
+#ifdef CONFIG_DTS
+		prev->by_pass = NONE_BY_PASS;
+		prev->se.by_pass = NONE_BY_PASS;
+		prev->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3744,7 +3777,7 @@ static void __balance_callback(struct rq *rq)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 	if (unlikely(rq->balance_callback))
 		__balance_callback(rq);
@@ -3752,7 +3785,7 @@ static inline void balance_callback(struct rq *rq)
 
 #else
 
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -3789,7 +3822,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
-static __always_inline struct rq *
+__always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
@@ -3846,7 +3879,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	barrier();
 
 	return finish_task_switch(prev);
-}
+}EXPORT_SYMBOL(context_switch);
 
 /*
  * nr_running and nr_context_switches:
@@ -4615,7 +4648,7 @@ void __noreturn do_task_dead(void)
 		cpu_relax();
 }
 
-static inline void sched_submit_work(struct task_struct *tsk)
+inline void sched_submit_work(struct task_struct *tsk)
 {
 	unsigned int task_flags;
 
@@ -4651,7 +4684,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
-static void sched_update_worker(struct task_struct *tsk)
+void sched_update_worker(struct task_struct *tsk)
 {
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 30fbcd06baa90a73e2d5fd68e44fdcad325115d2..d58a949c95194f39a9da820fc7335d0bcfbc08a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -572,6 +572,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }
 
+static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node)
+{
+	struct sched_entity *entry;
+
+	if (!*node) {
+		printk("TREE END\n");
+		return;
+	}
+
+	entry = rb_entry(*node, struct sched_entity, run_node);
+
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+	printk("%p\n", entry);
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+}
+
+void traverse_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
+	__traverse_cfs_rq(cfs_rq, link);
+}
+
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
@@ -2982,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
 }
 #endif
 
-static void
+void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
@@ -4340,7 +4362,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue(cfs_rq, se, flags);
 	check_spread(cfs_rq, se);
 	if (!curr)
-		__enqueue_entity(cfs_rq, se);
+#ifdef CONFIG_DTS
+		if (se->by_pass != INIT_BY_PASS)
+#endif
+			__enqueue_entity(cfs_rq, se);
+
 	se->on_rq = 1;
 
 	/*
@@ -4463,6 +4489,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	unsigned long ideal_runtime, delta_exec;
 	struct sched_entity *se;
 	s64 delta;
+#ifdef CONFIG_DTS
+	struct task_struct *curr_task = NULL;
+
+	if (entity_is_task(curr) && curr->by_pass != NONE_BY_PASS)
+		curr_task = task_of_dts_shared_se(curr);
+#endif
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -4488,7 +4520,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
-		return;
+		goto end;
 	}
 
 	/*
@@ -4497,19 +4529,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * This also mitigates buddy induced latencies under load.
 	 */
 	if (delta_exec < sysctl_sched_min_granularity)
-		return;
+		goto end;
 
 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;
 
 	if (delta < 0)
-		return;
+		goto end;
 
-	if (delta > ideal_runtime)
+	if (delta > ideal_runtime) {
 		resched_curr(rq_of(cfs_rq));
+		goto end;
+	} else {
+		return;
+	}
+end:
+#ifdef CONFIG_DTS
+	if (curr_task) {
+		curr_task->by_pass = END_BY_PASS;
+		curr_task->se.by_pass = END_BY_PASS;
+		curr_task->dts_shared_se.by_pass = END_BY_PASS;
+	}
+#endif
 }
 
-static void
+#ifdef CONFIG_DTS
+/*
+ * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se.
+ * Correspondingly, enqueue the task original se without any changes on se's information
+ * when the shared se expired. // TODO
+ * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO
+ */
+void
+replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se)
+{
+	if (shared_se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		// TODO
+		update_stats_wait_end(cfs_rq, shared_se);
+		__dequeue_entity(cfs_rq, se);	/* the se of next task should be dequeued */
+		update_load_avg(cfs_rq, shared_se, UPDATE_TG);
+	}
+
+	update_stats_curr_start(cfs_rq, shared_se);
+	cfs_rq->curr = shared_se;	// 后续update_curr是update cfs_rq->curr
+
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (schedstat_enabled() &&
+	    rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) {
+		schedstat_set(shared_se->statistics.slice_max,
+			max((u64)schedstat_val(shared_se->statistics.slice_max),
+			    shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime));
+	}
+
+	shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime;
+}
+#endif
+
+void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* 'current' is not kept within the tree. */
@@ -4605,8 +4690,15 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
+/* the prev's value is unique or shared for the dts mechanism */
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+#ifdef CONFIG_DTS
+	struct task_struct *task = NULL;
+
+	if (entity_is_task(prev))
+		task = task_of(prev);
+#endif
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -4627,6 +4719,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
+#ifdef CONFIG_DTS
+	if (task && task->by_pass == END_BY_PASS) {
+		task->by_pass = NONE_BY_PASS;
+		task->se.by_pass = NONE_BY_PASS;
+		task->dts_shared_se.by_pass = NONE_BY_PASS;
+	}
+#endif
 }
 
 static void
@@ -5630,6 +5729,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS) {
+		se = &p->dts_shared_se;
+	}
+#endif
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -5737,11 +5842,17 @@ static void set_next_buddy(struct sched_entity *se);
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	int idle_h_nr_running = task_has_idle_policy(p);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 	bool was_sched_idle = sched_idle_rq(rq);
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS)
+		se = &p->dts_shared_se;
+	else
+#endif
+		se = &p->se;
 
 	util_est_dequeue(&rq->cfs, p);
 
@@ -7159,11 +7270,28 @@ static void set_skip_buddy(struct sched_entity *se)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct sched_entity *se, *pse;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
 
+#ifdef CONFIG_DTS
+	int curr_by_pass = curr->by_pass;
+	int p_by_pass = p->by_pass;
+
+	if (curr_by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+		se = &curr->se;
+
+#ifdef CONFIG_DTS
+	if (p_by_pass != NONE_BY_PASS)
+		pse = &p->dts_shared_se;
+	else
+#endif
+		pse = &p->se;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -7718,13 +7846,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 	p = task_of(se);
 
+	if (se == NULL) {
+		printk("CFS_RQ Nr_running: %d\n", rq->cfs.nr_running);
+		printk("RQ Nr_running: %d\n", rq->nr_running);
+	}
+
 	/*
 	 * Since we haven't yet done put_prev_entity and if the selected task
 	 * is a different task than we started out with, try and touch the
 	 * least amount of cfs_rqs.
 	 */
 	if (prev != p) {
-		struct sched_entity *pse = &prev->se;
+		struct sched_entity *pse;
+#ifdef CONFIG_DTS
+		if (prev->by_pass != NONE_BY_PASS)
+			pse = &prev->dts_shared_se;
+		else
+#endif
+		pse = &prev->se;
+
 
 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
@@ -7877,8 +8017,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct sched_entity *se = &prev->se;
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (prev->by_pass != NONE_BY_PASS)
+		se = &prev->dts_shared_se;
+	else
+#endif
+		se = &prev->se;
+
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -7895,7 +8042,13 @@ static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *se = &curr->se;
+	struct sched_entity *se;
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+	se = &curr->se;
 
 	/*
 	 * Are we the only task in the tree?
@@ -7926,6 +8079,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 
+#ifdef CONFIG_DTS
+	/* DTS tasks DO NOT support being executed by yeild_to method.*/
+	if (p->by_pass != NONE_BY_PASS) {
+		return false;
+	}
+#endif
+
 	/* throttled hierarchies are not runnable */
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
@@ -8363,7 +8523,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
 /*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
 	lockdep_assert_held(&src_rq->lock);
 
@@ -8573,6 +8733,10 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	BUG_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
 	check_preempt_curr(rq, p, 0);
 }
 
@@ -11544,6 +11708,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 	return stolen;
 }
 
+int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      struct task_struct *tsk)
+{
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = task_rq(tsk);
+	int src_cpu = task_cpu(tsk);
+
+	if (!steal_enabled())
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_cpu == src_cpu)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_unlock(&dst_rq->lock);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (!cpu_active(src_cpu))
+		tsk = NULL;
+	else
+		detach_task(tsk, src_rq, dst_cpu);
+
+	rq_unlock(src_rq, &rf);
+
+	if (tsk) {
+		raw_spin_lock(&dst_rq->lock);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, tsk);
+		stolen = 1;
+		schedstat_inc(dst_rq->steal);
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
 /*
  * Conservative upper bound on the max cost of a steal, in nsecs (the typical
  * cost is 1-2 microsec).  Do not steal if average idle time is less.
@@ -11653,6 +11864,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS) {
+		se = &curr->dts_shared_se;
+	}
+#endif
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
@@ -12148,6 +12365,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
 
+void update_before_bypass(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+	struct sched_entity *curr;
+	struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_DTS
+	if (current->by_pass != NONE_BY_PASS)
+		curr = &current->dts_shared_se;
+	else
+#endif
+		curr = &current->se;
+
+	cfs_rq = cfs_rq_of(curr);
+
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	rq_unlock(rq, &rf);
+}
+
+/*
+ * return 1: left time Y
+ *
+ */
+int check_task_left_time(struct task_struct *task)
+{
+	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (task->by_pass != NONE_BY_PASS)
+		se = &task->dts_shared_se;
+	else
+#endif
+		se = &task->se;
+
+	cfs_rq = cfs_rq_of(se);
+
+	ideal_runtime = sched_slice(cfs_rq, se);
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime) {
+		if (cfs_rq->nr_running > 1) {
+			resched_curr(rq_of(cfs_rq));
+			/*
+			* The current task ran long enough, ensure it doesn't get
+			* re-elected due to buddy favours.
+			*/
+			clear_buddies(cfs_rq, se);
+		}
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * All the scheduling class methods:
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 976cdb36fe082dbccdfc1c3e2d12683012552357..456dfd0abf8da354c5a31bdc3998dd38f0ccb1b6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -534,6 +534,12 @@ extern void sched_offline_group(struct task_group *tg);
 
 extern void sched_move_task(struct task_struct *tsk);
 
+#ifdef CONFIG_DTS
+extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se);
+#endif
+
+extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
@@ -1185,9 +1191,22 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	SCHED_WARN_ON(!entity_is_task(dts_shared_se));
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	SCHED_WARN_ON(!entity_is_task(se));
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
 
@@ -1210,8 +1229,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 
 #else
 
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+
+static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se)
+{
+	struct task_struct *p = task_of_dts_shared_se(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
 
@@ -1220,7 +1259,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 	return &task_rq(p)->cfs;
 }
 
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
@@ -1228,6 +1267,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 	return &rq->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return cfs_rq_of_dts_shared_se(se);
+	else
+#endif
+	return cfs_rq_of_se(se);
+
+}
+
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
@@ -2404,6 +2454,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void traverse_cfs_rq(struct cfs_rq *cfs_rq);
 
 #ifdef	CONFIG_SCHED_DEBUG
 extern bool sched_debug_enabled;
@@ -2789,3 +2840,18 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_DTS
+extern void sched_submit_work(struct task_struct *tsk);
+extern void sched_update_worker(struct task_struct *tsk);
+extern struct rq *context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf);
+extern void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SCHED_STEAL
+extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+			struct task_struct *tsk);
+extern void update_before_bypass(void);
+extern void balance_callback(struct rq *rq);
+#endif
+#endif
\ No newline at end of file
diff --git a/tools/testing/selftests/futex/functional/futex_swap.c b/tools/testing/selftests/futex/functional/futex_swap.c
index 9034d04372d3f34b45e1f76e853bcf81990d275f..8ce0266fcbc3d49c01e994b5666ea9a009f3b784 100644
--- a/tools/testing/selftests/futex/functional/futex_swap.c
+++ b/tools/testing/selftests/futex/functional/futex_swap.c
@@ -13,11 +13,11 @@
 
 /* The futex the main thread waits on. */
 futex_t futex_main = FUTEX_INITIALIZER;
-/* The futex the other thread wats on. */
+/* The futex the other thread waits on. */
 futex_t futex_other = FUTEX_INITIALIZER;
 
 /* The number of iterations to run (>1 => run benchmarks. */
-static int cfg_iterations = 1;
+static int cfg_iterations = 5;
 
 /* If != 0, print diagnostic messages. */
 static int cfg_verbose;
@@ -28,17 +28,21 @@ static int cfg_validate = 1;
 /* How to swap threads. */
 #define SWAP_WAKE_WAIT 1
 #define SWAP_SWAP 2
+#define SWAP_SWAP_DTS 4
 
 /* Futex values. */
 #define FUTEX_WAITING 0
 #define FUTEX_WAKEUP 1
 
+#define FUTEX_FLAGS_DTS_MODE	512
+
 /* An atomic counter used to validate proper swapping. */
 static atomic_t validation_counter;
 
 void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 {
 	int ret;
+	int flags = 0;
 
 	switch (mode) {
 	case SWAP_WAKE_WAIT:
@@ -52,11 +56,14 @@ void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 		}
 		break;
 
+	case SWAP_SWAP_DTS:
+		flags |= FUTEX_FLAGS_DTS_MODE;
 	case SWAP_SWAP:
+		flags |= FUTEX_PRIVATE_FLAG;
 		futex_set(futex_this, FUTEX_WAITING);
 		futex_set(futex_that, FUTEX_WAKEUP);
 		ret = futex_swap(futex_this, FUTEX_WAITING, NULL,
-				 futex_that, FUTEX_PRIVATE_FLAG);
+				 futex_that, flags);
 		if (ret < 0 && errno == ENOSYS) {
 			/* futex_swap not implemented */
 			perror("futex_swap");
@@ -171,13 +178,14 @@ void usage(char *prog)
 	printf("  -i N  Use N iterations to benchmark\n");
 	printf("  -n    Do not validate swapping correctness\n");
 	printf("  -v    Print diagnostic messages\n");
+	printf("  -d    Benchmark with the direct-thread-switch(DTS) mechanism\n");
 }
 
 int main(int argc, char *argv[])
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "hi:nv")) != -1) {
+	while ((c = getopt(argc, argv, "hi:nvd")) != -1) {
 		switch (c) {
 		case 'h':
 			usage(basename(argv[0]));
@@ -191,6 +199,9 @@ int main(int argc, char *argv[])
 		case 'v':
 			cfg_verbose = 1;
 			break;
+		case 'd':
+			goto dts_test;
+			break;
 		default:
 			usage(basename(argv[0]));
 			exit(1);
@@ -205,5 +216,10 @@ int main(int argc, char *argv[])
 	run_test(SWAP_SWAP);
 	printf("PASS\n");
 
+dts_test:
+	printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n");
+	run_test(SWAP_SWAP_DTS);
+	printf("PASS\n");
+
 	return 0;
 }