diff --git a/include/linux/sched.h b/include/linux/sched.h index 714386dd3e181e35e16deb0a87175fbb8e149006..9aa03c0957be5a5c3e13e0990e0375f1c63f399d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,6 +68,11 @@ struct task_delay_info; struct task_group; struct io_uring_task; +#define NONE_BY_PASS 0x0000 +#define INIT_BY_PASS 0x0001 +#define IN_BY_PASS 0x0002 +#define END_BY_PASS 0x0004 + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -500,6 +505,10 @@ struct sched_entity { unsigned long runnable_weight; #endif +#ifdef CONFIG_DTS + int by_pass; +#endif + #ifdef CONFIG_SMP /* * Per entity load average tracking. @@ -726,6 +735,15 @@ struct task_struct { int normal_prio; unsigned int rt_priority; +#ifdef CONFIG_DTS + /* + * by_pass indicate that the task is launched by direct-thread-switch. + * dts_shared_se is the schedule entity shared with DTS task. + */ + int by_pass; + struct sched_entity dts_shared_se; +#endif + const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -2194,6 +2212,10 @@ static inline int sched_qos_cpu_overload(void) } #endif +#ifdef CONFIG_DTS +extern int check_task_left_time(struct task_struct *task); +#endif + #ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index c1d151d97deaa3bfb760545b7475a455b6c6d3ec..e2ed0553046d8af5a970837440f01054a05d022c 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -25,7 +25,14 @@ #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 + +#ifdef CONFIG_DTS +#define FUTEX_FLAGS_DTS_MODE 512 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \ + FUTEX_FLAGS_DTS_MODE) +#else #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) +#endif #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) @@ -43,6 +50,7 @@ FUTEX_PRIVATE_FLAG) #define FUTEX_SWAP_PRIVATE (FUTEX_SWAP | FUTEX_PRIVATE_FLAG) + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/init/Kconfig b/init/Kconfig index 1c607825c2dbd2a0ae5554ac8eef314209f80e55..e1be030628a3b40b09a92cce446476ed3e1c2a65 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1261,6 +1261,13 @@ config SCHED_STEAL If unsure, say N here. +config DTS + bool "Direct Thread Switch" + default y + depends on SCHED_STEAL + help + enable the direct thread switch mechanism in the futex_swap operation + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/futex.c b/kernel/futex.c index 42f55d1da678b04d4a298e0e1269d8fe0fa97a85..48da1e8b89480ae86bf19a2fc9b9272c9c7da1ca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -39,11 +39,16 @@ #include #include #include +#include +#include #include #include "locking/rtmutex_common.h" +#ifdef CONFIG_DTS +#include "sched/sched.h" +#endif /* * READ this before attempting to hack on futexes! * @@ -161,7 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled; * NOMMU does not have per process address space. Let the compiler optimize * code away. */ -# define FLAGS_SHARED 0x00 +#define FLAGS_SHARED 0x00 #endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -2585,6 +2590,219 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) return 0; } +#ifdef CONFIG_DTS +static int __direct_thread_switch(struct task_struct *next) +{ + int cpu = smp_processor_id(); + int success = 1; + struct rq_flags rf; + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + struct task_struct *prev = rq->curr; + struct sched_entity *prev_se, *next_se; + unsigned long *switch_count = &prev->nvcsw; + unsigned long prev_state; + int next_state; + struct rq *src_rq_next; + bool locked; + + preempt_disable(); + local_irq_disable(); + + if (!prev->by_pass) { + prev_se = &prev->se; + } else { + prev_se = &prev->dts_shared_se; + } + + next_se = &next->se; + + prev->by_pass = NONE_BY_PASS; + next->by_pass = INIT_BY_PASS; + next->dts_shared_se = *prev_se; + prev_se->by_pass = NONE_BY_PASS; + next->dts_shared_se.by_pass = INIT_BY_PASS; + + /* task_struct::state is volatile so far */ + next_state = next->state; + src_rq_next = task_rq(next); + locked = true; + /* Deliver the execution to the callee. */ + if (next_state == TASK_RUNNING) { + /* The next is running now. */ + if (task_running(src_rq_next, next)) { + success = 0; + goto end; + } + /* The next task is runnable, and may stay in the current core's rq or other cores' rq. */ + /* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence. + * Enqueue the next task's se when the task expired. + */ + if (task_rq(next) != rq) { +#ifdef CONFIG_SCHED_STEAL + /* migrate */ + if (!steal_task(rq, &rf, &locked, next)) { + success = 0; + goto end; + } +#else + success = 0; + goto end; +#endif + } + replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se); + } else if (next_state == TASK_INTERRUPTIBLE) { + /* + * + * The next task in the sleeping state caused by futex_swap, futex_wait, + * can be woken up here so far, but signals, and other interruptible situations + * need to be implemented here. + * P.S. We pick up the next task from the wake list of the corresponding futex_t. + */ + + /* Enqueue the shared_se and change the state without entering schedule() path. */ + if (!wake_up_process_prefer_current_cpu(next)) { + success = 0; + goto end; + } + + /* success to wakeup (set p->state = TASK_RUNNING) */ + /* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */ + set_next_entity(cfs_rq, &next->dts_shared_se); + + } else { + success = 0; + goto end; + } + + /* increase rq->cfs.nr_running */ + cfs_rq->nr_running++; + + sched_submit_work(prev); + + rcu_note_context_switch(false); + + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * __direct_thread_switch() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) + * + * Also, the membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. + */ + rq_lock(rq, &rf); + smp_mb__after_spinlock(); + + /* + * We may fail to switch, so do not deactivate the current task before + * process the next. + */ + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (prev_state) { + if (signal_pending_state(prev_state, prev)) { + prev->state = TASK_RUNNING; + } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); + + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + } + } + + rq->nr_switches++; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), + */ + ++*switch_count; + + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(false, prev, next); + + /* do the get_task_struct() in the futex_wait_queue_me() before */ + put_task_struct(next); + + rq = context_switch(rq, prev, next, &rf); + + balance_callback(rq); + sched_update_worker(next); +end: + sched_preempt_enable_no_resched(); + return success; +} + +/* + * return + * 0 for fail + * 1 for succeed + */ +static int direct_thread_switch(struct task_struct *next) +{ + if (next->sched_class != &fair_sched_class || + current == next) { + return 0; + } + + if (!check_task_left_time(current)) { + return 0; + } + + return __direct_thread_switch(next); +} +#endif /* CONFIG_DTS */ + /** * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller @@ -2595,7 +2813,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout, - struct task_struct *next) + struct task_struct *next, int flags) { /* * The task state is guaranteed to be set before another task can @@ -2615,6 +2833,9 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { +#ifdef CONFIG_DTS + int do_dts_switch = 0; +#endif /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there @@ -2622,27 +2843,49 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, */ if (!timeout || timeout->task) { if (next) { +#ifdef CONFIG_DTS /* - * wake_up_process() below will be replaced - * in the next patch with - * wake_up_process_prefer_current_cpu(). + * If we fail to switch to the next task directly, try to switch to + * the next task in the traditional way. + * */ + if (flags & FUTEX_FLAGS_DTS_MODE) + do_dts_switch = direct_thread_switch(next); + + if (!do_dts_switch) +#endif + { #ifdef CONFIG_SMP - wake_up_process_prefer_current_cpu(next); + wake_up_process_prefer_current_cpu(next); #else - wake_up_process(next); + wake_up_process(next); +#endif + } + +#ifdef CONFIG_DTS + if (!do_dts_switch) #endif - put_task_struct(next); + put_task_struct(next); + next = NULL; } - freezable_schedule(); +#ifdef CONFIG_DTS + if (!do_dts_switch) +#endif + freezable_schedule(); } } __set_current_state(TASK_RUNNING); + + if (next) { +#ifdef CONFIG_DTS + direct_thread_switch(next); +#else wake_up_process(next); put_task_struct(next); +#endif } } @@ -2743,7 +2986,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, goto out; /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to, next); + futex_wait_queue_me(hb, &q, to, next, flags); next = NULL; /* If we were woken (and unqueued), we succeeded, whatever. */ @@ -2819,6 +3062,15 @@ static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val, next->wake_q.next = NULL; } + /* Basic security test. (Are the two tasks in the same group?) */ + + /* Have any time slices to be used? */ + + /* + * The old one will go to sleep and enqueue the rq, meanwhile, get + * the new one to run. + */ + return futex_wait(uaddr, flags, val, abs_time, bitset, next); } @@ -3282,7 +3534,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to, NULL); + futex_wait_queue_me(hb, &q, to, NULL, flags); spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); @@ -3768,6 +4020,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; +#ifdef CONFIG_DTS + if (op & FUTEX_FLAGS_DTS_MODE) { + flags |= FUTEX_FLAGS_DTS_MODE; + } +#endif + if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bf3553233819a40f40b4dc3ebec2b9910d4d8e5..a9093f6d98d9172acef1afde6ee86f51559d6eba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2469,7 +2469,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - check_preempt_curr(rq, p, wake_flags); +#ifdef CONFIG_DTS + if (p->by_pass != INIT_BY_PASS) +#endif + check_preempt_curr(rq, p, wake_flags); + p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2996,7 +3000,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) out: if (success) ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); +#ifdef CONFIG_DTS + if (p->by_pass == INIT_BY_PASS) { + p->by_pass = IN_BY_PASS; + p->se.by_pass = IN_BY_PASS; + p->dts_shared_se.by_pass = IN_BY_PASS; + preempt_enable_no_resched(); + } + else +#endif + preempt_enable(); return success; } @@ -3086,6 +3099,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_DTS + p->dts_shared_se.on_rq = 0; + p->dts_shared_se.exec_start = 0; + p->dts_shared_se.sum_exec_runtime = 0; + p->dts_shared_se.prev_sum_exec_runtime = 0; + p->dts_shared_se.nr_migrations = 0; + p->dts_shared_se.vruntime = 0; + INIT_LIST_HEAD(&p->dts_shared_se.group_node); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@ -3315,6 +3338,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); +#ifdef CONFIG_DTS + p->by_pass = NONE_BY_PASS; + p->se.by_pass = NONE_BY_PASS; + p->dts_shared_se.by_pass = NONE_BY_PASS; +#endif #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) @@ -3702,6 +3730,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } +#ifdef CONFIG_DTS + prev->by_pass = NONE_BY_PASS; + prev->se.by_pass = NONE_BY_PASS; + prev->dts_shared_se.by_pass = NONE_BY_PASS; +#endif if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -3744,7 +3777,7 @@ static void __balance_callback(struct rq *rq) raw_spin_unlock_irqrestore(&rq->lock, flags); } -static inline void balance_callback(struct rq *rq) +inline void balance_callback(struct rq *rq) { if (unlikely(rq->balance_callback)) __balance_callback(rq); @@ -3752,7 +3785,7 @@ static inline void balance_callback(struct rq *rq) #else -static inline void balance_callback(struct rq *rq) +inline void balance_callback(struct rq *rq) { } @@ -3789,7 +3822,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static __always_inline struct rq * +__always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { @@ -3846,7 +3879,7 @@ context_switch(struct rq *rq, struct task_struct *prev, barrier(); return finish_task_switch(prev); -} +}EXPORT_SYMBOL(context_switch); /* * nr_running and nr_context_switches: @@ -4615,7 +4648,7 @@ void __noreturn do_task_dead(void) cpu_relax(); } -static inline void sched_submit_work(struct task_struct *tsk) +inline void sched_submit_work(struct task_struct *tsk) { unsigned int task_flags; @@ -4651,7 +4684,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -static void sched_update_worker(struct task_struct *tsk) +void sched_update_worker(struct task_struct *tsk) { if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & PF_WQ_WORKER) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30fbcd06baa90a73e2d5fd68e44fdcad325115d2..d58a949c95194f39a9da820fc7335d0bcfbc08a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -572,6 +572,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } +static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node) +{ + struct sched_entity *entry; + + if (!*node) { + printk("TREE END\n"); + return; + } + + entry = rb_entry(*node, struct sched_entity, run_node); + + __traverse_cfs_rq(cfs_rq, &(*node)->rb_left); + printk("%p\n", entry); + __traverse_cfs_rq(cfs_rq, &(*node)->rb_left); +} + +void traverse_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; + __traverse_cfs_rq(cfs_rq, link); +} + static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); @@ -2982,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *), } #endif -static void +void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); @@ -4340,7 +4362,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) - __enqueue_entity(cfs_rq, se); +#ifdef CONFIG_DTS + if (se->by_pass != INIT_BY_PASS) +#endif + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; /* @@ -4463,6 +4489,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; +#ifdef CONFIG_DTS + struct task_struct *curr_task = NULL; + + if (entity_is_task(curr) && curr->by_pass != NONE_BY_PASS) + curr_task = task_of_dts_shared_se(curr); +#endif ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; @@ -4488,7 +4520,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); - return; + goto end; } /* @@ -4497,19 +4529,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * This also mitigates buddy induced latencies under load. */ if (delta_exec < sysctl_sched_min_granularity) - return; + goto end; se = __pick_first_entity(cfs_rq); delta = curr->vruntime - se->vruntime; if (delta < 0) - return; + goto end; - if (delta > ideal_runtime) + if (delta > ideal_runtime) { resched_curr(rq_of(cfs_rq)); + goto end; + } else { + return; + } +end: +#ifdef CONFIG_DTS + if (curr_task) { + curr_task->by_pass = END_BY_PASS; + curr_task->se.by_pass = END_BY_PASS; + curr_task->dts_shared_se.by_pass = END_BY_PASS; + } +#endif } -static void +#ifdef CONFIG_DTS +/* + * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se. + * Correspondingly, enqueue the task original se without any changes on se's information + * when the shared se expired. // TODO + * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO + */ +void +replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se) +{ + if (shared_se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + // TODO + update_stats_wait_end(cfs_rq, shared_se); + __dequeue_entity(cfs_rq, se); /* the se of next task should be dequeued */ + update_load_avg(cfs_rq, shared_se, UPDATE_TG); + } + + update_stats_curr_start(cfs_rq, shared_se); + cfs_rq->curr = shared_se; // 后续update_curr是update cfs_rq->curr + + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) { + schedstat_set(shared_se->statistics.slice_max, + max((u64)schedstat_val(shared_se->statistics.slice_max), + shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime)); + } + + shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime; +} +#endif + +void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* 'current' is not kept within the tree. */ @@ -4605,8 +4690,15 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +/* the prev's value is unique or shared for the dts mechanism */ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { +#ifdef CONFIG_DTS + struct task_struct *task = NULL; + + if (entity_is_task(prev)) + task = task_of(prev); +#endif /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: @@ -4627,6 +4719,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; +#ifdef CONFIG_DTS + if (task && task->by_pass == END_BY_PASS) { + task->by_pass = NONE_BY_PASS; + task->se.by_pass = NONE_BY_PASS; + task->dts_shared_se.by_pass = NONE_BY_PASS; + } +#endif } static void @@ -5630,6 +5729,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_new = !(flags & ENQUEUE_WAKEUP); unsigned int prev_nr = rq->cfs.h_nr_running; +#ifdef CONFIG_DTS + if (p->by_pass != NONE_BY_PASS) { + se = &p->dts_shared_se; + } +#endif + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -5737,11 +5842,17 @@ static void set_next_buddy(struct sched_entity *se); static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; + struct sched_entity *se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); +#ifdef CONFIG_DTS + if (p->by_pass != NONE_BY_PASS) + se = &p->dts_shared_se; + else +#endif + se = &p->se; util_est_dequeue(&rq->cfs, p); @@ -7159,11 +7270,28 @@ static void set_skip_buddy(struct sched_entity *se) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; + struct sched_entity *se, *pse; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; +#ifdef CONFIG_DTS + int curr_by_pass = curr->by_pass; + int p_by_pass = p->by_pass; + + if (curr_by_pass != NONE_BY_PASS) + se = &curr->dts_shared_se; + else +#endif + se = &curr->se; + +#ifdef CONFIG_DTS + if (p_by_pass != NONE_BY_PASS) + pse = &p->dts_shared_se; + else +#endif + pse = &p->se; + if (unlikely(se == pse)) return; @@ -7718,13 +7846,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf p = task_of(se); + if (se == NULL) { + printk("CFS_RQ Nr_running: %d\n", rq->cfs.nr_running); + printk("RQ Nr_running: %d\n", rq->nr_running); + } + /* * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { - struct sched_entity *pse = &prev->se; + struct sched_entity *pse; +#ifdef CONFIG_DTS + if (prev->by_pass != NONE_BY_PASS) + pse = &prev->dts_shared_se; + else +#endif + pse = &prev->se; + while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -7877,8 +8017,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { - struct sched_entity *se = &prev->se; + struct sched_entity *se; struct cfs_rq *cfs_rq; +#ifdef CONFIG_DTS + if (prev->by_pass != NONE_BY_PASS) + se = &prev->dts_shared_se; + else +#endif + se = &prev->se; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7895,7 +8042,13 @@ static void yield_task_fair(struct rq *rq) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *se = &curr->se; + struct sched_entity *se; +#ifdef CONFIG_DTS + if (curr->by_pass != NONE_BY_PASS) + se = &curr->dts_shared_se; + else +#endif + se = &curr->se; /* * Are we the only task in the tree? @@ -7926,6 +8079,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; +#ifdef CONFIG_DTS + /* DTS tasks DO NOT support being executed by yeild_to method.*/ + if (p->by_pass != NONE_BY_PASS) { + return false; + } +#endif + /* throttled hierarchies are not runnable */ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; @@ -8363,7 +8523,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) /* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */ -static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) +void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { lockdep_assert_held(&src_rq->lock); @@ -8573,6 +8733,10 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); + +#ifdef CONFIG_DTS + if (p->by_pass != INIT_BY_PASS) +#endif check_preempt_curr(rq, p, 0); } @@ -11544,6 +11708,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, return stolen; } +int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + struct task_struct *tsk) +{ + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = task_rq(tsk); + int src_cpu = task_cpu(tsk); + + if (!steal_enabled()) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_cpu == src_cpu) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_unlock(&dst_rq->lock); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (!cpu_active(src_cpu)) + tsk = NULL; + else + detach_task(tsk, src_rq, dst_cpu); + + rq_unlock(src_rq, &rf); + + if (tsk) { + raw_spin_lock(&dst_rq->lock); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, tsk); + stolen = 1; + schedstat_inc(dst_rq->steal); + } + local_irq_restore(rf.flags); + + return stolen; +} + /* * Conservative upper bound on the max cost of a steal, in nsecs (the typical * cost is 1-2 microsec). Do not steal if average idle time is less. @@ -11653,6 +11864,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; +#ifdef CONFIG_DTS + if (curr->by_pass != NONE_BY_PASS) { + se = &curr->dts_shared_se; + } +#endif + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); @@ -12148,6 +12365,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task return rr_interval; } +void update_before_bypass(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct sched_entity *curr; + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_DTS + if (current->by_pass != NONE_BY_PASS) + curr = ¤t->dts_shared_se; + else +#endif + curr = ¤t->se; + + cfs_rq = cfs_rq_of(curr); + + rq_lock(rq, &rf); + update_rq_clock(rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + update_cfs_group(curr); + + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + update_cfs_group(curr); + + rq_unlock(rq, &rf); +} + +/* + * return 1: left time Y + * + */ +int check_task_left_time(struct task_struct *task) +{ + unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + struct cfs_rq *cfs_rq; +#ifdef CONFIG_DTS + if (task->by_pass != NONE_BY_PASS) + se = &task->dts_shared_se; + else +#endif + se = &task->se; + + cfs_rq = cfs_rq_of(se); + + ideal_runtime = sched_slice(cfs_rq, se); + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + if (cfs_rq->nr_running > 1) { + resched_curr(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, se); + } + return 0; + } + + return 1; +} + /* * All the scheduling class methods: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 976cdb36fe082dbccdfc1c3e2d12683012552357..456dfd0abf8da354c5a31bdc3998dd38f0ccb1b6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -534,6 +534,12 @@ extern void sched_offline_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); +#ifdef CONFIG_DTS +extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se); +#endif + +extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se); + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -1185,9 +1191,22 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define raw_rq() raw_cpu_ptr(&runqueues) #ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_DTS +static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se) +{ + SCHED_WARN_ON(!entity_is_task(dts_shared_se)); + return container_of(dts_shared_se, struct task_struct, dts_shared_se); +} +#endif + static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return task_of_dts_shared_se(se); + else +#endif return container_of(se, struct task_struct, se); } @@ -1210,8 +1229,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #else +#ifdef CONFIG_DTS +static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se) +{ + return container_of(dts_shared_se, struct task_struct, dts_shared_se); +} + +static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se) +{ + struct task_struct *p = task_of_dts_shared_se(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} +#endif + static inline struct task_struct *task_of(struct sched_entity *se) { +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return task_of_dts_shared_se(se); + else +#endif return container_of(se, struct task_struct, se); } @@ -1220,7 +1259,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) return &task_rq(p)->cfs; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se) { struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); @@ -1228,6 +1267,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) return &rq->cfs; } +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return cfs_rq_of_dts_shared_se(se); + else +#endif + return cfs_rq_of_se(se); + +} + /* runqueue "owned" by this group */ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { @@ -2404,6 +2454,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void traverse_cfs_rq(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_enabled; @@ -2789,3 +2840,18 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_DTS +extern void sched_submit_work(struct task_struct *tsk); +extern void sched_update_worker(struct task_struct *tsk); +extern struct rq *context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next, struct rq_flags *rf); +extern void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SCHED_STEAL +extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + struct task_struct *tsk); +extern void update_before_bypass(void); +extern void balance_callback(struct rq *rq); +#endif +#endif \ No newline at end of file diff --git a/tools/testing/selftests/futex/functional/futex_swap.c b/tools/testing/selftests/futex/functional/futex_swap.c index 9034d04372d3f34b45e1f76e853bcf81990d275f..8ce0266fcbc3d49c01e994b5666ea9a009f3b784 100644 --- a/tools/testing/selftests/futex/functional/futex_swap.c +++ b/tools/testing/selftests/futex/functional/futex_swap.c @@ -13,11 +13,11 @@ /* The futex the main thread waits on. */ futex_t futex_main = FUTEX_INITIALIZER; -/* The futex the other thread wats on. */ +/* The futex the other thread waits on. */ futex_t futex_other = FUTEX_INITIALIZER; /* The number of iterations to run (>1 => run benchmarks. */ -static int cfg_iterations = 1; +static int cfg_iterations = 5; /* If != 0, print diagnostic messages. */ static int cfg_verbose; @@ -28,17 +28,21 @@ static int cfg_validate = 1; /* How to swap threads. */ #define SWAP_WAKE_WAIT 1 #define SWAP_SWAP 2 +#define SWAP_SWAP_DTS 4 /* Futex values. */ #define FUTEX_WAITING 0 #define FUTEX_WAKEUP 1 +#define FUTEX_FLAGS_DTS_MODE 512 + /* An atomic counter used to validate proper swapping. */ static atomic_t validation_counter; void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that) { int ret; + int flags = 0; switch (mode) { case SWAP_WAKE_WAIT: @@ -52,11 +56,14 @@ void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that) } break; + case SWAP_SWAP_DTS: + flags |= FUTEX_FLAGS_DTS_MODE; case SWAP_SWAP: + flags |= FUTEX_PRIVATE_FLAG; futex_set(futex_this, FUTEX_WAITING); futex_set(futex_that, FUTEX_WAKEUP); ret = futex_swap(futex_this, FUTEX_WAITING, NULL, - futex_that, FUTEX_PRIVATE_FLAG); + futex_that, flags); if (ret < 0 && errno == ENOSYS) { /* futex_swap not implemented */ perror("futex_swap"); @@ -171,13 +178,14 @@ void usage(char *prog) printf(" -i N Use N iterations to benchmark\n"); printf(" -n Do not validate swapping correctness\n"); printf(" -v Print diagnostic messages\n"); + printf(" -d Benchmark with the direct-thread-switch(DTS) mechanism\n"); } int main(int argc, char *argv[]) { int c; - while ((c = getopt(argc, argv, "hi:nv")) != -1) { + while ((c = getopt(argc, argv, "hi:nvd")) != -1) { switch (c) { case 'h': usage(basename(argv[0])); @@ -191,6 +199,9 @@ int main(int argc, char *argv[]) case 'v': cfg_verbose = 1; break; + case 'd': + goto dts_test; + break; default: usage(basename(argv[0])); exit(1); @@ -205,5 +216,10 @@ int main(int argc, char *argv[]) run_test(SWAP_SWAP); printf("PASS\n"); +dts_test: + printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n"); + run_test(SWAP_SWAP_DTS); + printf("PASS\n"); + return 0; }