提交 b87ae678 编写于 作者: P Peter Oskolkov 提交者: briansun

futex: introduce FUTEX_SWAP operation

openeuler inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU
CVE: NA

-------------------

As Paul Turner presented at LPC in 2013 ...
- pdf: http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf
- video: https://www.youtube.com/watch?v=KXuZi9aeGTw

... Google has developed an M:N userspace threading subsystem backed
by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced
above). This subsystem provides latency-sensitive services at Google with
fine-grained user-space control/scheduling over what is running when,
and this subsystem is used widely internally (called schedulers or fibers).

This patchset is the first step to open-source this work. As explained
in the linked pdf and video, SwitchTo API has three core operations: wait,
resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation
that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation
on top of which user-space threading libraries can be built.

Another common use case for FUTEX_SWAP is message passing a-la RPC
between tasks: task/thread T1 prepares a message,
wakes T2 to work on it, and waits for the results; when T2 is done, it
wakes T1 and waits for more work to arrive. Currently the simplest
way to implement this is

a. T1: futex-wake T2, futex-wait
b. T2: wakes, does what it has been woken to do
c. T2: futex-wake T1, futex-wait

With FUTEX_SWAP, steps a and c above can be reduced to one futex operation
that runs 5-10 times faster.

Patches in this patchset:

Patch 1: (this patch) introduce FUTEX_SWAP futex operation that,
         internally, does wake + wait. The purpose of this patch is
         to work out the API.
Patch 2: a first rough attempt to make FUTEX_SWAP faster than
         what wake + wait can do.
Patch 3: a selftest that can also be used to benchmark FUTEX_SWAP vs
         FUTEX_WAKE + FUTEX_WAIT.

Tested: see patch 3 in this patchset.
Signed-off-by: NPeter Oskolkov <posk@google.com>
上级 79d1782a
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#define FUTEX_WAKE_BITSET 10 #define FUTEX_WAKE_BITSET 10
#define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_SWAP 13
#define FUTEX_PRIVATE_FLAG 128 #define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256 #define FUTEX_CLOCK_REALTIME 256
...@@ -40,6 +41,7 @@ ...@@ -40,6 +41,7 @@
FUTEX_PRIVATE_FLAG) FUTEX_PRIVATE_FLAG)
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
FUTEX_PRIVATE_FLAG) FUTEX_PRIVATE_FLAG)
#define FUTEX_SWAP_PRIVATE (FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
/* /*
* Support for robust futexes: the kernel cleans up held futexes at * Support for robust futexes: the kernel cleans up held futexes at
......
...@@ -1181,7 +1181,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, ...@@ -1181,7 +1181,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* tsk->futex_state = } else { * tsk->futex_state = } else {
* FUTEX_STATE_DEAD; if (tsk->futex_state != * FUTEX_STATE_DEAD; if (tsk->futex_state !=
* FUTEX_STATE_DEAD) * FUTEX_STATE_DEAD)
* return -EAGAIN; * return -EAGAIN;
* return -ESRCH; <--- FAIL * return -ESRCH; <--- FAIL
* } * }
* *
...@@ -1584,16 +1584,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) ...@@ -1584,16 +1584,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
} }
/* /*
* Wake up waiters matching bitset queued on this futex (uaddr). * Prepare wake queue matching bitset queued on this futex (uaddr).
*/ */
static int static int
futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
struct wake_q_head *wake_q)
{ {
struct futex_hash_bucket *hb; struct futex_hash_bucket *hb;
struct futex_q *this, *next; struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT; union futex_key key = FUTEX_KEY_INIT;
int ret; int ret;
DEFINE_WAKE_Q(wake_q);
if (!bitset) if (!bitset)
return -EINVAL; return -EINVAL;
...@@ -1621,14 +1621,28 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ...@@ -1621,14 +1621,28 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset)) if (!(this->bitset & bitset))
continue; continue;
mark_wake_futex(&wake_q, this); mark_wake_futex(wake_q, this);
if (++ret >= nr_wake) if (++ret >= nr_wake)
break; break;
} }
} }
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
return ret;
}
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
static int
futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
int ret;
DEFINE_WAKE_Q(wake_q);
ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
wake_up_q(&wake_q); wake_up_q(&wake_q);
return ret; return ret;
} }
...@@ -2576,9 +2590,12 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) ...@@ -2576,9 +2590,12 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* @hb: the futex hash bucket, must be locked by the caller * @hb: the futex hash bucket, must be locked by the caller
* @q: the futex_q to queue up on * @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout * @timeout: the prepared hrtimer_sleeper, or null for no timeout
* @next: if present, wake next and hint to the scheduler that we'd
* prefer to execute it locally.
*/ */
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout) struct hrtimer_sleeper *timeout,
struct task_struct *next)
{ {
/* /*
* The task state is guaranteed to be set before another task can * The task state is guaranteed to be set before another task can
...@@ -2603,10 +2620,26 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, ...@@ -2603,10 +2620,26 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* flagged for rescheduling. Only call schedule if there * flagged for rescheduling. Only call schedule if there
* is no timeout, or if it has yet to expire. * is no timeout, or if it has yet to expire.
*/ */
if (!timeout || timeout->task) if (!timeout || timeout->task) {
if (next) {
/*
* wake_up_process() below will be replaced
* in the next patch with
* wake_up_process_prefer_current_cpu().
*/
wake_up_process(next);
put_task_struct(next);
next = NULL;
}
freezable_schedule(); freezable_schedule();
}
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
if (next) {
wake_up_process(next);
put_task_struct(next);
}
} }
/** /**
...@@ -2682,7 +2715,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ...@@ -2682,7 +2715,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
} }
static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset) ktime_t *abs_time, u32 bitset, struct task_struct *next)
{ {
struct hrtimer_sleeper timeout, *to; struct hrtimer_sleeper timeout, *to;
struct restart_block *restart; struct restart_block *restart;
...@@ -2706,7 +2739,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ...@@ -2706,7 +2739,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
goto out; goto out;
/* queue_me and wait for wakeup, timeout, or a signal. */ /* queue_me and wait for wakeup, timeout, or a signal. */
futex_wait_queue_me(hb, &q, to); futex_wait_queue_me(hb, &q, to, next);
next = NULL;
/* If we were woken (and unqueued), we succeeded, whatever. */ /* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0; ret = 0;
...@@ -2738,6 +2772,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ...@@ -2738,6 +2772,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ret = set_restart_fn(restart, futex_wait_restart); ret = set_restart_fn(restart, futex_wait_restart);
out: out:
if (next) {
wake_up_process(next);
put_task_struct(next);
}
if (to) { if (to) {
hrtimer_cancel(&to->timer); hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer); destroy_hrtimer_on_stack(&to->timer);
...@@ -2745,7 +2783,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ...@@ -2745,7 +2783,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
return ret; return ret;
} }
static long futex_wait_restart(struct restart_block *restart) static long futex_wait_restart(struct restart_block *restart)
{ {
u32 __user *uaddr = restart->futex.uaddr; u32 __user *uaddr = restart->futex.uaddr;
...@@ -2757,10 +2794,29 @@ static long futex_wait_restart(struct restart_block *restart) ...@@ -2757,10 +2794,29 @@ static long futex_wait_restart(struct restart_block *restart)
} }
restart->fn = do_no_restart_syscall; restart->fn = do_no_restart_syscall;
return (long)futex_wait(uaddr, restart->futex.flags, return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val,
restart->futex.val, tp, restart->futex.bitset); tp, restart->futex.bitset, NULL);
} }
static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 __user *uaddr2)
{
u32 bitset = FUTEX_BITSET_MATCH_ANY;
struct task_struct *next = NULL;
DEFINE_WAKE_Q(wake_q);
int ret;
ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
if (ret < 0)
return ret;
if (!wake_q_empty(&wake_q)) {
/* At most one wakee can be present. Pull it out. */
next = container_of(wake_q.first, struct task_struct, wake_q);
next->wake_q.next = NULL;
}
return futex_wait(uaddr, flags, val, abs_time, bitset, next);
}
/* /*
* Userspace tried a 0 -> TID atomic transition of the futex value * Userspace tried a 0 -> TID atomic transition of the futex value
...@@ -3222,7 +3278,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ...@@ -3222,7 +3278,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
} }
/* Queue the futex_q, drop the hb lock, wait for wakeup. */ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to); futex_wait_queue_me(hb, &q, to, NULL);
spin_lock(&hb->lock); spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
...@@ -3732,7 +3788,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -3732,7 +3788,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
val3 = FUTEX_BITSET_MATCH_ANY; val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough; fallthrough;
case FUTEX_WAIT_BITSET: case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3); return futex_wait(uaddr, flags, val, timeout, val3, NULL);
case FUTEX_WAKE: case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY; val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough; fallthrough;
...@@ -3756,6 +3812,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -3756,6 +3812,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
uaddr2); uaddr2);
case FUTEX_CMP_REQUEUE_PI: case FUTEX_CMP_REQUEUE_PI:
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
case FUTEX_SWAP:
return futex_swap(uaddr, flags, val, timeout, uaddr2);
} }
return -ENOSYS; return -ENOSYS;
} }
...@@ -3772,7 +3830,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, ...@@ -3772,7 +3830,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) { cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT; return -EFAULT;
if (get_timespec64(&ts, utime)) if (get_timespec64(&ts, utime))
...@@ -3781,7 +3839,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, ...@@ -3781,7 +3839,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return -EINVAL; return -EINVAL;
t = timespec64_to_ktime(ts); t = timespec64_to_ktime(ts);
if (cmd == FUTEX_WAIT) if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
t = ktime_add_safe(ktime_get(), t); t = ktime_add_safe(ktime_get(), t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
t = timens_ktime_to_host(CLOCK_MONOTONIC, t); t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
...@@ -3968,14 +4026,14 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, ...@@ -3968,14 +4026,14 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) { cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
if (get_old_timespec32(&ts, utime)) if (get_old_timespec32(&ts, utime))
return -EFAULT; return -EFAULT;
if (!timespec64_valid(&ts)) if (!timespec64_valid(&ts))
return -EINVAL; return -EINVAL;
t = timespec64_to_ktime(ts); t = timespec64_to_ktime(ts);
if (cmd == FUTEX_WAIT) if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
t = ktime_add_safe(ktime_get(), t); t = ktime_add_safe(ktime_get(), t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
t = timens_ktime_to_host(CLOCK_MONOTONIC, t); t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册