diff --git a/include/net/sock.h b/include/net/sock.h index 352f06bbd7a9b14b0257b8014d4ecf3027aad44e..4eb8409249f6dfbdec81708921fe48d24cea639d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -1241,6 +1242,71 @@ static inline int sk_has_allocations(const struct sock *sk) return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } +/** + * sk_has_sleeper - check if there are any waiting processes + * @sk: socket + * + * Returns true if socket has waiting processes + * + * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * barrier call. They were added due to the race found within the tcp code. + * + * Consider following tcp code paths: + * + * CPU1 CPU2 + * + * sys_select receive packet + * ... ... + * __add_wait_queue update tp->rcv_nxt + * ... ... + * tp->rcv_nxt check sock_def_readable + * ... { + * schedule ... + * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + * wake_up_interruptible(sk->sk_sleep) + * ... + * } + * + * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay + * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 + * could then endup calling schedule and sleep forever if there are no more + * data on the socket. + */ +static inline int sk_has_sleeper(struct sock *sk) +{ + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier is paired in the sock_poll_wait. + */ + smp_mb(); + return sk->sk_sleep && waitqueue_active(sk->sk_sleep); +} + +/** + * sock_poll_wait - place memory barrier behind the poll_wait call. + * @filp: file + * @wait_address: socket wait queue + * @p: poll_table + * + * See the comments in the sk_has_sleeper function. + */ +static inline void sock_poll_wait(struct file *filp, + wait_queue_head_t *wait_address, poll_table *p) +{ + if (p && wait_address) { + poll_wait(filp, wait_address, p); + /* + * We need to be sure we are in sync with the + * socket flags modification. + * + * This memory barrier is paired in the sk_has_sleeper. + */ + smp_mb(); + } +} + /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in diff --git a/net/atm/common.c b/net/atm/common.c index c1c97936192c730d8f68431a782f8695c43504a3..8c4d843eb17f46bbec50847aa1352580adcd1c26 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk) static void vcc_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (vcc_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); @@ -594,7 +594,7 @@ unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/core/datagram.c b/net/core/datagram.c index 58abee1f1df1b044514079f9245dc7c1b96978a3..b0fe69211eefef1e18224fa123d2b13498bffefa 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -712,7 +712,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/sock.c b/net/core/sock.c index b0ba569bc97361dec5b2619a46ffbeab50331dc2..6354863b1c687c1917388d26c439fc8eab737bf4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_all(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk) static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_poll(sk->sk_sleep, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk) static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk) * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | POLLWRNORM | POLLWRBAND); diff --git a/net/dccp/output.c b/net/dccp/output.c index c0e88c16d0888b5222ca250c01a27bf3078a7e2a..c96119fda6885fd6b6bdc5487ff5cca7d2dd1574 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 314a1b5c033c88950ddf73eccf30a34bcaf60191..94ca8eaace7d05988059838bd5477831d643b65f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -311,7 +311,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, unsigned int mask; struct sock *sk = sock->sk; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7870a535dac6484ec7886dcf8e8270b729d3eaaa..91145244ea630777e7e4a2e7f112f3c31bc7f90e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -339,7 +339,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6be5f92d10943d383ce252242233e61770b2e528..49c15b48408e5617a5405ba396f1f9b9e427dd5d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk) static void iucv_sock_wake_msglim(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_all(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); read_unlock(&sk->sk_callback_lock); @@ -1256,7 +1256,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask = 0; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index eac5e7bb73659c3b951bd7fc9f564e7d380e4d9b..bfe493ebf27c36a7a82974af5a596ee8f75528c9 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk) _enter("%p", sk); read_lock(&sk->sk_callback_lock); if (rxrpc_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } @@ -588,7 +588,7 @@ static unsigned int rxrpc_poll(struct file *file, struct socket *sock, unsigned int mask; struct sock *sk = sock->sk; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 36d4e44d62334275ffbfaa8e3b837e95bcdace5f..fc3ebb906911b955b33b39af1a6a2e729d4695c9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } @@ -1985,7 +1985,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2022,7 +2022,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int mask, writable; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2053,7 +2053,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer_get(sk); if (other) { if (unix_peer(other) != sk) { - poll_wait(file, &unix_sk(other)->peer_wait, + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); if (unix_recvq_full(other)) writable = 0;