提交 6771bfd9 编写于 作者: F Florian Westphal 提交者: David S. Miller

mptcp: update mptcp ack sequence from work queue

If userspace is not reading data, all the mptcp-level acks contain the
ack_seq from the last time userspace read data rather than the most
recent in-sequence value.

This causes pointless retransmissions for data that is already queued.

The reason for this is that all the mptcp protocol level processing
happens at mptcp_recv time.

This adds work queue to move skbs from the subflow sockets receive
queue on the mptcp socket receive queue (which was not used so far).

This allows us to announce the correct mptcp ack sequence in a timely
fashion, even when the application does not call recv() on the mptcp socket
for some time.

We still wake userspace tasks waiting for POLLIN immediately:
If the mptcp level receive queue is empty (because the work queue is
still pending) it can be filled from in-sequence subflow sockets at
recv time without a need to wait for the worker.

The skb_orphan when moving skbs from subflow to mptcp level is needed,
because the destructor (sock_rfree) relies on skb->sk (ssk!) lock
being taken.

A followup patch will add needed rmem accouting for the moved skbs.

Other problem: In case application behaves as expected, and calls
recv() as soon as mptcp socket becomes readable, the work queue will
only waste cpu cycles.  This will also be addressed in followup patches.
Signed-off-by: NFlorian Westphal <fw@strlen.de>
Reviewed-by: NMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 80992017
...@@ -31,6 +31,12 @@ struct mptcp6_sock { ...@@ -31,6 +31,12 @@ struct mptcp6_sock {
}; };
#endif #endif
struct mptcp_skb_cb {
u32 offset;
};
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket. * completed yet or has failed, return the subflow socket.
* Otherwise return NULL. * Otherwise return NULL.
...@@ -111,11 +117,88 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) ...@@ -111,11 +117,88 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
return NULL; return NULL;
} }
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
struct sk_buff *skb,
unsigned int offset, size_t copy_len)
{
struct sock *sk = (struct sock *)msk;
__skb_unlink(skb, &ssk->sk_receive_queue);
skb_orphan(skb);
__skb_queue_tail(&sk->sk_receive_queue, skb);
msk->ack_seq += copy_len;
MPTCP_SKB_CB(skb)->offset = offset;
}
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
unsigned int moved = 0;
bool more_data_avail;
struct tcp_sock *tp;
bool done = false;
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
u32 seq = tp->copied_seq;
struct sk_buff *skb;
bool fin;
/* try to move as much data as available */
map_remaining = subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue);
if (!skb)
break;
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (fin) {
done = true;
seq++;
}
if (offset < skb->len) {
size_t len = skb->len - offset;
if (tp->urg_data)
done = true;
__mptcp_move_skb(msk, ssk, skb, offset, len);
seq += len;
moved += len;
if (WARN_ON_ONCE(map_remaining < len))
break;
} else {
WARN_ON_ONCE(!fin);
sk_eat_skb(ssk, skb);
done = true;
}
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);
} while (more_data_avail);
*bytes = moved;
return done;
}
void mptcp_data_ready(struct sock *sk) void mptcp_data_ready(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_DATA_READY, &msk->flags);
if (schedule_work(&msk->work))
sock_hold((struct sock *)msk);
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
} }
...@@ -373,19 +456,68 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) ...@@ -373,19 +456,68 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
remove_wait_queue(sk_sleep(sk), &wait); remove_wait_queue(sk_sleep(sk), &wait);
} }
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
size_t len)
{
struct sock *sk = (struct sock *)msk;
struct sk_buff *skb;
int copied = 0;
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
u32 offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
u32 count = min_t(size_t, len - copied, data_len);
int err;
err = skb_copy_datagram_msg(skb, offset, msg, count);
if (unlikely(err < 0)) {
if (!copied)
return err;
break;
}
copied += count;
if (count < data_len) {
MPTCP_SKB_CB(skb)->offset += count;
break;
}
__skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
if (copied >= len)
break;
}
return copied;
}
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
unsigned int moved = 0;
bool done;
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
if (!ssk)
break;
lock_sock(ssk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
release_sock(ssk);
} while (!done);
return moved > 0;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len) int nonblock, int flags, int *addr_len)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
bool more_data_avail = false;
struct mptcp_read_arg arg;
read_descriptor_t desc;
bool wait_data = false;
struct socket *ssock; struct socket *ssock;
struct tcp_sock *tp;
bool done = false;
struct sock *ssk;
int copied = 0; int copied = 0;
int target; int target;
long timeo; long timeo;
...@@ -403,65 +535,26 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -403,65 +535,26 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return copied; return copied;
} }
arg.msg = msg;
desc.arg.data = &arg;
desc.error = 0;
timeo = sock_rcvtimeo(sk, nonblock); timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX); len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
while (!done) { while (len > (size_t)copied) {
u32 map_remaining;
int bytes_read; int bytes_read;
ssk = mptcp_subflow_recv_lookup(msk); bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
pr_debug("msk=%p ssk=%p", msk, ssk); if (unlikely(bytes_read < 0)) {
if (!ssk) if (!copied)
goto wait_for_data; copied = bytes_read;
goto out_err;
}
subflow = mptcp_subflow_ctx(ssk); copied += bytes_read;
tp = tcp_sk(ssk);
lock_sock(ssk); if (skb_queue_empty(&sk->sk_receive_queue) &&
do { __mptcp_move_skbs(msk))
/* try to read as much data as available */ continue;
map_remaining = subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
desc.count = min_t(size_t, len - copied, map_remaining);
pr_debug("reading %zu bytes, copied %d", desc.count,
copied);
bytes_read = tcp_read_sock(ssk, &desc,
mptcp_read_actor);
if (bytes_read < 0) {
if (!copied)
copied = bytes_read;
done = true;
goto next;
}
pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
msk->ack_seq + bytes_read);
msk->ack_seq += bytes_read;
copied += bytes_read;
if (copied >= len) {
done = true;
goto next;
}
if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
pr_err("Urgent data present, cannot proceed");
done = true;
goto next;
}
next:
more_data_avail = mptcp_subflow_data_available(ssk);
} while (more_data_avail && !done);
release_sock(ssk);
continue;
wait_for_data:
more_data_avail = false;
/* only the master socket status is relevant here. The exit /* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg() * conditions mirror closely tcp_recvmsg()
...@@ -502,26 +595,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -502,26 +595,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
} }
pr_debug("block timeout %ld", timeo); pr_debug("block timeout %ld", timeo);
wait_data = true;
mptcp_wait_data(sk, &timeo); mptcp_wait_data(sk, &timeo);
if (unlikely(__mptcp_tcp_fallback(msk))) if (unlikely(__mptcp_tcp_fallback(msk)))
goto fallback; goto fallback;
} }
if (more_data_avail) { if (skb_queue_empty(&sk->sk_receive_queue)) {
if (!test_bit(MPTCP_DATA_READY, &msk->flags)) /* entire backlog drained, clear DATA_READY. */
set_bit(MPTCP_DATA_READY, &msk->flags);
} else if (!wait_data) {
clear_bit(MPTCP_DATA_READY, &msk->flags); clear_bit(MPTCP_DATA_READY, &msk->flags);
/* .. race-breaker: ssk might get new data after last /* .. race-breaker: ssk might have gotten new data
* data_available() returns false. * after last __mptcp_move_skbs() returned false.
*/ */
ssk = mptcp_subflow_recv_lookup(msk); if (unlikely(__mptcp_move_skbs(msk)))
if (unlikely(ssk))
set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_DATA_READY, &msk->flags);
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
/* data to read but mptcp_wait_data() cleared DATA_READY */
set_bit(MPTCP_DATA_READY, &msk->flags);
} }
out_err:
release_sock(sk); release_sock(sk);
return copied; return copied;
} }
...@@ -557,7 +649,7 @@ static void mptcp_worker(struct work_struct *work) ...@@ -557,7 +649,7 @@ static void mptcp_worker(struct work_struct *work)
struct sock *sk = &msk->sk.icsk_inet.sk; struct sock *sk = &msk->sk.icsk_inet.sk;
lock_sock(sk); lock_sock(sk);
__mptcp_move_skbs(msk);
release_sock(sk); release_sock(sk);
sock_put(sk); sock_put(sk);
} }
...@@ -638,6 +730,8 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -638,6 +730,8 @@ static void mptcp_close(struct sock *sk, long timeout)
mptcp_cancel_work(sk); mptcp_cancel_work(sk);
__skb_queue_purge(&sk->sk_receive_queue);
sk_common_release(sk); sk_common_release(sk);
} }
...@@ -1204,6 +1298,8 @@ void mptcp_proto_init(void) ...@@ -1204,6 +1298,8 @@ void mptcp_proto_init(void)
panic("Failed to register MPTCP proto.\n"); panic("Failed to register MPTCP proto.\n");
inet_register_protosw(&mptcp_protosw); inet_register_protosw(&mptcp_protosw);
BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
} }
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册