tcp: defer SACK compression after DupThresh

[ Upstream commit 86de5921 ] Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262 ("tcp: add SACK compression") Reported-by: N Jean-Louis Dupond <jean-louis@dupond.be> Tested-by: N Jean-Louis Dupond <jean-louis@dupond.be> Signed-off-by: N Eric Dumazet <edumazet@google.com> Acked-by: N Neal Cardwell <ncardwell@google.com> Signed-off-by: N David S. Miller <davem@davemloft.net> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>

tcp: defer SACK compression after DupThresh
[ Upstream commit 86de5921 ] Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262 ("tcp: add SACK compression") Reported-by: N Jean-Louis Dupond <jean-louis@dupond.be> Tested-by: N Jean-Louis Dupond <jean-louis@dupond.be> Signed-off-by: N Eric Dumazet <edumazet@google.com> Acked-by: N Neal Cardwell <ncardwell@google.com> Signed-off-by: N David S. Miller <davem@davemloft.net> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>
aaa7e45c · Eric Dumazet · Greg Kroah-Hartman · b8e07695 · aaa7e45c · aaa7e45c
Showing with 17 addition and 6 deletion

include/linux/tcp.h include/linux/tcp.h +1 -0

net/ipv4/tcp_input.c net/ipv4/tcp_input.c +12 -2

net/ipv4/tcp_output.c net/ipv4/tcp_output.c +3 -3

net/ipv4/tcp_timer.c net/ipv4/tcp_timer.c +1 -1

未找到文件。
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -196,6 +196,7 @@ struct tcp_sock {
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
+	u32	compressed_ack_rcv_nxt;

 	u32	tsoffset;	/* timestamp offset */


--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4276,7 +4276,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 	 * If the sack array is full, forget about the last one.
 	 */
 	if (this_sack >= TCP_NUM_SACKS) {
-		if (tp->compressed_ack)
+		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
 			tcp_send_ack(sk);
 		this_sack--;
 		tp->rx_opt.num_sacks--;
@@ -5196,7 +5196,17 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	if (!tcp_is_sack(tp) ||
 	    tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
 		goto send_now;
-	tp->compressed_ack++;
+
+	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
+		tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
+		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
+			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+				      tp->compressed_ack - TCP_FASTRETRANS_THRESH);
+		tp->compressed_ack = 0;
+	}
+
+	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
+		goto send_now;

 	if (hrtimer_is_queued(&tp->compressed_ack_timer))
 		return;

--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -165,10 +165,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
 {
 	struct tcp_sock *tp = tcp_sk(sk);

-	if (unlikely(tp->compressed_ack)) {
+	if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
-			      tp->compressed_ack);
-		tp->compressed_ack = 0;
+			      tp->compressed_ack - TCP_FASTRETRANS_THRESH);
+		tp->compressed_ack = TCP_FASTRETRANS_THRESH;
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
 	}

--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -740,7 +740,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)

 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk)) {
-		if (tp->compressed_ack)
+		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
 			tcp_send_ack(sk);
 	} else {
 		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,