diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7dd65c9cf7072661a22f0961753850c1dd7c74b4..aa1bb49f1dc67fbea70812aef6f72ce8663f5680 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN Allows TCP to send "duplicate" SACKs. tcp_early_retrans - INTEGER - Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold - for triggering fast retransmit when the amount of outstanding data is - small and when no previously unsent data can be transmitted (such - that limited transmit could be used). Also controls the use of - Tail loss probe (TLP) that converts RTOs occurring due to tail - losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01). + Tail loss probe (TLP) converts RTOs occurring due to tail + losses into fast recovery (draft-ietf-tcpm-rack). Note that + TLP requires RACK to function properly (see tcp_recovery below) Possible values: - 0 disables ER - 1 enables ER - 2 enables ER but delays fast recovery and fast retransmit - by a fourth of RTT. This mitigates connection falsely - recovers when network has a small degree of reordering - (less than 3 packets). - 3 enables delayed ER and TLP. - 4 enables TLP only. + 0 disables TLP + 3 or 4 enables TLP Default: 3 tcp_ecn - INTEGER @@ -712,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 -tcp_thin_dupack - BOOLEAN - Enable dynamic triggering of retransmissions after one dupACK - for thin streams. If set, a check is performed upon reception - of a dupACK to determine if the stream is thin (less than 4 - packets in flight). As long as the stream is found to be thin, - data is retransmitted on the first received dupACK. This - improves retransmission latency for non-aggressive thin - streams, often found to be time-dependent. - For more information on thin streams, see - Documentation/networking/tcp-thin.txt - Default: 0 - tcp_limit_output_bytes - INTEGER Controls TCP Small Queue limit per tcp socket. TCP bulk sender tends to increase packets in flight until it diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fc5848dad7a43216b3f124c4afdaa6b64b23910c..6c22332afb759a02cb93d5d05135add81f4339eb 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -207,6 +207,8 @@ struct tcp_sock { /* Information of the most recently (s)acked skb */ struct tcp_rack { struct skb_mstamp mstamp; /* (Re)sent time of the skb */ + u32 rtt_us; /* Associated RTT */ + u32 end_seq; /* Ending TCP sequence of the skb */ u8 advanced; /* mstamp advanced since last lost marking */ u8 reord; /* reordering detected */ } rack; @@ -218,12 +220,11 @@ struct tcp_sock { unused:5; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ - thin_dupack : 1,/* Fast retransmit on first dupack */ + unused1 : 1, repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ - syn_data:1, /* SYN includes data */ + u8 syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ @@ -305,7 +306,6 @@ struct tcp_sock { */ int lost_cnt_hint; - u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 85ee3879499ebc4ebd63a59b2c425918858154c6..84b2edde09b143bae31b71b0305e55e80962fd75 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -144,6 +144,7 @@ struct inet_connection_sock { #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ #define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ +#define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ static inline struct inet_connection_sock *inet_csk(const struct sock *sk) { @@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || - what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) { + what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE || + what == ICSK_TIME_REO_TIMEOUT) { icsk->icsk_pending = what; icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); diff --git a/include/net/tcp.h b/include/net/tcp.h index 1da0aa7249291f77e35feb2efc2531627c096f93..c55d65f74f7f60a729e9d1321bfca25846c58936 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ +#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ @@ -261,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_recovery; +#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ + extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; @@ -397,6 +401,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); @@ -541,6 +546,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); +void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); @@ -559,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ -void tcp_resume_early_retransmit(struct sock *sk); void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); @@ -1031,23 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; } -/* TCP early-retransmit (ER) is similar to but more conservative than - * the thin-dupack feature. Enable ER only if thin-dupack is disabled. - */ -static inline void tcp_enable_early_retrans(struct tcp_sock *tp) -{ - struct net *net = sock_net((struct sock *)tp); - - tp->do_early_retrans = sysctl_tcp_early_retrans && - sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && - net->ipv4.sysctl_tcp_reordering == 3; -} - -static inline void tcp_disable_early_retrans(struct tcp_sock *tp) -{ - tp->do_early_retrans = 0; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; @@ -1856,17 +1844,11 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ - -/* Flags to enable various loss recovery features. See below */ -extern int sysctl_tcp_recovery; - -/* Use TCP RACK to detect (some) tail and retransmit losses */ -#define TCP_RACK_LOST_RETRANS 0x1 - -extern int tcp_rack_mark_lost(struct sock *sk); - -extern void tcp_rack_advance(struct tcp_sock *tp, - const struct skb_mstamp *xmit_time, u8 sacked); +extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now); +extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + const struct skb_mstamp *xmit_time, + const struct skb_mstamp *ack_time); +extern void tcp_rack_reo_timeout(struct sock *sk); /* * Save and compile IPv4 options, return a pointer to it diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4dea33e5f29572e09c29621ee8eadc4e60a9a9a2..3828b3a805cdeae87f4ad9300f35fff048e23691 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0f2d37e8e983d93245e52b041a146210e657a655..c8d283615c6fb9806e0095445c926195b3e28ec6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_thin_dupack", - .data = &sysctl_tcp_thin_dupack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_early_retrans", .data = &sysctl_tcp_early_retrans, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c8d46c140b4a47ee7c954317977393664e295abf..aba6ea76338e35e36124580dcab3c80e3c638659 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; - tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -2475,11 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else { - tp->thin_dupack = val; - if (tp->thin_dupack) - tcp_disable_early_retrans(tp); - } break; case TCP_REPAIR: @@ -2969,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; + case TCP_THIN_DUPACK: - val = tp->thin_dupack; + val = 0; break; case TCP_REPAIR: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec6d843630242074421994b17a851429a51a926c..1a34e9278c07a8b679402ef6ec9baa7f24c32fe4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; -int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tcp_disable_fack(tp); } - if (metric > 0) - tcp_disable_early_retrans(tp); tp->rack.reord = 1; } @@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; - - if (!tp->lost_out || - after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } /* Sum the number of packets on the wire we have marked as lost. @@ -1135,6 +1126,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, xmit_time, sacked); + tcp_rack_advance(tp, sacked, end_seq, + xmit_time, &state->ack_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; - bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous - * loss recovery is underway except recurring timeout(s) on - * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO + * if a previous recovery is underway, otherwise it may incorrectly + * call a timeout spurious if some previously retransmitted packets + * are s/acked (sec 3.2). We do not apply that retriction since + * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS + * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO + * on PTMU discovery to avoid sending new data. */ - tp->frto = sysctl_tcp_frto && - (new_recovery || icsk->icsk_retransmits) && - !inet_csk(sk)->icsk_mtup.probe_size; + tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned long delay; - - /* Delay early retransmit and entering fast recovery for - * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples - * available, or RTO is scheduled to fire first. - */ - if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt_us) - return false; - - delay = max(usecs_to_jiffies(tp->srtt_us >> 5), - msecs_to_jiffies(2)); - - if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) - return false; - - inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, - TCP_RTO_MAX); - return true; -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK (Disabled by default. Subsumbed by RACK): + * It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out; - int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#4: It is still not OK... But will it be useful to delay - * recovery more? - */ - packets_out = tp->packets_out; - if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && - !tcp_may_send_now(sk)) { - /* We have nothing to send. This connection is limited - * either by receiver window or by application. - */ - return true; - } - - /* If a thin stream is detected, retransmit after first - * received dupack. Employ only if SACK is supported in order - * to avoid possible corner-case series of spurious retransmissions - * Use only if there are no unsent data. - */ - if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && - tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && - tcp_is_sack(tp) && !tcp_send_head(sk)) - return true; - - /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious - * retransmissions due to small network reorderings, we implement - * Mitigation A.3 in the RFC and delay the retransmission for a short - * interval if appropriate. - */ - if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); - return false; } @@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, - int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; @@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, true)) - return; + /* The ACK (s)acks some never-retransmitted data meaning not all + * the data packets before the timeout were lost. Therefore we + * undo the congestion window and state. This is essentially + * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since + * a retransmitted skb is permantly marked, we can apply such an + * operation even if F-RTO was not used. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, tp->undo_marker)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ @@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, + const struct skb_mstamp *ack_time) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + u32 prior_retrans = tp->retrans_out; + + tcp_rack_mark_lost(sk, ack_time); + if (prior_retrans > tp->retrans_out) + *ack_flag |= FLAG_LOST_RETRANS; + } +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit) + bool is_dupack, int *ack_flag, int *rexmit, + const struct skb_mstamp *ack_time) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } - /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) { - flag |= FLAG_LOST_RETRANS; - *ack_flag |= FLAG_LOST_RETRANS; - } - /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } + tcp_rack_identify_loss(sk, ack_flag, ack_time); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - if (icsk->icsk_ca_state != TCP_CA_Open && - !(flag & FLAG_LOST_RETRANS)) + tcp_rack_identify_loss(sk, ack_flag, ack_time); + if (!(icsk->icsk_ca_state == TCP_CA_Open || + (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ default: @@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); + tcp_rack_identify_loss(sk, ack_flag, ack_time); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = @@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk) } } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_rearm_rto(sk); - - /* Stop if ER is disabled after the delayed ER timer is scheduled */ - if (!tp->do_early_retrans) - return; - - tcp_enter_recovery(sk, false); - tcp_update_scoreboard(sk, 1); - tcp_xmit_retransmit_queue(sk); -} - /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack, - struct skb_mstamp *now) + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; + struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } else if (tcp_is_sack(tp)) { tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + tcp_rack_advance(tp, sacked, scb->end_seq, + &skb->skb_mstamp, + &sack->ack_time); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - struct skb_mstamp now; sack_state.first_sackt.v64 = 0; sack_state.rate = &rs; @@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&now); + skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) { @@ -3689,11 +3634,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state, &now); + &sack_state); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3708,15 +3654,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag, &rs); + tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, + sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3737,9 +3685,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { + skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); tcp_xmit_recovery(sk, rexmit); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56d756ecfb597855065299b0f8b14d73ddfcdbf3..63214136cf1c39ef0afbdea7c1fd884dadfd7c08 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2229,7 +2229,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index ba8f02d0f283c6eaaf14ed89103adea135093353..b9ed0d50aead508d03b364b5f3e1665f1ac276a9 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk) val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) { tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); tp->reordering = val; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 06fde26a82b7fecdabc3fecb0861fe09e4cbaa46..bdb443471c3914cd57c3836c43f73648b452c01e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; newtp->lsndtime = treq->snt_synack.stamp_jiffies; newsk->sk_txhash = treq->txhash; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d5331a1b1dc2677316148ba9852c191e7ed0fd4..9a1a1494b9dd40cdf4e99c50a8d7af3096762fa5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); @@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) - return false; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { tcp_rearm_rto(sk); @@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || - !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + !tp->packets_out || !tcp_is_sack(tp) || + icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && @@ -2831,36 +2828,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return err; } -/* Check if we forward retransmits are possible in the current - * window/congestion state. - */ -static bool tcp_can_forward_retransmit(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); - - /* Forward retransmissions are possible only during Recovery. */ - if (icsk->icsk_ca_state != TCP_CA_Recovery) - return false; - - /* No forward retransmissions in Reno are possible. */ - if (tcp_is_reno(tp)) - return false; - - /* Yeah, we have to make difficult choice between forward transmission - * and retransmission... Both ways have their merits... - * - * For now we do not retransmit anything, while we have some new - * segments to send. In the other cases, follow rule 3 for - * NextSeg() specified in RFC3517. - */ - - if (tcp_may_send_now(sk)) - return false; - - return true; -} - /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2875,24 +2842,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; - u32 max_segs, last_lost; + u32 max_segs; int mib_idx; - int fwd_rexmitting = 0; if (!tp->packets_out) return; - if (!tp->lost_out) - tp->retransmit_high = tp->snd_una; - if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; - last_lost = TCP_SKB_CB(skb)->end_seq; - if (after(last_lost, tp->retransmit_high)) - last_lost = tp->retransmit_high; } else { skb = tcp_write_queue_head(sk); - last_lost = tp->snd_una; } max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); @@ -2915,31 +2874,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ segs = min_t(int, segs, max_segs); - if (fwd_rexmitting) { -begin_fwd: - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - break; - mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - - } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { - tp->retransmit_high = last_lost; - if (!tcp_can_forward_retransmit(sk)) - break; - /* Backtrack if necessary to non-L'ed skb */ - if (hole) { - skb = hole; - hole = NULL; - } - fwd_rexmitting = 1; - goto begin_fwd; - + if (tp->retrans_out >= tp->lost_out) { + break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { - last_lost = TCP_SKB_CB(skb)->end_seq; if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else @@ -2960,7 +2902,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk)) + if (skb == tcp_write_queue_head(sk) && + icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index e36df4fcfeba3042f3d84337a18311427f68a418..4ecb38ae85042db7fa59e1aa6c74c9c3da0b1099 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,9 +1,32 @@ #include #include -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; -/* Marks a packet lost, if some packet sent later has been (s)acked. +static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); + } +} + +static bool tcp_rack_sent_after(const struct skb_mstamp *t1, + const struct skb_mstamp *t2, + u32 seq1, u32 seq2) +{ + return skb_mstamp_after(t1, t2) || + (t1->v64 == t2->v64 && after(seq1, seq2)); +} + +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ -int tcp_rack_mark_lost(struct sock *sk) +static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, + u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - u32 reo_wnd, prior_retrans = tp->retrans_out; - - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) - return 0; - - /* Reset the advanced flag to avoid unnecessary queue scanning */ - tp->rack.advanced = 0; + u32 reo_wnd; + *reo_timeout = 0; /* To be more reordering resilient, allow min_rtt/4 settling delay * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * RTT because reordering is often a path property and less related * to queuing or delayed ACKs. - * - * TODO: measure and adapt to the observed reordering delay, and - * use a timer to retransmit like the delayed early retransmit. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) { + /* Step 3 in draft-cheng-tcpm-rack-00.txt: + * A packet is lost if its elapsed time is beyond + * the recent RTT plus the reordering window. + */ + u32 elapsed = skb_mstamp_us_delta(now, + &skb->skb_mstamp); + s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - if (skb_mstamp_us_delta(&tp->rack.mstamp, - &skb->skb_mstamp) <= reo_wnd) + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); continue; - - /* skb is lost if packet sent later is sacked */ - tcp_skb_mark_lost_uncond_verify(tp, skb); - if (scb->sacked & TCPCB_SACKED_RETRANS) { - scb->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); } + + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; + + /* Record maximum wait time (+1 to avoid 0) */ + *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early * b/c the rest are all sent after rack_sent @@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk) break; } } - return prior_retrans - tp->retrans_out; } -/* Record the most recently (re)sent time among the (s)acked packets */ -void tcp_rack_advance(struct tcp_sock *tp, - const struct skb_mstamp *xmit_time, u8 sacked) +void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout; + + if (!tp->rack.advanced) + return; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + tcp_rack_detect_loss(sk, now, &timeout); + if (timeout) { + timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, + timeout, inet_csk(sk)->icsk_rto); + } +} + +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + const struct skb_mstamp *xmit_time, + const struct skb_mstamp *ack_time) { + u32 rtt_us; + if (tp->rack.mstamp.v64 && - !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + end_seq, tp->rack.end_seq)) return; + rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); if (sacked & TCPCB_RETRANS) { - struct skb_mstamp now; - /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - skb_mstamp_get(&now); - if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + if (rtt_us < tcp_min_rtt(tp)) return; } - + tp->rack.rtt_us = rtt_us; tp->rack.mstamp = *xmit_time; + tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } + +/* We have waited long enough to accommodate reordering. Mark the expired + * packets lost and retransmit them. + */ +void tcp_rack_reo_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + u32 timeout, prior_inflight; + + skb_mstamp_get(&now); + prior_inflight = tcp_packets_in_flight(tp); + tcp_rack_detect_loss(sk, &now, &timeout); + if (prior_inflight != tcp_packets_in_flight(tp)) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { + tcp_enter_recovery(sk, false); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_cwnd_reduction(sk, 1, 0); + } + tcp_xmit_retransmit_queue(sk); + } + if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) + tcp_rearm_rto(sk); +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 29a9bd5f1225e96aef9052d69e19b3407f727e83..40d893556e6701ace6a02903e53c45822d6fa56d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk) event = icsk->icsk_pending; switch (event) { - case ICSK_TIME_EARLY_RETRANS: - tcp_resume_early_retransmit(sk); + case ICSK_TIME_REO_TIMEOUT: + tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 228965dca3c529b1c80994d254fb59a8a19f3089..fc14e04028bf2ad79b13f27249914ad6f327a539 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1745,7 +1745,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout;