提交 20ff44aa 编写于 作者: D David S. Miller

Merge branch 'tcp'

Yuchung Cheng says:

====================
This patch series improve RTT sampling in three ways:
1. Sample RTT during fast recovery and reordering events.
2. Favor ack-based RTT to timestamps because of broken TS ECR fields
3. Consolidate the RTT measurement logic.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 c3f51d5f ed08495c
master alk-4.19.24 alk-4.19.30 alk-4.19.34 alk-4.19.36 alk-4.19.43 alk-4.19.48 alk-4.19.57 ck-4.19.67 ck-4.19.81 ck-4.19.91 github/fork/deepanshu1422/fix-typo-in-comment github/fork/haosdent/fix-typo linux-next v4.19.91 v4.19.90 v4.19.89 v4.19.88 v4.19.87 v4.19.86 v4.19.85 v4.19.84 v4.19.83 v4.19.82 v4.19.81 v4.19.80 v4.19.79 v4.19.78 v4.19.77 v4.19.76 v4.19.75 v4.19.74 v4.19.73 v4.19.72 v4.19.71 v4.19.70 v4.19.69 v4.19.68 v4.19.67 v4.19.66 v4.19.65 v4.19.64 v4.19.63 v4.19.62 v4.19.61 v4.19.60 v4.19.59 v4.19.58 v4.19.57 v4.19.56 v4.19.55 v4.19.54 v4.19.53 v4.19.52 v4.19.51 v4.19.50 v4.19.49 v4.19.48 v4.19.47 v4.19.46 v4.19.45 v4.19.44 v4.19.43 v4.19.42 v4.19.41 v4.19.40 v4.19.39 v4.19.38 v4.19.37 v4.19.36 v4.19.35 v4.19.34 v4.19.33 v4.19.32 v4.19.31 v4.19.30 v4.19.29 v4.19.28 v4.19.27 v4.19.26 v4.19.25 v4.19.24 v4.19.23 v4.19.22 v4.19.21 v4.19.20 v4.19.19 v4.19.18 v4.19.17 v4.19.16 v4.19.15 v4.19.14 v4.19.13 v4.19.12 v4.19.11 v4.19.10 v4.19.9 v4.19.8 v4.19.7 v4.19.6 v4.19.5 v4.19.4 v4.19.3 v4.19.2 v4.19.1 v4.19 v4.19-rc8 v4.19-rc7 v4.19-rc6 v4.19-rc5 v4.19-rc4 v4.19-rc3 v4.19-rc2 v4.19-rc1 ck-release-21 ck-release-20 ck-release-19.2 ck-release-19.1 ck-release-19 ck-release-18 ck-release-17.2 ck-release-17.1 ck-release-17 ck-release-16 ck-release-15.1 ck-release-15 ck-release-14 ck-release-13.2 ck-release-13 ck-release-12 ck-release-11 ck-release-10 ck-release-9 ck-release-7 alk-release-15 alk-release-14 alk-release-13.2 alk-release-13 alk-release-12 alk-release-11 alk-release-10 alk-release-9 alk-release-7
无相关合并请求
......@@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
extern int tcp_mss_to_mtu(struct sock *sk, int mss);
extern void tcp_mtup_init(struct sock *sk);
extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
extern void tcp_init_buffer_space(struct sock *sk);
static inline void tcp_bound_rto(const struct sock *sk)
......@@ -1094,15 +1093,6 @@ static inline void tcp_openreq_init(struct request_sock *req,
ireq->loc_port = tcp_hdr(skb)->dest;
}
/* Compute time elapsed between SYNACK and the ACK completing 3WHS */
static inline void tcp_synack_rtt_meas(struct sock *sk,
struct request_sock *req)
{
if (tcp_rsk(req)->snt_synack)
tcp_valid_rtt_meas(sk,
tcp_time_stamp - tcp_rsk(req)->snt_synack);
}
extern void tcp_enter_memory_pressure(struct sock *sk);
static inline int keepalive_intvl_when(const struct tcp_sock *tp)
......
......@@ -1048,6 +1048,7 @@ struct tcp_sacktag_state {
int reord;
int fack_count;
int flag;
s32 rtt; /* RTT measured by SACKing never-retransmitted data */
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
......@@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
bool dup_sack, int pcount)
int dup_sack, int pcount, u32 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
......@@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
/* Pick the earliest sequence sacked for RTT */
if (state->rtt < 0)
state->rtt = tcp_time_stamp - xmit_time;
}
if (sacked & TCPCB_LOST) {
......@@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount);
start_seq, end_seq, dup_sack, pcount,
TCP_SKB_CB(skb)->when);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
......@@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb));
tcp_skb_pcount(skb),
TCP_SKB_CB(skb)->when);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
......@@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una)
u32 prior_snd_una, s32 *sack_rtt)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
......@@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.flag = 0;
state.reord = tp->packets_out;
state.rtt = -1;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
......@@ -1737,6 +1744,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
*sack_rtt = state.rtt;
return state.flag;
}
......@@ -2792,65 +2800,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk);
}
void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
s32 seq_rtt, s32 sack_rtt)
{
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
}
EXPORT_SYMBOL(tcp_valid_rtt_meas);
const struct tcp_sock *tp = tcp_sk(sk);
/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
* broken middle-boxes or peers may corrupt TS-ECR fields. But
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
seq_rtt = -1;
if (seq_rtt < 0)
seq_rtt = sack_rtt;
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Supersedes RFC1323)
*/
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
struct tcp_sock *tp = tcp_sk(sk);
tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
}
if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (seq_rtt < 0)
return false;
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
tcp_valid_rtt_meas(sk, seq_rtt);
/* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
return true;
}
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const s32 seq_rtt)
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag);
else if (seq_rtt >= 0)
tcp_ack_no_tstamp(sk, seq_rtt, flag);
struct tcp_sock *tp = tcp_sk(sk);
s32 seq_rtt = -1;
if (tp->lsndtime && !tp->total_retrans)
seq_rtt = tcp_time_stamp - tp->lsndtime;
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
}
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
......@@ -2939,7 +2933,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una)
u32 prior_snd_una, s32 sack_rtt)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
......@@ -2978,8 +2972,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
......@@ -3031,6 +3023,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
(flag & FLAG_ACKED))
tcp_rearm_rto(sk);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
......@@ -3040,9 +3036,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_mtup_probe_success(sk);
}
tcp_ack_update_rtt(sk, flag, seq_rtt);
tcp_rearm_rto(sk);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
......@@ -3274,6 +3267,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
s32 sack_rtt = -1;
/* If the ack is older than previous acks
* then we can probably ignore it.
......@@ -3330,7 +3324,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
......@@ -3349,7 +3344,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
......@@ -3402,7 +3397,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
......@@ -5624,9 +5620,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
tcp_synack_rtt_meas(sk, req);
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
......@@ -5651,6 +5645,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
......
......@@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
newtp->total_retrans = req->num_retrans;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
......
......@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack;
newtp->total_retrans = req->num_retrans;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
......@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
tcp_rsk(req)->snt_synack = 0;
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
......
......@@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
newtp->total_retrans = req->num_retrans;
newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部