diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b1c6500e7a8df4d7377b291e9afc09363e66cd17..974ab47ae53a81c27b2b57533db813288139fd7b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -602,6 +602,14 @@ tcp_fastopen - INTEGER Note that that additional client or server features are only effective if the basic support (0x1 and 0x2) are enabled respectively. +tcp_fastopen_blackhole_timeout_sec - INTEGER + Initial time period in second to disable Fastopen on active TCP sockets + when a TFO firewall blackhole issue happens. + This time period will grow exponentially when more blackhole issues + get detected right after Fastopen is re-enabled and will reset to + initial value when the blackhole issue goes away. + By default, it is set to 1hr. + tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt will be retransmitted. Should not be higher than 127. Default value diff --git a/include/linux/tcp.h b/include/linux/tcp.h index cfc2d9506ce8077af1ec92eb7086fd52ce4fe1ac..cbe5b602a2d349fdeb1e878305f37b4da1e6cc86 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -233,6 +233,7 @@ struct tcp_sock { u8 syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ + syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ diff --git a/include/net/tcp.h b/include/net/tcp.h index cc6ae0a95201f0adc52c2c46b429566806da6745..da28bef1d82b6773bbfcf7c7eafebb7a4932f25b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1506,6 +1506,12 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; +extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; +void tcp_fastopen_active_disable(struct sock *sk); +bool tcp_fastopen_active_should_disable(struct sock *sk); +void tcp_fastopen_active_disable_ofo_check(struct sock *sk); +void tcp_fastopen_active_timeout_reset(void); + /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index cec0e171d20caea2f188c06a9924f886b0daaa85..95cffcb21dfdba7c974706131d0f43e21435e82d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -259,6 +259,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ + LINUX_MIB_TCPFASTOPENBLACKHOLE, /* TCPFastOpenBlackholeDetect */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ccbf464d1acf5f433dd2a0768691f5d22e3033d..fa44e752a9a3f8eb9957314149ae15e6df10465a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -281,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ddac9e64b7022452202cdb0697cbfee82ed1727b..86957e9cd6c6748ac00aa0307154bb131c43f1da 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -350,6 +350,19 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, return ret; } +static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, + int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + tcp_fastopen_active_timeout_reset(); + return ret; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -399,6 +412,14 @@ static struct ctl_table ipv4_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 04843ae77b9ecacb3e4f2e81096f11d35ae1915e..efc976ae66ae5b82d496323634c3030fb71c6c92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); + tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8ea4e9787f82ba65cd07b4c2b663df76fe4eb143..4af82b914dd4bbdc47e37cf1cf70f206bd186db5 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return false; } + + /* Firewall blackhole issue check */ + if (tcp_fastopen_active_should_disable(sk)) { + cookie->len = -1; + return false; + } + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; @@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); + +/* + * The following code block is to deal with middle box issues with TFO: + * Middlebox firewall issues can potentially cause server's data being + * blackholed after a successful 3WHS using TFO. + * The proposed solution is to disable active TFO globally under the + * following circumstances: + * 1. client side TFO socket receives out of order FIN + * 2. client side TFO socket receives out of order RST + * We disable active side TFO globally for 1hr at first. Then if it + * happens again, we disable it for 2h, then 4h, 8h, ... + * And we reset the timeout back to 1hr when we see a successful active + * TFO connection with data exchanges. + */ + +/* Default to 1hr */ +unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; +static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); +static unsigned long tfo_active_disable_stamp __read_mostly; + +/* Disable active TFO and record current jiffies and + * tfo_active_disable_times + */ +void tcp_fastopen_active_disable(struct sock *sk) +{ + atomic_inc(&tfo_active_disable_times); + tfo_active_disable_stamp = jiffies; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); +} + +/* Reset tfo_active_disable_times to 0 */ +void tcp_fastopen_active_timeout_reset(void) +{ + atomic_set(&tfo_active_disable_times, 0); +} + +/* Calculate timeout for tfo active disable + * Return true if we are still in the active TFO disable period + * Return false if timeout already expired and we should use active TFO + */ +bool tcp_fastopen_active_should_disable(struct sock *sk) +{ + int tfo_da_times = atomic_read(&tfo_active_disable_times); + int multiplier; + unsigned long timeout; + + if (!tfo_da_times) + return false; + + /* Limit timout to max: 2^6 * initial timeout */ + multiplier = 1 << min(tfo_da_times - 1, 6); + timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; + if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + return true; + + /* Mark check bit so we can check for successful active TFO + * condition and reset tfo_active_disable_times + */ + tcp_sk(sk)->syn_fastopen_ch = 1; + return false; +} + +/* Disable active TFO if FIN is the only packet in the ofo queue + * and no data is received. + * Also check if we can reset tfo_active_disable_times if data is + * received successfully on a marked active TFO sockets opened on + * a non-loopback interface + */ +void tcp_fastopen_active_disable_ofo_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *p; + struct sk_buff *skb; + struct dst_entry *dst; + + if (!tp->syn_fastopen) + return; + + if (!tp->data_segs_in) { + p = rb_first(&tp->out_of_order_queue); + if (p && !rb_next(p)) { + skb = rb_entry(p, struct sk_buff, rbnode); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_fastopen_active_disable(sk); + return; + } + } + } else if (tp->syn_fastopen_ch && + atomic_read(&tfo_active_disable_times)) { + dst = sk_dst_get(sk); + if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) + tcp_fastopen_active_timeout_reset(); + dst_release(dst); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 341f021f02a2931cd75b2e1e71af9729fc4c7895..5af2f04f885914491a7116c20056b3d2188d2d7d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5300,8 +5300,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (rst_seq_match) tcp_reset(sk); - else + else { + /* Disable TFO if RST is out-of-order + * and no data has been received + * for current active TFO socket + */ + if (tp->syn_fastopen && !tp->data_segs_in && + sk->sk_state == TCP_ESTABLISHED) + tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk, skb); + } goto discard; } @@ -6044,9 +6052,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + if (tp->linger2 < 0) { + tcp_done(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + /* Receive out of order FIN after close() */ + if (tp->syn_fastopen && th->fin) + tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 20cbd2f07f281717c1cb4e901c4c4e22f7c46bd6..cbbafe546c0f5c5f43531eaf24f5b460264785c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1855,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); + /* Check if we want to disable active TFO */ + tcp_fastopen_active_disable_ofo_check(sk); + /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b2ab411c6d3728fa7dbdebde045532a7317f5166..14672543cf0bd27bc59976d5cec38d2d3bbcdd2c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk) if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts with - * few or zero bytes acked after Fast Open. + * Fast Open on this path on recurring timeouts after + * successful Fast Open. */ - if (tp->syn_data_acked && - tp->bytes_acked <= tp->rx_opt.mss_clamp) { + if (tp->syn_data_acked) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS(sock_net(sk),