diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ea304a23c8d72c92a925d0c107bfd2bcfbbb92ec..924bd51327b7a8dff3503d7afccdd54e1eb5c29b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -525,6 +525,19 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). +tcp_comp_sack_delay_ns - LONG INTEGER + TCP tries to reduce number of SACK sent, using a timer + based on 5% of SRTT, capped by this sysctl, in nano seconds. + The default is 1ms, based on TSO autosizing period. + + Default : 1,000,000 ns (1 ms) + +tcp_comp_sack_nr - INTEGER + Max numer of SACK that can be compressed. + Using 0 disables SACK compression. + + Detault : 44 + tcp_slow_start_after_idle - BOOLEAN If set, provide RFC2861 behavior and time out the congestion window after an idle period. An idle period is defined at diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 807776928cb8610fe97121fbc3c600b08d5d2991..72705eaf4b84060a45bf04d5170f389a18010eac 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -218,6 +218,7 @@ struct tcp_sock { reord:1; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS */ + u8 compressed_ack; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ @@ -297,6 +298,7 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; + struct hrtimer compressed_ack_timer; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8491bc9c86b1553ab603e4363e8e38ca7ff547e0..661348f23ea5a3a9320b2cafcd17e23960214771 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,8 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + int sysctl_tcp_comp_sack_nr; + unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6deb540297ccaa1f05ce633efe313d1ca2c15dd9..952d842a604a3ed79e1bf87a712db20a461c35a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -559,7 +559,10 @@ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) - sock_put(sk); + __sock_put(sk); + + if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) + __sock_put(sk); inet_csk_clear_xmit_timers(sk); } diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index d02e859301ff499dd72a1c0e1b56bed10a9397a6..750d89120335eb489f698191edb6c5110969fa8c 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -278,6 +278,7 @@ enum LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ + LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 261b71d0ccc5c17c6032bf67eb8f842006766e64..6c1ff89a60fa0a3485dcc71fafc799e798d5dc11 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), + SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4b195bac8ac0eefe0a224528ad854338c4f8e6e3..d2eed3ddcb0a1ad9778d96d46c685f6c60b93d8d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int comp_sack_nr_max = 255; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1151,6 +1152,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "tcp_comp_sack_delay_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "tcp_comp_sack_nr", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &comp_sack_nr_max, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 62b776f9003798eaf06992a4eb0914d17646aa61..0a2ea0bbf867271db05aedd7d48b193677664321 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); + tp->compressed_ack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0bf032839548f8dccb7f24a6fb5a7d47ea29208b..aebb29ab2fdf2ceaa182cd11928f145a886149ff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { + if (tp->compressed_ack) + tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -4715,8 +4717,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(sk); - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", @@ -5083,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); + unsigned long rtt, delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5094,15 +5095,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* We have out of order data. */ - (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { - /* Then ack it now */ + tcp_in_quickack_mode(sk)) { +send_now: tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ + return; + } + + if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); + return; } + + if (!tcp_is_sack(tp) || + tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + goto send_now; + tp->compressed_ack++; + + if (hrtimer_is_queued(&tp->compressed_ack_timer)) + return; + + /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + + rtt = tp->rcv_rtt_est.rtt_us; + if (tp->srtt_us && tp->srtt_us < rtt) + rtt = tp->srtt_us; + + delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + rtt * (NSEC_PER_USEC >> 3)/20); + sock_hold(sk); + hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index caf23de88f8a369c2038cecd34ce42c522487e90..adbdb503db0c983ef4185f83b138aa51bafd15bf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net) init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } + net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0d8f950a9006598c70dbf51e281a3fe32dfaa234..437bb7ceba7fd388abac1c12f2920b02be77bad9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(tp->compressed_ack)) { + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack); + tp->compressed_ack = 0; + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + } tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 92bdf64fffae3a5be291ca419eb21276b4c8cbae..3b3611729928f77934e0298bb248e55c7a7c5def 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -708,6 +708,27 @@ static void tcp_keepalive_timer (struct timer_list *t) sock_put(sk); } +static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) +{ + struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); + struct sock *sk = (struct sock *)tp; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + if (tp->compressed_ack) + tcp_send_ack(sk); + } else { + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + + sock_put(sk); + + return HRTIMER_NORESTART; +} + void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, @@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk) hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + + hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); + tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; }