提交 2c47a65b 编写于 作者: D David S. Miller

Merge branch 'tcp-implement-SACK-compression'

Eric Dumazet says:

====================
tcp: implement SACK compression

When TCP receives an out-of-order packet, it immediately sends
a SACK packet, generating network load but also forcing the
receiver to send 1-MSS pathological packets, increasing its
RTX queue length/depth, and thus processing time.

Wifi networks suffer from this aggressive behavior, but generally
speaking, all these SACK packets add fuel to the fire when networks
are under congestion.

This patch series adds SACK compression, but the infrastructure
could be leveraged to also compress ACK in the future.

v2: Addressed Neal feedback.
    Added two sysctls to allow fine tuning, or even disabling the feature.

v3: take rtt = min(srtt, rcv_rtt) as Yuchung suggested, because rcv_rtt
    can be over estimated for RPC (or sender limited)
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
...@@ -525,6 +525,19 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max ...@@ -525,6 +525,19 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
tcp_sack - BOOLEAN tcp_sack - BOOLEAN
Enable select acknowledgments (SACKS). Enable select acknowledgments (SACKS).
tcp_comp_sack_delay_ns - LONG INTEGER
TCP tries to reduce number of SACK sent, using a timer
based on 5% of SRTT, capped by this sysctl, in nano seconds.
The default is 1ms, based on TSO autosizing period.
Default : 1,000,000 ns (1 ms)
tcp_comp_sack_nr - INTEGER
Max numer of SACK that can be compressed.
Using 0 disables SACK compression.
Detault : 44
tcp_slow_start_after_idle - BOOLEAN tcp_slow_start_after_idle - BOOLEAN
If set, provide RFC2861 behavior and time out the congestion If set, provide RFC2861 behavior and time out the congestion
window after an idle period. An idle period is defined at window after an idle period. An idle period is defined at
......
...@@ -218,6 +218,7 @@ struct tcp_sock { ...@@ -218,6 +218,7 @@ struct tcp_sock {
reord:1; /* reordering detected */ reord:1; /* reordering detected */
} rack; } rack;
u16 advmss; /* Advertised MSS */ u16 advmss; /* Advertised MSS */
u8 compressed_ack;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */ u8 chrono_type:2, /* current chronograph type */
...@@ -297,6 +298,7 @@ struct tcp_sock { ...@@ -297,6 +298,7 @@ struct tcp_sock {
u32 sacked_out; /* SACK'd packets */ u32 sacked_out; /* SACK'd packets */
struct hrtimer pacing_timer; struct hrtimer pacing_timer;
struct hrtimer compressed_ack_timer;
/* from STCP, retrans queue hinting */ /* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint; struct sk_buff* lost_skb_hint;
......
...@@ -160,6 +160,8 @@ struct netns_ipv4 { ...@@ -160,6 +160,8 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3]; int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3]; int sysctl_tcp_rmem[3];
int sysctl_tcp_comp_sack_nr;
unsigned long sysctl_tcp_comp_sack_delay_ns;
struct inet_timewait_death_row tcp_death_row; struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog; int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen; int sysctl_tcp_fastopen;
......
...@@ -559,7 +559,10 @@ void tcp_init_xmit_timers(struct sock *); ...@@ -559,7 +559,10 @@ void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk) static inline void tcp_clear_xmit_timers(struct sock *sk)
{ {
if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
sock_put(sk); __sock_put(sk);
if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
__sock_put(sk);
inet_csk_clear_xmit_timers(sk); inet_csk_clear_xmit_timers(sk);
} }
......
...@@ -278,6 +278,7 @@ enum ...@@ -278,6 +278,7 @@ enum
LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ LINUX_MIB_TCPDELIVERED, /* TCPDelivered */
LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */
LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
__LINUX_MIB_MAX __LINUX_MIB_MAX
}; };
......
...@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };
......
...@@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1; ...@@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255;
/* obsolete */ /* obsolete */
static int sysctl_tcp_low_latency __read_mostly; static int sysctl_tcp_low_latency __read_mostly;
...@@ -1151,6 +1152,22 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1151,6 +1152,22 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &one, .extra1 = &one,
}, },
{
.procname = "tcp_comp_sack_delay_ns",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "tcp_comp_sack_nr",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &comp_sack_nr_max,
},
{ {
.procname = "udp_rmem_min", .procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min, .data = &init_net.ipv4.sysctl_udp_rmem_min,
......
...@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
dst_release(sk->sk_rx_dst); dst_release(sk->sk_rx_dst);
sk->sk_rx_dst = NULL; sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp); tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
/* Clean up fastopen related fields */ /* Clean up fastopen related fields */
tcp_free_fastopen_req(tp); tcp_free_fastopen_req(tp);
......
...@@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) ...@@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one. * If the sack array is full, forget about the last one.
*/ */
if (this_sack >= TCP_NUM_SACKS) { if (this_sack >= TCP_NUM_SACKS) {
if (tp->compressed_ack)
tcp_send_ack(sk);
this_sack--; this_sack--;
tp->rx_opt.num_sacks--; tp->rx_opt.num_sacks--;
sp--; sp--;
...@@ -4715,8 +4717,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) ...@@ -4715,8 +4717,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window; goto out_of_window;
tcp_enter_quickack_mode(sk);
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */ /* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
...@@ -5083,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk) ...@@ -5083,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;
/* More than one full frame received... */ /* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
...@@ -5094,15 +5095,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) ...@@ -5094,15 +5095,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) || __tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */ /* We ACK each frame or... */
tcp_in_quickack_mode(sk) || tcp_in_quickack_mode(sk)) {
/* We have out of order data. */ send_now:
(ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk); tcp_send_ack(sk);
} else { return;
/* Else, send delayed ack. */ }
if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk); tcp_send_delayed_ack(sk);
return;
} }
if (!tcp_is_sack(tp) ||
tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
goto send_now;
tp->compressed_ack++;
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_SOFT);
} }
static inline void tcp_ack_snd_check(struct sock *sk) static inline void tcp_ack_snd_check(struct sock *sk)
......
...@@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net)
init_net.ipv4.sysctl_tcp_wmem, init_net.ipv4.sysctl_tcp_wmem,
sizeof(init_net.ipv4.sysctl_tcp_wmem)); sizeof(init_net.ipv4.sysctl_tcp_wmem));
} }
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
......
...@@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp, ...@@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* Account for an ACK we sent. */ /* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{ {
struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(tp->compressed_ack)) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack);
tp->compressed_ack = 0;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
tcp_dec_quickack_mode(sk, pkts); tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
} }
......
...@@ -708,6 +708,27 @@ static void tcp_keepalive_timer (struct timer_list *t) ...@@ -708,6 +708,27 @@ static void tcp_keepalive_timer (struct timer_list *t)
sock_put(sk); sock_put(sk);
} }
static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
struct sock *sk = (struct sock *)tp;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
if (tp->compressed_ack)
tcp_send_ack(sk);
} else {
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
sock_put(sk);
return HRTIMER_NORESTART;
}
void tcp_init_xmit_timers(struct sock *sk) void tcp_init_xmit_timers(struct sock *sk)
{ {
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
...@@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk) ...@@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk)
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED_SOFT); HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED_SOFT);
tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册