diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2dc7a1d9768638e05ca3495b077064dc772bb773..f147310d9af4d28d5f0be6565129669d638cad9a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -487,6 +487,18 @@ tcp_dma_copybreak - INTEGER and CONFIG_NET_DMA is enabled. Default: 4096 +tcp_thin_linear_timeouts - BOOLEAN + Enable dynamic triggering of linear timeouts for thin streams. + If set, a check is performed upon retransmission by timeout to + determine if the stream is thin (less than 4 packets in flight). + As long as the stream is found to be thin, up to 6 linear + timeouts may be performed before exponential backoff mode is + initiated. This improves retransmission latency for + non-aggressive thin streams, often found to be time-dependent. + For more information on thin streams, see + Documentation/networking/tcp-thin.txt + Default: 0 + UDP variables: udp_mem - vector of 3 INTEGERs: min, pressure, max diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7fee8a4df93184145b7659dfd630710e1d6f5f96..3ba8b074612fb3e41ec862c969195a194fd1ce4c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -103,6 +103,7 @@ enum { #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 @@ -340,7 +341,9 @@ struct tcp_sock { u32 frto_highmark; /* snd_nxt when RTO occurred */ u16 advmss; /* Advertised MSS */ u8 frto_counter; /* Number of new acks after RTO */ - u8 nonagle; /* Disable Nagle algorithm? */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ + thin_lto : 1,/* Use linear timeouts for thin streams */ + unused : 3; /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0bdc3f640247fab7c19d3ddb36e20d54599c27fe..6278fc734abd5d5b97c9f0bceaf251d1b2fca542 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ +/* TCP thin-stream limits */ +#define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ + extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ @@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; extern int sysctl_tcp_cookie_size; +extern int sysctl_tcp_thin_linear_timeouts; extern atomic_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e3712ce399476be6cfdbc631654a5863fb68bc4..e6a2460587d4e6755008e28c460829d9363fd9f4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -575,6 +575,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e471d037fcc9fed1e221b25d62e67b212bfd9e20..21bae9afefea5f8858a1744a8f70297d117cff91 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_THIN_LINEAR_TIMEOUTS: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_lto = val; + break; + case TCP_CORK: /* When set indicates to always queue non-full frames. * Later the user clears this option and we transmit diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index de7d1bf9114fe6d3bf6475be2f9031460c618e95..a17629b8912ee7134eaaa780957222b287bad6da 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; +int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); @@ -415,7 +416,25 @@ void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; out_reset_timer: - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (sk->sk_state == TCP_ESTABLISHED && + (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(tp) && + icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + icsk->icsk_backoff = 0; + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + } else { + /* Use normal (exponential) backoff */ + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk);