From 0e7b6b7f31c9c38c3124aceb472245e496afb661 Mon Sep 17 00:00:00 2001 From: George Zhang Date: Wed, 28 Mar 2018 03:24:14 +0800 Subject: [PATCH] alinux: net/tcp: Support tunable tcp timeout value in TIME-WAIT state By default the tcp_tw_timeout value is 60 seconds. The minimum is 1 second and the maximum is 600. This setting is useful on system under heavy tcp load. NOTE: set the tcp_tw_timeout below 60 seconds voilates the "quiet time" restriction, and make your system into the risk of causing some old data to be accepted as new or new data rejected as old duplicated by some receivers. Link: http://web.archive.org/web/20150102003320/http://tools.ietf.org/html/rfc793 Signed-off-by: George Zhang Signed-off-by: Jiufei Xue Acked-by: Joseph Qi --- Documentation/networking/ip-sysctl.txt | 8 +++ fs/proc/proc_sysctl.c | 1 + include/linux/sysctl.h | 2 + include/net/netns/ipv4.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/sysctl.c | 71 ++++++++++++++++++++++++++ kernel/sysctl_binary.c | 1 + net/ipv4/sysctl_net_ipv4.c | 11 ++++ net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 5 +- net/ipv4/tcp_ipv4.c | 2 + net/ipv4/tcp_minisocks.c | 9 ++-- net/ipv4/tcp_timer.c | 5 +- 13 files changed, 111 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7eb9366422f5..1f8c3f408393 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -695,6 +695,14 @@ tcp_tw_reuse - INTEGER experts. Default: 2 +tcp_tw_timeout - INTEGER + The length of time (in seconds) a connection will remain in the + TIME-WAIT state. The maximum value is 600 seconds, the minimum + value is 1 second. + It should not be changed without advice/request of technical + experts. + Default: 60 seconds + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c95f32b83a94..df29ff93c329 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1117,6 +1117,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_jiffies_minmax) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_doulongvec_minmax) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd..d36e15c6dcf9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write, loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 366e2a60010e..2d123eaf46bb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -168,6 +168,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + int sysctl_tcp_tw_timeout; const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf6..b5cb6ffe251b 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_IPV4_TCP_TW_TIMEOUT=126, }; enum { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f8576509c7be..b7fd05287475 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } +struct do_proc_dointvec_jiffies_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int val = 0; + struct do_proc_dointvec_jiffies_minmax_conv_param *param = + (struct do_proc_dointvec_jiffies_minmax_conv_param *)data; + + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + val = (*negp) ? -(*lvalp*HZ) : (*lvalp*HZ); + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + unsigned long lval; + val = *valp; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + + return 0; +} + +/** + * proc_dointvec_jiffies_minmax - read a vector of integers as seconds with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_jiffies_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_jiffies_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3220,6 +3284,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3257,6 +3327,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_jiffies_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b497451..57b9571cc203 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_IPV4_TCP_TW_TIMEOUT, "tcp_tw_timeout" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ad132b6e8cfa..08f3b21fd50d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -52,6 +52,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_tw_timeout_min = 1 * HZ; +static int tcp_tw_timeout_max = 600 * HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -869,6 +871,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_tw_timeout", + .data = &init_net.ipv4.sysctl_tcp_tw_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies_minmax, + .extra1 = &tcp_tw_timeout_min, + .extra2 = &tcp_tw_timeout_max + }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a7a804bece7a..e810b656d460 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2454,9 +2454,9 @@ void tcp_close(struct sock *sk, long timeout) } else { const int tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57e8dad956ec..0238b43d8f53 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6168,8 +6168,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { + inet_csk_reset_keepalive_timer(sk, + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bfec48849735..6a3e2c8b125a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2579,6 +2579,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + net->ipv4.sysctl_tcp_tw_timeout = TCP_TIMEWAIT_LEN; + if (net != &init_net) { memcpy(net->ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d9..26e1f38f259d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct net *net = sock_net((struct sock *)tw); tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return TCP_TW_ACK; } @@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } } else { - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); } if (tmp_opt.saw_tstamp) { @@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = rto; if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; + timeo = sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 681882a40968..d1fa0a5c5653 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= sock_net(sk)->ipv4.sysctl_tcp_tw_timeout || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - + sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -- GitLab