From f354337fe74ca09a3d5a020a0ea8a139031d3b7d Mon Sep 17 00:00:00 2001 From: George Zhang Date: Wed, 28 Mar 2018 03:24:14 +0800 Subject: [PATCH] net/tcp: Support tunable tcp timeout value in TIME-WAIT state By default the tcp_tw_timeout value is 60 seconds. The minimum is 1 second and the maximum is 600. This setting is useful on system under heavy tcp load. NOTE: set the tcp_tw_timeout below 60 seconds voilates the "quiet time" restriction, and make your system into the risk of causing some old data to be accepted as new or new data rejected as old duplicated by some receivers. Link: http://web.archive.org/web/20150102003320/http://tools.ietf.org/html/rfc793 Signed-off-by: George Zhang Signed-off-by: Jiufei Xue Acked-by: Joseph Qi --- Documentation/networking/ip-sysctl.txt | 8 +++ fs/proc/proc_sysctl.c | 1 + include/linux/sysctl.h | 2 + include/net/netns/ipv4.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/sysctl.c | 71 ++++++++++++++++++++++++++ kernel/sysctl_binary.c | 1 + net/ipv4/sysctl_net_ipv4.c | 11 ++++ net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 5 +- net/ipv4/tcp_ipv4.c | 2 + net/ipv4/tcp_minisocks.c | 9 ++-- net/ipv4/tcp_timer.c | 5 +- 13 files changed, 111 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2c31208528d5..c7c46c6349d2 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -687,6 +687,14 @@ tcp_tw_reuse - INTEGER experts. Default: 2 +tcp_tw_timeout - INTEGER + The length of time (in seconds) a connection will remain in the + TIME-WAIT state. The maximum value is 600 seconds, the minimum + value is 1 second. + It should not be changed without advice/request of technical + experts. + Default: 60 seconds + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7325baa8f9d4..18b48a46de90 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1113,6 +1113,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_jiffies_minmax) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_doulongvec_minmax) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd..d36e15c6dcf9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write, loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e47503b4e4d1..75b1cfbdea72 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -166,6 +166,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + int sysctl_tcp_tw_timeout; const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf6..b5cb6ffe251b 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_IPV4_TCP_TW_TIMEOUT=126, }; enum { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9a85c7ae7362..7eee14cc24c0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } +struct do_proc_dointvec_jiffies_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int val = 0; + struct do_proc_dointvec_jiffies_minmax_conv_param *param = + (struct do_proc_dointvec_jiffies_minmax_conv_param *)data; + + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + val = (*negp) ? -(*lvalp*HZ) : (*lvalp*HZ); + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + unsigned long lval; + val = *valp; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + + return 0; +} + +/** + * proc_dointvec_jiffies_minmax - read a vector of integers as seconds with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_jiffies_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_jiffies_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3218,6 +3282,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3255,6 +3325,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_jiffies_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b497451..57b9571cc203 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_IPV4_TCP_TW_TIMEOUT, "tcp_tw_timeout" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ce64453d337d..544887d7304b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_tw_timeout_min = 1 * HZ; +static int tcp_tw_timeout_max = 600 * HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -858,6 +860,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_tw_timeout", + .data = &init_net.ipv4.sysctl_tcp_tw_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies_minmax, + .extra1 = &tcp_tw_timeout_min, + .extra2 = &tcp_tw_timeout_max + }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 30c6e94b06c4..58622d60f509 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2443,9 +2443,9 @@ void tcp_close(struct sock *sk, long timeout) } else { const int tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cfdd70e32755..ba0ffdbba180 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6175,8 +6175,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { + inet_csk_reset_keepalive_timer(sk, + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 11101cf8693b..4622d55ab101 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2578,6 +2578,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + net->ipv4.sysctl_tcp_tw_timeout = TCP_TIMEWAIT_LEN; + if (net != &init_net) { memcpy(net->ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d9..26e1f38f259d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct net *net = sock_net((struct sock *)tw); tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return TCP_TW_ACK; } @@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } } else { - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); } if (tmp_opt.saw_tstamp) { @@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = rto; if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; + timeo = sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b1b5a648def6..b49427dd1136 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= sock_net(sk)->ipv4.sysctl_tcp_tw_timeout || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - + sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -- GitLab