diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7eb9366422f54b451d5818a211b8adc4d5eed1ea..1f8c3f408393d0cf24ec4cc23e193b3f1eb45b2b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -695,6 +695,14 @@ tcp_tw_reuse - INTEGER experts. Default: 2 +tcp_tw_timeout - INTEGER + The length of time (in seconds) a connection will remain in the + TIME-WAIT state. The maximum value is 600 seconds, the minimum + value is 1 second. + It should not be changed without advice/request of technical + experts. + Default: 60 seconds + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c95f32b83a942c3b39350bc76bde54800c4d53fe..df29ff93c329c8bc4790bf6cff1142f7143f7ac9 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1117,6 +1117,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_jiffies_minmax) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_doulongvec_minmax) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd41aad6fd339ba824c6bb622ac24d..d36e15c6dcf983f697eb184d43973db7e4d6c5c5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write, loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 366e2a60010e817c1275584ebef31eea2ff31096..2d123eaf46bb34ad9f6632e948ee8e78dbb8e218 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -168,6 +168,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + int sysctl_tcp_tw_timeout; const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf65b520420a4d04d2347f450c2f598..b5cb6ffe251b7f9e3b31a49e3bd775049e0db275 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_IPV4_TCP_TW_TIMEOUT=126, }; enum { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f8576509c7bef2c086b60e8b3df0f91d5f1879cf..b7fd0528747504bbe72829abfa63b2942a38b860 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } +struct do_proc_dointvec_jiffies_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int val = 0; + struct do_proc_dointvec_jiffies_minmax_conv_param *param = + (struct do_proc_dointvec_jiffies_minmax_conv_param *)data; + + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + val = (*negp) ? -(*lvalp*HZ) : (*lvalp*HZ); + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + unsigned long lval; + val = *valp; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + + return 0; +} + +/** + * proc_dointvec_jiffies_minmax - read a vector of integers as seconds with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_jiffies_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_jiffies_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3220,6 +3284,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3257,6 +3327,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_jiffies_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b4974516cec45b934e5ed5881e1554e7af0..57b9571cc2037009f001f195eaae7b096d707b37 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_IPV4_TCP_TW_TIMEOUT, "tcp_tw_timeout" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ad132b6e8cfade62e1e8aa46a941fcde3cda9f67..08f3b21fd50dd2fdf1172cea327748597c1550a9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -52,6 +52,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_tw_timeout_min = 1 * HZ; +static int tcp_tw_timeout_max = 600 * HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -869,6 +871,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_tw_timeout", + .data = &init_net.ipv4.sysctl_tcp_tw_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies_minmax, + .extra1 = &tcp_tw_timeout_min, + .extra2 = &tcp_tw_timeout_max + }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a7a804bece7aca273e64ac4b41bcc07654e1df5f..e810b656d460d2349aa866c37d6d0e4a95b76966 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2454,9 +2454,9 @@ void tcp_close(struct sock *sk, long timeout) } else { const int tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57e8dad956ec4269b7f6dc9aca2b5ce28fa60e9a..0238b43d8f5318f83be5f259421fb5193ae0dd25 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6168,8 +6168,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { + inet_csk_reset_keepalive_timer(sk, + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bfec4884973534480823a4792e5b58ee6984fc77..6a3e2c8b125a53dc4dfc175bd9659fd7f819fe65 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2579,6 +2579,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + net->ipv4.sysctl_tcp_tw_timeout = TCP_TIMEWAIT_LEN; + if (net != &init_net) { memcpy(net->ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d981a6494059232c4965aaee756803..26e1f38f259d2f14519c8f802de5a2404cd5507e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct net *net = sock_net((struct sock *)tw); tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return TCP_TW_ACK; } @@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } } else { - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); } if (tmp_opt.saw_tstamp) { @@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = rto; if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; + timeo = sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 681882a409686743b18e4eac614b66a1c3cdec37..d1fa0a5c5653915d5c46f7497c89279d84f353bb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= sock_net(sk)->ipv4.sysctl_tcp_tw_timeout || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - + sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);