diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2c31208528d565ef662c997b5f3c71300092264a..c7c46c6349d225f752acb369061f7976600ca0c8 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -687,6 +687,14 @@ tcp_tw_reuse - INTEGER experts. Default: 2 +tcp_tw_timeout - INTEGER + The length of time (in seconds) a connection will remain in the + TIME-WAIT state. The maximum value is 600 seconds, the minimum + value is 1 second. + It should not be changed without advice/request of technical + experts. + Default: 60 seconds + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7325baa8f9d474f166c1bbef54b584a028b287fb..18b48a46de908459060ee81e787ea36c95a90831 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1113,6 +1113,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_jiffies_minmax) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_doulongvec_minmax) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd41aad6fd339ba824c6bb622ac24d..d36e15c6dcf983f697eb184d43973db7e4d6c5c5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write, loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 622db6bc2f02da28e7cb2c1b3283e6fe9f6e375c..436e0a5a0de2e79253302eb6bd9e2d9dbffb83fd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -167,6 +167,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + int sysctl_tcp_tw_timeout; const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf65b520420a4d04d2347f450c2f598..b5cb6ffe251b7f9e3b31a49e3bd775049e0db275 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_IPV4_TCP_TW_TIMEOUT=126, }; enum { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9a85c7ae7362106e4b2b9ffb5290dab09c95cf23..7eee14cc24c04f6ab5255d46df7bbe69df21cfbc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } +struct do_proc_dointvec_jiffies_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int val = 0; + struct do_proc_dointvec_jiffies_minmax_conv_param *param = + (struct do_proc_dointvec_jiffies_minmax_conv_param *)data; + + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + val = (*negp) ? -(*lvalp*HZ) : (*lvalp*HZ); + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + unsigned long lval; + val = *valp; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + + return 0; +} + +/** + * proc_dointvec_jiffies_minmax - read a vector of integers as seconds with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_jiffies_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_jiffies_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3218,6 +3282,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3255,6 +3325,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL(proc_dointvec_jiffies_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b4974516cec45b934e5ed5881e1554e7af0..57b9571cc2037009f001f195eaae7b096d707b37 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_IPV4_TCP_TW_TIMEOUT, "tcp_tw_timeout" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ce64453d337d34aab305e07ad598150f242d3126..544887d7304b1b752b1e924f342cd997ddedbec8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_tw_timeout_min = 1 * HZ; +static int tcp_tw_timeout_max = 600 * HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -858,6 +860,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_tw_timeout", + .data = &init_net.ipv4.sysctl_tcp_tw_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies_minmax, + .extra1 = &tcp_tw_timeout_min, + .extra2 = &tcp_tw_timeout_max + }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 30c6e94b06c49d2b5a18d5708560b02cc9f48aae..58622d60f5096d11daad2d44b0e253952d8b82ae 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2443,9 +2443,9 @@ void tcp_close(struct sock *sk, long timeout) } else { const int tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cfdd70e32755e6328229ccc02b690189b60290fb..ba0ffdbba180d7ec4436fc9316c4f4dede0bb9dd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6175,8 +6175,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) { + inet_csk_reset_keepalive_timer(sk, + tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 11101cf8693b1dc2fe898e8fef1c05b22dd1cc9d..4622d55ab1015dab85d7315d3976fcb1d3a8de96 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2578,6 +2578,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + net->ipv4.sysctl_tcp_tw_timeout = TCP_TIMEWAIT_LEN; + if (net != &init_net) { memcpy(net->ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d981a6494059232c4965aaee756803..26e1f38f259d2f14519c8f802de5a2404cd5507e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct net *net = sock_net((struct sock *)tw); tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return TCP_TW_ACK; } @@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } } else { - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); } if (tmp_opt.saw_tstamp) { @@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = rto; if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; + timeo = sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b1b5a648def6cd263bd4d8c927488243acfca255..b49427dd1136d40cfa1be3f4d60a115ef4f8bb1f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= sock_net(sk)->ipv4.sysctl_tcp_tw_timeout || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - + sock_net(sk)->ipv4.sysctl_tcp_tw_timeout; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);