提交 f354337f 编写于 作者: G George Zhang 提交者: Jiufei Xue

net/tcp: Support tunable tcp timeout value in TIME-WAIT state

By default the tcp_tw_timeout value is 60 seconds. The minimum is
1 second and the maximum is 600. This setting is useful on system under
heavy tcp load.

NOTE: set the tcp_tw_timeout below 60 seconds voilates the "quiet time"
restriction, and make your system into the risk of causing some old data
to be accepted as new or new data rejected as old duplicated by some
receivers.

Link: http://web.archive.org/web/20150102003320/http://tools.ietf.org/html/rfc793Signed-off-by: NGeorge Zhang <georgezhang@linux.alibaba.com>
Signed-off-by: NJiufei Xue <jiufei.xue@linux.alibaba.com>
Acked-by: NJoseph Qi <joseph.qi@linux.alibaba.com>
上级 b8cff60d
...@@ -687,6 +687,14 @@ tcp_tw_reuse - INTEGER ...@@ -687,6 +687,14 @@ tcp_tw_reuse - INTEGER
experts. experts.
Default: 2 Default: 2
tcp_tw_timeout - INTEGER
The length of time (in seconds) a connection will remain in the
TIME-WAIT state. The maximum value is 600 seconds, the minimum
value is 1 second.
It should not be changed without advice/request of technical
experts.
Default: 60 seconds
tcp_window_scaling - BOOLEAN tcp_window_scaling - BOOLEAN
Enable window scaling as defined in RFC1323. Enable window scaling as defined in RFC1323.
......
...@@ -1113,6 +1113,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) ...@@ -1113,6 +1113,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
(table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_douintvec_minmax) ||
(table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_minmax) ||
(table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_jiffies) ||
(table->proc_handler == proc_dointvec_jiffies_minmax) ||
(table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) ||
(table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) ||
(table->proc_handler == proc_doulongvec_minmax) || (table->proc_handler == proc_doulongvec_minmax) ||
......
...@@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write, ...@@ -53,6 +53,8 @@ extern int proc_douintvec_minmax(struct ctl_table *table, int write,
loff_t *ppos); loff_t *ppos);
extern int proc_dointvec_jiffies(struct ctl_table *, int, extern int proc_dointvec_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);
extern int proc_dointvec_jiffies_minmax(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);
extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
......
...@@ -166,6 +166,7 @@ struct netns_ipv4 { ...@@ -166,6 +166,7 @@ struct netns_ipv4 {
struct inet_timewait_death_row tcp_death_row; struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog; int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen; int sysctl_tcp_fastopen;
int sysctl_tcp_tw_timeout;
const struct tcp_congestion_ops __rcu *tcp_congestion_control; const struct tcp_congestion_ops __rcu *tcp_congestion_control;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
spinlock_t tcp_fastopen_ctx_lock; spinlock_t tcp_fastopen_ctx_lock;
......
...@@ -425,6 +425,7 @@ enum ...@@ -425,6 +425,7 @@ enum
NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_ALLOWED_CONG_CONTROL=123,
NET_TCP_MAX_SSTHRESH=124, NET_TCP_MAX_SSTHRESH=124,
NET_TCP_FRTO_RESPONSE=125, NET_TCP_FRTO_RESPONSE=125,
NET_IPV4_TCP_TW_TIMEOUT=126,
}; };
enum { enum {
......
...@@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, ...@@ -2706,6 +2706,70 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write,
do_proc_dopipe_max_size_conv, NULL); do_proc_dopipe_max_size_conv, NULL);
} }
struct do_proc_dointvec_jiffies_minmax_conv_param {
int *min;
int *max;
};
static int do_proc_dointvec_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
int val = 0;
struct do_proc_dointvec_jiffies_minmax_conv_param *param =
(struct do_proc_dointvec_jiffies_minmax_conv_param *)data;
if (write) {
if (*lvalp > LONG_MAX / HZ)
return 1;
val = (*negp) ? -(*lvalp*HZ) : (*lvalp*HZ);
if ((param->min && *param->min > val) ||
(param->max && *param->max < val))
return -EINVAL;
*valp = val;
} else {
unsigned long lval;
val = *valp;
if (val < 0) {
*negp = true;
lval = (unsigned long)-val;
} else {
*negp = false;
lval = (unsigned long)val;
}
*lvalp = lval / HZ;
}
return 0;
}
/**
* proc_dointvec_jiffies_minmax - read a vector of integers as seconds with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
* values from/to the user buffer, treated as an ASCII string.
*
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
* Returns 0 on success.
*/
int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct do_proc_dointvec_jiffies_minmax_conv_param param = {
.min = (int *) table->extra1,
.max = (int *) table->extra2,
};
return do_proc_dointvec(table, write, buffer, lenp, ppos,
do_proc_dointvec_jiffies_minmax_conv, &param);
}
static void validate_coredump_safety(void) static void validate_coredump_safety(void)
{ {
#ifdef CONFIG_COREDUMP #ifdef CONFIG_COREDUMP
...@@ -3218,6 +3282,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, ...@@ -3218,6 +3282,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
return -ENOSYS; return -ENOSYS;
} }
int proc_dointvec_jiffies_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
...@@ -3255,6 +3325,7 @@ EXPORT_SYMBOL(proc_douintvec); ...@@ -3255,6 +3325,7 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
EXPORT_SYMBOL(proc_dointvec_jiffies_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_dostring);
......
...@@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = { ...@@ -358,6 +358,7 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
{ CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
{ CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
{ CTL_INT, NET_IPV4_TCP_TW_TIMEOUT, "tcp_tw_timeout" },
{ CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
{ CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
{ CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
......
...@@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; ...@@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255; static int comp_sack_nr_max = 255;
static u32 u32_max_div_HZ = UINT_MAX / HZ; static u32 u32_max_div_HZ = UINT_MAX / HZ;
static int one_day_secs = 24 * 3600; static int one_day_secs = 24 * 3600;
static int tcp_tw_timeout_min = 1 * HZ;
static int tcp_tw_timeout_max = 600 * HZ;
/* obsolete */ /* obsolete */
static int sysctl_tcp_low_latency __read_mostly; static int sysctl_tcp_low_latency __read_mostly;
...@@ -858,6 +860,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -858,6 +860,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_tw_timeout",
.data = &init_net.ipv4.sysctl_tcp_tw_timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies_minmax,
.extra1 = &tcp_tw_timeout_min,
.extra2 = &tcp_tw_timeout_max
},
{ {
.procname = "tcp_orphan_retries", .procname = "tcp_orphan_retries",
.data = &init_net.ipv4.sysctl_tcp_orphan_retries, .data = &init_net.ipv4.sysctl_tcp_orphan_retries,
......
...@@ -2443,9 +2443,9 @@ void tcp_close(struct sock *sk, long timeout) ...@@ -2443,9 +2443,9 @@ void tcp_close(struct sock *sk, long timeout)
} else { } else {
const int tmo = tcp_fin_time(sk); const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) { if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) {
inet_csk_reset_keepalive_timer(sk, inet_csk_reset_keepalive_timer(sk,
tmo - TCP_TIMEWAIT_LEN); tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout);
} else { } else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out; goto out;
......
...@@ -6175,8 +6175,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -6175,8 +6175,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} }
tmo = tcp_fin_time(sk); tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) { if (tmo > sock_net(sk)->ipv4.sysctl_tcp_tw_timeout) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); inet_csk_reset_keepalive_timer(sk,
tmo - sock_net(sk)->ipv4.sysctl_tcp_tw_timeout);
} else if (th->fin || sock_owned_by_user(sk)) { } else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise. /* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing * It is not a big problem, but it looks confusing
......
...@@ -2578,6 +2578,8 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2578,6 +2578,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
net->ipv4.sysctl_tcp_tw_timeout = TCP_TIMEWAIT_LEN;
if (net != &init_net) { if (net != &init_net) {
memcpy(net->ipv4.sysctl_tcp_rmem, memcpy(net->ipv4.sysctl_tcp_rmem,
init_net.ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem,
......
...@@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -94,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt; struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false; bool paws_reject = false;
struct net *net = sock_net((struct sock *)tw);
tmp_opt.saw_tstamp = 0; tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
...@@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -148,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
} }
inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout);
return TCP_TW_ACK; return TCP_TW_ACK;
} }
...@@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -185,7 +186,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
return TCP_TW_SUCCESS; return TCP_TW_SUCCESS;
} }
} else { } else {
inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout);
} }
if (tmp_opt.saw_tstamp) { if (tmp_opt.saw_tstamp) {
...@@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -236,7 +237,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* Do not reschedule in the last case. * Do not reschedule in the last case.
*/ */
if (paws_reject || th->ack) if (paws_reject || th->ack)
inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); inet_twsk_reschedule(tw, net->ipv4.sysctl_tcp_tw_timeout);
return tcp_timewait_check_oow_rate_limit( return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
...@@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ...@@ -309,7 +310,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = rto; timeo = rto;
if (state == TCP_TIME_WAIT) if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN; timeo = sock_net(sk)->ipv4.sysctl_tcp_tw_timeout;
/* tw_timer is pinned, so we need to make sure BH are disabled /* tw_timer is pinned, so we need to make sure BH are disabled
* in following section, otherwise timer handler could run before * in following section, otherwise timer handler could run before
......
...@@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) ...@@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
if (tcp_check_oom(sk, shift)) { if (tcp_check_oom(sk, shift)) {
/* Catch exceptional cases, when connection requires reset. /* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */ * 1. Last segment was sent recently. */
if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || if ((s32)(tcp_jiffies32 - tp->lsndtime) <= sock_net(sk)->ipv4.sysctl_tcp_tw_timeout ||
/* 2. Window is closed. */ /* 2. Window is closed. */
(!tp->snd_wnd && !tp->packets_out)) (!tp->snd_wnd && !tp->packets_out))
do_reset = true; do_reset = true;
...@@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t) ...@@ -669,7 +669,8 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp); tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) { if (tp->linger2 >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; const int tmo = tcp_fin_time(sk) -
sock_net(sk)->ipv4.sysctl_tcp_tw_timeout;
if (tmo > 0) { if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册