提交 ca6fb065 编写于 作者: E Eric Dumazet 提交者: David S. Miller

tcp: attach SYNACK messages to request sockets instead of listener

If a listen backlog is very big (to avoid syncookies), then
the listener sk->sk_wmem_alloc is the main source of false
sharing, as we need to touch it twice per SYNACK re-transmit
and TX completion.

(One SYN packet takes listener lock once, but up to 6 SYNACK
are generated)

By attaching the skb to the request socket, we remove this
source of contention.

Tested:

 listen(fd, 10485760); // single listener (no SO_REUSEPORT)
 16 RX/TX queue NIC
 Sustain a SYNFLOOD attack of ~320,000 SYN per second,
 Sending ~1,400,000 SYNACK per second.
 Perf profiles now show listener spinlock being next bottleneck.

    20.29%  [kernel]  [k] queued_spin_lock_slowpath
    10.06%  [kernel]  [k] __inet_lookup_established
     5.12%  [kernel]  [k] reqsk_timer_handler
     3.22%  [kernel]  [k] get_next_timer_interrupt
     3.00%  [kernel]  [k] tcp_make_synack
     2.77%  [kernel]  [k] ipt_do_table
     2.70%  [kernel]  [k] run_timer_softirq
     2.50%  [kernel]  [k] ip_finish_output
     2.04%  [kernel]  [k] cascade
Signed-off-by: NEric Dumazet <edumazet@google.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 1b33bc3e
...@@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); ...@@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk); int tcp_connect(struct sock *sk);
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct tcp_fastopen_cookie *foc); struct tcp_fastopen_cookie *foc,
bool attach_req);
int tcp_disconnect(struct sock *sk, int flags); int tcp_disconnect(struct sock *sk, int flags);
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
...@@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops { ...@@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops {
__u32 (*init_seq)(const struct sk_buff *skb); __u32 (*init_seq)(const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst, int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req, struct flowi *fl, struct request_sock *req,
u16 queue_mapping, struct tcp_fastopen_cookie *foc); u16 queue_mapping, struct tcp_fastopen_cookie *foc,
bool attach_req);
}; };
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
......
...@@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, ...@@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
* are committed to memory and refcnt initialized. * are committed to memory and refcnt initialized.
*/ */
smp_wmb(); smp_wmb();
atomic_set(&req->rsk_refcnt, 2); atomic_set(&req->rsk_refcnt, 2 + 1);
} }
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
......
...@@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, ...@@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted. /* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the SYN table of the parent * The request socket is not added to the ehash
* because it's been added to the accept queue directly. * because it's been added to the accept queue directly.
*/ */
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX); TCP_TIMEOUT_INIT, TCP_RTO_MAX);
atomic_set(&req->rsk_refcnt, 1); atomic_set(&req->rsk_refcnt, 2);
/* Add the child socket directly into the accept queue */ /* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, child); inet_csk_reqsk_queue_add(sk, req, child);
......
...@@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct request_sock *req; struct request_sock *req;
bool want_cookie = false; bool want_cookie = false;
struct flowi fl; struct flowi fl;
int err;
/* TW buckets are converted to open requests without /* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is * limitations, they conserve resources and peer is
...@@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst); tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) if (!want_cookie) {
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req, tcp_reqsk_record_syn(sk, req, skb);
skb_get_queue_mapping(skb), &foc); }
if (fastopen_sk) { if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
skb_get_queue_mapping(skb), &foc, false);
sock_put(fastopen_sk); sock_put(fastopen_sk);
} else { } else {
if (err || want_cookie)
goto drop_and_free;
tcp_rsk(req)->tfo_listener = false; tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
af_ops->send_synack(sk, dst, &fl, req,
skb_get_queue_mapping(skb), &foc, !want_cookie);
if (want_cookie)
goto drop_and_free;
} }
tcp_reqsk_record_syn(sk, req, skb); reqsk_put(req);
return 0; return 0;
drop_and_release: drop_and_release:
......
...@@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct flowi *fl,
struct request_sock *req, struct request_sock *req,
u16 queue_mapping, u16 queue_mapping,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4; struct flowi4 fl4;
...@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1; return -1;
skb = tcp_make_synack(sk, dst, req, foc); skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) { if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
......
...@@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) ...@@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk)
*/ */
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
struct inet_request_sock *ireq = inet_rsk(req); struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
...@@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
u16 user_mss; u16 user_mss;
int mss; int mss;
/* sk is a const pointer, because we want to express multiple cpus skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
* might call us concurrently.
* sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way.
*/
skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
if (unlikely(!skb)) { if (unlikely(!skb)) {
dst_release(dst); dst_release(dst);
return NULL; return NULL;
...@@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */ /* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER); skb_reserve(skb, MAX_TCP_HEADER);
if (attach_req) {
skb->destructor = sock_edemux;
sock_hold(req_to_sk(req));
skb->sk = req_to_sk(req);
} else {
/* sk is a const pointer, because we want to express multiple
* cpu might call us concurrently.
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
*/
skb_set_owner_w(skb, (struct sock *)sk);
}
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst); mss = dst_metric_advmss(dst);
...@@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) ...@@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res; int res;
tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true);
if (!res) { if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
......
...@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct flowi *fl,
struct request_sock *req, struct request_sock *req,
u16 queue_mapping, u16 queue_mapping,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
struct inet_request_sock *ireq = inet_rsk(req); struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk);
...@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
IPPROTO_TCP)) == NULL) IPPROTO_TCP)) == NULL)
goto done; goto done;
skb = tcp_make_synack(sk, dst, req, foc); skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) { if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
......
...@@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
return &q->internal; return &q->internal;
/* SYNACK messages are attached to a listener socket. /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
* 1) They are not part of a 'flow' yet * 1) request sockets are not full blown,
* 2) We do not want to rate limit them (eg SYNFLOOD attack), * they do not contain sk_pacing_rate
* 2) They are not part of a 'flow' yet
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE * especially if the listener set SO_MAX_PACING_RATE
* 3) We pretend they are orphaned * 4) We pretend they are orphaned
*/ */
if (!sk || sk->sk_state == TCP_LISTEN) { if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask; unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not /* By forcing low order bit to 1, we make sure to not
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册