tcp_fastopen.c 14.1 KB
Newer Older
H
Herbert Xu 已提交
1
#include <linux/crypto.h>
2
#include <linux/err.h>
Y
Yuchung Cheng 已提交
3 4
#include <linux/init.h>
#include <linux/kernel.h>
5 6 7 8 9 10
#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <net/inetpeer.h>
#include <net/tcp.h>
Y
Yuchung Cheng 已提交
11

12
void tcp_fastopen_init_key_once(struct net *net)
13
{
14 15 16 17 18 19 20 21 22 23
	u8 key[TCP_FASTOPEN_KEY_LENGTH];
	struct tcp_fastopen_context *ctxt;

	rcu_read_lock();
	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
	if (ctxt) {
		rcu_read_unlock();
		return;
	}
	rcu_read_unlock();
24 25 26 27 28 29 30

	/* tcp_fastopen_reset_cipher publishes the new context
	 * atomically, so we allow this race happening here.
	 *
	 * All call sites of tcp_fastopen_cookie_gen also check
	 * for a valid cookie, so this is an acceptable risk.
	 */
31 32
	get_random_bytes(key, sizeof(key));
	tcp_fastopen_reset_cipher(net, key, sizeof(key));
33 34
}

35 36 37 38 39 40 41 42
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
	struct tcp_fastopen_context *ctx =
	    container_of(head, struct tcp_fastopen_context, rcu);
	crypto_free_cipher(ctx->tfm);
	kfree(ctx);
}

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
void tcp_fastopen_ctx_destroy(struct net *net)
{
	struct tcp_fastopen_context *ctxt;

	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);

	ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);

	if (ctxt)
		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
}

int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
{
	int err;
	struct tcp_fastopen_context *ctx, *octx;

	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return -ENOMEM;
	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);

	if (IS_ERR(ctx->tfm)) {
		err = PTR_ERR(ctx->tfm);
error:		kfree(ctx);
		pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
		return err;
	}
	err = crypto_cipher_setkey(ctx->tfm, key, len);
	if (err) {
		pr_err("TCP: TFO cipher key error: %d\n", err);
		crypto_free_cipher(ctx->tfm);
		goto error;
	}
	memcpy(ctx->key, key, len);

82
	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
83

84 85 86 87
	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
88 89 90 91 92 93

	if (octx)
		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
	return err;
}

94 95
static bool __tcp_fastopen_cookie_gen(struct net *net,
				      const void *path,
96
				      struct tcp_fastopen_cookie *foc)
97 98
{
	struct tcp_fastopen_context *ctx;
99
	bool ok = false;
100 101

	rcu_read_lock();
102
	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
103
	if (ctx) {
104
		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
105
		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
106
		ok = true;
107 108
	}
	rcu_read_unlock();
109 110 111 112 113 114 115 116 117
	return ok;
}

/* Generate the fastopen cookie by doing aes128 encryption on both
 * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
 * addresses. For the longer IPv6 addresses use CBC-MAC.
 *
 * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
 */
118 119
static bool tcp_fastopen_cookie_gen(struct net *net,
				    struct request_sock *req,
120 121 122 123 124 125 126
				    struct sk_buff *syn,
				    struct tcp_fastopen_cookie *foc)
{
	if (req->rsk_ops->family == AF_INET) {
		const struct iphdr *iph = ip_hdr(syn);

		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
127
		return __tcp_fastopen_cookie_gen(net, path, foc);
128 129 130 131 132 133 134
	}

#if IS_ENABLED(CONFIG_IPV6)
	if (req->rsk_ops->family == AF_INET6) {
		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
		struct tcp_fastopen_cookie tmp;

135
		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
136
			struct in6_addr *buf = &tmp.addr;
137
			int i;
138 139 140

			for (i = 0; i < 4; i++)
				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
141
			return __tcp_fastopen_cookie_gen(net, buf, foc);
142 143 144 145
		}
	}
#endif
	return false;
146
}
147

148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163

/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
 * queue this additional data / FIN.
 */
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
		return;

	skb = skb_clone(skb, GFP_ATOMIC);
	if (!skb)
		return;

	skb_dst_drop(skb);
164 165 166 167 168 169 170 171
	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
	 * to avoid double counting.  Also, tcp_segs_in() expects
	 * skb->len to include the tcp_hdrlen.  Hence, it should
	 * be called before __skb_pull().
	 */
	tp->segs_in = 0;
	tcp_segs_in(tp, skb);
172
	__skb_pull(skb, tcp_hdrlen(skb));
173
	sk_forced_mem_schedule(sk, skb->truesize);
174 175
	skb_set_owner_r(skb, sk);

176 177 178
	TCP_SKB_CB(skb)->seq++;
	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;

179 180 181 182 183 184 185 186
	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	tp->syn_data_acked = 1;

	/* u64_stats_update_begin(&tp->syncp) not needed here,
	 * as we certainly are not changing upper 32bit value (0)
	 */
	tp->bytes_received = skb->len;
187 188 189

	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
		tcp_fin(sk);
190 191
}

192 193 194
static struct sock *tcp_fastopen_create_child(struct sock *sk,
					      struct sk_buff *skb,
					      struct request_sock *req)
195
{
196
	struct tcp_sock *tp;
197 198
	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
	struct sock *child;
199
	bool own_req;
200 201 202 203 204

	req->num_retrans = 0;
	req->num_timeout = 0;
	req->sk = NULL;

205 206
	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
							 NULL, &own_req);
207
	if (!child)
208
		return NULL;
209

210 211 212
	spin_lock(&queue->fastopenq.lock);
	queue->fastopenq.qlen++;
	spin_unlock(&queue->fastopenq.lock);
213 214 215 216 217 218 219 220

	/* Initialize the child socket. Have to fix some values to take
	 * into account the child is a Fast Open socket and is created
	 * only out of the bits carried in the SYN packet.
	 */
	tp = tcp_sk(child);

	tp->fastopen_rsk = req;
221
	tcp_rsk(req)->tfo_listener = true;
222 223 224 225 226

	/* RFC1323: The window in SYN & SYN/ACK segments is never
	 * scaled. So correct it appropriately.
	 */
	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
227
	tp->max_window = tp->snd_wnd;
228 229

	/* Activate the retrans timer so that SYNACK can be retransmitted.
230
	 * The request socket is not added to the ehash
231 232 233 234 235
	 * because it's been added to the accept queue directly.
	 */
	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
				  TCP_TIMEOUT_INIT, TCP_RTO_MAX);

236
	refcount_set(&req->rsk_refcnt, 2);
237 238 239 240 241 242

	/* Now finish processing the fastopen child socket. */
	inet_csk(child)->icsk_af_ops->rebuild_header(child);
	tcp_init_congestion_control(child);
	tcp_mtup_init(child);
	tcp_init_metrics(child);
243
	tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
244 245
	tcp_init_buffer_space(child);

246 247 248 249 250
	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;

	tcp_fastopen_add_skb(child, skb);

	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
251
	tp->rcv_wup = tp->rcv_nxt;
252 253
	/* tcp_conn_request() is sending the SYNACK,
	 * and queues the child into listener accept queue.
254 255
	 */
	return child;
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
}

static bool tcp_fastopen_queue_check(struct sock *sk)
{
	struct fastopen_queue *fastopenq;

	/* Make sure the listener has enabled fastopen, and we don't
	 * exceed the max # of pending TFO requests allowed before trying
	 * to validating the cookie in order to avoid burning CPU cycles
	 * unnecessarily.
	 *
	 * XXX (TFO) - The implication of checking the max_qlen before
	 * processing a cookie request is that clients can't differentiate
	 * between qlen overflow causing Fast Open to be disabled
	 * temporarily vs a server not supporting Fast Open at all.
	 */
272 273
	fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
	if (fastopenq->max_qlen == 0)
274 275 276 277 278 279
		return false;

	if (fastopenq->qlen >= fastopenq->max_qlen) {
		struct request_sock *req1;
		spin_lock(&fastopenq->lock);
		req1 = fastopenq->rskq_rst_head;
280
		if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
281 282
			__NET_INC_STATS(sock_net(sk),
					LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
283
			spin_unlock(&fastopenq->lock);
284 285 286 287 288
			return false;
		}
		fastopenq->rskq_rst_head = req1->dl_next;
		fastopenq->qlen--;
		spin_unlock(&fastopenq->lock);
289
		reqsk_put(req1);
290 291 292 293
	}
	return true;
}

294 295 296 297
/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
 * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
 * cookie request (foc->len == 0).
 */
298 299
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
			      struct request_sock *req,
300
			      struct tcp_fastopen_cookie *foc)
301
{
302
	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
303 304
	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
305
	struct sock *child;
306

307
	if (foc->len == 0) /* Client requests a cookie */
308
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
309

310
	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
311 312 313
	      (syn_data || foc->len >= 0) &&
	      tcp_fastopen_queue_check(sk))) {
		foc->len = -1;
314
		return NULL;
315 316
	}

317
	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
318 319
		goto fastopen;

320
	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
321
	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
322
	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
323 324
	    foc->len == valid_foc.len &&
	    !memcmp(foc->val, valid_foc.val, foc->len)) {
325 326 327 328 329 330 331 332
		/* Cookie is valid. Create a (full) child socket to accept
		 * the data in SYN before returning a SYN-ACK to ack the
		 * data. If we fail to create the socket, fall back and
		 * ack the ISN only but includes the same cookie.
		 *
		 * Note: Data-less SYN with valid cookie is allowed to send
		 * data in SYN_RECV state.
		 */
333
fastopen:
334
		child = tcp_fastopen_create_child(sk, skb, req);
335
		if (child) {
336
			foc->len = -1;
337 338
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPFASTOPENPASSIVE);
339
			return child;
340
		}
341
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
342
	} else if (foc->len > 0) /* Client presents an invalid cookie */
343
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
344

345
	valid_foc.exp = foc->exp;
346
	*foc = valid_foc;
347
	return NULL;
348
}
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363

bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
			       struct tcp_fastopen_cookie *cookie)
{
	unsigned long last_syn_loss = 0;
	int syn_loss = 0;

	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);

	/* Recurring FO SYN losses: no cookie or data in SYN */
	if (syn_loss > 1 &&
	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
		cookie->len = -1;
		return false;
	}
364 365 366 367 368 369 370

	/* Firewall blackhole issue check */
	if (tcp_fastopen_active_should_disable(sk)) {
		cookie->len = -1;
		return false;
	}

371
	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
372 373 374 375 376
		cookie->len = -1;
		return true;
	}
	return cookie->len > 0;
}
W
Wei Wang 已提交
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409

/* This function checks if we want to defer sending SYN until the first
 * write().  We defer under the following conditions:
 * 1. fastopen_connect sockopt is set
 * 2. we have a valid cookie
 * Return value: return true if we want to defer until application writes data
 *               return false if we want to send out SYN immediately
 */
bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
{
	struct tcp_fastopen_cookie cookie = { .len = 0 };
	struct tcp_sock *tp = tcp_sk(sk);
	u16 mss;

	if (tp->fastopen_connect && !tp->fastopen_req) {
		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
			inet_sk(sk)->defer_connect = 1;
			return true;
		}

		/* Alloc fastopen_req in order for FO option to be included
		 * in SYN
		 */
		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
					   sk->sk_allocation);
		if (tp->fastopen_req)
			tp->fastopen_req->cookie = cookie;
		else
			*err = -ENOBUFS;
	}
	return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427

/*
 * The following code block is to deal with middle box issues with TFO:
 * Middlebox firewall issues can potentially cause server's data being
 * blackholed after a successful 3WHS using TFO.
 * The proposed solution is to disable active TFO globally under the
 * following circumstances:
 *   1. client side TFO socket receives out of order FIN
 *   2. client side TFO socket receives out of order RST
 * We disable active side TFO globally for 1hr at first. Then if it
 * happens again, we disable it for 2h, then 4h, 8h, ...
 * And we reset the timeout back to 1hr when we see a successful active
 * TFO connection with data exchanges.
 */

/* Disable active TFO and record current jiffies and
 * tfo_active_disable_times
 */
428
void tcp_fastopen_active_disable(struct sock *sk)
429
{
430
	struct net *net = sock_net(sk);
431

432 433 434
	atomic_inc(&net->ipv4.tfo_active_disable_times);
	net->ipv4.tfo_active_disable_stamp = jiffies;
	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
435 436 437 438 439 440 441 442
}

/* Calculate timeout for tfo active disable
 * Return true if we are still in the active TFO disable period
 * Return false if timeout already expired and we should use active TFO
 */
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
443 444
	unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
	int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
445
	unsigned long timeout;
446
	int multiplier;
447 448 449 450 451 452

	if (!tfo_da_times)
		return false;

	/* Limit timout to max: 2^6 * initial timeout */
	multiplier = 1 << min(tfo_da_times - 1, 6);
453 454
	timeout = multiplier * tfo_bh_timeout * HZ;
	if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
		return true;

	/* Mark check bit so we can check for successful active TFO
	 * condition and reset tfo_active_disable_times
	 */
	tcp_sk(sk)->syn_fastopen_ch = 1;
	return false;
}

/* Disable active TFO if FIN is the only packet in the ofo queue
 * and no data is received.
 * Also check if we can reset tfo_active_disable_times if data is
 * received successfully on a marked active TFO sockets opened on
 * a non-loopback interface
 */
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct rb_node *p;
	struct sk_buff *skb;
	struct dst_entry *dst;

	if (!tp->syn_fastopen)
		return;

	if (!tp->data_segs_in) {
		p = rb_first(&tp->out_of_order_queue);
		if (p && !rb_next(p)) {
			skb = rb_entry(p, struct sk_buff, rbnode);
			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
485
				tcp_fastopen_active_disable(sk);
486 487 488 489
				return;
			}
		}
	} else if (tp->syn_fastopen_ch &&
490
		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
491 492
		dst = sk_dst_get(sk);
		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
493
			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
494 495 496
		dst_release(dst);
	}
}