tcp_fastopen.c 14.0 KB
Newer Older
H
Herbert Xu 已提交
1
#include <linux/crypto.h>
2
#include <linux/err.h>
Y
Yuchung Cheng 已提交
3 4
#include <linux/init.h>
#include <linux/kernel.h>
5 6 7 8 9 10
#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <net/inetpeer.h>
#include <net/tcp.h>
Y
Yuchung Cheng 已提交
11

12
void tcp_fastopen_init_key_once(struct net *net)
13
{
14 15 16 17 18 19 20 21 22 23
	u8 key[TCP_FASTOPEN_KEY_LENGTH];
	struct tcp_fastopen_context *ctxt;

	rcu_read_lock();
	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
	if (ctxt) {
		rcu_read_unlock();
		return;
	}
	rcu_read_unlock();
24 25 26 27 28 29 30

	/* tcp_fastopen_reset_cipher publishes the new context
	 * atomically, so we allow this race happening here.
	 *
	 * All call sites of tcp_fastopen_cookie_gen also check
	 * for a valid cookie, so this is an acceptable risk.
	 */
31 32
	get_random_bytes(key, sizeof(key));
	tcp_fastopen_reset_cipher(net, key, sizeof(key));
33 34
}

35 36 37 38 39 40 41 42
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
	struct tcp_fastopen_context *ctx =
	    container_of(head, struct tcp_fastopen_context, rcu);
	crypto_free_cipher(ctx->tfm);
	kfree(ctx);
}

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
void tcp_fastopen_ctx_destroy(struct net *net)
{
	struct tcp_fastopen_context *ctxt;

	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);

	ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);

	if (ctxt)
		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
}

int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
{
	int err;
	struct tcp_fastopen_context *ctx, *octx;

	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return -ENOMEM;
	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);

	if (IS_ERR(ctx->tfm)) {
		err = PTR_ERR(ctx->tfm);
error:		kfree(ctx);
		pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
		return err;
	}
	err = crypto_cipher_setkey(ctx->tfm, key, len);
	if (err) {
		pr_err("TCP: TFO cipher key error: %d\n", err);
		crypto_free_cipher(ctx->tfm);
		goto error;
	}
	memcpy(ctx->key, key, len);

82
	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
83

84 85 86 87
	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
88 89 90 91 92 93

	if (octx)
		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
	return err;
}

94 95
static bool __tcp_fastopen_cookie_gen(struct net *net,
				      const void *path,
96
				      struct tcp_fastopen_cookie *foc)
97 98
{
	struct tcp_fastopen_context *ctx;
99
	bool ok = false;
100 101

	rcu_read_lock();
102
	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
103
	if (ctx) {
104
		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
105
		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
106
		ok = true;
107 108
	}
	rcu_read_unlock();
109 110 111 112 113 114 115 116 117
	return ok;
}

/* Generate the fastopen cookie by doing aes128 encryption on both
 * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
 * addresses. For the longer IPv6 addresses use CBC-MAC.
 *
 * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
 */
118 119
static bool tcp_fastopen_cookie_gen(struct net *net,
				    struct request_sock *req,
120 121 122 123 124 125 126
				    struct sk_buff *syn,
				    struct tcp_fastopen_cookie *foc)
{
	if (req->rsk_ops->family == AF_INET) {
		const struct iphdr *iph = ip_hdr(syn);

		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
127
		return __tcp_fastopen_cookie_gen(net, path, foc);
128 129 130 131 132 133 134
	}

#if IS_ENABLED(CONFIG_IPV6)
	if (req->rsk_ops->family == AF_INET6) {
		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
		struct tcp_fastopen_cookie tmp;

135
		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
136
			struct in6_addr *buf = &tmp.addr;
137
			int i;
138 139 140

			for (i = 0; i < 4; i++)
				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
141
			return __tcp_fastopen_cookie_gen(net, buf, foc);
142 143 144 145
		}
	}
#endif
	return false;
146
}
147

148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163

/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
 * queue this additional data / FIN.
 */
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
		return;

	skb = skb_clone(skb, GFP_ATOMIC);
	if (!skb)
		return;

	skb_dst_drop(skb);
164 165 166 167 168 169 170 171
	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
	 * to avoid double counting.  Also, tcp_segs_in() expects
	 * skb->len to include the tcp_hdrlen.  Hence, it should
	 * be called before __skb_pull().
	 */
	tp->segs_in = 0;
	tcp_segs_in(tp, skb);
172
	__skb_pull(skb, tcp_hdrlen(skb));
173
	sk_forced_mem_schedule(sk, skb->truesize);
174 175
	skb_set_owner_r(skb, sk);

176 177 178
	TCP_SKB_CB(skb)->seq++;
	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;

179 180 181 182 183 184 185 186
	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	tp->syn_data_acked = 1;

	/* u64_stats_update_begin(&tp->syncp) not needed here,
	 * as we certainly are not changing upper 32bit value (0)
	 */
	tp->bytes_received = skb->len;
187 188 189

	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
		tcp_fin(sk);
190 191
}

192 193 194
static struct sock *tcp_fastopen_create_child(struct sock *sk,
					      struct sk_buff *skb,
					      struct request_sock *req)
195
{
196
	struct tcp_sock *tp;
197 198
	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
	struct sock *child;
199
	bool own_req;
200 201 202 203 204

	req->num_retrans = 0;
	req->num_timeout = 0;
	req->sk = NULL;

205 206
	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
							 NULL, &own_req);
207
	if (!child)
208
		return NULL;
209

210 211 212
	spin_lock(&queue->fastopenq.lock);
	queue->fastopenq.qlen++;
	spin_unlock(&queue->fastopenq.lock);
213 214 215 216 217 218 219 220

	/* Initialize the child socket. Have to fix some values to take
	 * into account the child is a Fast Open socket and is created
	 * only out of the bits carried in the SYN packet.
	 */
	tp = tcp_sk(child);

	tp->fastopen_rsk = req;
221
	tcp_rsk(req)->tfo_listener = true;
222 223 224 225 226

	/* RFC1323: The window in SYN & SYN/ACK segments is never
	 * scaled. So correct it appropriately.
	 */
	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
227
	tp->max_window = tp->snd_wnd;
228 229

	/* Activate the retrans timer so that SYNACK can be retransmitted.
230
	 * The request socket is not added to the ehash
231 232 233 234 235
	 * because it's been added to the accept queue directly.
	 */
	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
				  TCP_TIMEOUT_INIT, TCP_RTO_MAX);

236
	refcount_set(&req->rsk_refcnt, 2);
237 238

	/* Now finish processing the fastopen child socket. */
239
	tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
240

241 242 243 244 245
	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;

	tcp_fastopen_add_skb(child, skb);

	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
246
	tp->rcv_wup = tp->rcv_nxt;
247 248
	/* tcp_conn_request() is sending the SYNACK,
	 * and queues the child into listener accept queue.
249 250
	 */
	return child;
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
}

static bool tcp_fastopen_queue_check(struct sock *sk)
{
	struct fastopen_queue *fastopenq;

	/* Make sure the listener has enabled fastopen, and we don't
	 * exceed the max # of pending TFO requests allowed before trying
	 * to validating the cookie in order to avoid burning CPU cycles
	 * unnecessarily.
	 *
	 * XXX (TFO) - The implication of checking the max_qlen before
	 * processing a cookie request is that clients can't differentiate
	 * between qlen overflow causing Fast Open to be disabled
	 * temporarily vs a server not supporting Fast Open at all.
	 */
267 268
	fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
	if (fastopenq->max_qlen == 0)
269 270 271 272 273 274
		return false;

	if (fastopenq->qlen >= fastopenq->max_qlen) {
		struct request_sock *req1;
		spin_lock(&fastopenq->lock);
		req1 = fastopenq->rskq_rst_head;
275
		if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
276 277
			__NET_INC_STATS(sock_net(sk),
					LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
278
			spin_unlock(&fastopenq->lock);
279 280 281 282 283
			return false;
		}
		fastopenq->rskq_rst_head = req1->dl_next;
		fastopenq->qlen--;
		spin_unlock(&fastopenq->lock);
284
		reqsk_put(req1);
285 286 287 288
	}
	return true;
}

289 290 291 292
/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
 * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
 * cookie request (foc->len == 0).
 */
293 294
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
			      struct request_sock *req,
295
			      struct tcp_fastopen_cookie *foc)
296
{
297
	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
298 299
	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
300
	struct sock *child;
301

302
	if (foc->len == 0) /* Client requests a cookie */
303
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
304

305
	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
306 307 308
	      (syn_data || foc->len >= 0) &&
	      tcp_fastopen_queue_check(sk))) {
		foc->len = -1;
309
		return NULL;
310 311
	}

312
	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
313 314
		goto fastopen;

315
	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
316
	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
317
	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
318 319
	    foc->len == valid_foc.len &&
	    !memcmp(foc->val, valid_foc.val, foc->len)) {
320 321 322 323 324 325 326 327
		/* Cookie is valid. Create a (full) child socket to accept
		 * the data in SYN before returning a SYN-ACK to ack the
		 * data. If we fail to create the socket, fall back and
		 * ack the ISN only but includes the same cookie.
		 *
		 * Note: Data-less SYN with valid cookie is allowed to send
		 * data in SYN_RECV state.
		 */
328
fastopen:
329
		child = tcp_fastopen_create_child(sk, skb, req);
330
		if (child) {
331
			foc->len = -1;
332 333
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPFASTOPENPASSIVE);
334
			return child;
335
		}
336
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
337
	} else if (foc->len > 0) /* Client presents an invalid cookie */
338
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
339

340
	valid_foc.exp = foc->exp;
341
	*foc = valid_foc;
342
	return NULL;
343
}
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358

bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
			       struct tcp_fastopen_cookie *cookie)
{
	unsigned long last_syn_loss = 0;
	int syn_loss = 0;

	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);

	/* Recurring FO SYN losses: no cookie or data in SYN */
	if (syn_loss > 1 &&
	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
		cookie->len = -1;
		return false;
	}
359 360 361 362 363 364 365

	/* Firewall blackhole issue check */
	if (tcp_fastopen_active_should_disable(sk)) {
		cookie->len = -1;
		return false;
	}

366
	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
367 368 369 370 371
		cookie->len = -1;
		return true;
	}
	return cookie->len > 0;
}
W
Wei Wang 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404

/* This function checks if we want to defer sending SYN until the first
 * write().  We defer under the following conditions:
 * 1. fastopen_connect sockopt is set
 * 2. we have a valid cookie
 * Return value: return true if we want to defer until application writes data
 *               return false if we want to send out SYN immediately
 */
bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
{
	struct tcp_fastopen_cookie cookie = { .len = 0 };
	struct tcp_sock *tp = tcp_sk(sk);
	u16 mss;

	if (tp->fastopen_connect && !tp->fastopen_req) {
		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
			inet_sk(sk)->defer_connect = 1;
			return true;
		}

		/* Alloc fastopen_req in order for FO option to be included
		 * in SYN
		 */
		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
					   sk->sk_allocation);
		if (tp->fastopen_req)
			tp->fastopen_req->cookie = cookie;
		else
			*err = -ENOBUFS;
	}
	return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422

/*
 * The following code block is to deal with middle box issues with TFO:
 * Middlebox firewall issues can potentially cause server's data being
 * blackholed after a successful 3WHS using TFO.
 * The proposed solution is to disable active TFO globally under the
 * following circumstances:
 *   1. client side TFO socket receives out of order FIN
 *   2. client side TFO socket receives out of order RST
 * We disable active side TFO globally for 1hr at first. Then if it
 * happens again, we disable it for 2h, then 4h, 8h, ...
 * And we reset the timeout back to 1hr when we see a successful active
 * TFO connection with data exchanges.
 */

/* Disable active TFO and record current jiffies and
 * tfo_active_disable_times
 */
423
void tcp_fastopen_active_disable(struct sock *sk)
424
{
425
	struct net *net = sock_net(sk);
426

427 428 429
	atomic_inc(&net->ipv4.tfo_active_disable_times);
	net->ipv4.tfo_active_disable_stamp = jiffies;
	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
430 431 432 433 434 435 436 437
}

/* Calculate timeout for tfo active disable
 * Return true if we are still in the active TFO disable period
 * Return false if timeout already expired and we should use active TFO
 */
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
438 439
	unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
	int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
440
	unsigned long timeout;
441
	int multiplier;
442 443 444 445 446 447

	if (!tfo_da_times)
		return false;

	/* Limit timout to max: 2^6 * initial timeout */
	multiplier = 1 << min(tfo_da_times - 1, 6);
448 449
	timeout = multiplier * tfo_bh_timeout * HZ;
	if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
		return true;

	/* Mark check bit so we can check for successful active TFO
	 * condition and reset tfo_active_disable_times
	 */
	tcp_sk(sk)->syn_fastopen_ch = 1;
	return false;
}

/* Disable active TFO if FIN is the only packet in the ofo queue
 * and no data is received.
 * Also check if we can reset tfo_active_disable_times if data is
 * received successfully on a marked active TFO sockets opened on
 * a non-loopback interface
 */
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct rb_node *p;
	struct sk_buff *skb;
	struct dst_entry *dst;

	if (!tp->syn_fastopen)
		return;

	if (!tp->data_segs_in) {
		p = rb_first(&tp->out_of_order_queue);
		if (p && !rb_next(p)) {
			skb = rb_entry(p, struct sk_buff, rbnode);
			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
480
				tcp_fastopen_active_disable(sk);
481 482 483 484
				return;
			}
		}
	} else if (tp->syn_fastopen_ch &&
485
		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
486 487
		dst = sk_dst_get(sk);
		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
488
			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
489 490 491
		dst_release(dst);
	}
}