tcp_input.c 199.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
9
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
S
Stephen Hemminger 已提交
44
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
L
Linus Torvalds 已提交
45 46 47 48 49 50 51
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
52
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
L
Linus Torvalds 已提交
53
 *					connections with MSS<min(MTU,ann. MSS)
54
 *					work without delayed acks.
L
Linus Torvalds 已提交
55 56 57 58 59 60 61 62 63 64
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

65 66
#define pr_fmt(fmt) "TCP: " fmt

L
Linus Torvalds 已提交
67
#include <linux/mm.h>
68
#include <linux/slab.h>
L
Linus Torvalds 已提交
69 70
#include <linux/module.h>
#include <linux/sysctl.h>
71
#include <linux/kernel.h>
72
#include <linux/prefetch.h>
73
#include <net/dst.h>
L
Linus Torvalds 已提交
74 75 76 77
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
78
#include <linux/errqueue.h>
79
#include <trace/events/tcp.h>
80
#include <linux/jump_label_ratelimit.h>
81
#include <net/busy_poll.h>
P
Peter Krystad 已提交
82
#include <net/mptcp.h>
L
Linus Torvalds 已提交
83

84
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
93
#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
94
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Y
Yuchung Cheng 已提交
95
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
96
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
97
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
98
#define FLAG_SET_XMIT_TIMER	0x1000 /* Set TLP or RTO timer */
99
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
100
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
101
#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
102
#define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
103
#define FLAG_DSACK_TLP		0x20000 /* DSACK for tail loss probe */
L
Linus Torvalds 已提交
104 105 106

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
107
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
L
Linus Torvalds 已提交
108 109 110
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
111
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
L
Linus Torvalds 已提交
112

113 114 115 116
#define REXMIT_NONE	0 /* no loss recovery to do */
#define REXMIT_LOST	1 /* retransmit packets marked lost */
#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */

I
Ilya Lesokhin 已提交
117
#if IS_ENABLED(CONFIG_TLS_DEVICE)
118
static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
I
Ilya Lesokhin 已提交
119 120 121 122 123

void clean_acked_data_enable(struct inet_connection_sock *icsk,
			     void (*cad)(struct sock *sk, u32 ack_seq))
{
	icsk->icsk_clean_acked = cad;
124
	static_branch_deferred_inc(&clean_acked_data_enabled);
I
Ilya Lesokhin 已提交
125 126 127 128 129
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);

void clean_acked_data_disable(struct inet_connection_sock *icsk)
{
130
	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
I
Ilya Lesokhin 已提交
131 132 133
	icsk->icsk_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
134 135 136 137 138 139

void clean_acked_data_flush(void)
{
	static_key_deferred_flush(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
I
Ilya Lesokhin 已提交
140 141
#endif

142
#ifdef CONFIG_CGROUP_BPF
143 144 145 146 147 148 149
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
	bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
		BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
				       BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
	bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
						    BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
150
	struct bpf_sock_ops_kern sock_ops;
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

	if (likely(!unknown_opt && !parse_all_opt))
		return;

	/* The skb will be handled in the
	 * bpf_skops_established() or
	 * bpf_skops_write_hdr_opt().
	 */
	switch (sk->sk_state) {
	case TCP_SYN_RECV:
	case TCP_SYN_SENT:
	case TCP_LISTEN:
		return;
	}

166 167 168 169 170 171 172 173 174
	sock_owned_by_me(sk);

	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
	sock_ops.is_fullsock = 1;
	sock_ops.sk = sk;
	bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));

	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
175 176
}

177 178 179 180 181 182 183 184 185 186 187
static void bpf_skops_established(struct sock *sk, int bpf_op,
				  struct sk_buff *skb)
{
	struct bpf_sock_ops_kern sock_ops;

	sock_owned_by_me(sk);

	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	sock_ops.op = bpf_op;
	sock_ops.is_fullsock = 1;
	sock_ops.sk = sk;
188 189 190
	/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
	if (skb)
		bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
191 192 193 194

	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
}
#else
195 196 197 198
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
}

199 200 201 202 203 204
static void bpf_skops_established(struct sock *sk, int bpf_op,
				  struct sk_buff *skb)
{
}
#endif

205 206
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
			     unsigned int len)
207 208 209 210 211 212 213 214 215 216
{
	static bool __once __read_mostly;

	if (!__once) {
		struct net_device *dev;

		__once = true;

		rcu_read_lock();
		dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
217 218 219
		if (!dev || len >= dev->mtu)
			pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				dev ? dev->name : "Unknown driver");
220 221 222 223
		rcu_read_unlock();
	}
}

224
/* Adapt the MSS value used to make delayed ack decision to the
L
Linus Torvalds 已提交
225
 * real world.
226
 */
227
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
L
Linus Torvalds 已提交
228
{
229
	struct inet_connection_sock *icsk = inet_csk(sk);
230
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
231
	unsigned int len;
L
Linus Torvalds 已提交
232

233
	icsk->icsk_ack.last_seg_size = 0;
L
Linus Torvalds 已提交
234 235 236 237

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
238
	len = skb_shinfo(skb)->gso_size ? : skb->len;
239
	if (len >= icsk->icsk_ack.rcv_mss) {
240 241
		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
					       tcp_sk(sk)->advmss);
242 243 244 245
		/* Account for possibly-removed options */
		if (unlikely(len > icsk->icsk_ack.rcv_mss +
				   MAX_TCP_OPTION_SPACE))
			tcp_gro_dev_warn(sk, skb, len);
L
Linus Torvalds 已提交
246 247 248 249 250 251
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
252
		len += skb->data - skb_transport_header(skb);
253
		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
L
Linus Torvalds 已提交
254 255 256 257 258 259
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
260
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
L
Linus Torvalds 已提交
261 262 263 264
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
265 266
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
L
Linus Torvalds 已提交
267
			if (len == lss) {
268
				icsk->icsk_ack.rcv_mss = len;
L
Linus Torvalds 已提交
269 270 271
				return;
			}
		}
272 273
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
274
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
L
Linus Torvalds 已提交
275 276 277
	}
}

278
static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
L
Linus Torvalds 已提交
279
{
280
	struct inet_connection_sock *icsk = inet_csk(sk);
281
	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
L
Linus Torvalds 已提交
282

283 284
	if (quickacks == 0)
		quickacks = 2;
285
	quickacks = min(quickacks, max_quickacks);
286
	if (quickacks > icsk->icsk_ack.quick)
287
		icsk->icsk_ack.quick = quickacks;
L
Linus Torvalds 已提交
288 289
}

290
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
L
Linus Torvalds 已提交
291
{
292
	struct inet_connection_sock *icsk = inet_csk(sk);
293 294

	tcp_incr_quickack(sk, max_quickacks);
W
Wei Wang 已提交
295
	inet_csk_exit_pingpong_mode(sk);
296
	icsk->icsk_ack.ato = TCP_ATO_MIN;
L
Linus Torvalds 已提交
297
}
298
EXPORT_SYMBOL(tcp_enter_quickack_mode);
L
Linus Torvalds 已提交
299 300 301 302 303

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

304
static bool tcp_in_quickack_mode(struct sock *sk)
L
Linus Torvalds 已提交
305
{
306
	const struct inet_connection_sock *icsk = inet_csk(sk);
307
	const struct dst_entry *dst = __sk_dst_get(sk);
E
Eric Dumazet 已提交
308

309
	return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
W
Wei Wang 已提交
310
		(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
L
Linus Torvalds 已提交
311 312
}

313
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
314
{
315
	if (tp->ecn_flags & TCP_ECN_OK)
316 317 318
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

319
static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
320
{
321
	if (tcp_hdr(skb)->cwr) {
322
		tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
323 324 325 326 327

		/* If the sender is telling us it has entered CWR, then its
		 * cwnd may be very low (even just 1 packet), so we should ACK
		 * immediately.
		 */
328 329
		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
330
	}
331 332
}

333
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
334
{
335
	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
336 337
}

338
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
339
{
340 341
	struct tcp_sock *tp = tcp_sk(sk);

342
	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
343
	case INET_ECN_NOT_ECT:
344
		/* Funny extension: if ECT is not set on a segment,
345 346 347 348
		 * and we already seen ECT on a previous segment,
		 * it is probably a retransmit.
		 */
		if (tp->ecn_flags & TCP_ECN_SEEN)
349
			tcp_enter_quickack_mode(sk, 2);
350 351
		break;
	case INET_ECN_CE:
352 353
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
354

355 356
		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
			/* Better not delay acks, sender can have a very low cwnd */
357
			tcp_enter_quickack_mode(sk, 2);
358 359
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		}
360 361
		tp->ecn_flags |= TCP_ECN_SEEN;
		break;
362
	default:
363 364
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
365
		tp->ecn_flags |= TCP_ECN_SEEN;
366
		break;
367 368 369
	}
}

370
static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
371
{
372 373
	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
		__tcp_ecn_check_ce(sk, skb);
374 375 376
}

static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
377
{
378
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
379 380 381
		tp->ecn_flags &= ~TCP_ECN_OK;
}

382
static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
383
{
384
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
385 386 387
		tp->ecn_flags &= ~TCP_ECN_OK;
}

388
static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
389
{
390
	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
E
Eric Dumazet 已提交
391 392
		return true;
	return false;
393 394
}

L
Linus Torvalds 已提交
395 396 397 398 399
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

E
Eric Dumazet 已提交
400
static void tcp_sndbuf_expand(struct sock *sk)
L
Linus Torvalds 已提交
401
{
E
Eric Dumazet 已提交
402
	const struct tcp_sock *tp = tcp_sk(sk);
403
	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
E
Eric Dumazet 已提交
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
	int sndmem, per_mss;
	u32 nr_segs;

	/* Worst case is non GSO/TSO : each frame consumes one skb
	 * and skb->head is kmalloced using power of two area of memory
	 */
	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
		  MAX_TCP_HEADER +
		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

	per_mss = roundup_pow_of_two(per_mss) +
		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

	/* Fast Recovery (RFC 5681 3.2) :
	 * Cubic needs 1.7 factor, rounded to 2 to include
422
	 * extra cushion (application might react slowly to EPOLLOUT)
E
Eric Dumazet 已提交
423
	 */
424 425
	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
	sndmem *= nr_segs * per_mss;
L
Linus Torvalds 已提交
426

427
	if (sk->sk_sndbuf < sndmem)
428 429
		WRITE_ONCE(sk->sk_sndbuf,
			   min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
L
Linus Torvalds 已提交
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
S
Stephen Hemminger 已提交
453
 * window and then starts to feed us spaghetti. But it should work
L
Linus Torvalds 已提交
454 455 456 457
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
458 459
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
			     unsigned int skbtruesize)
L
Linus Torvalds 已提交
460
{
461
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
462
	/* Optimize this! */
463
	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
464
	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
L
Linus Torvalds 已提交
465 466 467

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
468
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
L
Linus Torvalds 已提交
469 470 471 472 473 474 475

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
 * can play nice with us, as sk_buff and skb->head might be either
 * freed or shared with up to MAX_SKB_FRAGS segments.
 * Only give a boost to drivers using page frag(s) to hold the frame(s),
 * and if no payload was pulled in skb->head before reaching us.
 */
static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
{
	u32 truesize = skb->truesize;

	if (adjust && !skb_headlen(skb)) {
		truesize -= SKB_TRUESIZE(skb_end_offset(skb));
		/* paranoid check, some drivers might be buggy */
		if (unlikely((int)truesize < (int)skb->len))
			truesize = skb->truesize;
	}
	return truesize;
}

static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
			    bool adjust)
L
Linus Torvalds 已提交
497
{
498
	struct tcp_sock *tp = tcp_sk(sk);
499 500 501
	int room;

	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
502

503 504 505
	if (room <= 0)
		return;

L
Linus Torvalds 已提交
506
	/* Check #1 */
507
	if (!tcp_under_memory_pressure(sk)) {
508
		unsigned int truesize = truesize_adjust(adjust, skb);
L
Linus Torvalds 已提交
509 510 511 512 513
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
514
		if (tcp_win_from_space(sk, truesize) <= skb->len)
515
			incr = 2 * tp->advmss;
L
Linus Torvalds 已提交
516
		else
517
			incr = __tcp_grow_window(sk, skb, truesize);
L
Linus Torvalds 已提交
518 519

		if (incr) {
520
			incr = max_t(int, incr, 2 * skb->len);
521
			tp->rcv_ssthresh += min(room, incr);
522
			inet_csk(sk)->icsk_ack.quick |= 1;
L
Linus Torvalds 已提交
523
		}
524 525 526 527 528
	} else {
		/* Under pressure:
		 * Adjust rcv_ssthresh according to reserved mem
		 */
		tcp_adjust_rcv_ssthresh(sk);
L
Linus Torvalds 已提交
529 530 531
	}
}

532
/* 3. Try to fixup all. It is made immediately after connection enters
L
Linus Torvalds 已提交
533 534
 *    established state.
 */
535
static void tcp_init_buffer_space(struct sock *sk)
L
Linus Torvalds 已提交
536
{
537
	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
L
Linus Torvalds 已提交
538 539 540 541
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
E
Eric Dumazet 已提交
542
		tcp_sndbuf_expand(sk);
L
Linus Torvalds 已提交
543

544
	tcp_mstamp_refresh(tp);
545
	tp->rcvq_space.time = tp->tcp_mstamp;
E
Eric Dumazet 已提交
546
	tp->rcvq_space.seq = tp->copied_seq;
L
Linus Torvalds 已提交
547 548 549 550 551 552

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

553
		if (tcp_app_win && maxwin > 4 * tp->advmss)
L
Linus Torvalds 已提交
554
			tp->window_clamp = max(maxwin -
555
					       (maxwin >> tcp_app_win),
L
Linus Torvalds 已提交
556 557 558 559
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
560
	if (tcp_app_win &&
L
Linus Torvalds 已提交
561 562 563 564 565
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
566
	tp->snd_cwnd_stamp = tcp_jiffies32;
567 568
	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
				    (u32)TCP_INIT_CWND * tp->advmss);
L
Linus Torvalds 已提交
569 570
}

571
/* 4. Recalculate window clamp after socket hit its memory bounds. */
572
static void tcp_clamp_window(struct sock *sk)
L
Linus Torvalds 已提交
573
{
574
	struct tcp_sock *tp = tcp_sk(sk);
575
	struct inet_connection_sock *icsk = inet_csk(sk);
576
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
577

578
	icsk->icsk_ack.quick = 0;
L
Linus Torvalds 已提交
579

580
	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
581
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
582
	    !tcp_under_memory_pressure(sk) &&
583
	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
584 585 586
		WRITE_ONCE(sk->sk_rcvbuf,
			   min(atomic_read(&sk->sk_rmem_alloc),
			       net->ipv4.sysctl_tcp_rmem[2]));
L
Linus Torvalds 已提交
587
	}
588
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
589
		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
L
Linus Torvalds 已提交
590 591
}

S
Stephen Hemminger 已提交
592 593 594 595 596 597 598 599 600
/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
601
	const struct tcp_sock *tp = tcp_sk(sk);
S
Stephen Hemminger 已提交
602 603
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

604
	hint = min(hint, tp->rcv_wnd / 2);
605
	hint = min(hint, TCP_MSS_DEFAULT);
S
Stephen Hemminger 已提交
606 607 608 609
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
E
Eric Dumazet 已提交
610
EXPORT_SYMBOL(tcp_initialize_rcv_mss);
S
Stephen Hemminger 已提交
611

L
Linus Torvalds 已提交
612 613 614 615
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
616
 * <https://public.lanl.gov/radiant/pubs.html#DRS>
L
Linus Torvalds 已提交
617 618
 *
 * More detail on this code can be found at
619
 * <http://staff.psc.edu/jheffner/>,
L
Linus Torvalds 已提交
620 621 622 623 624
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
625
	u32 new_sample = tp->rcv_rtt_est.rtt_us;
L
Linus Torvalds 已提交
626 627 628 629 630 631 632 633 634
	long m = sample;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
S
Stephen Hemminger 已提交
635
		 * non-timestamp case, we do not smooth things out
S
Stephen Hemminger 已提交
636
		 * else with timestamps disabled convergence takes too
L
Linus Torvalds 已提交
637 638 639 640 641
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
642 643 644 645 646
		} else {
			m <<= 3;
			if (m < new_sample)
				new_sample = m;
		}
L
Linus Torvalds 已提交
647
	} else {
S
Stephen Hemminger 已提交
648
		/* No previous measure. */
L
Linus Torvalds 已提交
649 650 651
		new_sample = m << 3;
	}

652
	tp->rcv_rtt_est.rtt_us = new_sample;
L
Linus Torvalds 已提交
653 654 655 656
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
657 658
	u32 delta_us;

659
	if (tp->rcv_rtt_est.time == 0)
L
Linus Torvalds 已提交
660 661 662
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
663
	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
664 665
	if (!delta_us)
		delta_us = 1;
666
	tcp_rcv_rtt_update(tp, delta_us, 1);
L
Linus Torvalds 已提交
667 668 669

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
670
	tp->rcv_rtt_est.time = tp->tcp_mstamp;
L
Linus Torvalds 已提交
671 672
}

673 674
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
					  const struct sk_buff *skb)
L
Linus Torvalds 已提交
675
{
676
	struct tcp_sock *tp = tcp_sk(sk);
677

678 679 680 681 682 683
	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
		return;
	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;

	if (TCP_SKB_CB(skb)->end_seq -
	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
684
		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
685
		u32 delta_us;
686

687 688 689 690 691 692
		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
			if (!delta)
				delta = 1;
			delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
			tcp_rcv_rtt_update(tp, delta_us, 0);
		}
693
	}
L
Linus Torvalds 已提交
694 695 696 697 698 699 700 701 702
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
703
	u32 copied;
L
Linus Torvalds 已提交
704
	int time;
705

706 707
	trace_tcp_rcv_space_adjust(sk);

708
	tcp_mstamp_refresh(tp);
709
	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
710
	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
L
Linus Torvalds 已提交
711
		return;
712

E
Eric Dumazet 已提交
713 714 715 716 717 718 719 720 721 722 723 724 725 726
	/* Number of bytes copied to user in last RTT */
	copied = tp->copied_seq - tp->rcvq_space.seq;
	if (copied <= tp->rcvq_space.space)
		goto new_measure;

	/* A bit of theory :
	 * copied = bytes received in previous RTT, our base window
	 * To cope with packet losses, we need a 2x factor
	 * To cope with slow start, and sender growing its cwin by 100 %
	 * every RTT, we need a 4x factor, because the ACK we are sending
	 * now is for the next RTT, not the current one :
	 * <prev RTT . ><current RTT .. ><next RTT .... >
	 */

727
	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
E
Eric Dumazet 已提交
728
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
729
		int rcvmem, rcvbuf;
E
Eric Dumazet 已提交
730
		u64 rcvwin, grow;
L
Linus Torvalds 已提交
731

E
Eric Dumazet 已提交
732 733 734
		/* minimal window to cope with packet losses, assuming
		 * steady state. Add some cushion because of small variations.
		 */
735
		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
L
Linus Torvalds 已提交
736

E
Eric Dumazet 已提交
737 738 739 740
		/* Accommodate for sender rate increase (eg. slow start) */
		grow = rcvwin * (copied - tp->rcvq_space.space);
		do_div(grow, tp->rcvq_space.space);
		rcvwin += (grow << 1);
L
Linus Torvalds 已提交
741

E
Eric Dumazet 已提交
742
		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
743
		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
E
Eric Dumazet 已提交
744
			rcvmem += 128;
L
Linus Torvalds 已提交
745

746 747 748
		do_div(rcvwin, tp->advmss);
		rcvbuf = min_t(u64, rcvwin * rcvmem,
			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
E
Eric Dumazet 已提交
749
		if (rcvbuf > sk->sk_rcvbuf) {
750
			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
L
Linus Torvalds 已提交
751

E
Eric Dumazet 已提交
752
			/* Make the window clamp follow along.  */
753
			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
L
Linus Torvalds 已提交
754 755
		}
	}
E
Eric Dumazet 已提交
756
	tp->rcvq_space.space = copied;
757

L
Linus Torvalds 已提交
758 759
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
760
	tp->rcvq_space.time = tp->tcp_mstamp;
L
Linus Torvalds 已提交
761 762 763 764 765 766 767 768 769 770 771 772
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
773
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
774
{
775
	struct tcp_sock *tp = tcp_sk(sk);
776
	struct inet_connection_sock *icsk = inet_csk(sk);
L
Linus Torvalds 已提交
777 778
	u32 now;

779
	inet_csk_schedule_ack(sk);
L
Linus Torvalds 已提交
780

781
	tcp_measure_rcv_mss(sk, skb);
L
Linus Torvalds 已提交
782 783

	tcp_rcv_rtt_measure(tp);
784

785
	now = tcp_jiffies32;
L
Linus Torvalds 已提交
786

787
	if (!icsk->icsk_ack.ato) {
L
Linus Torvalds 已提交
788 789 790
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
791
		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
792
		icsk->icsk_ack.ato = TCP_ATO_MIN;
L
Linus Torvalds 已提交
793
	} else {
794
		int m = now - icsk->icsk_ack.lrcvtime;
L
Linus Torvalds 已提交
795

796
		if (m <= TCP_ATO_MIN / 2) {
L
Linus Torvalds 已提交
797
			/* The fastest case is the first. */
798 799 800 801 802 803
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
S
Stephen Hemminger 已提交
804
			/* Too long gap. Apparently sender failed to
L
Linus Torvalds 已提交
805 806
			 * restart window, so that we send ACKs quickly.
			 */
807
			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
808
			sk_mem_reclaim(sk);
L
Linus Torvalds 已提交
809 810
		}
	}
811
	icsk->icsk_ack.lrcvtime = now;
L
Linus Torvalds 已提交
812

813
	tcp_ecn_check_ce(sk, skb);
L
Linus Torvalds 已提交
814 815

	if (skb->len >= 128)
816
		tcp_grow_window(sk, skb, true);
L
Linus Torvalds 已提交
817 818 819 820 821 822 823 824 825 826 827
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
828
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
L
Linus Torvalds 已提交
829
{
830
	struct tcp_sock *tp = tcp_sk(sk);
831 832
	long m = mrtt_us; /* RTT */
	u32 srtt = tp->srtt_us;
L
Linus Torvalds 已提交
833 834 835 836

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
837
	 *	This is designed to be as fast as possible
L
Linus Torvalds 已提交
838 839 840 841 842 843 844
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
S
Stephen Hemminger 已提交
845
	 * too slowly, when it should be increased quickly, decrease too quickly
L
Linus Torvalds 已提交
846 847 848 849
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
850 851 852
	if (srtt != 0) {
		m -= (srtt >> 3);	/* m is now error in rtt est */
		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
L
Linus Torvalds 已提交
853 854
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
855
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
L
Linus Torvalds 已提交
856 857 858 859 860 861 862 863 864 865 866
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
867
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
L
Linus Torvalds 已提交
868
		}
869 870 871 872 873
		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev_us > tp->mdev_max_us) {
			tp->mdev_max_us = tp->mdev_us;
			if (tp->mdev_max_us > tp->rttvar_us)
				tp->rttvar_us = tp->mdev_max_us;
L
Linus Torvalds 已提交
874 875
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
876 877
			if (tp->mdev_max_us < tp->rttvar_us)
				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
L
Linus Torvalds 已提交
878
			tp->rtt_seq = tp->snd_nxt;
879
			tp->mdev_max_us = tcp_rto_min_us(sk);
880 881

			tcp_bpf_rtt(sk);
L
Linus Torvalds 已提交
882 883 884
		}
	} else {
		/* no previous measure. */
885
		srtt = m << 3;		/* take the measured time to be rtt */
886 887 888
		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */
		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
		tp->mdev_max_us = tp->rttvar_us;
L
Linus Torvalds 已提交
889
		tp->rtt_seq = tp->snd_nxt;
890 891

		tcp_bpf_rtt(sk);
L
Linus Torvalds 已提交
892
	}
893
	tp->srtt_us = max(1U, srtt);
L
Linus Torvalds 已提交
894 895
}

E
Eric Dumazet 已提交
896 897 898 899 900 901
static void tcp_update_pacing_rate(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	u64 rate;

	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
902 903 904 905 906 907 908 909 910 911 912
	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);

	/* current rate is (cwnd * mss) / srtt
	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
	 * In Congestion Avoidance phase, set it to 120 % the current rate.
	 *
	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
	 *	 end of slow start and should slow down.
	 */
	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
913
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
914
	else
915
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
E
Eric Dumazet 已提交
916 917 918

	rate *= max(tp->snd_cwnd, tp->packets_out);

919 920
	if (likely(tp->srtt_us))
		do_div(rate, tp->srtt_us);
E
Eric Dumazet 已提交
921

922
	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
923 924 925
	 * without any lock. We want to make sure compiler wont store
	 * intermediate values in this location.
	 */
926 927
	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
					     sk->sk_max_pacing_rate));
E
Eric Dumazet 已提交
928 929
}

L
Linus Torvalds 已提交
930 931 932
/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
933
static void tcp_set_rto(struct sock *sk)
L
Linus Torvalds 已提交
934
{
935
	const struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
936 937 938 939 940 941 942 943
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
S
Stephen Hemminger 已提交
944
	 *    ACKs in some circumstances.
L
Linus Torvalds 已提交
945
	 */
946
	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
L
Linus Torvalds 已提交
947 948 949 950

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
S
Stephen Hemminger 已提交
951
	 *    with correct one. It is exactly, which we pretend to do.
L
Linus Torvalds 已提交
952 953
	 */

954 955 956
	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
	 * guarantees that rto is higher.
	 */
957
	tcp_bound_rto(sk);
L
Linus Torvalds 已提交
958 959
}

960
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
L
Linus Torvalds 已提交
961 962 963
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

964
	if (!cwnd)
965
		cwnd = TCP_INIT_CWND;
L
Linus Torvalds 已提交
966 967 968
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

969 970 971 972 973 974 975 976 977 978 979 980 981 982
struct tcp_sacktag_state {
	/* Timestamps for earliest and latest never-retransmitted segment
	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
	 * but congestion control should still get an accurate delay signal.
	 */
	u64	first_sackt;
	u64	last_sackt;
	u32	reord;
	u32	sack_delivered;
	int	flag;
	unsigned int mss_now;
	struct rate_sample *rate;
};

983 984 985 986 987 988
/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
 * and spurious retransmission information if this DSACK is unlikely caused by
 * sender's action:
 * - DSACKed sequence range is larger than maximum receiver's window.
 * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
 */
989 990
static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
			  u32 end_seq, struct tcp_sacktag_state *state)
991
{
992 993
	u32 seq_len, dup_segs = 1;

994 995 996 997 998 999 1000 1001 1002
	if (!before(start_seq, end_seq))
		return 0;

	seq_len = end_seq - start_seq;
	/* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
	if (seq_len > tp->max_window)
		return 0;
	if (seq_len > tp->mss_cache)
		dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
1003 1004
	else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
		state->flag |= FLAG_DSACK_TLP;
1005 1006 1007 1008 1009

	tp->dsack_dups += dup_segs;
	/* Skip the DSACK if dup segs weren't retransmitted by sender */
	if (tp->dsack_dups > tp->total_retrans)
		return 0;
1010

1011
	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
1012 1013 1014 1015 1016 1017 1018 1019
	/* We increase the RACK ordering window in rounds where we receive
	 * DSACKs that may have been due to reordering causing RACK to trigger
	 * a spurious fast recovery. Thus RACK ignores DSACKs that happen
	 * without having seen reordering, or that match TLP probes (TLP
	 * is timer-driven, not triggered by RACK).
	 */
	if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
		tp->rack.dsack_seen = 1;
1020 1021 1022 1023 1024 1025

	state->flag |= FLAG_DSACKING_ACK;
	/* A spurious retransmission is delivered */
	state->sack_delivered += dup_segs;

	return dup_segs;
1026 1027
}

1028 1029 1030 1031 1032 1033
/* It's reordering when higher sequence was delivered (i.e. sacked) before
 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
 * distance is approximated in full-mss packet distance ("reordering").
 */
static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				      const int ts)
L
Linus Torvalds 已提交
1034
{
1035
	struct tcp_sock *tp = tcp_sk(sk);
1036 1037
	const u32 mss = tp->mss_cache;
	u32 fack, metric;
1038

1039 1040
	fack = tcp_highest_sack_seq(tp);
	if (!before(low_seq, fack))
1041 1042
		return;

1043 1044
	metric = fack - low_seq;
	if ((metric > tp->reordering * mss) && mss) {
L
Linus Torvalds 已提交
1045
#if FASTRETRANS_DEBUG > 1
1046 1047 1048
		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
			 tp->reordering,
1049
			 0,
1050 1051
			 tp->sacked_out,
			 tp->undo_marker ? tp->undo_retrans : 0);
L
Linus Torvalds 已提交
1052
#endif
1053 1054
		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
L
Linus Torvalds 已提交
1055
	}
Y
Yuchung Cheng 已提交
1056

1057
	/* This exciting event is worth to be remembered. 8) */
1058
	tp->reord_seen++;
1059 1060
	NET_INC_STATS(sock_net(sk),
		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
L
Linus Torvalds 已提交
1061 1062
}

Y
Yuchung Cheng 已提交
1063 1064 1065 1066 1067
 /* This must be called before lost_out or retrans_out are updated
  * on a new loss, because we want to know if all skbs previously
  * known to be lost have already been retransmitted, indicating
  * that this newly lost skb is our next skb to retransmit.
  */
1068 1069
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
1070 1071 1072 1073
	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
	    (tp->retransmit_skb_hint &&
	     before(TCP_SKB_CB(skb)->seq,
		    TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
1074
		tp->retransmit_skb_hint = skb;
1075 1076
}

1077 1078 1079 1080 1081 1082 1083 1084
/* Sum the number of packets on the wire we have marked as lost, and
 * notify the congestion control module that the given skb was marked lost.
 */
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
	tp->lost += tcp_skb_pcount(skb);
}

Y
Yuchung Cheng 已提交
1085 1086
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
Y
Yuchung Cheng 已提交
1087
	__u8 sacked = TCP_SKB_CB(skb)->sacked;
Y
Yuchung Cheng 已提交
1088 1089
	struct tcp_sock *tp = tcp_sk(sk);

Y
Yuchung Cheng 已提交
1090 1091
	if (sacked & TCPCB_SACKED_ACKED)
		return;
1092

Y
Yuchung Cheng 已提交
1093 1094 1095 1096 1097 1098 1099 1100
	tcp_verify_retransmit_hint(tp, skb);
	if (sacked & TCPCB_LOST) {
		if (sacked & TCPCB_SACKED_RETRANS) {
			/* Account for retransmits that are lost again */
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
			tp->retrans_out -= tcp_skb_pcount(skb);
			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
				      tcp_skb_pcount(skb));
1101
			tcp_notify_skb_loss_event(tp, skb);
Y
Yuchung Cheng 已提交
1102 1103 1104 1105
		}
	} else {
		tp->lost_out += tcp_skb_pcount(skb);
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1106
		tcp_notify_skb_loss_event(tp, skb);
Y
Yuchung Cheng 已提交
1107
	}
1108 1109
}

1110 1111 1112 1113 1114 1115 1116 1117 1118
/* Updates the delivered and delivered_ce counts */
static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
				bool ece_ack)
{
	tp->delivered += delivered;
	if (ece_ack)
		tp->delivered_ce += delivered;
}

L
Linus Torvalds 已提交
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
1140
 * 3. Loss detection event of two flavors:
L
Linus Torvalds 已提交
1141 1142
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.
1143
 *	B. SACK arrives sacking SND.NXT at the moment, when the
L
Linus Torvalds 已提交
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
 *	   segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
I
Ilpo Järvinen 已提交
1163 1164 1165 1166 1167 1168 1169
 *
 * SACK block validation.
 * ----------------------
 *
 * SACK block range validation checks that the received SACK block fits to
 * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
 * Note that SND.UNA is not included to the range though being valid because
1170 1171 1172 1173 1174 1175 1176 1177 1178
 * it means that the receiver is rather inconsistent with itself reporting
 * SACK reneging when it should advance SND.UNA. Such SACK block this is
 * perfectly valid, however, in light of RFC2018 which explicitly states
 * that "SACK block MUST reflect the newest segment.  Even if the newest
 * segment is going to be discarded ...", not that it looks very clever
 * in case of head skb. Due to potentional receiver driven attacks, we
 * choose to avoid immediate execution of a walk in write queue due to
 * reneging and defer head skb's loss recovery to standard loss recovery
 * procedure that will eventually trigger (nothing forbids us doing this).
I
Ilpo Järvinen 已提交
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
 *
 * Implements also blockage to start_seq wrap-around. Problem lies in the
 * fact that though start_seq (s) is before end_seq (i.e., not reversed),
 * there's no guarantee that it will be before snd_nxt (n). The problem
 * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
 * wrap (s_w):
 *
 *         <- outs wnd ->                          <- wrapzone ->
 *         u     e      n                         u_w   e_w  s n_w
 *         |     |      |                          |     |   |  |
 * |<------------+------+----- TCP seqno space --------------+---------->|
 * ...-- <2^31 ->|                                           |<--------...
 * ...---- >2^31 ------>|                                    |<--------...
 *
 * Current code wouldn't be vulnerable but it's better still to discard such
 * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
 * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
 * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
 * equal to the ideal case (infinite seqno space without wrap caused issues).
 *
 * With D-SACK the lower bound is extended to cover sequence space below
 * SND.UNA down to undo_marker, which is the last point of interest. Yet
1201
 * again, D-SACK block must not to go across snd_una (for the same reason as
I
Ilpo Järvinen 已提交
1202 1203 1204 1205 1206 1207 1208 1209 1210
 * for the normal SACK blocks, explained above). But there all simplicity
 * ends, TCP might receive valid D-SACKs below that. As long as they reside
 * fully below undo_marker they do not affect behavior in anyway and can
 * therefore be safely ignored. In rare cases (which are more or less
 * theoretical ones), the D-SACK will nicely cross that boundary due to skb
 * fragmentation and packet reordering past skb's retransmission. To consider
 * them correctly, the acceptable range must be extended even more though
 * the exact amount is rather hard to quantify. However, tp->max_window can
 * be used as an exaggerated estimate.
L
Linus Torvalds 已提交
1211
 */
E
Eric Dumazet 已提交
1212 1213
static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				   u32 start_seq, u32 end_seq)
I
Ilpo Järvinen 已提交
1214 1215 1216
{
	/* Too far in future, or reversed (interpretation is ambiguous) */
	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
E
Eric Dumazet 已提交
1217
		return false;
I
Ilpo Järvinen 已提交
1218 1219 1220

	/* Nasty start_seq wrap-around check (see comments above) */
	if (!before(start_seq, tp->snd_nxt))
E
Eric Dumazet 已提交
1221
		return false;
I
Ilpo Järvinen 已提交
1222

1223
	/* In outstanding window? ...This is valid exit for D-SACKs too.
I
Ilpo Järvinen 已提交
1224 1225 1226
	 * start_seq == snd_una is non-sensical (see comments above)
	 */
	if (after(start_seq, tp->snd_una))
E
Eric Dumazet 已提交
1227
		return true;
I
Ilpo Järvinen 已提交
1228 1229

	if (!is_dsack || !tp->undo_marker)
E
Eric Dumazet 已提交
1230
		return false;
I
Ilpo Järvinen 已提交
1231 1232

	/* ...Then it's D-SACK, and must reside below snd_una completely */
Z
Zheng Yan 已提交
1233
	if (after(end_seq, tp->snd_una))
E
Eric Dumazet 已提交
1234
		return false;
I
Ilpo Järvinen 已提交
1235 1236

	if (!before(start_seq, tp->undo_marker))
E
Eric Dumazet 已提交
1237
		return true;
I
Ilpo Järvinen 已提交
1238 1239 1240

	/* Too old */
	if (!after(end_seq, tp->undo_marker))
E
Eric Dumazet 已提交
1241
		return false;
I
Ilpo Järvinen 已提交
1242 1243 1244 1245 1246 1247 1248

	/* Undo_marker boundary crossing (overestimates a lot). Known already:
	 *   start_seq < undo_marker and end_seq >= undo_marker.
	 */
	return !before(start_seq, end_seq - tp->max_window);
}

E
Eric Dumazet 已提交
1249 1250
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
			    struct tcp_sack_block_wire *sp, int num_sacks,
1251
			    u32 prior_snd_una, struct tcp_sacktag_state *state)
1252
{
1253
	struct tcp_sock *tp = tcp_sk(sk);
1254 1255
	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1256
	u32 dup_segs;
1257 1258

	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1259
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1260
	} else if (num_sacks > 1) {
1261 1262
		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1263

1264 1265 1266 1267 1268
		if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
			return false;
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
	} else {
		return false;
1269 1270
	}

1271
	dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
1272 1273 1274 1275 1276
	if (!dup_segs) {	/* Skip dubious DSACK */
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
		return false;
	}

1277
	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1278

1279
	/* D-SACK for already forgotten data... Do dumb counting. */
1280
	if (tp->undo_marker && tp->undo_retrans > 0 &&
1281 1282
	    !after(end_seq_0, prior_snd_una) &&
	    after(end_seq_0, tp->undo_marker))
1283
		tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
1284

1285
	return true;
1286 1287
}

1288 1289 1290 1291 1292
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
 * the incoming SACK may not exactly match but we can find smaller MSS
 * aligned portion of it that matches. Therefore we might need to fragment
 * which may fail and creates some hassle (caller must handle error case
 * returns).
1293 1294
 *
 * FIXME: this could be merged to shift decision code
1295
 */
1296
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
E
Eric Dumazet 已提交
1297
				  u32 start_seq, u32 end_seq)
1298
{
E
Eric Dumazet 已提交
1299 1300
	int err;
	bool in_sack;
1301
	unsigned int pkt_len;
1302
	unsigned int mss;
1303 1304 1305 1306 1307 1308

	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1309
		mss = tcp_skb_mss(skb);
1310 1311
		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);

1312
		if (!in_sack) {
1313
			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1314 1315 1316
			if (pkt_len < mss)
				pkt_len = mss;
		} else {
1317
			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1318 1319 1320 1321 1322 1323 1324 1325 1326
			if (pkt_len < mss)
				return -EINVAL;
		}

		/* Round if necessary so that SACKs cover only full MSSes
		 * and/or the remaining small portion (if present)
		 */
		if (pkt_len > mss) {
			unsigned int new_len = (pkt_len / mss) * mss;
1327
			if (!in_sack && new_len < pkt_len)
1328 1329 1330
				new_len += mss;
			pkt_len = new_len;
		}
1331 1332 1333 1334

		if (pkt_len >= skb->len && !in_sack)
			return 0;

1335 1336
		err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				   pkt_len, mss, GFP_ATOMIC);
1337 1338 1339 1340 1341 1342 1343
		if (err < 0)
			return err;
	}

	return in_sack;
}

1344 1345 1346 1347
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
			  struct tcp_sacktag_state *state, u8 sacked,
			  u32 start_seq, u32 end_seq,
1348
			  int dup_sack, int pcount,
1349
			  u64 xmit_time)
1350
{
1351
	struct tcp_sock *tp = tcp_sk(sk);
1352 1353 1354

	/* Account D-SACK for retransmitted packet. */
	if (dup_sack && (sacked & TCPCB_RETRANS)) {
Y
Yuchung Cheng 已提交
1355
		if (tp->undo_marker && tp->undo_retrans > 0 &&
1356
		    after(end_seq, tp->undo_marker))
1357
			tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
1358 1359 1360
		if ((sacked & TCPCB_SACKED_ACKED) &&
		    before(start_seq, state->reord))
				state->reord = start_seq;
1361 1362 1363
	}

	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
1364
	if (!after(end_seq, tp->snd_una))
1365
		return sacked;
1366 1367

	if (!(sacked & TCPCB_SACKED_ACKED)) {
1368
		tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1369

1370 1371 1372 1373 1374 1375
		if (sacked & TCPCB_SACKED_RETRANS) {
			/* If the segment is not tagged as lost,
			 * we do not clear RETRANS, believing
			 * that retransmission is still in flight.
			 */
			if (sacked & TCPCB_LOST) {
1376
				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1377 1378
				tp->lost_out -= pcount;
				tp->retrans_out -= pcount;
1379 1380 1381 1382 1383 1384
			}
		} else {
			if (!(sacked & TCPCB_RETRANS)) {
				/* New sack for not retransmitted frame,
				 * which was in hole. It is reordering.
				 */
1385
				if (before(start_seq,
1386 1387 1388 1389
					   tcp_highest_sack_seq(tp)) &&
				    before(start_seq, state->reord))
					state->reord = start_seq;

Y
Yuchung Cheng 已提交
1390 1391
				if (!after(end_seq, tp->high_seq))
					state->flag |= FLAG_ORIG_SACK_ACKED;
1392 1393 1394
				if (state->first_sackt == 0)
					state->first_sackt = xmit_time;
				state->last_sackt = xmit_time;
1395 1396 1397
			}

			if (sacked & TCPCB_LOST) {
1398
				sacked &= ~TCPCB_LOST;
1399
				tp->lost_out -= pcount;
1400 1401 1402
			}
		}

1403 1404
		sacked |= TCPCB_SACKED_ACKED;
		state->flag |= FLAG_DATA_SACKED;
1405
		tp->sacked_out += pcount;
1406
		/* Out-of-order packets delivered */
1407
		state->sack_delivered += pcount;
1408 1409

		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
Y
Yuchung Cheng 已提交
1410
		if (tp->lost_skb_hint &&
1411
		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1412
			tp->lost_cnt_hint += pcount;
1413 1414 1415 1416 1417 1418
	}

	/* D-SACK. We can detect redundant retransmission in S|R and plain R
	 * frames and clear it. undo_retrans is decreased above, L|R frames
	 * are accounted above as well.
	 */
1419 1420
	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
		sacked &= ~TCPCB_SACKED_RETRANS;
1421
		tp->retrans_out -= pcount;
1422 1423
	}

1424
	return sacked;
1425 1426
}

1427 1428 1429
/* Shift newly-SACKed bytes from this skb to the immediately previous
 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
 */
1430 1431
static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
			    struct sk_buff *skb,
E
Eric Dumazet 已提交
1432 1433 1434
			    struct tcp_sacktag_state *state,
			    unsigned int pcount, int shifted, int mss,
			    bool dup_sack)
1435 1436
{
	struct tcp_sock *tp = tcp_sk(sk);
1437 1438
	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */
	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */
1439 1440 1441

	BUG_ON(!pcount);

1442 1443 1444 1445 1446 1447 1448
	/* Adjust counters and hints for the newly sacked sequence
	 * range but discard the return value since prev is already
	 * marked. We must tag the range first because the seq
	 * advancement below implicitly advances
	 * tcp_highest_sack_seq() when skb is highest_sack.
	 */
	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
Y
Yuchung Cheng 已提交
1449
			start_seq, end_seq, dup_sack, pcount,
1450
			tcp_skb_timestamp_us(skb));
1451
	tcp_rate_skb_delivered(sk, skb, state->rate);
1452 1453

	if (skb == tp->lost_skb_hint)
1454 1455
		tp->lost_cnt_hint += pcount;

1456 1457 1458
	TCP_SKB_CB(prev)->end_seq += shifted;
	TCP_SKB_CB(skb)->seq += shifted;

1459
	tcp_skb_pcount_add(prev, pcount);
1460
	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
1461
	tcp_skb_pcount_add(skb, -pcount);
1462 1463 1464 1465 1466 1467

	/* When we're adding to gso_segs == 1, gso_size will be zero,
	 * in theory this shouldn't be necessary but as long as DSACK
	 * code can come after this skb later on it's better to keep
	 * setting gso_size to something.
	 */
1468 1469
	if (!TCP_SKB_CB(prev)->tcp_gso_size)
		TCP_SKB_CB(prev)->tcp_gso_size = mss;
1470 1471

	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
1472
	if (tcp_skb_pcount(skb) <= 1)
1473
		TCP_SKB_CB(skb)->tcp_gso_size = 0;
1474 1475 1476 1477 1478 1479

	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);

	if (skb->len > 0) {
		BUG_ON(!tcp_skb_pcount(skb));
1480
		NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
E
Eric Dumazet 已提交
1481
		return false;
1482 1483 1484 1485
	}

	/* Whole SKB was eaten :-) */

1486 1487 1488 1489 1490 1491 1492
	if (skb == tp->retransmit_skb_hint)
		tp->retransmit_skb_hint = prev;
	if (skb == tp->lost_skb_hint) {
		tp->lost_skb_hint = prev;
		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
	}

1493
	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1494
	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1495 1496 1497
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
		TCP_SKB_CB(prev)->end_seq++;

1498 1499 1500
	if (skb == tcp_highest_sack(sk))
		tcp_advance_highest_sack(sk, skb);

1501
	tcp_skb_collapse_tstamp(prev, skb);
1502 1503
	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
		TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1504

1505
	tcp_rtx_queue_unlink_and_free(skb, sk);
1506

1507
	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1508

E
Eric Dumazet 已提交
1509
	return true;
1510 1511 1512 1513 1514
}

/* I wish gso_size would have a bit more sane initialization than
 * something-or-zero which complicates things
 */
1515
static int tcp_skb_seglen(const struct sk_buff *skb)
1516
{
1517
	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1518 1519 1520
}

/* Shifting pages past head area doesn't work */
1521
static int skb_can_shift(const struct sk_buff *skb)
1522 1523 1524 1525
{
	return !skb_headlen(skb) && skb_is_nonlinear(skb);
}

1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540
int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
		  int pcount, int shiftlen)
{
	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
	 * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
	 * to make sure not storing more than 65535 * 8 bytes per skb,
	 * even if current MSS is bigger.
	 */
	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
		return 0;
	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
		return 0;
	return skb_shift(to, from, shiftlen);
}

1541 1542 1543 1544
/* Try collapsing SACK blocks spanning across multiple skbs to a single
 * skb.
 */
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1545
					  struct tcp_sacktag_state *state,
1546
					  u32 start_seq, u32 end_seq,
E
Eric Dumazet 已提交
1547
					  bool dup_sack)
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *prev;
	int mss;
	int pcount = 0;
	int len;
	int in_sack;

	/* Normally R but no L won't result in plain S */
	if (!dup_sack &&
1558
	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1559 1560 1561 1562 1563 1564 1565 1566
		goto fallback;
	if (!skb_can_shift(skb))
		goto fallback;
	/* This frame is about to be dropped (was ACKed). */
	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
		goto fallback;

	/* Can only happen with delayed DSACK + discard craziness */
1567 1568
	prev = skb_rb_prev(skb);
	if (!prev)
1569 1570 1571 1572 1573
		goto fallback;

	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
		goto fallback;

1574
	if (!tcp_skb_can_collapse(prev, skb))
1575 1576
		goto fallback;

1577 1578 1579 1580 1581 1582
	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

	if (in_sack) {
		len = skb->len;
		pcount = tcp_skb_pcount(skb);
1583
		mss = tcp_skb_seglen(skb);
1584 1585 1586 1587

		/* TODO: Fix DSACKs to not fragment already SACKed and we can
		 * drop this restriction as unnecessary
		 */
1588
		if (mss != tcp_skb_seglen(prev))
1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
			goto fallback;
	} else {
		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
			goto noop;
		/* CHECKME: This is non-MSS split case only?, this will
		 * cause skipped skbs due to advancing loop btw, original
		 * has that feature too
		 */
		if (tcp_skb_pcount(skb) <= 1)
			goto noop;

		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
		if (!in_sack) {
			/* TODO: head merge to next could be attempted here
			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
			 * though it might not be worth of the additional hassle
			 *
			 * ...we can probably just fallback to what was done
			 * previously. We could try merging non-SACKed ones
			 * as well but it probably isn't going to buy off
			 * because later SACKs might again split them, and
			 * it would make skb timestamp tracking considerably
			 * harder problem.
			 */
			goto fallback;
		}

		len = end_seq - TCP_SKB_CB(skb)->seq;
		BUG_ON(len < 0);
		BUG_ON(len > skb->len);

		/* MSS boundaries should be honoured or else pcount will
		 * severely break even though it makes things bit trickier.
		 * Optimize common case to avoid most of the divides
		 */
		mss = tcp_skb_mss(skb);

		/* TODO: Fix DSACKs to not fragment already SACKed and we can
		 * drop this restriction as unnecessary
		 */
1629
		if (mss != tcp_skb_seglen(prev))
1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641
			goto fallback;

		if (len == mss) {
			pcount = 1;
		} else if (len < mss) {
			goto noop;
		} else {
			pcount = len / mss;
			len = pcount * mss;
		}
	}

1642 1643 1644 1645
	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
		goto fallback;

1646
	if (!tcp_skb_shift(prev, skb, pcount, len))
1647
		goto fallback;
1648
	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
1649 1650 1651 1652 1653
		goto out;

	/* Hole filled allows collapsing with the next as well, this is very
	 * useful when hole on every nth skb pattern happens
	 */
1654 1655
	skb = skb_rb_next(prev);
	if (!skb)
1656 1657
		goto out;

1658 1659
	if (!skb_can_shift(skb) ||
	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1660
	    (mss != tcp_skb_seglen(skb)))
1661 1662
		goto out;

1663 1664
	if (!tcp_skb_can_collapse(prev, skb))
		goto out;
1665
	len = skb->len;
1666 1667 1668
	pcount = tcp_skb_pcount(skb);
	if (tcp_skb_shift(prev, skb, pcount, len))
		tcp_shifted_skb(sk, prev, skb, state, pcount,
1669
				len, mss, 0);
1670 1671 1672 1673 1674 1675 1676 1677

out:
	return prev;

noop:
	return skb;

fallback:
1678
	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1679 1680 1681
	return NULL;
}

1682 1683
static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
					struct tcp_sack_block *next_dup,
1684
					struct tcp_sacktag_state *state,
1685
					u32 start_seq, u32 end_seq,
E
Eric Dumazet 已提交
1686
					bool dup_sack_in)
1687
{
1688 1689 1690
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *tmp;

1691
	skb_rbtree_walk_from(skb) {
1692
		int in_sack = 0;
E
Eric Dumazet 已提交
1693
		bool dup_sack = dup_sack_in;
1694 1695 1696 1697 1698

		/* queue is in-order => we can short-circuit the walk early */
		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
			break;

1699
		if (next_dup  &&
1700 1701 1702 1703 1704
		    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
			in_sack = tcp_match_skb_to_sack(sk, skb,
							next_dup->start_seq,
							next_dup->end_seq);
			if (in_sack > 0)
E
Eric Dumazet 已提交
1705
				dup_sack = true;
1706 1707
		}

1708 1709 1710 1711 1712
		/* skb reference here is a bit tricky to get right, since
		 * shifting can eat and free both this skb and the next,
		 * so not even _safe variant of the loop is enough.
		 */
		if (in_sack <= 0) {
1713 1714
			tmp = tcp_shift_skb_data(sk, skb, state,
						 start_seq, end_seq, dup_sack);
1715
			if (tmp) {
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
				if (tmp != skb) {
					skb = tmp;
					continue;
				}

				in_sack = 0;
			} else {
				in_sack = tcp_match_skb_to_sack(sk, skb,
								start_seq,
								end_seq);
			}
		}

1729 1730 1731
		if (unlikely(in_sack < 0))
			break;

1732
		if (in_sack) {
1733 1734 1735 1736 1737 1738 1739
			TCP_SKB_CB(skb)->sacked =
				tcp_sacktag_one(sk,
						state,
						TCP_SKB_CB(skb)->sacked,
						TCP_SKB_CB(skb)->seq,
						TCP_SKB_CB(skb)->end_seq,
						dup_sack,
Y
Yuchung Cheng 已提交
1740
						tcp_skb_pcount(skb),
1741
						tcp_skb_timestamp_us(skb));
1742
			tcp_rate_skb_delivered(sk, skb, state->rate);
1743 1744
			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				list_del_init(&skb->tcp_tsorted_anchor);
1745

1746 1747 1748 1749
			if (!before(TCP_SKB_CB(skb)->seq,
				    tcp_highest_sack_seq(tp)))
				tcp_advance_highest_sack(sk, skb);
		}
1750 1751 1752 1753
	}
	return skb;
}

1754
static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774
{
	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
	struct sk_buff *skb;

	while (*p) {
		parent = *p;
		skb = rb_to_skb(parent);
		if (before(seq, TCP_SKB_CB(skb)->seq)) {
			p = &parent->rb_left;
			continue;
		}
		if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
			p = &parent->rb_right;
			continue;
		}
		return skb;
	}
	return NULL;
}

1775
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1776
					u32 skip_to_seq)
1777
{
1778 1779
	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
		return skb;
1780

1781
	return tcp_sacktag_bsearch(sk, skip_to_seq);
1782 1783 1784 1785 1786
}

static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
						struct sock *sk,
						struct tcp_sack_block *next_dup,
1787 1788
						struct tcp_sacktag_state *state,
						u32 skip_to_seq)
1789
{
1790
	if (!next_dup)
1791 1792 1793
		return skb;

	if (before(next_dup->start_seq, skip_to_seq)) {
1794
		skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
1795 1796 1797
		skb = tcp_sacktag_walk(skb, sk, NULL, state,
				       next_dup->start_seq, next_dup->end_seq,
				       1);
1798 1799 1800 1801 1802
	}

	return skb;
}

1803
static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1804 1805 1806 1807
{
	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}

L
Linus Torvalds 已提交
1808
static int
1809
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1810
			u32 prior_snd_una, struct tcp_sacktag_state *state)
L
Linus Torvalds 已提交
1811 1812
{
	struct tcp_sock *tp = tcp_sk(sk);
1813 1814
	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				    TCP_SKB_CB(ack_skb)->sacked);
1815
	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1816
	struct tcp_sack_block sp[TCP_NUM_SACKS];
1817 1818
	struct tcp_sack_block *cache;
	struct sk_buff *skb;
1819
	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1820
	int used_sacks;
E
Eric Dumazet 已提交
1821
	bool found_dup_sack = false;
1822
	int i, j;
1823
	int first_sack_index;
L
Linus Torvalds 已提交
1824

1825
	state->flag = 0;
1826
	state->reord = tp->snd_nxt;
1827

1828
	if (!tp->sacked_out)
1829
		tcp_highest_sack_reset(sk);
L
Linus Torvalds 已提交
1830

1831
	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1832
					 num_sacks, prior_snd_una, state);
1833 1834 1835 1836 1837 1838 1839 1840

	/* Eliminate too old ACKs, but take into
	 * account more or less fresh ones, they can
	 * contain valid SACK info.
	 */
	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
		return 0;

1841 1842 1843
	if (!tp->packets_out)
		goto out;

1844 1845 1846
	used_sacks = 0;
	first_sack_index = 0;
	for (i = 0; i < num_sacks; i++) {
E
Eric Dumazet 已提交
1847
		bool dup_sack = !i && found_dup_sack;
1848

1849 1850
		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1851 1852 1853 1854

		if (!tcp_is_sackblock_valid(tp, dup_sack,
					    sp[used_sacks].start_seq,
					    sp[used_sacks].end_seq)) {
1855 1856
			int mib_idx;

1857 1858
			if (dup_sack) {
				if (!tp->undo_marker)
1859
					mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1860
				else
1861
					mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1862 1863 1864 1865 1866
			} else {
				/* Don't count olds caused by ACK reordering */
				if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				    !after(sp[used_sacks].end_seq, tp->snd_una))
					continue;
1867
				mib_idx = LINUX_MIB_TCPSACKDISCARD;
1868
			}
1869

1870
			NET_INC_STATS(sock_net(sk), mib_idx);
1871 1872 1873 1874 1875 1876
			if (i == 0)
				first_sack_index = -1;
			continue;
		}

		/* Ignore very old stuff early */
1877 1878 1879
		if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
			if (i == 0)
				first_sack_index = -1;
1880
			continue;
1881
		}
1882 1883 1884 1885

		used_sacks++;
	}

1886 1887
	/* order SACK blocks to allow in order walk of the retrans queue */
	for (i = used_sacks - 1; i > 0; i--) {
1888 1889
		for (j = 0; j < i; j++) {
			if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1890
				swap(sp[j], sp[j + 1]);
1891

1892 1893
				/* Track where the first SACK block goes to */
				if (j == first_sack_index)
1894
					first_sack_index = j + 1;
1895 1896 1897 1898
			}
		}
	}

1899 1900
	state->mss_now = tcp_current_mss(sk);
	skb = NULL;
1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
	i = 0;

	if (!tp->sacked_out) {
		/* It's already past, so skip checking against it */
		cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
	} else {
		cache = tp->recv_sack_cache;
		/* Skip empty blocks in at head of the cache */
		while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
		       !cache->end_seq)
			cache++;
1912 1913
	}

1914
	while (i < used_sacks) {
1915 1916
		u32 start_seq = sp[i].start_seq;
		u32 end_seq = sp[i].end_seq;
E
Eric Dumazet 已提交
1917
		bool dup_sack = (found_dup_sack && (i == first_sack_index));
1918
		struct tcp_sack_block *next_dup = NULL;
1919

1920 1921
		if (found_dup_sack && ((i + 1) == first_sack_index))
			next_dup = &sp[i + 1];
L
Linus Torvalds 已提交
1922

1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
		/* Skip too early cached blocks */
		while (tcp_sack_cache_ok(tp, cache) &&
		       !before(start_seq, cache->end_seq))
			cache++;

		/* Can skip some work by looking recv_sack_cache? */
		if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
		    after(end_seq, cache->start_seq)) {

			/* Head todo? */
			if (before(start_seq, cache->start_seq)) {
1934
				skb = tcp_sacktag_skip(skb, sk, start_seq);
1935
				skb = tcp_sacktag_walk(skb, sk, next_dup,
1936
						       state,
1937 1938
						       start_seq,
						       cache->start_seq,
1939
						       dup_sack);
1940
			}
1941

1942
			/* Rest of the block already fully processed? */
1943
			if (!after(end_seq, cache->end_seq))
1944
				goto advance_sp;
1945

1946
			skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1947
						       state,
1948
						       cache->end_seq);
1949

1950
			/* ...tail remains todo... */
1951
			if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1952
				/* ...but better entrypoint exists! */
1953
				skb = tcp_highest_sack(sk);
1954
				if (!skb)
1955
					break;
1956 1957
				cache++;
				goto walk;
1958 1959
			}

1960
			skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
1961 1962 1963 1964
			/* Check overlap against next cached too (past this one already) */
			cache++;
			continue;
		}
L
Linus Torvalds 已提交
1965

1966 1967
		if (!before(start_seq, tcp_highest_sack_seq(tp))) {
			skb = tcp_highest_sack(sk);
1968
			if (!skb)
1969
				break;
L
Linus Torvalds 已提交
1970
		}
1971
		skb = tcp_sacktag_skip(skb, sk, start_seq);
1972 1973

walk:
1974
		skb = tcp_sacktag_walk(skb, sk, next_dup, state,
1975
				       start_seq, end_seq, dup_sack);
1976

1977 1978
advance_sp:
		i++;
L
Linus Torvalds 已提交
1979 1980
	}

1981 1982 1983 1984 1985 1986 1987 1988
	/* Clear the head of the cache sack blocks so we can skip it next time */
	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
		tp->recv_sack_cache[i].start_seq = 0;
		tp->recv_sack_cache[i].end_seq = 0;
	}
	for (j = 0; j < used_sacks; j++)
		tp->recv_sack_cache[i++] = sp[j];

1989 1990
	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
		tcp_check_sack_reordering(sk, state->reord, 0);
L
Linus Torvalds 已提交
1991

1992
	tcp_verify_left_out(tp);
1993 1994
out:

L
Linus Torvalds 已提交
1995
#if FASTRETRANS_DEBUG > 0
1996 1997 1998 1999
	WARN_ON((int)tp->sacked_out < 0);
	WARN_ON((int)tp->lost_out < 0);
	WARN_ON((int)tp->retrans_out < 0);
	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
L
Linus Torvalds 已提交
2000
#endif
2001
	return state->flag;
L
Linus Torvalds 已提交
2002 2003
}

2004
/* Limits sacked_out so that sum with lost_out isn't ever larger than
E
Eric Dumazet 已提交
2005
 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
2006
 */
E
Eric Dumazet 已提交
2007
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
2008 2009 2010 2011 2012 2013 2014 2015
{
	u32 holes;

	holes = max(tp->lost_out, 1U);
	holes = min(holes, tp->packets_out);

	if ((tp->sacked_out + holes) > tp->packets_out) {
		tp->sacked_out = tp->packets_out - holes;
E
Eric Dumazet 已提交
2016
		return true;
2017
	}
E
Eric Dumazet 已提交
2018
	return false;
2019 2020 2021 2022 2023 2024 2025 2026 2027
}

/* If we receive more dupacks than we expected counting segments
 * in assumption of absent reordering, interpret this as reordering.
 * The only another reason could be bug in receiver TCP.
 */
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
	struct tcp_sock *tp = tcp_sk(sk);
2028 2029 2030 2031 2032 2033

	if (!tcp_limit_reno_sacked(tp))
		return;

	tp->reordering = min_t(u32, tp->packets_out + addend,
			       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
2034
	tp->reord_seen++;
2035
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
2036 2037 2038 2039
}

/* Emulate SACKs for SACKless connection: account for a new dupack. */

2040
static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
2041
{
2042 2043 2044 2045
	if (num_dupack) {
		struct tcp_sock *tp = tcp_sk(sk);
		u32 prior_sacked = tp->sacked_out;
		s32 delivered;
Y
Yuchung Cheng 已提交
2046

2047 2048 2049 2050
		tp->sacked_out += num_dupack;
		tcp_check_reno_reordering(sk, 0);
		delivered = tp->sacked_out - prior_sacked;
		if (delivered > 0)
2051
			tcp_count_delivered(tp, delivered, ece_ack);
2052 2053
		tcp_verify_left_out(tp);
	}
2054 2055 2056 2057
}

/* Account for ACK, ACKing some data in Reno Recovery phase. */

2058
static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
2059 2060 2061 2062 2063
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (acked > 0) {
		/* One ACK acked hole. The rest eat duplicate ACKs. */
2064 2065
		tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
				    ece_ack);
2066
		if (acked - 1 >= tp->sacked_out)
2067 2068
			tp->sacked_out = 0;
		else
2069
			tp->sacked_out -= acked - 1;
2070 2071
	}
	tcp_check_reno_reordering(sk, acked);
2072
	tcp_verify_left_out(tp);
2073 2074 2075 2076 2077 2078 2079
}

static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
	tp->sacked_out = 0;
}

Y
Yuchung Cheng 已提交
2080
void tcp_clear_retrans(struct tcp_sock *tp)
L
Linus Torvalds 已提交
2081 2082 2083 2084
{
	tp->retrans_out = 0;
	tp->lost_out = 0;
	tp->undo_marker = 0;
Y
Yuchung Cheng 已提交
2085
	tp->undo_retrans = -1;
Y
Yuchung Cheng 已提交
2086
	tp->sacked_out = 0;
L
Linus Torvalds 已提交
2087 2088
}

Y
Yuchung Cheng 已提交
2089
static inline void tcp_init_undo(struct tcp_sock *tp)
2090
{
Y
Yuchung Cheng 已提交
2091 2092 2093
	tp->undo_marker = tp->snd_una;
	/* Retransmission still in flight may cause DSACKs later. */
	tp->undo_retrans = tp->retrans_out ? : -1;
2094 2095
}

2096 2097 2098 2099 2100
static bool tcp_is_rack(const struct sock *sk)
{
	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
}

2101
/* If we detect SACK reneging, forget all SACK information
L
Linus Torvalds 已提交
2102 2103 2104
 * and reset tags completely, otherwise preserve SACKs. If receiver
 * dropped its ofo queue, we will know this due to reneging detection.
 */
2105 2106 2107
static void tcp_timeout_mark_lost(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
2108
	struct sk_buff *skb, *head;
2109 2110
	bool is_reneg;			/* is receiver reneging on SACKs? */

2111 2112
	head = tcp_rtx_queue_head(sk);
	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
2113 2114 2115 2116 2117 2118 2119 2120 2121
	if (is_reneg) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
		tp->sacked_out = 0;
		/* Mark SACK reneging until we recover from this loss event. */
		tp->is_sack_reneg = 1;
	} else if (tcp_is_reno(tp)) {
		tcp_reset_reno_sack(tp);
	}

2122
	skb = head;
2123 2124 2125
	skb_rbtree_walk_from(skb) {
		if (is_reneg)
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
2126 2127 2128
		else if (tcp_is_rack(sk) && skb != head &&
			 tcp_rack_skb_timeout(tp, skb, 0) > 0)
			continue; /* Don't mark recently sent ones lost yet */
2129 2130 2131 2132 2133 2134 2135
		tcp_mark_skb_lost(sk, skb);
	}
	tcp_verify_left_out(tp);
	tcp_clear_all_retrans_hints(tp);
}

/* Enter Loss state. */
2136
void tcp_enter_loss(struct sock *sk)
L
Linus Torvalds 已提交
2137
{
2138
	const struct inet_connection_sock *icsk = inet_csk(sk);
L
Linus Torvalds 已提交
2139
	struct tcp_sock *tp = tcp_sk(sk);
2140
	struct net *net = sock_net(sk);
2141
	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
L
Linus Torvalds 已提交
2142

2143 2144
	tcp_timeout_mark_lost(sk);

L
Linus Torvalds 已提交
2145
	/* Reduce ssthresh if it has not yet been made inside this window. */
Y
Yuchung Cheng 已提交
2146 2147
	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
	    !after(tp->high_seq, tp->snd_una) ||
2148 2149
	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
		tp->prior_ssthresh = tcp_current_ssthresh(sk);
2150
		tp->prior_cwnd = tp->snd_cwnd;
2151 2152
		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
		tcp_ca_event(sk, CA_EVENT_LOSS);
Y
Yuchung Cheng 已提交
2153
		tcp_init_undo(tp);
L
Linus Torvalds 已提交
2154
	}
2155
	tp->snd_cwnd	   = tcp_packets_in_flight(tp) + 1;
L
Linus Torvalds 已提交
2156
	tp->snd_cwnd_cnt   = 0;
2157
	tp->snd_cwnd_stamp = tcp_jiffies32;
L
Linus Torvalds 已提交
2158

2159 2160 2161 2162
	/* Timeout in disordered state after receiving substantial DUPACKs
	 * suggests that the degree of reordering is over-estimated.
	 */
	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2163
	    tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
2164
		tp->reordering = min_t(unsigned int, tp->reordering,
2165
				       net->ipv4.sysctl_tcp_reordering);
2166
	tcp_set_ca_state(sk, TCP_CA_Loss);
L
Linus Torvalds 已提交
2167
	tp->high_seq = tp->snd_nxt;
2168
	tcp_ecn_queue_cwr(tp);
Y
Yuchung Cheng 已提交
2169

2170 2171 2172
	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
	 * loss recovery is underway except recurring timeout(s) on
	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
Y
Yuchung Cheng 已提交
2173
	 */
E
Eric Dumazet 已提交
2174
	tp->frto = net->ipv4.sysctl_tcp_frto &&
2175 2176
		   (new_recovery || icsk->icsk_retransmits) &&
		   !inet_csk(sk)->icsk_mtup.probe_size;
L
Linus Torvalds 已提交
2177 2178
}

2179 2180 2181 2182
/* If ACK arrived pointing to a remembered SACK, it means that our
 * remembered SACKs do not reflect real state of receiver i.e.
 * receiver _host_ is heavily congested (or buggy).
 *
2183 2184 2185 2186 2187
 * To avoid big spurious retransmission bursts due to transient SACK
 * scoreboard oddities that look like reneging, we give the receiver a
 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
 * restore sanity to the SACK scoreboard. If the apparent reneging
 * persists until this RTO then we'll clear the SACK scoreboard.
2188
 */
E
Eric Dumazet 已提交
2189
static bool tcp_check_sack_reneging(struct sock *sk, int flag)
L
Linus Torvalds 已提交
2190
{
2191
	if (flag & FLAG_SACK_RENEGING) {
2192 2193 2194
		struct tcp_sock *tp = tcp_sk(sk);
		unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
					  msecs_to_jiffies(10));
L
Linus Torvalds 已提交
2195

2196
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2197
					  delay, TCP_RTO_MAX);
E
Eric Dumazet 已提交
2198
		return true;
L
Linus Torvalds 已提交
2199
	}
E
Eric Dumazet 已提交
2200
	return false;
L
Linus Torvalds 已提交
2201 2202
}

2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
 * counter when SACK is enabled (without SACK, sacked_out is used for
 * that purpose).
 *
 * With reordering, holes may still be in flight, so RFC3517 recovery
 * uses pure sacked_out (total number of SACKed segments) even though
 * it violates the RFC that uses duplicate ACKs, often these are equal
 * but when e.g. out-of-window ACKs or packet duplication occurs,
 * they differ. Since neither occurs due to loss, TCP should really
 * ignore them.
 */
2214
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2215
{
Y
Yuchung Cheng 已提交
2216
	return tp->sacked_out + 1;
2217 2218
}

Y
Yuchung Cheng 已提交
2219
/* Linux NewReno/SACK/ECN state machine.
L
Linus Torvalds 已提交
2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265
 * --------------------------------------
 *
 * "Open"	Normal state, no dubious events, fast path.
 * "Disorder"   In all the respects it is "Open",
 *		but requires a bit more attention. It is entered when
 *		we see some SACKs or dupacks. It is split of "Open"
 *		mainly to move some processing from fast path to slow one.
 * "CWR"	CWND was reduced due to some Congestion Notification event.
 *		It can be ECN, ICMP source quench, local device congestion.
 * "Recovery"	CWND was reduced, we are fast-retransmitting.
 * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
 *
 * tcp_fastretrans_alert() is entered:
 * - each incoming ACK, if state is not "Open"
 * - when arrived ACK is unusual, namely:
 *	* SACK
 *	* Duplicate ACK.
 *	* ECN ECE.
 *
 * Counting packets in flight is pretty simple.
 *
 *	in_flight = packets_out - left_out + retrans_out
 *
 *	packets_out is SND.NXT-SND.UNA counted in packets.
 *
 *	retrans_out is number of retransmitted segments.
 *
 *	left_out is number of segments left network, but not ACKed yet.
 *
 *		left_out = sacked_out + lost_out
 *
 *     sacked_out: Packets, which arrived to receiver out of order
 *		   and hence not ACKed. With SACKs this number is simply
 *		   amount of SACKed data. Even without SACKs
 *		   it is easy to give pretty reliable estimate of this number,
 *		   counting duplicate ACKs.
 *
 *       lost_out: Packets lost by network. TCP has no explicit
 *		   "loss notification" feedback from network (for now).
 *		   It means that this number can be only _guessed_.
 *		   Actually, it is the heuristics to predict lossage that
 *		   distinguishes different algorithms.
 *
 *	F.e. after RTO, when all the queue is considered as lost,
 *	lost_out = packets_out and in_flight = retrans_out.
 *
2266
 *		Essentially, we have now a few algorithms detecting
L
Linus Torvalds 已提交
2267 2268
 *		lost packets.
 *
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
 *		If the receiver supports SACK:
 *
 *		RFC6675/3517: It is the conventional algorithm. A packet is
 *		considered lost if the number of higher sequence packets
 *		SACKed is greater than or equal the DUPACK thoreshold
 *		(reordering). This is implemented in tcp_mark_head_lost and
 *		tcp_update_scoreboard.
 *
 *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
 *		(2017-) that checks timing instead of counting DUPACKs.
 *		Essentially a packet is considered lost if it's not S/ACKed
 *		after RTT + reordering_window, where both metrics are
 *		dynamically measured and adjusted. This is implemented in
 *		tcp_rack_mark_lost.
 *
 *		If the receiver does not support SACK:
 *
 *		NewReno (RFC6582): in Recovery we assume that one segment
L
Linus Torvalds 已提交
2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315
 *		is lost (classic Reno). While we are in Recovery and
 *		a partial ACK arrives, we assume that one more packet
 *		is lost (NewReno). This heuristics are the same in NewReno
 *		and SACK.
 *
 * Really tricky (and requiring careful tuning) part of algorithm
 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
 * The first determines the moment _when_ we should reduce CWND and,
 * hence, slow down forward transmission. In fact, it determines the moment
 * when we decide that hole is caused by loss, rather than by a reorder.
 *
 * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
 * holes, caused by lost packets.
 *
 * And the most logically complicated part of algorithm is undo
 * heuristics. We detect false retransmits due to both too early
 * fast retransmit (reordering) and underestimated RTO, analyzing
 * timestamps and D-SACKs. When we detect that some segments were
 * retransmitted by mistake and CWND reduction was wrong, we undo
 * window reduction and abort recovery phase. This logic is hidden
 * inside several functions named tcp_try_undo_<something>.
 */

/* This function decides, when we should leave Disordered state
 * and enter Recovery phase, reducing congestion window.
 *
 * Main question: may we further continue forward transmission
 * with the same cwnd?
 */
E
Eric Dumazet 已提交
2316
static bool tcp_time_to_recover(struct sock *sk, int flag)
L
Linus Torvalds 已提交
2317
{
2318
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
2319 2320 2321

	/* Trick#1: The loss is proven. */
	if (tp->lost_out)
E
Eric Dumazet 已提交
2322
		return true;
L
Linus Torvalds 已提交
2323 2324

	/* Not-A-Trick#2 : Classic rule... */
2325
	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
E
Eric Dumazet 已提交
2326
		return true;
L
Linus Torvalds 已提交
2327

E
Eric Dumazet 已提交
2328
	return false;
L
Linus Torvalds 已提交
2329 2330
}

2331
/* Detect loss in event "A" above by marking head of queue up as lost.
2332
 * For RFC3517 SACK, a segment is considered lost if it
2333 2334
 * has at least tp->reordering SACKed seqments above it; "packets" refers to
 * the maximum SACKed segments to pass before reaching this limit.
2335
 */
I
Ilpo Järvinen 已提交
2336
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
L
Linus Torvalds 已提交
2337
{
2338
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
2339
	struct sk_buff *skb;
2340
	int cnt;
2341
	/* Use SACK to deduce losses of new sequences sent during recovery */
2342
	const u32 loss_high = tp->snd_nxt;
L
Linus Torvalds 已提交
2343

2344
	WARN_ON(packets > tp->packets_out);
2345 2346
	skb = tp->lost_skb_hint;
	if (skb) {
I
Ilpo Järvinen 已提交
2347
		/* Head already handled? */
2348
		if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
I
Ilpo Järvinen 已提交
2349
			return;
2350
		cnt = tp->lost_cnt_hint;
2351
	} else {
2352
		skb = tcp_rtx_queue_head(sk);
2353 2354
		cnt = 0;
	}
L
Linus Torvalds 已提交
2355

2356
	skb_rbtree_walk_from(skb) {
2357 2358 2359 2360
		/* TODO: do this better */
		/* this is not the most efficient way to do this... */
		tp->lost_skb_hint = skb;
		tp->lost_cnt_hint = cnt;
2361

2362
		if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2363 2364
			break;

2365
		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2366 2367
			cnt += tcp_skb_pcount(skb);

2368 2369
		if (cnt > packets)
			break;
2370

2371 2372
		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
			tcp_mark_skb_lost(sk, skb);
I
Ilpo Järvinen 已提交
2373 2374 2375

		if (mark_head)
			break;
L
Linus Torvalds 已提交
2376
	}
2377
	tcp_verify_left_out(tp);
L
Linus Torvalds 已提交
2378 2379 2380 2381
}

/* Account newly detected lost packet(s) */

2382
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
L
Linus Torvalds 已提交
2383
{
2384 2385
	struct tcp_sock *tp = tcp_sk(sk);

2386
	if (tcp_is_sack(tp)) {
2387
		int sacked_upto = tp->sacked_out - tp->reordering;
I
Ilpo Järvinen 已提交
2388 2389 2390 2391
		if (sacked_upto >= 0)
			tcp_mark_head_lost(sk, sacked_upto, 0);
		else if (fast_rexmit)
			tcp_mark_head_lost(sk, 1, 1);
L
Linus Torvalds 已提交
2392 2393 2394
	}
}

2395 2396 2397 2398 2399 2400
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
	       before(tp->rx_opt.rcv_tsecr, when);
}

2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
/* skb is spurious retransmitted if the returned timestamp echo
 * reply is prior to the skb transmission time
 */
static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				     const struct sk_buff *skb)
{
	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
}

L
Linus Torvalds 已提交
2411 2412 2413
/* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
2414
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
L
Linus Torvalds 已提交
2415
{
2416
	return tp->retrans_stamp &&
2417
	       tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
L
Linus Torvalds 已提交
2418 2419 2420 2421
}

/* Undo procedures. */

2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443
/* We can clear retrans_stamp when there are no retransmissions in the
 * window. It would seem that it is trivially available for us in
 * tp->retrans_out, however, that kind of assumptions doesn't consider
 * what will happen if errors occur when sending retransmission for the
 * second time. ...It could the that such segment has only
 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
 * the head skb is enough except for some reneging corner cases that
 * are not worth the effort.
 *
 * Main reason for all this complexity is the fact that connection dying
 * time now depends on the validity of the retrans_stamp, in particular,
 * that successive retransmissions of a segment must not advance
 * retrans_stamp under any conditions.
 */
static bool tcp_any_retrans_done(const struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

	if (tp->retrans_out)
		return true;

2444
	skb = tcp_rtx_queue_head(sk);
2445 2446 2447 2448 2449 2450
	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
		return true;

	return false;
}

2451
static void DBGUNDO(struct sock *sk, const char *msg)
L
Linus Torvalds 已提交
2452
{
J
Joe Perches 已提交
2453
#if FASTRETRANS_DEBUG > 1
2454
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
2455
	struct inet_sock *inet = inet_sk(sk);
2456

2457
	if (sk->sk_family == AF_INET) {
2458 2459 2460 2461 2462 2463
		pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
			 msg,
			 &inet->inet_daddr, ntohs(inet->inet_dport),
			 tp->snd_cwnd, tcp_left_out(tp),
			 tp->snd_ssthresh, tp->prior_ssthresh,
			 tp->packets_out);
2464
	}
E
Eric Dumazet 已提交
2465
#if IS_ENABLED(CONFIG_IPV6)
2466
	else if (sk->sk_family == AF_INET6) {
2467 2468
		pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
			 msg,
2469
			 &sk->sk_v6_daddr, ntohs(inet->inet_dport),
2470 2471 2472
			 tp->snd_cwnd, tcp_left_out(tp),
			 tp->snd_ssthresh, tp->prior_ssthresh,
			 tp->packets_out);
2473 2474
	}
#endif
L
Linus Torvalds 已提交
2475
#endif
J
Joe Perches 已提交
2476
}
L
Linus Torvalds 已提交
2477

2478
static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
L
Linus Torvalds 已提交
2479
{
2480 2481
	struct tcp_sock *tp = tcp_sk(sk);

Y
Yuchung Cheng 已提交
2482 2483 2484
	if (unmark_loss) {
		struct sk_buff *skb;

2485
		skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
Y
Yuchung Cheng 已提交
2486 2487 2488 2489 2490 2491
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
		}
		tp->lost_out = 0;
		tcp_clear_all_retrans_hints(tp);
	}

L
Linus Torvalds 已提交
2492
	if (tp->prior_ssthresh) {
2493 2494
		const struct inet_connection_sock *icsk = inet_csk(sk);

2495
		tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
L
Linus Torvalds 已提交
2496

2497
		if (tp->prior_ssthresh > tp->snd_ssthresh) {
L
Linus Torvalds 已提交
2498
			tp->snd_ssthresh = tp->prior_ssthresh;
2499
			tcp_ecn_withdraw_cwr(tp);
L
Linus Torvalds 已提交
2500 2501
		}
	}
2502
	tp->snd_cwnd_stamp = tcp_jiffies32;
2503
	tp->undo_marker = 0;
2504
	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
L
Linus Torvalds 已提交
2505 2506
}

2507
static inline bool tcp_may_undo(const struct tcp_sock *tp)
L
Linus Torvalds 已提交
2508
{
2509
	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
L
Linus Torvalds 已提交
2510 2511 2512
}

/* People celebrate: "We love our President!" */
E
Eric Dumazet 已提交
2513
static bool tcp_try_undo_recovery(struct sock *sk)
L
Linus Torvalds 已提交
2514
{
2515 2516
	struct tcp_sock *tp = tcp_sk(sk);

L
Linus Torvalds 已提交
2517
	if (tcp_may_undo(tp)) {
2518 2519
		int mib_idx;

L
Linus Torvalds 已提交
2520 2521 2522
		/* Happy end! We did not retransmit anything
		 * or our original transmission succeeded.
		 */
2523
		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2524
		tcp_undo_cwnd_reduction(sk, false);
2525
		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2526
			mib_idx = LINUX_MIB_TCPLOSSUNDO;
L
Linus Torvalds 已提交
2527
		else
2528 2529
			mib_idx = LINUX_MIB_TCPFULLUNDO;

2530
		NET_INC_STATS(sock_net(sk), mib_idx);
2531 2532
	} else if (tp->rack.reo_wnd_persist) {
		tp->rack.reo_wnd_persist--;
L
Linus Torvalds 已提交
2533
	}
2534
	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
L
Linus Torvalds 已提交
2535 2536 2537
		/* Hold old state until something *above* high_seq
		 * is ACKed. For Reno it is MUST to prevent false
		 * fast retransmits (RFC2582). SACK TCP is safe. */
2538 2539
		if (!tcp_any_retrans_done(sk))
			tp->retrans_stamp = 0;
E
Eric Dumazet 已提交
2540
		return true;
L
Linus Torvalds 已提交
2541
	}
2542
	tcp_set_ca_state(sk, TCP_CA_Open);
2543
	tp->is_sack_reneg = 0;
E
Eric Dumazet 已提交
2544
	return false;
L
Linus Torvalds 已提交
2545 2546 2547
}

/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
2548
static bool tcp_try_undo_dsack(struct sock *sk)
L
Linus Torvalds 已提交
2549
{
2550 2551
	struct tcp_sock *tp = tcp_sk(sk);

L
Linus Torvalds 已提交
2552
	if (tp->undo_marker && !tp->undo_retrans) {
2553 2554
		tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
					       tp->rack.reo_wnd_persist + 1);
2555
		DBGUNDO(sk, "D-SACK");
2556
		tcp_undo_cwnd_reduction(sk, false);
2557
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2558
		return true;
L
Linus Torvalds 已提交
2559
	}
2560
	return false;
L
Linus Torvalds 已提交
2561 2562
}

Y
Yuchung Cheng 已提交
2563 2564
/* Undo during loss recovery after partial ACK or using F-RTO. */
static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
L
Linus Torvalds 已提交
2565
{
2566 2567
	struct tcp_sock *tp = tcp_sk(sk);

Y
Yuchung Cheng 已提交
2568
	if (frto_undo || tcp_may_undo(tp)) {
2569
		tcp_undo_cwnd_reduction(sk, true);
2570

2571
		DBGUNDO(sk, "partial loss");
2572
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
Y
Yuchung Cheng 已提交
2573
		if (frto_undo)
2574
			NET_INC_STATS(sock_net(sk),
2575
					LINUX_MIB_TCPSPURIOUSRTOS);
2576
		inet_csk(sk)->icsk_retransmits = 0;
2577
		if (frto_undo || tcp_is_sack(tp)) {
2578
			tcp_set_ca_state(sk, TCP_CA_Open);
2579 2580
			tp->is_sack_reneg = 0;
		}
E
Eric Dumazet 已提交
2581
		return true;
L
Linus Torvalds 已提交
2582
	}
E
Eric Dumazet 已提交
2583
	return false;
L
Linus Torvalds 已提交
2584 2585
}

2586
/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
2587 2588 2589 2590
 * It computes the number of packets to send (sndcnt) based on packets newly
 * delivered:
 *   1) If the packets in flight is larger than ssthresh, PRR spreads the
 *	cwnd reductions across a full RTT.
2591
 *   2) Otherwise PRR uses packet conservation to send as much as delivered.
2592
 *      But when SND_UNA is acked without further losses,
2593
 *      slow starts cwnd up to ssthresh to speed up the recovery.
2594
 */
2595
static void tcp_init_cwnd_reduction(struct sock *sk)
2596 2597 2598 2599
{
	struct tcp_sock *tp = tcp_sk(sk);

	tp->high_seq = tp->snd_nxt;
N
Nandita Dukkipati 已提交
2600
	tp->tlp_high_seq = 0;
2601 2602 2603 2604
	tp->snd_cwnd_cnt = 0;
	tp->prior_cwnd = tp->snd_cwnd;
	tp->prr_delivered = 0;
	tp->prr_out = 0;
2605
	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2606
	tcp_ecn_queue_cwr(tp);
2607 2608
}

2609
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
2610 2611 2612 2613 2614
{
	struct tcp_sock *tp = tcp_sk(sk);
	int sndcnt = 0;
	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);

2615 2616 2617
	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
		return;

2618
	tp->prr_delivered += newly_acked_sacked;
2619
	if (delta < 0) {
2620 2621 2622
		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
			       tp->prior_cwnd - 1;
		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2623
	} else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) {
2624 2625 2626
		sndcnt = min_t(int, delta,
			       max_t(int, tp->prr_delivered - tp->prr_out,
				     newly_acked_sacked) + 1);
2627 2628
	} else {
		sndcnt = min(delta, newly_acked_sacked);
2629
	}
2630 2631
	/* Force a fast retransmit upon entering fast recovery */
	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
2632 2633 2634
	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}

2635
static inline void tcp_end_cwnd_reduction(struct sock *sk)
L
Linus Torvalds 已提交
2636
{
2637
	struct tcp_sock *tp = tcp_sk(sk);
2638

2639 2640 2641
	if (inet_csk(sk)->icsk_ca_ops->cong_control)
		return;

2642
	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2643 2644
	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
	    (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
2645
		tp->snd_cwnd = tp->snd_ssthresh;
2646
		tp->snd_cwnd_stamp = tcp_jiffies32;
2647
	}
2648
	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
L
Linus Torvalds 已提交
2649 2650
}

2651
/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2652
void tcp_enter_cwr(struct sock *sk)
Y
Yuchung Cheng 已提交
2653 2654 2655 2656
{
	struct tcp_sock *tp = tcp_sk(sk);

	tp->prior_ssthresh = 0;
2657
	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
Y
Yuchung Cheng 已提交
2658
		tp->undo_marker = 0;
2659
		tcp_init_cwnd_reduction(sk);
Y
Yuchung Cheng 已提交
2660 2661 2662
		tcp_set_ca_state(sk, TCP_CA_CWR);
	}
}
2663
EXPORT_SYMBOL(tcp_enter_cwr);
Y
Yuchung Cheng 已提交
2664

2665 2666 2667 2668 2669
static void tcp_try_keep_open(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int state = TCP_CA_Open;

2670
	if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2671 2672 2673 2674 2675 2676 2677 2678
		state = TCP_CA_Disorder;

	if (inet_csk(sk)->icsk_ca_state != state) {
		tcp_set_ca_state(sk, state);
		tp->high_seq = tp->snd_nxt;
	}
}

2679
static void tcp_try_to_open(struct sock *sk, int flag)
L
Linus Torvalds 已提交
2680
{
2681 2682
	struct tcp_sock *tp = tcp_sk(sk);

2683 2684
	tcp_verify_left_out(tp);

Y
Yuchung Cheng 已提交
2685
	if (!tcp_any_retrans_done(sk))
L
Linus Torvalds 已提交
2686 2687
		tp->retrans_stamp = 0;

2688
	if (flag & FLAG_ECE)
2689
		tcp_enter_cwr(sk);
L
Linus Torvalds 已提交
2690

2691
	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2692
		tcp_try_keep_open(sk);
L
Linus Torvalds 已提交
2693 2694 2695
	}
}

J
John Heffner 已提交
2696 2697 2698 2699 2700 2701
static void tcp_mtup_probe_failed(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
	icsk->icsk_mtup.probe_size = 0;
2702
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
J
John Heffner 已提交
2703 2704
}

2705
static void tcp_mtup_probe_success(struct sock *sk)
J
John Heffner 已提交
2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

	/* FIXME: breaks with very large cwnd */
	tp->prior_ssthresh = tcp_current_ssthresh(sk);
	tp->snd_cwnd = tp->snd_cwnd *
		       tcp_mss_to_mtu(sk, tp->mss_cache) /
		       icsk->icsk_mtup.probe_size;
	tp->snd_cwnd_cnt = 0;
2716
	tp->snd_cwnd_stamp = tcp_jiffies32;
2717
	tp->snd_ssthresh = tcp_current_ssthresh(sk);
J
John Heffner 已提交
2718 2719 2720 2721

	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
	icsk->icsk_mtup.probe_size = 0;
	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2722
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
J
John Heffner 已提交
2723 2724
}

2725 2726 2727 2728 2729 2730 2731 2732 2733
/* Do a simple retransmit without using the backoff mechanisms in
 * tcp_timer. This is used for path mtu discovery.
 * The socket is already locked here.
 */
void tcp_simple_retransmit(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749
	int mss;

	/* A fastopen SYN request is stored as two separate packets within
	 * the retransmit queue, this is done by tcp_send_syn_data().
	 * As a result simply checking the MSS of the frames in the queue
	 * will not work for the SYN packet.
	 *
	 * Us being here is an indication of a path MTU issue so we can
	 * assume that the fastopen SYN was lost and just mark all the
	 * frames in the retransmit queue as lost. We will use an MSS of
	 * -1 to mark all frames as lost, otherwise compute the current MSS.
	 */
	if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
		mss = -1;
	else
		mss = tcp_current_mss(sk);
2750

2751
	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
Y
Yuchung Cheng 已提交
2752
		if (tcp_skb_seglen(skb) > mss)
2753
			tcp_mark_skb_lost(sk, skb);
2754 2755 2756 2757
	}

	tcp_clear_retrans_hints_partial(tp);

2758
	if (!tp->lost_out)
2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779
		return;

	if (tcp_is_reno(tp))
		tcp_limit_reno_sacked(tp);

	tcp_verify_left_out(tp);

	/* Don't muck with the congestion window here.
	 * Reason is that we do not increase amount of _data_
	 * in network, but units changed and effective
	 * cwnd/ssthresh really reduced now.
	 */
	if (icsk->icsk_ca_state != TCP_CA_Loss) {
		tp->high_seq = tp->snd_nxt;
		tp->snd_ssthresh = tcp_current_ssthresh(sk);
		tp->prior_ssthresh = 0;
		tp->undo_marker = 0;
		tcp_set_ca_state(sk, TCP_CA_Loss);
	}
	tcp_xmit_retransmit_queue(sk);
}
E
Eric Dumazet 已提交
2780
EXPORT_SYMBOL(tcp_simple_retransmit);
2781

2782
void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2783 2784 2785 2786 2787 2788 2789 2790 2791
{
	struct tcp_sock *tp = tcp_sk(sk);
	int mib_idx;

	if (tcp_is_reno(tp))
		mib_idx = LINUX_MIB_TCPRENORECOVERY;
	else
		mib_idx = LINUX_MIB_TCPSACKRECOVERY;

2792
	NET_INC_STATS(sock_net(sk), mib_idx);
2793 2794

	tp->prior_ssthresh = 0;
Y
Yuchung Cheng 已提交
2795
	tcp_init_undo(tp);
2796

2797
	if (!tcp_in_cwnd_reduction(sk)) {
2798 2799
		if (!ece_ack)
			tp->prior_ssthresh = tcp_current_ssthresh(sk);
2800
		tcp_init_cwnd_reduction(sk);
2801 2802 2803 2804
	}
	tcp_set_ca_state(sk, TCP_CA_Recovery);
}

2805 2806 2807
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
 * recovered or spurious. Otherwise retransmits more on partial ACKs.
 */
2808
static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
2809
			     int *rexmit)
2810 2811
{
	struct tcp_sock *tp = tcp_sk(sk);
Y
Yuchung Cheng 已提交
2812
	bool recovered = !before(tp->snd_una, tp->high_seq);
2813

2814
	if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
2815 2816 2817
	    tcp_try_undo_loss(sk, false))
		return;

2818
	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2819 2820 2821 2822 2823 2824 2825
		/* Step 3.b. A timeout is spurious if not all data are
		 * lost, i.e., never-retransmitted data are (s)acked.
		 */
		if ((flag & FLAG_ORIG_SACK_ACKED) &&
		    tcp_try_undo_loss(sk, true))
			return;

2826
		if (after(tp->snd_nxt, tp->high_seq)) {
2827
			if (flag & FLAG_DATA_SACKED || num_dupack)
2828
				tp->frto = 0; /* Step 3.a. loss was real */
Y
Yuchung Cheng 已提交
2829 2830
		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
			tp->high_seq = tp->snd_nxt;
2831 2832 2833 2834
			/* Step 2.b. Try send new data (but deferred until cwnd
			 * is updated in tcp_ack()). Otherwise fall back to
			 * the conventional recovery.
			 */
2835
			if (!tcp_write_queue_empty(sk) &&
2836 2837 2838 2839
			    after(tcp_wnd_end(tp), tp->snd_nxt)) {
				*rexmit = REXMIT_NEW;
				return;
			}
Y
Yuchung Cheng 已提交
2840 2841 2842 2843 2844 2845
			tp->frto = 0;
		}
	}

	if (recovered) {
		/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2846 2847 2848
		tcp_try_undo_recovery(sk);
		return;
	}
Y
Yuchung Cheng 已提交
2849 2850 2851 2852
	if (tcp_is_reno(tp)) {
		/* A Reno DUPACK means new data in F-RTO step 2.b above are
		 * delivered. Lower inflight to clock out (re)tranmissions.
		 */
2853
		if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2854
			tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
Y
Yuchung Cheng 已提交
2855 2856 2857
		else if (flag & FLAG_SND_UNA_ADVANCED)
			tcp_reset_reno_sack(tp);
	}
2858
	*rexmit = REXMIT_LOST;
2859 2860
}

2861 2862 2863 2864 2865 2866 2867 2868
static bool tcp_force_fast_retransmit(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	return after(tcp_highest_sack_seq(tp),
		     tp->snd_una + tp->reordering * tp->mss_cache);
}

Y
Yuchung Cheng 已提交
2869
/* Undo during fast recovery after partial ACK. */
2870 2871
static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
				 bool *do_lost)
Y
Yuchung Cheng 已提交
2872 2873 2874
{
	struct tcp_sock *tp = tcp_sk(sk);

2875
	if (tp->undo_marker && tcp_packet_delayed(tp)) {
Y
Yuchung Cheng 已提交
2876
		/* Plain luck! Hole if filled with delayed
2877
		 * packet, rather than with a retransmit. Check reordering.
Y
Yuchung Cheng 已提交
2878
		 */
2879
		tcp_check_sack_reordering(sk, prior_snd_una, 1);
2880 2881 2882 2883 2884 2885

		/* We are getting evidence that the reordering degree is higher
		 * than we realized. If there are no retransmits out then we
		 * can undo. Otherwise we clock out new packets but do not
		 * mark more packets lost or retransmit more.
		 */
2886
		if (tp->retrans_out)
2887 2888
			return true;

Y
Yuchung Cheng 已提交
2889 2890 2891
		if (!tcp_any_retrans_done(sk))
			tp->retrans_stamp = 0;

2892 2893
		DBGUNDO(sk, "partial recovery");
		tcp_undo_cwnd_reduction(sk, true);
2894
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2895
		tcp_try_keep_open(sk);
2896 2897 2898
	} else {
		/* Partial ACK arrived. Force fast retransmit. */
		*do_lost = tcp_force_fast_retransmit(sk);
Y
Yuchung Cheng 已提交
2899
	}
2900
	return false;
Y
Yuchung Cheng 已提交
2901 2902
}

2903
static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
2904 2905 2906
{
	struct tcp_sock *tp = tcp_sk(sk);

2907 2908 2909 2910 2911 2912
	if (tcp_rtx_queue_empty(sk))
		return;

	if (unlikely(tcp_is_reno(tp))) {
		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
	} else if (tcp_is_rack(sk)) {
2913 2914
		u32 prior_retrans = tp->retrans_out;

2915 2916
		if (tcp_rack_mark_lost(sk))
			*ack_flag &= ~FLAG_SET_XMIT_TIMER;
2917 2918 2919 2920 2921
		if (prior_retrans > tp->retrans_out)
			*ack_flag |= FLAG_LOST_RETRANS;
	}
}

L
Linus Torvalds 已提交
2922 2923 2924 2925 2926
/* Process an event, which can update packets-in-flight not trivially.
 * Main goal of this function is to calculate new estimate for left_out,
 * taking into account both packets sitting in receiver's buffer and
 * packets lost by network.
 *
2927 2928 2929
 * Besides that it updates the congestion state when packet loss or ECN
 * is detected. But it does not reduce the cwnd, it is done by the
 * congestion control later.
L
Linus Torvalds 已提交
2930 2931 2932 2933
 *
 * It does _not_ decide what to send, it is made in function
 * tcp_xmit_retransmit_queue().
 */
2934
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2935
				  int num_dupack, int *ack_flag, int *rexmit)
L
Linus Torvalds 已提交
2936
{
2937
	struct inet_connection_sock *icsk = inet_csk(sk);
L
Linus Torvalds 已提交
2938
	struct tcp_sock *tp = tcp_sk(sk);
2939
	int fast_rexmit = 0, flag = *ack_flag;
2940
	bool ece_ack = flag & FLAG_ECE;
2941 2942
	bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
				      tcp_force_fast_retransmit(sk));
L
Linus Torvalds 已提交
2943

2944
	if (!tp->packets_out && tp->sacked_out)
L
Linus Torvalds 已提交
2945 2946
		tp->sacked_out = 0;

2947
	/* Now state machine starts.
L
Linus Torvalds 已提交
2948
	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2949
	if (ece_ack)
L
Linus Torvalds 已提交
2950 2951 2952
		tp->prior_ssthresh = 0;

	/* B. In all the states check for reneging SACKs. */
2953
	if (tcp_check_sack_reneging(sk, flag))
L
Linus Torvalds 已提交
2954 2955
		return;

2956
	/* C. Check consistency of the current state. */
2957
	tcp_verify_left_out(tp);
L
Linus Torvalds 已提交
2958

2959
	/* D. Check state exit conditions. State can be terminated
L
Linus Torvalds 已提交
2960
	 *    when high_seq is ACKed. */
2961
	if (icsk->icsk_ca_state == TCP_CA_Open) {
2962
		WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
L
Linus Torvalds 已提交
2963 2964
		tp->retrans_stamp = 0;
	} else if (!before(tp->snd_una, tp->high_seq)) {
2965
		switch (icsk->icsk_ca_state) {
L
Linus Torvalds 已提交
2966 2967 2968 2969
		case TCP_CA_CWR:
			/* CWR is to be held something *above* high_seq
			 * is ACKed for CWR bit to reach receiver. */
			if (tp->snd_una != tp->high_seq) {
2970
				tcp_end_cwnd_reduction(sk);
2971
				tcp_set_ca_state(sk, TCP_CA_Open);
L
Linus Torvalds 已提交
2972 2973 2974 2975
			}
			break;

		case TCP_CA_Recovery:
2976
			if (tcp_is_reno(tp))
L
Linus Torvalds 已提交
2977
				tcp_reset_reno_sack(tp);
2978
			if (tcp_try_undo_recovery(sk))
L
Linus Torvalds 已提交
2979
				return;
2980
			tcp_end_cwnd_reduction(sk);
L
Linus Torvalds 已提交
2981 2982 2983 2984
			break;
		}
	}

2985
	/* E. Process state. */
2986
	switch (icsk->icsk_ca_state) {
L
Linus Torvalds 已提交
2987
	case TCP_CA_Recovery:
2988
		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2989
			if (tcp_is_reno(tp))
2990
				tcp_add_reno_sack(sk, num_dupack, ece_ack);
2991
		} else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
2992
			return;
2993 2994 2995 2996

		if (tcp_try_undo_dsack(sk))
			tcp_try_keep_open(sk);

2997
		tcp_identify_packet_loss(sk, ack_flag);
2998 2999 3000 3001 3002 3003 3004 3005
		if (icsk->icsk_ca_state != TCP_CA_Recovery) {
			if (!tcp_time_to_recover(sk, flag))
				return;
			/* Undo reverts the recovery state. If loss is evident,
			 * starts a new recovery (e.g. reordering then loss);
			 */
			tcp_enter_recovery(sk, ece_ack);
		}
L
Linus Torvalds 已提交
3006 3007
		break;
	case TCP_CA_Loss:
3008
		tcp_process_loss(sk, flag, num_dupack, rexmit);
3009
		tcp_identify_packet_loss(sk, ack_flag);
3010 3011
		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
		      (*ack_flag & FLAG_LOST_RETRANS)))
L
Linus Torvalds 已提交
3012
			return;
3013
		/* Change state if cwnd is undone or retransmits are lost */
J
Joe Perches 已提交
3014
		fallthrough;
L
Linus Torvalds 已提交
3015
	default:
3016
		if (tcp_is_reno(tp)) {
3017
			if (flag & FLAG_SND_UNA_ADVANCED)
L
Linus Torvalds 已提交
3018
				tcp_reset_reno_sack(tp);
3019
			tcp_add_reno_sack(sk, num_dupack, ece_ack);
L
Linus Torvalds 已提交
3020 3021
		}

3022
		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3023
			tcp_try_undo_dsack(sk);
L
Linus Torvalds 已提交
3024

3025
		tcp_identify_packet_loss(sk, ack_flag);
3026
		if (!tcp_time_to_recover(sk, flag)) {
3027
			tcp_try_to_open(sk, flag);
L
Linus Torvalds 已提交
3028 3029 3030
			return;
		}

J
John Heffner 已提交
3031 3032 3033
		/* MTU probe failure: don't reduce cwnd */
		if (icsk->icsk_ca_state < TCP_CA_CWR &&
		    icsk->icsk_mtup.probe_size &&
3034
		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
J
John Heffner 已提交
3035 3036 3037 3038 3039 3040 3041
			tcp_mtup_probe_failed(sk);
			/* Restores the reduction we did in tcp_mtup_probe() */
			tp->snd_cwnd++;
			tcp_simple_retransmit(sk);
			return;
		}

L
Linus Torvalds 已提交
3042
		/* Otherwise enter Recovery state */
3043
		tcp_enter_recovery(sk, ece_ack);
3044
		fast_rexmit = 1;
L
Linus Torvalds 已提交
3045 3046
	}

3047
	if (!tcp_is_rack(sk) && do_lost)
3048
		tcp_update_scoreboard(sk, fast_rexmit);
3049
	*rexmit = REXMIT_LOST;
L
Linus Torvalds 已提交
3050 3051
}

3052
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
3053
{
3054
	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
3055 3056
	struct tcp_sock *tp = tcp_sk(sk);

3057 3058 3059 3060 3061 3062 3063
	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
		/* If the remote keeps returning delayed ACKs, eventually
		 * the min filter would pick it up and overestimate the
		 * prop. delay when it expires. Skip suspected delayed ACKs.
		 */
		return;
	}
3064
	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
3065
			   rtt_us ? : jiffies_to_usecs(1));
3066 3067
}

3068 3069 3070
static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
			       long seq_rtt_us, long sack_rtt_us,
			       long ca_rtt_us, struct rate_sample *rs)
3071
{
3072 3073 3074 3075 3076 3077 3078
	const struct tcp_sock *tp = tcp_sk(sk);

	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
	 * broken middle-boxes or peers may corrupt TS-ECR fields. But
	 * Karn's algorithm forbids taking RTT if some retransmitted data
	 * is acked (RFC6298).
	 */
3079 3080
	if (seq_rtt_us < 0)
		seq_rtt_us = sack_rtt_us;
Y
Yuchung Cheng 已提交
3081

L
Linus Torvalds 已提交
3082 3083 3084 3085 3086 3087
	/* RTTM Rule: A TSecr value received in a segment is used to
	 * update the averaged RTT measurement only if the segment
	 * acknowledges some new data, i.e., only if it advances the
	 * left edge of the send window.
	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
	 */
3088
	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
3089 3090 3091
	    flag & FLAG_ACKED) {
		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;

3092
		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
3093 3094
			if (!delta)
				delta = 1;
3095 3096 3097
			seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
			ca_rtt_us = seq_rtt_us;
		}
3098
	}
3099
	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
3100
	if (seq_rtt_us < 0)
Y
Yuchung Cheng 已提交
3101
		return false;
L
Linus Torvalds 已提交
3102

3103 3104 3105 3106
	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
	 * always taken together with ACK, SACK, or TS-opts. Any negative
	 * values will be skipped with the seq_rtt_us < 0 check above.
	 */
3107
	tcp_update_rtt_min(sk, ca_rtt_us, flag);
3108
	tcp_rtt_estimator(sk, seq_rtt_us);
3109
	tcp_set_rto(sk);
L
Linus Torvalds 已提交
3110

3111 3112
	/* RFC6298: only reset backoff on valid RTT measurement. */
	inet_csk(sk)->icsk_backoff = 0;
Y
Yuchung Cheng 已提交
3113
	return true;
L
Linus Torvalds 已提交
3114 3115
}

3116
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
Y
Yuchung Cheng 已提交
3117
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
3118
{
3119
	struct rate_sample rs;
Y
Yuchung Cheng 已提交
3120
	long rtt_us = -1L;
3121

3122 3123
	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
		rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
Y
Yuchung Cheng 已提交
3124

3125
	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
3126 3127
}

Y
Yuchung Cheng 已提交
3128

3129
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
L
Linus Torvalds 已提交
3130
{
3131
	const struct inet_connection_sock *icsk = inet_csk(sk);
3132 3133

	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3134
	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
L
Linus Torvalds 已提交
3135 3136 3137 3138 3139
}

/* Restart timer after forward progress on connection.
 * RFC2988 recommends to restart timer to now+rto.
 */
3140
void tcp_rearm_rto(struct sock *sk)
L
Linus Torvalds 已提交
3141
{
N
Nandita Dukkipati 已提交
3142
	const struct inet_connection_sock *icsk = inet_csk(sk);
3143
	struct tcp_sock *tp = tcp_sk(sk);
3144

3145 3146 3147
	/* If the retrans timer is currently being used by Fast Open
	 * for SYN-ACK retrans purpose, stay put.
	 */
3148
	if (rcu_access_pointer(tp->fastopen_rsk))
3149 3150
		return;

L
Linus Torvalds 已提交
3151
	if (!tp->packets_out) {
3152
		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
L
Linus Torvalds 已提交
3153
	} else {
3154 3155
		u32 rto = inet_csk(sk)->icsk_rto;
		/* Offset the time elapsed after installing regular RTO */
Y
Yuchung Cheng 已提交
3156
		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
N
Nandita Dukkipati 已提交
3157
		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3158
			s64 delta_us = tcp_rto_delta_us(sk);
E
Eric Dumazet 已提交
3159
			/* delta_us may not be positive if the socket is locked
N
Nandita Dukkipati 已提交
3160
			 * when the retrans timer fires and is rescheduled.
3161
			 */
3162
			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3163
		}
3164
		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3165
				     TCP_RTO_MAX);
L
Linus Torvalds 已提交
3166
	}
3167 3168
}

3169 3170 3171
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
3172
	if (!tcp_schedule_loss_probe(sk, true))
3173 3174 3175
		tcp_rearm_rto(sk);
}

3176
/* If we get here, the whole TSO packet has not been acked. */
3177
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
3178 3179
{
	struct tcp_sock *tp = tcp_sk(sk);
3180
	u32 packets_acked;
L
Linus Torvalds 已提交
3181

3182
	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
L
Linus Torvalds 已提交
3183 3184

	packets_acked = tcp_skb_pcount(skb);
3185
	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
L
Linus Torvalds 已提交
3186 3187 3188 3189 3190
		return 0;
	packets_acked -= tcp_skb_pcount(skb);

	if (packets_acked) {
		BUG_ON(tcp_skb_pcount(skb) == 0);
3191
		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
L
Linus Torvalds 已提交
3192 3193
	}

3194
	return packets_acked;
L
Linus Torvalds 已提交
3195 3196
}

3197
static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3198
			   const struct sk_buff *ack_skb, u32 prior_snd_una)
3199 3200 3201 3202
{
	const struct skb_shared_info *shinfo;

	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
3203
	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3204 3205 3206
		return;

	shinfo = skb_shinfo(skb);
3207
	if (!before(shinfo->tskey, prior_snd_una) &&
3208 3209
	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
		tcp_skb_tsorted_save(skb) {
3210
			__skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
3211 3212
		} tcp_skb_tsorted_restore(skb);
	}
3213 3214
}

3215 3216 3217 3218
/* Remove acknowledged frames from the retransmission queue. If our packet
 * is before the ack sequence we can discard it as it's confirmed to have
 * arrived at the other end.
 */
3219 3220
static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
			       u32 prior_fack, u32 prior_snd_una,
3221
			       struct tcp_sacktag_state *sack, bool ece_ack)
L
Linus Torvalds 已提交
3222
{
3223
	const struct inet_connection_sock *icsk = inet_csk(sk);
3224
	u64 first_ackt, last_ackt;
3225 3226
	struct tcp_sock *tp = tcp_sk(sk);
	u32 prior_sacked = tp->sacked_out;
3227
	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
3228
	struct sk_buff *skb, *next;
3229
	bool fully_acked = true;
3230
	long sack_rtt_us = -1L;
3231
	long seq_rtt_us = -1L;
3232
	long ca_rtt_us = -1L;
3233
	u32 pkts_acked = 0;
3234
	bool rtt_update;
3235 3236
	int flag = 0;

3237
	first_ackt = 0;
L
Linus Torvalds 已提交
3238

3239
	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3240
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3241
		const u32 start_seq = scb->seq;
3242
		u8 sacked = scb->sacked;
3243
		u32 acked_pcount;
L
Linus Torvalds 已提交
3244

3245
		/* Determine how many packets and what bytes were acked, tso and else */
L
Linus Torvalds 已提交
3246
		if (after(scb->end_seq, tp->snd_una)) {
3247 3248 3249 3250
			if (tcp_skb_pcount(skb) == 1 ||
			    !after(tp->snd_una, scb->seq))
				break;

3251 3252
			acked_pcount = tcp_tso_acked(sk, skb);
			if (!acked_pcount)
3253
				break;
E
Eric Dumazet 已提交
3254
			fully_acked = false;
3255
		} else {
3256
			acked_pcount = tcp_skb_pcount(skb);
L
Linus Torvalds 已提交
3257 3258
		}

3259
		if (unlikely(sacked & TCPCB_RETRANS)) {
3260
			if (sacked & TCPCB_SACKED_RETRANS)
3261
				tp->retrans_out -= acked_pcount;
3262
			flag |= FLAG_RETRANS_DATA_ACKED;
3263
		} else if (!(sacked & TCPCB_SACKED_ACKED)) {
3264
			last_ackt = tcp_skb_timestamp_us(skb);
3265 3266
			WARN_ON_ONCE(last_ackt == 0);
			if (!first_ackt)
3267 3268
				first_ackt = last_ackt;

3269 3270
			if (before(start_seq, reord))
				reord = start_seq;
3271 3272
			if (!after(scb->end_seq, tp->high_seq))
				flag |= FLAG_ORIG_SACK_ACKED;
3273
		}
3274

Y
Yuchung Cheng 已提交
3275
		if (sacked & TCPCB_SACKED_ACKED) {
3276
			tp->sacked_out -= acked_pcount;
Y
Yuchung Cheng 已提交
3277
		} else if (tcp_is_sack(tp)) {
3278
			tcp_count_delivered(tp, acked_pcount, ece_ack);
Y
Yuchung Cheng 已提交
3279
			if (!tcp_skb_spurious_retrans(tp, skb))
3280
				tcp_rack_advance(tp, sacked, scb->end_seq,
3281
						 tcp_skb_timestamp_us(skb));
Y
Yuchung Cheng 已提交
3282
		}
3283
		if (sacked & TCPCB_LOST)
3284
			tp->lost_out -= acked_pcount;
3285

3286 3287
		tp->packets_out -= acked_pcount;
		pkts_acked += acked_pcount;
3288
		tcp_rate_skb_delivered(sk, skb, sack->rate);
3289

3290 3291 3292 3293 3294 3295 3296
		/* Initial outgoing SYN's get put onto the write_queue
		 * just like anything else we transmit.  It is not
		 * true data, and if we misinform our callers that
		 * this ACK acks real data, we will erroneously exit
		 * connection startup slow start one packet too
		 * quickly.  This is severely frowned upon behavior.
		 */
3297
		if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3298 3299 3300 3301 3302 3303
			flag |= FLAG_DATA_ACKED;
		} else {
			flag |= FLAG_SYN_ACKED;
			tp->retrans_stamp = 0;
		}

3304 3305 3306
		if (!fully_acked)
			break;

3307
		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3308

3309
		next = skb_rb_next(skb);
3310
		if (unlikely(skb == tp->retransmit_skb_hint))
3311
			tp->retransmit_skb_hint = NULL;
3312
		if (unlikely(skb == tp->lost_skb_hint))
3313
			tp->lost_skb_hint = NULL;
3314
		tcp_highest_sack_replace(sk, skb, next);
3315
		tcp_rtx_queue_unlink_and_free(skb, sk);
L
Linus Torvalds 已提交
3316 3317
	}

3318 3319 3320
	if (!skb)
		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);

I
Ilpo Järvinen 已提交
3321 3322 3323
	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
		tp->snd_up = tp->snd_una;

3324
	if (skb) {
3325
		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3326 3327 3328
		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
			flag |= FLAG_SACK_RENEGING;
	}
3329

3330 3331 3332
	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
3333

3334 3335
		if (pkts_acked == 1 && fully_acked && !prior_sacked &&
		    (tp->snd_una - prior_snd_una) < tp->mss_cache &&
3336 3337 3338 3339 3340 3341 3342 3343
		    sack->rate->prior_delivered + 1 == tp->delivered &&
		    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
			/* Conservatively mark a delayed ACK. It's typically
			 * from a lone runt packet over the round trip to
			 * a receiver w/o out-of-order or CE events.
			 */
			flag |= FLAG_ACK_MAYBE_DELAYED;
		}
3344
	}
3345 3346 3347
	if (sack->first_sackt) {
		sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
3348
	}
3349
	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3350
					ca_rtt_us, sack->rate);
Y
Yuchung Cheng 已提交
3351

3352
	if (flag & FLAG_ACKED) {
3353
		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
3354 3355 3356 3357 3358
		if (unlikely(icsk->icsk_mtup.probe_size &&
			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
			tcp_mtup_probe_success(sk);
		}

3359
		if (tcp_is_reno(tp)) {
3360
			tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
3361 3362 3363 3364 3365 3366 3367 3368 3369

			/* If any of the cumulatively ACKed segments was
			 * retransmitted, non-SACK case cannot confirm that
			 * progress was due to original transmission due to
			 * lack of TCPCB_SACKED_ACKED bits even if some of
			 * the packets may have been never retransmitted.
			 */
			if (flag & FLAG_RETRANS_DATA_ACKED)
				flag &= ~FLAG_ORIG_SACK_ACKED;
3370
		} else {
3371 3372
			int delta;

3373
			/* Non-retransmitted hole got filled? That's reordering */
3374 3375
			if (before(reord, prior_fack))
				tcp_check_sack_reordering(sk, reord, 0);
3376

Y
Yuchung Cheng 已提交
3377
			delta = prior_sacked - tp->sacked_out;
3378
			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3379
		}
3380
	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
3381 3382
		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
						    tcp_skb_timestamp_us(skb))) {
3383 3384 3385 3386
		/* Do not re-arm RTO if the sack RTT is measured from data sent
		 * after when the head was last (re)transmitted. Otherwise the
		 * timeout may continue to extend in loss recovery.
		 */
3387
		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
L
Linus Torvalds 已提交
3388 3389
	}

3390 3391
	if (icsk->icsk_ca_ops->pkts_acked) {
		struct ack_sample sample = { .pkts_acked = pkts_acked,
3392
					     .rtt_us = sack->rate->rtt_us };
3393

3394 3395
		sample.in_flight = tp->mss_cache *
			(tp->delivered - sack->rate->prior_delivered);
3396 3397
		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
	}
3398

L
Linus Torvalds 已提交
3399
#if FASTRETRANS_DEBUG > 0
3400 3401 3402
	WARN_ON((int)tp->sacked_out < 0);
	WARN_ON((int)tp->lost_out < 0);
	WARN_ON((int)tp->retrans_out < 0);
3403
	if (!tp->packets_out && tcp_is_sack(tp)) {
S
Stephen Hemminger 已提交
3404
		icsk = inet_csk(sk);
L
Linus Torvalds 已提交
3405
		if (tp->lost_out) {
3406 3407
			pr_debug("Leak l=%u %d\n",
				 tp->lost_out, icsk->icsk_ca_state);
L
Linus Torvalds 已提交
3408 3409 3410
			tp->lost_out = 0;
		}
		if (tp->sacked_out) {
3411 3412
			pr_debug("Leak s=%u %d\n",
				 tp->sacked_out, icsk->icsk_ca_state);
L
Linus Torvalds 已提交
3413 3414 3415
			tp->sacked_out = 0;
		}
		if (tp->retrans_out) {
3416 3417
			pr_debug("Leak r=%u %d\n",
				 tp->retrans_out, icsk->icsk_ca_state);
L
Linus Torvalds 已提交
3418 3419 3420 3421
			tp->retrans_out = 0;
		}
	}
#endif
3422
	return flag;
L
Linus Torvalds 已提交
3423 3424 3425 3426
}

static void tcp_ack_probe(struct sock *sk)
{
3427
	struct inet_connection_sock *icsk = inet_csk(sk);
3428 3429
	struct sk_buff *head = tcp_send_head(sk);
	const struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
3430 3431

	/* Was it a usable window open? */
3432 3433 3434
	if (!head)
		return;
	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3435
		icsk->icsk_backoff = 0;
3436
		icsk->icsk_probes_tstamp = 0;
3437
		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
L
Linus Torvalds 已提交
3438 3439 3440 3441
		/* Socket must be waked up by subsequent tcp_data_snd_check().
		 * This function is not for random using!
		 */
	} else {
3442
		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3443

3444 3445
		when = tcp_clamp_probe0_to_user_timeout(sk, when);
		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
L
Linus Torvalds 已提交
3446 3447 3448
	}
}

3449
static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
L
Linus Torvalds 已提交
3450
{
E
Eric Dumazet 已提交
3451 3452
	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
L
Linus Torvalds 已提交
3453 3454
}

3455
/* Decide wheather to run the increase function of congestion control. */
3456
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
L
Linus Torvalds 已提交
3457
{
3458 3459
	/* If reordering is high then always grow cwnd whenever data is
	 * delivered regardless of its ordering. Otherwise stay conservative
3460
	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
3461 3462 3463
	 * new SACK or ECE mark may first advance cwnd here and later reduce
	 * cwnd in tcp_fastretrans_alert() based on more states.
	 */
3464
	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3465 3466
		return flag & FLAG_FORWARD_PROGRESS;

3467
	return flag & FLAG_DATA_ACKED;
L
Linus Torvalds 已提交
3468 3469
}

Y
Yuchung Cheng 已提交
3470 3471 3472 3473 3474 3475
/* The "ultimate" congestion control function that aims to replace the rigid
 * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
 * It's called toward the end of processing an ACK with precise rate
 * information. All transmission or retransmission are delayed afterwards.
 */
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3476
			     int flag, const struct rate_sample *rs)
Y
Yuchung Cheng 已提交
3477
{
3478 3479 3480 3481 3482 3483 3484
	const struct inet_connection_sock *icsk = inet_csk(sk);

	if (icsk->icsk_ca_ops->cong_control) {
		icsk->icsk_ca_ops->cong_control(sk, rs);
		return;
	}

Y
Yuchung Cheng 已提交
3485 3486
	if (tcp_in_cwnd_reduction(sk)) {
		/* Reduce cwnd if state mandates */
3487
		tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
Y
Yuchung Cheng 已提交
3488 3489 3490 3491 3492 3493 3494
	} else if (tcp_may_raise_cwnd(sk, flag)) {
		/* Advance cwnd if state allows */
		tcp_cong_avoid(sk, ack, acked_sacked);
	}
	tcp_update_pacing_rate(sk);
}

L
Linus Torvalds 已提交
3495 3496 3497
/* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
3498
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3499 3500
					const u32 ack, const u32 ack_seq,
					const u32 nwin)
L
Linus Torvalds 已提交
3501
{
E
Eric Dumazet 已提交
3502
	return	after(ack, tp->snd_una) ||
L
Linus Torvalds 已提交
3503
		after(ack_seq, tp->snd_wl1) ||
E
Eric Dumazet 已提交
3504
		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
L
Linus Torvalds 已提交
3505 3506
}

3507 3508 3509 3510 3511
/* If we update tp->snd_una, also update tp->bytes_acked */
static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
	u32 delta = ack - tp->snd_una;

3512
	sock_owned_by_me((struct sock *)tp);
3513 3514 3515 3516
	tp->bytes_acked += delta;
	tp->snd_una = ack;
}

3517 3518 3519 3520 3521
/* If we update tp->rcv_nxt, also update tp->bytes_received */
static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
	u32 delta = seq - tp->rcv_nxt;

3522
	sock_owned_by_me((struct sock *)tp);
3523
	tp->bytes_received += delta;
3524
	WRITE_ONCE(tp->rcv_nxt, seq);
3525 3526
}

L
Linus Torvalds 已提交
3527 3528 3529 3530 3531
/* Update our send window.
 *
 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
 */
3532
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3533
				 u32 ack_seq)
L
Linus Torvalds 已提交
3534
{
3535
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
3536
	int flag = 0;
3537
	u32 nwin = ntohs(tcp_hdr(skb)->window);
L
Linus Torvalds 已提交
3538

3539
	if (likely(!tcp_hdr(skb)->syn))
L
Linus Torvalds 已提交
3540 3541 3542 3543
		nwin <<= tp->rx_opt.snd_wscale;

	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
		flag |= FLAG_WIN_UPDATE;
3544
		tcp_update_wl(tp, ack_seq);
L
Linus Torvalds 已提交
3545 3546 3547 3548

		if (tp->snd_wnd != nwin) {
			tp->snd_wnd = nwin;

3549 3550 3551 3552 3553 3554
			/* Note, it is the only place, where
			 * fast path is recovered for sending TCP.
			 */
			tp->pred_flags = 0;
			tcp_fast_path_check(sk);

3555
			if (!tcp_write_queue_empty(sk))
3556 3557
				tcp_slow_start_after_idle_check(sk);

L
Linus Torvalds 已提交
3558 3559
			if (nwin > tp->max_window) {
				tp->max_window = nwin;
3560
				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
L
Linus Torvalds 已提交
3561 3562 3563 3564
			}
		}
	}

3565
	tcp_snd_una_update(tp, ack);
L
Linus Torvalds 已提交
3566 3567 3568 3569

	return flag;
}

3570 3571 3572 3573
static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				   u32 *last_oow_ack_time)
{
	if (*last_oow_ack_time) {
3574
		s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3575

3576
		if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
3577 3578 3579 3580 3581
			NET_INC_STATS(net, mib_idx);
			return true;	/* rate-limited: don't send yet! */
		}
	}

3582
	*last_oow_ack_time = tcp_jiffies32;
3583 3584 3585 3586

	return false;	/* not rate-limited: go ahead, send dupack now! */
}

3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599
/* Return true if we're currently rate-limiting out-of-window ACKs and
 * thus shouldn't send a dupack right now. We rate-limit dupacks in
 * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
 * attacks that send repeated SYNs or ACKs for the same connection. To
 * do this, we do not send a duplicate SYNACK or ACK if the remote
 * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
 */
bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
			  int mib_idx, u32 *last_oow_ack_time)
{
	/* Data packets without SYNs are not likely part of an ACK loop. */
	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
	    !tcp_hdr(skb)->syn)
3600
		return false;
3601

3602
	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3603 3604
}

3605
/* RFC 5961 7 [ACK Throttling] */
3606
static void tcp_send_challenge_ack(struct sock *sk)
3607 3608 3609 3610
{
	/* unprotected vars, we dont care of overwrites */
	static u32 challenge_timestamp;
	static unsigned int challenge_count;
3611
	struct tcp_sock *tp = tcp_sk(sk);
3612
	struct net *net = sock_net(sk);
3613
	u32 count, now;
3614 3615

	/* First check our per-socket dupack rate limit. */
3616
	if (__tcp_oow_rate_limited(net,
3617 3618
				   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				   &tp->last_oow_ack_time))
3619
		return;
3620

3621
	/* Then check host-wide RFC 5961 rate limit. */
3622
	now = jiffies / HZ;
3623
	if (now != challenge_timestamp) {
3624 3625
		u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
		u32 half = (ack_limit + 1) >> 1;
3626

3627
		challenge_timestamp = now;
3628
		WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
3629
	}
3630 3631 3632
	count = READ_ONCE(challenge_count);
	if (count > 0) {
		WRITE_ONCE(challenge_count, count - 1);
3633
		NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3634 3635 3636 3637
		tcp_send_ack(sk);
	}
}

3638 3639 3640
static void tcp_store_ts_recent(struct tcp_sock *tp)
{
	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3641
	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658
}

static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
		 * extra check below makes sure this can only happen
		 * for pure ACK frames.  -DaveM
		 *
		 * Not only, also it occurs for expired timestamps.
		 */

		if (tcp_paws_check(&tp->rx_opt, 0))
			tcp_store_ts_recent(tp);
	}
}

3659 3660
/* This routine deals with acks during a TLP episode and ends an episode by
 * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
N
Nandita Dukkipati 已提交
3661 3662 3663 3664 3665
 */
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
	struct tcp_sock *tp = tcp_sk(sk);

3666
	if (before(ack, tp->tlp_high_seq))
N
Nandita Dukkipati 已提交
3667 3668
		return;

3669 3670 3671
	if (!tp->tlp_retrans) {
		/* TLP of new data has been acknowledged */
		tp->tlp_high_seq = 0;
3672
	} else if (flag & FLAG_DSACK_TLP) {
3673 3674 3675 3676 3677 3678 3679 3680 3681 3682
		/* This DSACK means original and TLP probe arrived; no loss */
		tp->tlp_high_seq = 0;
	} else if (after(ack, tp->tlp_high_seq)) {
		/* ACK advances: there was a loss, so reduce cwnd. Reset
		 * tlp_high_seq in tcp_init_cwnd_reduction()
		 */
		tcp_init_cwnd_reduction(sk);
		tcp_set_ca_state(sk, TCP_CA_CWR);
		tcp_end_cwnd_reduction(sk);
		tcp_try_keep_open(sk);
3683
		NET_INC_STATS(sock_net(sk),
3684
				LINUX_MIB_TCPLOSSPROBERECOVERY);
3685 3686 3687
	} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
		/* Pure dupack: original and TLP probe arrived; no loss */
N
Nandita Dukkipati 已提交
3688 3689 3690 3691
		tp->tlp_high_seq = 0;
	}
}

3692 3693 3694 3695 3696 3697 3698 3699
static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);

	if (icsk->icsk_ca_ops->in_ack_event)
		icsk->icsk_ca_ops->in_ack_event(sk, flags);
}

3700 3701 3702 3703 3704 3705 3706 3707
/* Congestion control has updated the cwnd already. So if we're in
 * loss recovery then now we do any new sends (for FRTO) or
 * retransmits (for CA_Loss or CA_recovery) that make sense.
 */
static void tcp_xmit_recovery(struct sock *sk, int rexmit)
{
	struct tcp_sock *tp = tcp_sk(sk);

3708
	if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
3709 3710
		return;

3711
	if (unlikely(rexmit == REXMIT_NEW)) {
3712 3713 3714 3715 3716 3717 3718 3719 3720
		__tcp_push_pending_frames(sk, tcp_current_mss(sk),
					  TCP_NAGLE_OFF);
		if (after(tp->snd_nxt, tp->high_seq))
			return;
		tp->frto = 0;
	}
	tcp_xmit_retransmit_queue(sk);
}

3721 3722 3723
/* Returns the number of packets newly acked or sacked by the current ACK */
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
{
3724
	const struct net *net = sock_net(sk);
3725 3726 3727 3728
	struct tcp_sock *tp = tcp_sk(sk);
	u32 delivered;

	delivered = tp->delivered - prior_delivered;
3729
	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3730
	if (flag & FLAG_ECE)
3731
		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3732

3733 3734 3735
	return delivered;
}

L
Linus Torvalds 已提交
3736
/* This routine deals with incoming acks, but not outgoing ones. */
3737
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
L
Linus Torvalds 已提交
3738
{
3739
	struct inet_connection_sock *icsk = inet_csk(sk);
L
Linus Torvalds 已提交
3740
	struct tcp_sock *tp = tcp_sk(sk);
3741
	struct tcp_sacktag_state sack_state;
3742
	struct rate_sample rs = { .prior_delivered = 0 };
L
Linus Torvalds 已提交
3743
	u32 prior_snd_una = tp->snd_una;
3744
	bool is_sack_reneg = tp->is_sack_reneg;
L
Linus Torvalds 已提交
3745 3746
	u32 ack_seq = TCP_SKB_CB(skb)->seq;
	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3747
	int num_dupack = 0;
3748
	int prior_packets = tp->packets_out;
3749 3750
	u32 delivered = tp->delivered;
	u32 lost = tp->lost;
3751
	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3752
	u32 prior_fack;
3753

3754
	sack_state.first_sackt = 0;
3755
	sack_state.rate = &rs;
3756
	sack_state.sack_delivered = 0;
L
Linus Torvalds 已提交
3757

3758 3759
	/* We very likely will need to access rtx queue. */
	prefetch(sk->tcp_rtx_queue.rb_node);
3760

3761
	/* If the ack is older than previous acks
L
Linus Torvalds 已提交
3762 3763
	 * then we can probably ignore it.
	 */
3764 3765 3766
	if (before(ack, prior_snd_una)) {
		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
		if (before(ack, prior_snd_una - tp->max_window)) {
3767
			if (!(flag & FLAG_NO_CHALLENGE_ACK))
3768
				tcp_send_challenge_ack(sk);
3769 3770
			return -1;
		}
L
Linus Torvalds 已提交
3771
		goto old_ack;
3772
	}
L
Linus Torvalds 已提交
3773

3774 3775 3776 3777
	/* If the ack includes data we haven't sent yet, discard
	 * this segment (RFC793 Section 3.9).
	 */
	if (after(ack, tp->snd_nxt))
Y
Yafang Shao 已提交
3778
		return -1;
3779

3780
	if (after(ack, prior_snd_una)) {
3781
		flag |= FLAG_SND_UNA_ADVANCED;
3782
		icsk->icsk_retransmits = 0;
I
Ilya Lesokhin 已提交
3783 3784

#if IS_ENABLED(CONFIG_TLS_DEVICE)
3785
		if (static_branch_unlikely(&clean_acked_data_enabled.key))
I
Ilya Lesokhin 已提交
3786 3787 3788
			if (icsk->icsk_clean_acked)
				icsk->icsk_clean_acked(sk, ack);
#endif
3789
	}
3790

E
Eric Dumazet 已提交
3791
	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3792
	rs.prior_in_flight = tcp_packets_in_flight(tp);
3793

3794 3795 3796 3797 3798 3799
	/* ts_recent update must be made after we are sure that the packet
	 * is in window.
	 */
	if (flag & FLAG_UPDATE_TS_RECENT)
		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);

3800 3801
	if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
	    FLAG_SND_UNA_ADVANCED) {
3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813
		/* Window is constant, pure forward advance.
		 * No more checks are required.
		 * Note, we use the fact that SND.UNA>=SND.WL2.
		 */
		tcp_update_wl(tp, ack_seq);
		tcp_snd_una_update(tp, ack);
		flag |= FLAG_WIN_UPDATE;

		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);

		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
	} else {
3814
		u32 ack_ev_flags = CA_ACK_SLOWPATH;
L
Linus Torvalds 已提交
3815

3816 3817 3818 3819
		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
			flag |= FLAG_DATA;
		else
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
L
Linus Torvalds 已提交
3820

3821
		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
L
Linus Torvalds 已提交
3822

3823 3824 3825
		if (TCP_SKB_CB(skb)->sacked)
			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
							&sack_state);
3826

3827 3828 3829 3830 3831
		if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
			flag |= FLAG_ECE;
			ack_ev_flags |= CA_ACK_ECE;
		}

3832 3833 3834 3835
		if (sack_state.sack_delivered)
			tcp_count_delivered(tp, sack_state.sack_delivered,
					    flag & FLAG_ECE);

3836 3837
		if (flag & FLAG_WIN_UPDATE)
			ack_ev_flags |= CA_ACK_WIN_UPDATE;
L
Linus Torvalds 已提交
3838

3839 3840
		tcp_in_ack_event(sk, ack_ev_flags);
	}
L
Linus Torvalds 已提交
3841

3842 3843 3844 3845 3846 3847 3848 3849 3850
	/* This is a deviation from RFC3168 since it states that:
	 * "When the TCP data sender is ready to set the CWR bit after reducing
	 * the congestion window, it SHOULD set the CWR bit only on the first
	 * new data packet that it transmits."
	 * We accept CWR on pure ACKs to be more robust
	 * with widely-deployed TCP implementations that do this.
	 */
	tcp_ecn_accept_cwr(sk, skb);

L
Linus Torvalds 已提交
3851 3852 3853 3854
	/* We passed data and got it acked, remove any soft error
	 * log. Something worked...
	 */
	sk->sk_err_soft = 0;
3855
	icsk->icsk_probes_out = 0;
3856
	tp->rcv_tstamp = tcp_jiffies32;
L
Linus Torvalds 已提交
3857 3858 3859 3860
	if (!prior_packets)
		goto no_queue;

	/* See if we can take anything off of the retransmit queue. */
3861 3862
	flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
				    &sack_state, flag & FLAG_ECE);
3863

3864
	tcp_rack_update_reo_wnd(sk, &rs);
3865

3866 3867 3868
	if (tp->tlp_high_seq)
		tcp_process_tlp_ack(sk, ack, flag);

3869
	if (tcp_ack_is_dubious(sk, flag)) {
3870 3871 3872 3873 3874 3875 3876
		if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
			num_dupack = 1;
			/* Consider if pure acks were aggregated in tcp_add_backlog() */
			if (!(flag & FLAG_DATA))
				num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
		}
		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3877
				      &rexmit);
L
Linus Torvalds 已提交
3878
	}
N
Nandita Dukkipati 已提交
3879

3880 3881 3882 3883
	/* If needed, reset TLP/RTO timer when RACK doesn't set. */
	if (flag & FLAG_SET_XMIT_TIMER)
		tcp_set_xmit_timer(sk);

3884 3885
	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
		sk_dst_confirm(sk);
N
Nandita Dukkipati 已提交
3886

3887
	delivered = tcp_newly_delivered(sk, delivered, flag);
3888
	lost = tp->lost - lost;			/* freshly marked lost */
3889
	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
3890
	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
3891
	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
3892
	tcp_xmit_recovery(sk, rexmit);
L
Linus Torvalds 已提交
3893 3894 3895
	return 1;

no_queue:
3896
	/* If data was DSACKed, see if we can undo a cwnd reduction. */
3897
	if (flag & FLAG_DSACKING_ACK) {
3898
		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3899
				      &rexmit);
3900 3901
		tcp_newly_delivered(sk, delivered, flag);
	}
L
Linus Torvalds 已提交
3902 3903 3904 3905
	/* If this ack opens up a zero window, clear backoff.  It was
	 * being used to time the probes, and is probably far higher than
	 * it needs to be for normal retransmission.
	 */
3906
	tcp_ack_probe(sk);
N
Nandita Dukkipati 已提交
3907 3908 3909

	if (tp->tlp_high_seq)
		tcp_process_tlp_ack(sk, ack, flag);
L
Linus Torvalds 已提交
3910 3911 3912
	return 1;

old_ack:
3913 3914 3915
	/* If data was SACKed, tag it and see if we should send more data.
	 * If data was DSACKed, see if we can undo a cwnd reduction.
	 */
3916
	if (TCP_SKB_CB(skb)->sacked) {
Y
Yuchung Cheng 已提交
3917
		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3918
						&sack_state);
3919
		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3920
				      &rexmit);
3921
		tcp_newly_delivered(sk, delivered, flag);
3922
		tcp_xmit_recovery(sk, rexmit);
3923
	}
L
Linus Torvalds 已提交
3924 3925 3926 3927

	return 0;
}

3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944
static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				      bool syn, struct tcp_fastopen_cookie *foc,
				      bool exp_opt)
{
	/* Valid only in SYN or SYN-ACK with an even length.  */
	if (!foc || !syn || len < 0 || (len & 1))
		return;

	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
	    len <= TCP_FASTOPEN_COOKIE_MAX)
		memcpy(foc->val, cookie, len);
	else if (len != 0)
		len = -1;
	foc->len = len;
	foc->exp = exp_opt;
}

3945
static bool smc_parse_options(const struct tcphdr *th,
3946 3947 3948 3949 3950 3951 3952 3953
			      struct tcp_options_received *opt_rx,
			      const unsigned char *ptr,
			      int opsize)
{
#if IS_ENABLED(CONFIG_SMC)
	if (static_branch_unlikely(&tcp_have_smc)) {
		if (th->syn && !(opsize & 1) &&
		    opsize >= TCPOLEN_EXP_SMC_BASE &&
3954
		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
3955
			opt_rx->smc_ok = 1;
3956 3957
			return true;
		}
3958 3959
	}
#endif
3960
	return false;
3961 3962
}

3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
 * value on success.
 */
static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
{
	const unsigned char *ptr = (const unsigned char *)(th + 1);
	int length = (th->doff * 4) - sizeof(struct tcphdr);
	u16 mss = 0;

	while (length > 0) {
		int opcode = *ptr++;
		int opsize;

		switch (opcode) {
		case TCPOPT_EOL:
			return mss;
		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
			length--;
			continue;
		default:
			if (length < 2)
				return mss;
			opsize = *ptr++;
			if (opsize < 2) /* "silly options" */
				return mss;
			if (opsize > length)
				return mss;	/* fail on partial options */
			if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
				u16 in_mss = get_unaligned_be16(ptr);

				if (in_mss) {
					if (user_mss && user_mss < in_mss)
						in_mss = user_mss;
					mss = in_mss;
				}
			}
			ptr += opsize - 2;
			length -= opsize;
		}
	}
	return mss;
}

L
Linus Torvalds 已提交
4006 4007 4008 4009
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
4010 4011
void tcp_parse_options(const struct net *net,
		       const struct sk_buff *skb,
C
Christoph Paasch 已提交
4012
		       struct tcp_options_received *opt_rx, int estab,
Y
Yuchung Cheng 已提交
4013
		       struct tcp_fastopen_cookie *foc)
L
Linus Torvalds 已提交
4014
{
4015 4016
	const unsigned char *ptr;
	const struct tcphdr *th = tcp_hdr(skb);
4017
	int length = (th->doff * 4) - sizeof(struct tcphdr);
L
Linus Torvalds 已提交
4018

4019
	ptr = (const unsigned char *)(th + 1);
L
Linus Torvalds 已提交
4020
	opt_rx->saw_tstamp = 0;
4021
	opt_rx->saw_unknown = 0;
L
Linus Torvalds 已提交
4022

S
Stephen Hemminger 已提交
4023
	while (length > 0) {
4024
		int opcode = *ptr++;
L
Linus Torvalds 已提交
4025 4026 4027
		int opsize;

		switch (opcode) {
4028 4029 4030 4031 4032 4033
		case TCPOPT_EOL:
			return;
		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
			length--;
			continue;
		default:
4034 4035
			if (length < 2)
				return;
4036 4037
			opsize = *ptr++;
			if (opsize < 2) /* "silly options" */
L
Linus Torvalds 已提交
4038
				return;
4039 4040 4041 4042 4043
			if (opsize > length)
				return;	/* don't parse partial options */
			switch (opcode) {
			case TCPOPT_MSS:
				if (opsize == TCPOLEN_MSS && th->syn && !estab) {
4044
					u16 in_mss = get_unaligned_be16(ptr);
4045 4046 4047 4048 4049
					if (in_mss) {
						if (opt_rx->user_mss &&
						    opt_rx->user_mss < in_mss)
							in_mss = opt_rx->user_mss;
						opt_rx->mss_clamp = in_mss;
L
Linus Torvalds 已提交
4050
					}
4051 4052 4053 4054
				}
				break;
			case TCPOPT_WINDOW:
				if (opsize == TCPOLEN_WINDOW && th->syn &&
4055
				    !estab && net->ipv4.sysctl_tcp_window_scaling) {
4056 4057
					__u8 snd_wscale = *(__u8 *)ptr;
					opt_rx->wscale_ok = 1;
4058 4059
					if (snd_wscale > TCP_MAX_WSCALE) {
						net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
4060
								     __func__,
4061 4062 4063
								     snd_wscale,
								     TCP_MAX_WSCALE);
						snd_wscale = TCP_MAX_WSCALE;
L
Linus Torvalds 已提交
4064
					}
4065 4066 4067 4068 4069 4070
					opt_rx->snd_wscale = snd_wscale;
				}
				break;
			case TCPOPT_TIMESTAMP:
				if ((opsize == TCPOLEN_TIMESTAMP) &&
				    ((estab && opt_rx->tstamp_ok) ||
4071
				     (!estab && net->ipv4.sysctl_tcp_timestamps))) {
4072
					opt_rx->saw_tstamp = 1;
4073 4074
					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
4075 4076 4077 4078
				}
				break;
			case TCPOPT_SACK_PERM:
				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
E
Eric Dumazet 已提交
4079
				    !estab && net->ipv4.sysctl_tcp_sack) {
4080
					opt_rx->sack_ok = TCP_SACK_SEEN;
4081 4082 4083
					tcp_sack_reset(opt_rx);
				}
				break;
L
Linus Torvalds 已提交
4084

4085 4086 4087 4088 4089 4090 4091
			case TCPOPT_SACK:
				if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				   opt_rx->sack_ok) {
					TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				}
				break;
4092
#ifdef CONFIG_TCP_MD5SIG
4093 4094 4095 4096 4097 4098
			case TCPOPT_MD5SIG:
				/*
				 * The MD5 Hash has already been
				 * checked (see tcp_v{4,6}_do_rcv()).
				 */
				break;
4099
#endif
4100 4101 4102 4103 4104 4105
			case TCPOPT_FASTOPEN:
				tcp_parse_fastopen_option(
					opsize - TCPOLEN_FASTOPEN_BASE,
					ptr, th->syn, foc, false);
				break;

Y
Yuchung Cheng 已提交
4106 4107
			case TCPOPT_EXP:
				/* Fast Open option shares code 254 using a
4108
				 * 16 bits magic number.
Y
Yuchung Cheng 已提交
4109
				 */
4110 4111
				if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				    get_unaligned_be16(ptr) ==
4112
				    TCPOPT_FASTOPEN_MAGIC) {
4113 4114 4115
					tcp_parse_fastopen_option(opsize -
						TCPOLEN_EXP_FASTOPEN_BASE,
						ptr + 2, th->syn, foc, true);
4116 4117 4118 4119 4120 4121 4122
					break;
				}

				if (smc_parse_options(th, opt_rx, ptr, opsize))
					break;

				opt_rx->saw_unknown = 1;
Y
Yuchung Cheng 已提交
4123 4124
				break;

4125 4126
			default:
				opt_rx->saw_unknown = 1;
Y
Yuchung Cheng 已提交
4127
			}
4128 4129
			ptr += opsize-2;
			length -= opsize;
4130
		}
L
Linus Torvalds 已提交
4131 4132
	}
}
E
Eric Dumazet 已提交
4133
EXPORT_SYMBOL(tcp_parse_options);
L
Linus Torvalds 已提交
4134

E
Eric Dumazet 已提交
4135
static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
4136
{
4137
	const __be32 *ptr = (const __be32 *)(th + 1);
4138 4139 4140 4141 4142 4143 4144

	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
		tp->rx_opt.saw_tstamp = 1;
		++ptr;
		tp->rx_opt.rcv_tsval = ntohl(*ptr);
		++ptr;
4145 4146 4147 4148
		if (*ptr)
			tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
		else
			tp->rx_opt.rcv_tsecr = 0;
E
Eric Dumazet 已提交
4149
		return true;
4150
	}
E
Eric Dumazet 已提交
4151
	return false;
4152 4153
}

L
Linus Torvalds 已提交
4154 4155 4156
/* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
4157 4158
static bool tcp_fast_parse_options(const struct net *net,
				   const struct sk_buff *skb,
C
Christoph Paasch 已提交
4159
				   const struct tcphdr *th, struct tcp_sock *tp)
L
Linus Torvalds 已提交
4160
{
4161 4162 4163 4164
	/* In the spirit of fast parsing, compare doff directly to constant
	 * values.  Because equality is used, short doff can be ignored here.
	 */
	if (th->doff == (sizeof(*th) / 4)) {
L
Linus Torvalds 已提交
4165
		tp->rx_opt.saw_tstamp = 0;
E
Eric Dumazet 已提交
4166
		return false;
L
Linus Torvalds 已提交
4167
	} else if (tp->rx_opt.tstamp_ok &&
4168
		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
4169
		if (tcp_parse_aligned_timestamp(tp, th))
E
Eric Dumazet 已提交
4170
			return true;
L
Linus Torvalds 已提交
4171
	}
4172

4173
	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
4174
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
4175 4176
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;

E
Eric Dumazet 已提交
4177
	return true;
L
Linus Torvalds 已提交
4178 4179
}

4180 4181 4182 4183
#ifdef CONFIG_TCP_MD5SIG
/*
 * Parse MD5 Signature option
 */
4184
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
4185
{
4186 4187
	int length = (th->doff << 2) - sizeof(*th);
	const u8 *ptr = (const u8 *)(th + 1);
4188

J
Jann Horn 已提交
4189 4190
	/* If not enough data remaining, we can short cut */
	while (length >= TCPOLEN_MD5SIG) {
4191 4192 4193
		int opcode = *ptr++;
		int opsize;

W
Weilong Chen 已提交
4194
		switch (opcode) {
4195 4196 4197 4198 4199 4200 4201 4202 4203 4204
		case TCPOPT_EOL:
			return NULL;
		case TCPOPT_NOP:
			length--;
			continue;
		default:
			opsize = *ptr++;
			if (opsize < 2 || opsize > length)
				return NULL;
			if (opcode == TCPOPT_MD5SIG)
4205
				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
4206 4207 4208 4209 4210 4211
		}
		ptr += opsize - 2;
		length -= opsize;
	}
	return NULL;
}
E
Eric Dumazet 已提交
4212
EXPORT_SYMBOL(tcp_parse_md5sig_option);
4213 4214
#endif

L
Linus Torvalds 已提交
4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 *
 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
 * it can pass through stack. So, the following predicate verifies that
 * this segment is not used for anything but congestion avoidance or
 * fast retransmit. Moreover, we even are able to eliminate most of such
 * second order effects, if we apply some small "replay" window (~RTO)
 * to timestamp space.
 *
 * All these measures still do not guarantee that we reject wrapped ACKs
 * on networks with high bandwidth, when sequence space is recycled fastly,
 * but it guarantees that such events will be very rare and do not affect
 * connection seriously. This doesn't look nice, but alas, PAWS is really
 * buggy extension.
 *
 * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
 * states that events when retransmit arrives after original data are rare.
 * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
 * the biggest problem on large power networks even with minor reordering.
 * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
 * up to bandwidth of 18Gigabit/sec. 8) ]
 */

4238
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
L
Linus Torvalds 已提交
4239
{
4240 4241
	const struct tcp_sock *tp = tcp_sk(sk);
	const struct tcphdr *th = tcp_hdr(skb);
L
Linus Torvalds 已提交
4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254
	u32 seq = TCP_SKB_CB(skb)->seq;
	u32 ack = TCP_SKB_CB(skb)->ack_seq;

	return (/* 1. Pure ACK with correct sequence number. */
		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&

		/* 2. ... and duplicate ACK. */
		ack == tp->snd_una &&

		/* 3. ... and does not update window. */
		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&

		/* 4. ... and sits in replay window. */
4255
		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
L
Linus Torvalds 已提交
4256 4257
}

4258
static inline bool tcp_paws_discard(const struct sock *sk,
4259
				   const struct sk_buff *skb)
L
Linus Torvalds 已提交
4260
{
4261
	const struct tcp_sock *tp = tcp_sk(sk);
I
Ilpo Järvinen 已提交
4262 4263 4264

	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
	       !tcp_disordered_ack(sk, skb);
L
Linus Torvalds 已提交
4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279
}

/* Check segment sequence number for validity.
 *
 * Segment controls are considered valid, if the segment
 * fits to the window after truncation to the window. Acceptability
 * of data (and SYN, FIN, of course) is checked separately.
 * See tcp_data_queue(), for example.
 *
 * Also, controls (RST is main one) are accepted using RCV.WUP instead
 * of RCV.NXT. Peer still did not advance his SND.UNA when we
 * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
 * (borrowed from freebsd)
 */

4280
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
L
Linus Torvalds 已提交
4281 4282 4283 4284 4285 4286
{
	return	!before(end_seq, tp->rcv_wup) &&
		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}

/* When we get a reset we do this. */
4287
void tcp_reset(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
4288
{
4289 4290
	trace_tcp_receive_reset(sk);

4291 4292 4293
	/* mptcp can't tell us to ignore reset pkts,
	 * so just ignore the return value of mptcp_incoming_options().
	 */
4294 4295 4296
	if (sk_is_mptcp(sk))
		mptcp_incoming_options(sk, skb);

L
Linus Torvalds 已提交
4297 4298
	/* We want the right error as BSD sees it (and indeed as we do). */
	switch (sk->sk_state) {
4299 4300 4301 4302 4303 4304 4305 4306 4307 4308
	case TCP_SYN_SENT:
		sk->sk_err = ECONNREFUSED;
		break;
	case TCP_CLOSE_WAIT:
		sk->sk_err = EPIPE;
		break;
	case TCP_CLOSE:
		return;
	default:
		sk->sk_err = ECONNRESET;
L
Linus Torvalds 已提交
4309
	}
T
Tom Marshall 已提交
4310 4311
	/* This barrier is coupled with smp_rmb() in tcp_poll() */
	smp_wmb();
L
Linus Torvalds 已提交
4312

4313
	tcp_write_queue_purge(sk);
4314 4315
	tcp_done(sk);

L
Linus Torvalds 已提交
4316
	if (!sock_flag(sk, SOCK_DEAD))
4317
		sk_error_report(sk);
L
Linus Torvalds 已提交
4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333
}

/*
 * 	Process the FIN bit. This now behaves as it is supposed to work
 *	and the FIN takes effect when it is validly part of sequence
 *	space. Not before when we get holes.
 *
 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
 *	TIME-WAIT)
 *
 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
 *	close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
4334
void tcp_fin(struct sock *sk)
L
Linus Torvalds 已提交
4335 4336 4337
{
	struct tcp_sock *tp = tcp_sk(sk);

4338
	inet_csk_schedule_ack(sk);
L
Linus Torvalds 已提交
4339 4340 4341 4342 4343

	sk->sk_shutdown |= RCV_SHUTDOWN;
	sock_set_flag(sk, SOCK_DONE);

	switch (sk->sk_state) {
4344 4345 4346 4347
	case TCP_SYN_RECV:
	case TCP_ESTABLISHED:
		/* Move to CLOSE_WAIT */
		tcp_set_state(sk, TCP_CLOSE_WAIT);
W
Wei Wang 已提交
4348
		inet_csk_enter_pingpong_mode(sk);
4349
		break;
L
Linus Torvalds 已提交
4350

4351 4352 4353 4354 4355 4356 4357 4358 4359
	case TCP_CLOSE_WAIT:
	case TCP_CLOSING:
		/* Received a retransmission of the FIN, do
		 * nothing.
		 */
		break;
	case TCP_LAST_ACK:
		/* RFC793: Remain in the LAST-ACK state. */
		break;
L
Linus Torvalds 已提交
4360

4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377
	case TCP_FIN_WAIT1:
		/* This case occurs when a simultaneous close
		 * happens, we must ack the received FIN and
		 * enter the CLOSING state.
		 */
		tcp_send_ack(sk);
		tcp_set_state(sk, TCP_CLOSING);
		break;
	case TCP_FIN_WAIT2:
		/* Received a FIN -- send ACK and enter TIME_WAIT. */
		tcp_send_ack(sk);
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
		break;
	default:
		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
		 * cases we should never reach this piece of code.
		 */
J
Joe Perches 已提交
4378
		pr_err("%s: Impossible, sk->sk_state=%d\n",
4379
		       __func__, sk->sk_state);
4380
		break;
4381
	}
L
Linus Torvalds 已提交
4382 4383 4384 4385

	/* It _is_ possible, that we have something out-of-order _after_ FIN.
	 * Probably, we should reset in this case. For now drop them.
	 */
4386
	skb_rbtree_purge(&tp->out_of_order_queue);
4387
	if (tcp_is_sack(tp))
L
Linus Torvalds 已提交
4388
		tcp_sack_reset(&tp->rx_opt);
4389
	sk_mem_reclaim(sk);
L
Linus Torvalds 已提交
4390 4391 4392 4393 4394 4395 4396

	if (!sock_flag(sk, SOCK_DEAD)) {
		sk->sk_state_change(sk);

		/* Do not send POLL_HUP for half duplex close. */
		if (sk->sk_shutdown == SHUTDOWN_MASK ||
		    sk->sk_state == TCP_CLOSE)
4397
			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
L
Linus Torvalds 已提交
4398
		else
4399
			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
L
Linus Torvalds 已提交
4400 4401 4402
	}
}

E
Eric Dumazet 已提交
4403
static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4404
				  u32 end_seq)
L
Linus Torvalds 已提交
4405 4406 4407 4408 4409 4410
{
	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
		if (before(seq, sp->start_seq))
			sp->start_seq = seq;
		if (after(end_seq, sp->end_seq))
			sp->end_seq = end_seq;
E
Eric Dumazet 已提交
4411
		return true;
L
Linus Torvalds 已提交
4412
	}
E
Eric Dumazet 已提交
4413
	return false;
L
Linus Torvalds 已提交
4414 4415
}

4416
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
L
Linus Torvalds 已提交
4417
{
4418 4419
	struct tcp_sock *tp = tcp_sk(sk);

E
Eric Dumazet 已提交
4420
	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4421 4422
		int mib_idx;

L
Linus Torvalds 已提交
4423
		if (before(seq, tp->rcv_nxt))
4424
			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
L
Linus Torvalds 已提交
4425
		else
4426 4427
			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;

4428
		NET_INC_STATS(sock_net(sk), mib_idx);
L
Linus Torvalds 已提交
4429 4430 4431 4432 4433 4434 4435

		tp->rx_opt.dsack = 1;
		tp->duplicate_sack[0].start_seq = seq;
		tp->duplicate_sack[0].end_seq = end_seq;
	}
}

4436
static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
L
Linus Torvalds 已提交
4437
{
4438 4439
	struct tcp_sock *tp = tcp_sk(sk);

L
Linus Torvalds 已提交
4440
	if (!tp->rx_opt.dsack)
4441
		tcp_dsack_set(sk, seq, end_seq);
L
Linus Torvalds 已提交
4442 4443 4444 4445
	else
		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}

4446 4447 4448 4449 4450 4451 4452
static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
{
	/* When the ACK path fails or drops most ACKs, the sender would
	 * timeout and spuriously retransmit the same segment repeatedly.
	 * The receiver remembers and reflects via DSACKs. Leverage the
	 * DSACK state and change the txhash to re-route speculatively.
	 */
4453 4454
	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
	    sk_rethink_txhash(sk))
4455
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4456 4457
}

4458
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
L
Linus Torvalds 已提交
4459 4460 4461 4462 4463
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4464
		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4465
		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
L
Linus Torvalds 已提交
4466

E
Eric Dumazet 已提交
4467
		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
L
Linus Torvalds 已提交
4468 4469
			u32 end_seq = TCP_SKB_CB(skb)->end_seq;

4470
			tcp_rcv_spurious_retrans(sk, skb);
L
Linus Torvalds 已提交
4471 4472
			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				end_seq = tp->rcv_nxt;
4473
			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
L
Linus Torvalds 已提交
4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486
		}
	}

	tcp_send_ack(sk);
}

/* These routines update the SACK block as out-of-order packets arrive or
 * in-order packets close up the sequence space.
 */
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
	int this_sack;
	struct tcp_sack_block *sp = &tp->selective_acks[0];
4487
	struct tcp_sack_block *swalk = sp + 1;
L
Linus Torvalds 已提交
4488 4489 4490 4491

	/* See if the recent change to the first SACK eats into
	 * or hits the sequence space of other SACK blocks, if so coalesce.
	 */
4492
	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
L
Linus Torvalds 已提交
4493 4494 4495 4496 4497 4498 4499
		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
			int i;

			/* Zap SWALK, by moving every further SACK up by one slot.
			 * Decrease num_sacks.
			 */
			tp->rx_opt.num_sacks--;
4500 4501
			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				sp[i] = sp[i + 1];
L
Linus Torvalds 已提交
4502 4503
			continue;
		}
4504 4505
		this_sack++;
		swalk++;
L
Linus Torvalds 已提交
4506 4507 4508
	}
}

E
Eric Dumazet 已提交
4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529
static void tcp_sack_compress_send_ack(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (!tp->compressed_ack)
		return;

	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
		__sock_put(sk);

	/* Since we have to send one ack finally,
	 * substract one from tp->compressed_ack to keep
	 * LINUX_MIB_TCPACKCOMPRESSED accurate.
	 */
	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
		      tp->compressed_ack - 1);

	tp->compressed_ack = 0;
	tcp_send_ack(sk);
}

4530 4531 4532 4533 4534 4535
/* Reasonable amount of sack blocks included in TCP SACK option
 * The max is 4, but this becomes 3 if TCP timestamps are there.
 * Given that SACK packets might be lost, be conservative and use 2.
 */
#define TCP_SACK_BLOCKS_EXPECTED 2

L
Linus Torvalds 已提交
4536 4537 4538 4539 4540 4541 4542 4543 4544 4545
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int cur_sacks = tp->rx_opt.num_sacks;
	int this_sack;

	if (!cur_sacks)
		goto new_sack;

4546
	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
L
Linus Torvalds 已提交
4547
		if (tcp_sack_extend(sp, seq, end_seq)) {
4548 4549
			if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
				tcp_sack_compress_send_ack(sk);
L
Linus Torvalds 已提交
4550
			/* Rotate this_sack to the first one. */
4551
			for (; this_sack > 0; this_sack--, sp--)
4552
				swap(*sp, *(sp - 1));
L
Linus Torvalds 已提交
4553 4554 4555 4556 4557 4558
			if (cur_sacks > 1)
				tcp_sack_maybe_coalesce(tp);
			return;
		}
	}

4559 4560 4561
	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
		tcp_sack_compress_send_ack(sk);

L
Linus Torvalds 已提交
4562 4563 4564 4565 4566 4567
	/* Could not find an adjacent existing SACK, build a new one,
	 * put it at the front, and shift everyone else down.  We
	 * always know there is at least one SACK present already here.
	 *
	 * If the sack array is full, forget about the last one.
	 */
4568
	if (this_sack >= TCP_NUM_SACKS) {
L
Linus Torvalds 已提交
4569 4570 4571 4572
		this_sack--;
		tp->rx_opt.num_sacks--;
		sp--;
	}
S
Stephen Hemminger 已提交
4573
	for (; this_sack > 0; this_sack--, sp--)
4574
		*sp = *(sp - 1);
L
Linus Torvalds 已提交
4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591

new_sack:
	/* Build the new head SACK, and we're done. */
	sp->start_seq = seq;
	sp->end_seq = end_seq;
	tp->rx_opt.num_sacks++;
}

/* RCV.NXT advances, some SACKs should be eaten. */

static void tcp_sack_remove(struct tcp_sock *tp)
{
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int num_sacks = tp->rx_opt.num_sacks;
	int this_sack;

	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4592
	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
L
Linus Torvalds 已提交
4593 4594 4595 4596
		tp->rx_opt.num_sacks = 0;
		return;
	}

4597
	for (this_sack = 0; this_sack < num_sacks;) {
L
Linus Torvalds 已提交
4598 4599 4600 4601 4602
		/* Check if the start of the sack is covered by RCV.NXT. */
		if (!before(tp->rcv_nxt, sp->start_seq)) {
			int i;

			/* RCV.NXT must cover all the block! */
4603
			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
L
Linus Torvalds 已提交
4604 4605

			/* Zap this SACK, by moving forward any other SACKS. */
W
Weilong Chen 已提交
4606
			for (i = this_sack+1; i < num_sacks; i++)
L
Linus Torvalds 已提交
4607 4608 4609 4610 4611 4612 4613
				tp->selective_acks[i-1] = tp->selective_acks[i];
			num_sacks--;
			continue;
		}
		this_sack++;
		sp++;
	}
4614
	tp->rx_opt.num_sacks = num_sacks;
L
Linus Torvalds 已提交
4615 4616
}

4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642
/**
 * tcp_try_coalesce - try to merge skb to prior one
 * @sk: socket
 * @to: prior buffer
 * @from: buffer to add in queue
 * @fragstolen: pointer to boolean
 *
 * Before queueing skb @from after @to, try to merge them
 * to reduce overall memory use and queue lengths, if cost is small.
 * Packets in ofo or receive queues can stay a long time.
 * Better try to coalesce them right now to avoid future collapses.
 * Returns true if caller should free @from instead of queueing it
 */
static bool tcp_try_coalesce(struct sock *sk,
			     struct sk_buff *to,
			     struct sk_buff *from,
			     bool *fragstolen)
{
	int delta;

	*fragstolen = false;

	/* Its possible this segment overlaps with prior segment in queue */
	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
		return false;

4643 4644 4645
	if (!mptcp_skb_can_collapse(to, from))
		return false;

4646 4647 4648 4649 4650
#ifdef CONFIG_TLS_DEVICE
	if (from->decrypted != to->decrypted)
		return false;
#endif

4651 4652 4653 4654 4655
	if (!skb_try_coalesce(to, from, fragstolen, &delta))
		return false;

	atomic_add(delta, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, delta);
4656
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4657 4658 4659
	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
	TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4660 4661 4662

	if (TCP_SKB_CB(from)->has_rxtstamp) {
		TCP_SKB_CB(to)->has_rxtstamp = true;
E
Eric Dumazet 已提交
4663
		to->tstamp = from->tstamp;
4664
		skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
4665 4666
	}

4667 4668 4669
	return true;
}

4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686
static bool tcp_ooo_try_coalesce(struct sock *sk,
			     struct sk_buff *to,
			     struct sk_buff *from,
			     bool *fragstolen)
{
	bool res = tcp_try_coalesce(sk, to, from, fragstolen);

	/* In case tcp_drop() is called later, update to->gso_segs */
	if (res) {
		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
			       max_t(u16, 1, skb_shinfo(from)->gso_segs);

		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
	}
	return res;
}

4687 4688
static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
			    enum skb_drop_reason reason)
4689 4690
{
	sk_drops_add(sk, skb);
4691 4692 4693 4694 4695 4696
	kfree_skb_reason(skb, reason);
}

static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
	tcp_drop_reason(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED);
4697 4698
}

L
Linus Torvalds 已提交
4699 4700 4701 4702 4703 4704 4705
/* This one checks to see if we can put data from the
 * out_of_order queue into the receive_queue.
 */
static void tcp_ofo_queue(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	__u32 dsack_high = tp->rcv_nxt;
4706
	bool fin, fragstolen, eaten;
4707
	struct sk_buff *skb, *tail;
4708
	struct rb_node *p;
L
Linus Torvalds 已提交
4709

4710 4711
	p = rb_first(&tp->out_of_order_queue);
	while (p) {
4712
		skb = rb_to_skb(p);
L
Linus Torvalds 已提交
4713 4714 4715 4716 4717 4718 4719
		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
			break;

		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
			__u32 dsack = dsack_high;
			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				dsack_high = TCP_SKB_CB(skb)->end_seq;
4720
			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
L
Linus Torvalds 已提交
4721
		}
4722 4723
		p = rb_next(p);
		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
L
Linus Torvalds 已提交
4724

4725
		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4726
			tcp_drop(sk, skb);
L
Linus Torvalds 已提交
4727 4728 4729
			continue;
		}

4730
		tail = skb_peek_tail(&sk->sk_receive_queue);
E
Eric Dumazet 已提交
4731
		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4732
		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4733
		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4734 4735
		if (!eaten)
			__skb_queue_tail(&sk->sk_receive_queue, skb);
4736
		else
4737
			kfree_skb_partial(skb, fragstolen);
4738 4739 4740 4741 4742 4743 4744 4745

		if (unlikely(fin)) {
			tcp_fin(sk);
			/* tcp_fin() purges tp->out_of_order_queue,
			 * so we must end this loop right now.
			 */
			break;
		}
L
Linus Torvalds 已提交
4746 4747 4748
	}
}

E
Eric Dumazet 已提交
4749
static bool tcp_prune_ofo_queue(struct sock *sk);
L
Linus Torvalds 已提交
4750 4751
static int tcp_prune_queue(struct sock *sk);

4752 4753
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
				 unsigned int size)
4754 4755
{
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4756
	    !sk_rmem_schedule(sk, skb, size)) {
4757 4758 4759 4760

		if (tcp_prune_queue(sk) < 0)
			return -1;

4761
		while (!sk_rmem_schedule(sk, skb, size)) {
4762 4763
			if (!tcp_prune_ofo_queue(sk))
				return -1;
4764 4765 4766 4767 4768
		}
	}
	return 0;
}

E
Eric Dumazet 已提交
4769 4770 4771
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
4772
	struct rb_node **p, *parent;
E
Eric Dumazet 已提交
4773 4774
	struct sk_buff *skb1;
	u32 seq, end_seq;
4775
	bool fragstolen;
E
Eric Dumazet 已提交
4776

4777
	tcp_ecn_check_ce(sk, skb);
E
Eric Dumazet 已提交
4778

4779
	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4780
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
4781
		sk->sk_data_ready(sk);
4782
		tcp_drop(sk, skb);
E
Eric Dumazet 已提交
4783 4784 4785
		return;
	}

4786 4787
	/* Disable header prediction. */
	tp->pred_flags = 0;
E
Eric Dumazet 已提交
4788 4789
	inet_csk_schedule_ack(sk);

4790
	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
4791
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4792 4793
	seq = TCP_SKB_CB(skb)->seq;
	end_seq = TCP_SKB_CB(skb)->end_seq;
E
Eric Dumazet 已提交
4794

4795 4796
	p = &tp->out_of_order_queue.rb_node;
	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
E
Eric Dumazet 已提交
4797 4798 4799
		/* Initial out of order segment, build 1 SACK. */
		if (tcp_is_sack(tp)) {
			tp->rx_opt.num_sacks = 1;
4800 4801
			tp->selective_acks[0].start_seq = seq;
			tp->selective_acks[0].end_seq = end_seq;
E
Eric Dumazet 已提交
4802
		}
4803 4804 4805
		rb_link_node(&skb->rbnode, NULL, p);
		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
		tp->ooo_last_skb = skb;
E
Eric Dumazet 已提交
4806 4807 4808
		goto end;
	}

4809 4810 4811
	/* In the typical case, we are adding an skb to the end of the list.
	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
	 */
4812 4813
	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
				 skb, &fragstolen)) {
4814
coalesce_done:
4815 4816 4817 4818
		/* For non sack flows, do not grow window to force DUPACK
		 * and trigger fast retransmit.
		 */
		if (tcp_is_sack(tp))
4819
			tcp_grow_window(sk, skb, true);
4820 4821 4822 4823
		kfree_skb_partial(skb, fragstolen);
		skb = NULL;
		goto add_sack;
	}
4824 4825 4826 4827 4828 4829
	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
		parent = &tp->ooo_last_skb->rbnode;
		p = &parent->rb_right;
		goto insert;
	}
4830 4831 4832 4833 4834

	/* Find place to insert this segment. Handle overlaps on the way. */
	parent = NULL;
	while (*p) {
		parent = *p;
4835
		skb1 = rb_to_skb(parent);
4836 4837 4838
		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
			p = &parent->rb_left;
			continue;
E
Eric Dumazet 已提交
4839
		}
4840 4841 4842 4843 4844
		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				/* All the bits are present. Drop. */
				NET_INC_STATS(sock_net(sk),
					      LINUX_MIB_TCPOFOMERGE);
4845
				tcp_drop(sk, skb);
4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863
				skb = NULL;
				tcp_dsack_set(sk, seq, end_seq);
				goto add_sack;
			}
			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				/* Partial overlap. */
				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
			} else {
				/* skb's seq == skb1's seq and skb covers skb1.
				 * Replace skb1 with skb.
				 */
				rb_replace_node(&skb1->rbnode, &skb->rbnode,
						&tp->out_of_order_queue);
				tcp_dsack_extend(sk,
						 TCP_SKB_CB(skb1)->seq,
						 TCP_SKB_CB(skb1)->end_seq);
				NET_INC_STATS(sock_net(sk),
					      LINUX_MIB_TCPOFOMERGE);
4864
				tcp_drop(sk, skb1);
4865
				goto merge_right;
4866
			}
4867 4868
		} else if (tcp_ooo_try_coalesce(sk, skb1,
						skb, &fragstolen)) {
4869
			goto coalesce_done;
E
Eric Dumazet 已提交
4870
		}
4871
		p = &parent->rb_right;
E
Eric Dumazet 已提交
4872
	}
4873
insert:
4874 4875 4876
	/* Insert segment into RB tree. */
	rb_link_node(&skb->rbnode, parent, p);
	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
E
Eric Dumazet 已提交
4877

4878
merge_right:
4879
	/* Remove other segments covered by skb. */
4880
	while ((skb1 = skb_rb_next(skb)) != NULL) {
E
Eric Dumazet 已提交
4881 4882 4883 4884 4885 4886 4887
		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
			break;
		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
					 end_seq);
			break;
		}
4888
		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
E
Eric Dumazet 已提交
4889 4890
		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				 TCP_SKB_CB(skb1)->end_seq);
4891
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4892
		tcp_drop(sk, skb1);
E
Eric Dumazet 已提交
4893
	}
4894
	/* If there is no skb after us, we are the last_skb ! */
4895
	if (!skb1)
4896
		tp->ooo_last_skb = skb;
E
Eric Dumazet 已提交
4897 4898 4899 4900 4901

add_sack:
	if (tcp_is_sack(tp))
		tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
4902
	if (skb) {
4903 4904 4905 4906
		/* For non sack flows, do not grow window to force DUPACK
		 * and trigger fast retransmit.
		 */
		if (tcp_is_sack(tp))
4907
			tcp_grow_window(sk, skb, false);
4908
		skb_condense(skb);
E
Eric Dumazet 已提交
4909
		skb_set_owner_r(skb, sk);
4910
	}
E
Eric Dumazet 已提交
4911 4912
}

4913 4914
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
				      bool *fragstolen)
4915 4916 4917 4918 4919
{
	int eaten;
	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

	eaten = (tail &&
E
Eric Dumazet 已提交
4920
		 tcp_try_coalesce(sk, tail,
4921
				  skb, fragstolen)) ? 1 : 0;
4922
	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4923 4924 4925 4926 4927 4928
	if (!eaten) {
		__skb_queue_tail(&sk->sk_receive_queue, skb);
		skb_set_owner_r(skb, sk);
	}
	return eaten;
}
E
Eric Dumazet 已提交
4929

4930 4931
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
4932
	struct sk_buff *skb;
4933 4934
	int err = -ENOMEM;
	int data_len = 0;
4935 4936
	bool fragstolen;

4937 4938 4939
	if (size == 0)
		return 0;

4940 4941 4942 4943 4944 4945 4946 4947 4948
	if (size > PAGE_SIZE) {
		int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);

		data_len = npages << PAGE_SHIFT;
		size = data_len + (size & ~PAGE_MASK);
	}
	skb = alloc_skb_with_frags(size - data_len, data_len,
				   PAGE_ALLOC_COSTLY_ORDER,
				   &err, sk->sk_allocation);
4949 4950 4951
	if (!skb)
		goto err;

4952 4953 4954 4955
	skb_put(skb, size - data_len);
	skb->data_len = data_len;
	skb->len = size;

4956 4957
	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
4958
		goto err_free;
4959
	}
4960

4961 4962
	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
	if (err)
4963 4964 4965 4966 4967 4968
		goto err_free;

	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;

4969
	if (tcp_queue_rcv(sk, skb, &fragstolen)) {
4970 4971 4972 4973 4974 4975 4976 4977
		WARN_ON_ONCE(fragstolen); /* should not happen */
		__kfree_skb(skb);
	}
	return size;

err_free:
	kfree_skb(skb);
err:
4978 4979
	return err;

4980 4981
}

4982 4983
void tcp_data_ready(struct sock *sk)
{
4984
	if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
4985
		sk->sk_data_ready(sk);
4986 4987
}

L
Linus Torvalds 已提交
4988 4989 4990
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
4991
	enum skb_drop_reason reason;
E
Eric Dumazet 已提交
4992 4993
	bool fragstolen;
	int eaten;
L
Linus Torvalds 已提交
4994

4995 4996 4997 4998 4999 5000 5001
	/* If a subflow has been reset, the packet should not continue
	 * to be processed, drop the packet.
	 */
	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
		__kfree_skb(skb);
		return;
	}
5002

5003 5004 5005 5006
	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
		__kfree_skb(skb);
		return;
	}
E
Eric Dumazet 已提交
5007
	skb_dst_drop(skb);
5008
	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
L
Linus Torvalds 已提交
5009

5010
	reason = SKB_DROP_REASON_NOT_SPECIFIED;
5011
	tp->rx_opt.dsack = 0;
L
Linus Torvalds 已提交
5012 5013 5014 5015 5016 5017

	/*  Queue data for delivery to the user.
	 *  Packets in sequence go to the receive queue.
	 *  Out of sequence packets to the out_of_order_queue.
	 */
	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
5018
		if (tcp_receive_window(tp) == 0) {
5019
			reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5020
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
L
Linus Torvalds 已提交
5021
			goto out_of_window;
5022
		}
L
Linus Torvalds 已提交
5023 5024 5025

		/* Ok. In sequence. In window. */
queue_and_out:
E
Eric Dumazet 已提交
5026 5027
		if (skb_queue_len(&sk->sk_receive_queue) == 0)
			sk_forced_mem_schedule(sk, skb->truesize);
5028
		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
5029
			reason = SKB_DROP_REASON_PROTO_MEM;
5030
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5031
			sk->sk_data_ready(sk);
E
Eric Dumazet 已提交
5032
			goto drop;
5033
		}
E
Eric Dumazet 已提交
5034

5035
		eaten = tcp_queue_rcv(sk, skb, &fragstolen);
S
Stephen Hemminger 已提交
5036
		if (skb->len)
5037
			tcp_event_data_recv(sk, skb);
5038
		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5039
			tcp_fin(sk);
L
Linus Torvalds 已提交
5040

5041
		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
L
Linus Torvalds 已提交
5042 5043
			tcp_ofo_queue(sk);

5044
			/* RFC5681. 4.2. SHOULD send immediate ACK, when
L
Linus Torvalds 已提交
5045 5046
			 * gap in queue is filled.
			 */
5047
			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5048
				inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
L
Linus Torvalds 已提交
5049 5050 5051 5052 5053
		}

		if (tp->rx_opt.num_sacks)
			tcp_sack_remove(tp);

5054 5055
		tcp_fast_path_check(sk);

5056 5057
		if (eaten > 0)
			kfree_skb_partial(skb, fragstolen);
5058
		if (!sock_flag(sk, SOCK_DEAD))
5059
			tcp_data_ready(sk);
L
Linus Torvalds 已提交
5060 5061 5062 5063
		return;
	}

	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5064
		tcp_rcv_spurious_retrans(sk, skb);
L
Linus Torvalds 已提交
5065
		/* A retransmit, 2nd most common case.  Force an immediate ack. */
5066
		reason = SKB_DROP_REASON_TCP_OLD_DATA;
5067
		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
5068
		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
L
Linus Torvalds 已提交
5069 5070

out_of_window:
5071
		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
5072
		inet_csk_schedule_ack(sk);
L
Linus Torvalds 已提交
5073
drop:
5074
		tcp_drop_reason(sk, skb, reason);
L
Linus Torvalds 已提交
5075 5076 5077 5078
		return;
	}

	/* Out of window. F.e. zero window probe. */
5079 5080 5081
	if (!before(TCP_SKB_CB(skb)->seq,
		    tp->rcv_nxt + tcp_receive_window(tp))) {
		reason = SKB_DROP_REASON_TCP_OVERWINDOW;
L
Linus Torvalds 已提交
5082
		goto out_of_window;
5083
	}
L
Linus Torvalds 已提交
5084 5085 5086

	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
		/* Partial packet, seq < rcv_next < end_seq */
5087
		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
5088

L
Linus Torvalds 已提交
5089 5090 5091
		/* If window is closed, drop tail of packet. But after
		 * remembering D-SACK for its head made in previous line.
		 */
5092
		if (!tcp_receive_window(tp)) {
5093
			reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5094
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
L
Linus Torvalds 已提交
5095
			goto out_of_window;
5096
		}
L
Linus Torvalds 已提交
5097 5098 5099
		goto queue_and_out;
	}

E
Eric Dumazet 已提交
5100
	tcp_data_queue_ofo(sk, skb);
L
Linus Torvalds 已提交
5101 5102
}

5103 5104 5105 5106 5107
static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
{
	if (list)
		return !skb_queue_is_last(list, skb) ? skb->next : NULL;

5108
	return skb_rb_next(skb);
5109 5110
}

5111
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
5112 5113
					struct sk_buff_head *list,
					struct rb_root *root)
5114
{
5115
	struct sk_buff *next = tcp_skb_next(skb, list);
5116

5117 5118 5119 5120
	if (list)
		__skb_unlink(skb, list);
	else
		rb_erase(&skb->rbnode, root);
5121 5122

	__kfree_skb(skb);
5123
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
5124 5125 5126 5127

	return next;
}

5128
/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
5129
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
5130 5131 5132 5133 5134 5135 5136
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct sk_buff *skb1;

	while (*p) {
		parent = *p;
5137
		skb1 = rb_to_skb(parent);
5138 5139 5140 5141 5142 5143 5144 5145 5146
		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
			p = &parent->rb_left;
		else
			p = &parent->rb_right;
	}
	rb_link_node(&skb->rbnode, parent, p);
	rb_insert_color(&skb->rbnode, root);
}

L
Linus Torvalds 已提交
5147 5148
/* Collapse contiguous sequence of skbs head..tail with
 * sequence numbers start..end.
5149
 *
5150
 * If tail is NULL, this means until the end of the queue.
5151
 *
L
Linus Torvalds 已提交
5152 5153 5154 5155
 * Segments with FIN/SYN are not collapsed (only because this
 * simplifies code)
 */
static void
5156 5157
tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
L
Linus Torvalds 已提交
5158
{
5159 5160
	struct sk_buff *skb = head, *n;
	struct sk_buff_head tmp;
5161
	bool end_of_skbs;
L
Linus Torvalds 已提交
5162

S
Stephen Hemminger 已提交
5163
	/* First, check that queue is collapsible and find
5164 5165
	 * the point where collapsing can be useful.
	 */
5166
restart:
5167 5168 5169
	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
		n = tcp_skb_next(skb, list);

L
Linus Torvalds 已提交
5170 5171
		/* No new bits? It is possible on ofo queue. */
		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
5172
			skb = tcp_collapse_one(sk, skb, list, root);
5173 5174 5175
			if (!skb)
				break;
			goto restart;
L
Linus Torvalds 已提交
5176 5177 5178 5179 5180
		}

		/* The first skb to collapse is:
		 * - not SYN/FIN and
		 * - bloated or contains data before "start" or
5181
		 *   overlaps to the next one and mptcp allow collapsing.
L
Linus Torvalds 已提交
5182
		 */
5183
		if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
5184
		    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
5185 5186
		     before(TCP_SKB_CB(skb)->seq, start))) {
			end_of_skbs = false;
L
Linus Torvalds 已提交
5187
			break;
5188 5189
		}

5190
		if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
5191 5192 5193
		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
			end_of_skbs = false;
			break;
5194
		}
L
Linus Torvalds 已提交
5195 5196 5197 5198

		/* Decided to skip this, advance start seq. */
		start = TCP_SKB_CB(skb)->end_seq;
	}
5199 5200
	if (end_of_skbs ||
	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
L
Linus Torvalds 已提交
5201 5202
		return;

5203 5204
	__skb_queue_head_init(&tmp);

L
Linus Torvalds 已提交
5205
	while (before(start, end)) {
5206
		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
L
Linus Torvalds 已提交
5207 5208
		struct sk_buff *nskb;

5209
		nskb = alloc_skb(copy, GFP_ATOMIC);
L
Linus Torvalds 已提交
5210
		if (!nskb)
5211
			break;
5212

L
Linus Torvalds 已提交
5213
		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
5214 5215 5216
#ifdef CONFIG_TLS_DEVICE
		nskb->decrypted = skb->decrypted;
#endif
L
Linus Torvalds 已提交
5217
		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
5218 5219 5220 5221
		if (list)
			__skb_queue_before(list, skb, nskb);
		else
			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
5222
		skb_set_owner_r(nskb, sk);
5223
		mptcp_skb_ext_move(nskb, skb);
L
Linus Torvalds 已提交
5224 5225 5226 5227 5228 5229

		/* Copy data, releasing collapsed skbs. */
		while (copy > 0) {
			int offset = start - TCP_SKB_CB(skb)->seq;
			int size = TCP_SKB_CB(skb)->end_seq - start;

5230
			BUG_ON(offset < 0);
L
Linus Torvalds 已提交
5231 5232 5233 5234 5235 5236 5237 5238 5239
			if (size > 0) {
				size = min(copy, size);
				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
					BUG();
				TCP_SKB_CB(nskb)->end_seq += size;
				copy -= size;
				start += size;
			}
			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
5240
				skb = tcp_collapse_one(sk, skb, list, root);
5241 5242
				if (!skb ||
				    skb == tail ||
5243
				    !mptcp_skb_can_collapse(nskb, skb) ||
5244
				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
5245
					goto end;
5246 5247 5248 5249
#ifdef CONFIG_TLS_DEVICE
				if (skb->decrypted != nskb->decrypted)
					goto end;
#endif
L
Linus Torvalds 已提交
5250 5251 5252
			}
		}
	}
5253 5254 5255
end:
	skb_queue_walk_safe(&tmp, skb, n)
		tcp_rbtree_insert(root, skb);
L
Linus Torvalds 已提交
5256 5257 5258 5259 5260 5261 5262 5263
}

/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
 * and tcp_collapse() them until all the queue is collapsed.
 */
static void tcp_collapse_ofo_queue(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
5264
	u32 range_truesize, sum_tiny = 0;
5265
	struct sk_buff *skb, *head;
L
Linus Torvalds 已提交
5266 5267
	u32 start, end;

5268
	skb = skb_rb_first(&tp->out_of_order_queue);
5269 5270
new_range:
	if (!skb) {
5271
		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
L
Linus Torvalds 已提交
5272
		return;
5273
	}
L
Linus Torvalds 已提交
5274 5275
	start = TCP_SKB_CB(skb)->seq;
	end = TCP_SKB_CB(skb)->end_seq;
5276
	range_truesize = skb->truesize;
5277

5278
	for (head = skb;;) {
5279
		skb = skb_rb_next(skb);
L
Linus Torvalds 已提交
5280

5281 5282 5283
		/* Range is terminated when we see a gap or when
		 * we are at the queue end.
		 */
5284
		if (!skb ||
L
Linus Torvalds 已提交
5285 5286
		    after(TCP_SKB_CB(skb)->seq, end) ||
		    before(TCP_SKB_CB(skb)->end_seq, start)) {
5287 5288 5289 5290 5291 5292 5293 5294 5295 5296
			/* Do not attempt collapsing tiny skbs */
			if (range_truesize != head->truesize ||
			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
					     head, skb, start, end);
			} else {
				sum_tiny += range_truesize;
				if (sum_tiny > sk->sk_rcvbuf >> 3)
					return;
			}
5297 5298 5299
			goto new_range;
		}

5300
		range_truesize += skb->truesize;
5301
		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
L
Linus Torvalds 已提交
5302
			start = TCP_SKB_CB(skb)->seq;
5303
		if (after(TCP_SKB_CB(skb)->end_seq, end))
L
Linus Torvalds 已提交
5304 5305 5306 5307
			end = TCP_SKB_CB(skb)->end_seq;
	}
}

5308
/*
5309 5310 5311 5312 5313 5314
 * Clean the out-of-order queue to make room.
 * We drop high sequences packets to :
 * 1) Let a chance for holes to be filled.
 * 2) not add too big latencies if thousands of packets sit there.
 *    (But if application shrinks SO_RCVBUF, we could still end up
 *     freeing whole queue here)
5315
 * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
5316 5317
 *
 * Return true if queue has shrunk.
5318
 */
E
Eric Dumazet 已提交
5319
static bool tcp_prune_ofo_queue(struct sock *sk)
5320 5321
{
	struct tcp_sock *tp = tcp_sk(sk);
5322
	struct rb_node *node, *prev;
5323
	int goal;
5324

5325
	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5326
		return false;
5327

5328
	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
5329
	goal = sk->sk_rcvbuf >> 3;
5330 5331 5332 5333
	node = &tp->ooo_last_skb->rbnode;
	do {
		prev = rb_prev(node);
		rb_erase(node, &tp->out_of_order_queue);
5334
		goal -= rb_to_skb(node)->truesize;
5335
		tcp_drop(sk, rb_to_skb(node));
5336 5337 5338 5339 5340 5341 5342
		if (!prev || goal <= 0) {
			sk_mem_reclaim(sk);
			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
			    !tcp_under_memory_pressure(sk))
				break;
			goal = sk->sk_rcvbuf >> 3;
		}
5343 5344
		node = prev;
	} while (node);
5345
	tp->ooo_last_skb = rb_to_skb(prev);
5346 5347 5348 5349 5350 5351 5352 5353 5354

	/* Reset SACK state.  A conforming SACK implementation will
	 * do the same at a timeout based retransmit.  When a connection
	 * is in a sad state like this, we care only about integrity
	 * of the connection not performance.
	 */
	if (tp->rx_opt.sack_ok)
		tcp_sack_reset(&tp->rx_opt);
	return true;
5355 5356
}

L
Linus Torvalds 已提交
5357 5358 5359 5360 5361 5362 5363 5364 5365
/* Reduce allocated memory if we can, trying to get
 * the socket within its memory limits again.
 *
 * Return less than zero if we should start dropping frames
 * until the socket owning process reads some of the data
 * to stabilize the situation.
 */
static int tcp_prune_queue(struct sock *sk)
{
5366
	struct tcp_sock *tp = tcp_sk(sk);
L
Linus Torvalds 已提交
5367

5368
	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
L
Linus Torvalds 已提交
5369 5370

	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
5371
		tcp_clamp_window(sk);
5372
	else if (tcp_under_memory_pressure(sk))
5373
		tcp_adjust_rcv_ssthresh(sk);
L
Linus Torvalds 已提交
5374

5375 5376 5377
	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
		return 0;

L
Linus Torvalds 已提交
5378
	tcp_collapse_ofo_queue(sk);
5379
	if (!skb_queue_empty(&sk->sk_receive_queue))
5380
		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
5381 5382 5383
			     skb_peek(&sk->sk_receive_queue),
			     NULL,
			     tp->copied_seq, tp->rcv_nxt);
5384
	sk_mem_reclaim(sk);
L
Linus Torvalds 已提交
5385 5386 5387 5388 5389 5390 5391

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
		return 0;

	/* Collapsing did not help, destructive actions follow.
	 * This must not ever occur. */

5392
	tcp_prune_ofo_queue(sk);
L
Linus Torvalds 已提交
5393 5394 5395 5396 5397 5398 5399 5400

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
		return 0;

	/* If we are really being abused, tell the caller to silently
	 * drop receive data on the floor.  It will get retransmitted
	 * and hopefully then we'll have sufficient space.
	 */
5401
	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
L
Linus Torvalds 已提交
5402 5403

	/* Massive buffer overcommit. */
5404
	tp->pred_flags = 0;
L
Linus Torvalds 已提交
5405 5406 5407
	return -1;
}

5408
static bool tcp_should_expand_sndbuf(struct sock *sk)
5409
{
5410
	const struct tcp_sock *tp = tcp_sk(sk);
5411

5412 5413 5414 5415
	/* If the user specified a specific send buffer setting, do
	 * not modify it.
	 */
	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
E
Eric Dumazet 已提交
5416
		return false;
5417 5418

	/* If we are under global TCP memory pressure, do not expand.  */
5419 5420 5421 5422 5423 5424 5425 5426 5427 5428
	if (tcp_under_memory_pressure(sk)) {
		int unused_mem = sk_unused_reserved_mem(sk);

		/* Adjust sndbuf according to reserved mem. But make sure
		 * it never goes below SOCK_MIN_SNDBUF.
		 * See sk_stream_moderate_sndbuf() for more details.
		 */
		if (unused_mem > SOCK_MIN_SNDBUF)
			WRITE_ONCE(sk->sk_sndbuf, unused_mem);

E
Eric Dumazet 已提交
5429
		return false;
5430
	}
5431 5432

	/* If we are under soft global TCP memory pressure, do not expand.  */
5433
	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
E
Eric Dumazet 已提交
5434
		return false;
5435 5436

	/* If we filled the congestion window, do not expand.  */
5437
	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
E
Eric Dumazet 已提交
5438
		return false;
5439

E
Eric Dumazet 已提交
5440
	return true;
5441
}
L
Linus Torvalds 已提交
5442 5443 5444 5445 5446

static void tcp_new_space(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

5447
	if (tcp_should_expand_sndbuf(sk)) {
E
Eric Dumazet 已提交
5448
		tcp_sndbuf_expand(sk);
5449
		tp->snd_cwnd_stamp = tcp_jiffies32;
L
Linus Torvalds 已提交
5450 5451
	}

5452
	INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
L
Linus Torvalds 已提交
5453 5454
}

S
Stephen Hemminger 已提交
5455
static void tcp_check_space(struct sock *sk)
L
Linus Torvalds 已提交
5456
{
E
Eric Dumazet 已提交
5457 5458 5459 5460 5461 5462 5463
	/* pairs with tcp_poll() */
	smp_mb();
	if (sk->sk_socket &&
	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
		tcp_new_space(sk);
		if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
			tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
L
Linus Torvalds 已提交
5464 5465 5466
	}
}

5467
static inline void tcp_data_snd_check(struct sock *sk)
L
Linus Torvalds 已提交
5468
{
5469
	tcp_push_pending_frames(sk);
L
Linus Torvalds 已提交
5470 5471 5472 5473 5474 5475 5476 5477 5478
	tcp_check_space(sk);
}

/*
 * Check if sending an ack is needed.
 */
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
	struct tcp_sock *tp = tcp_sk(sk);
E
Eric Dumazet 已提交
5479
	unsigned long rtt, delay;
L
Linus Torvalds 已提交
5480 5481

	    /* More than one full frame received... */
5482
	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
L
Linus Torvalds 已提交
5483
	     /* ... and right edge of window advances far enough.
5484 5485 5486
	      * (tcp_recvmsg() will send ACK otherwise).
	      * If application uses SO_RCVLOWAT, we want send ack now if
	      * we have not received enough bytes to satisfy the condition.
L
Linus Torvalds 已提交
5487
	      */
5488 5489
	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
L
Linus Torvalds 已提交
5490
	    /* We ACK each frame or... */
5491 5492 5493
	    tcp_in_quickack_mode(sk) ||
	    /* Protocol state mandates a one-time immediate ACK */
	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
E
Eric Dumazet 已提交
5494
send_now:
L
Linus Torvalds 已提交
5495
		tcp_send_ack(sk);
E
Eric Dumazet 已提交
5496 5497 5498 5499
		return;
	}

	if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
L
Linus Torvalds 已提交
5500
		tcp_send_delayed_ack(sk);
E
Eric Dumazet 已提交
5501
		return;
L
Linus Torvalds 已提交
5502
	}
E
Eric Dumazet 已提交
5503

E
Eric Dumazet 已提交
5504 5505
	if (!tcp_is_sack(tp) ||
	    tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
E
Eric Dumazet 已提交
5506
		goto send_now;
5507 5508 5509

	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
		tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
E
Eric Dumazet 已提交
5510
		tp->dup_ack_counter = 0;
5511
	}
E
Eric Dumazet 已提交
5512 5513
	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
		tp->dup_ack_counter++;
5514
		goto send_now;
E
Eric Dumazet 已提交
5515 5516
	}
	tp->compressed_ack++;
E
Eric Dumazet 已提交
5517 5518 5519
	if (hrtimer_is_queued(&tp->compressed_ack_timer))
		return;

5520
	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
E
Eric Dumazet 已提交
5521 5522 5523 5524 5525

	rtt = tp->rcv_rtt_est.rtt_us;
	if (tp->srtt_us && tp->srtt_us < rtt)
		rtt = tp->srtt_us;

5526
	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
E
Eric Dumazet 已提交
5527 5528
		      rtt * (NSEC_PER_USEC >> 3)/20);
	sock_hold(sk);
5529 5530 5531
	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
			       sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
			       HRTIMER_MODE_REL_PINNED_SOFT);
L
Linus Torvalds 已提交
5532 5533
}

S
Stephen Hemminger 已提交
5534
static inline void tcp_ack_snd_check(struct sock *sk)
L
Linus Torvalds 已提交
5535
{
5536
	if (!inet_csk_ack_scheduled(sk)) {
L
Linus Torvalds 已提交
5537 5538 5539 5540 5541 5542 5543 5544
		/* We sent a data segment already. */
		return;
	}
	__tcp_ack_snd_check(sk, 1);
}

/*
 *	This routine is only called when we have urgent data
S
Stephen Hemminger 已提交
5545
 *	signaled. Its the 'slow' part of tcp_urg. It could be
L
Linus Torvalds 已提交
5546 5547 5548 5549 5550 5551
 *	moved inline now as tcp_urg is only called from one
 *	place. We handle URGent data wrong. We have to - as
 *	BSD still doesn't use the correction from RFC961.
 *	For 1003.1g we should support a new option TCP_STDURG to permit
 *	either form (or just set the sysctl tcp_stdurg).
 */
5552

5553
static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
L
Linus Torvalds 已提交
5554 5555 5556 5557
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 ptr = ntohs(th->urg_ptr);

5558
	if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
L
Linus Torvalds 已提交
5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589
		ptr--;
	ptr += ntohl(th->seq);

	/* Ignore urgent data that we've already seen and read. */
	if (after(tp->copied_seq, ptr))
		return;

	/* Do not replay urg ptr.
	 *
	 * NOTE: interesting situation not covered by specs.
	 * Misbehaving sender may send urg ptr, pointing to segment,
	 * which we already have in ofo queue. We are not able to fetch
	 * such data and will stay in TCP_URG_NOTYET until will be eaten
	 * by recvmsg(). Seems, we are not obliged to handle such wicked
	 * situations. But it is worth to think about possibility of some
	 * DoSes using some hypothetical application level deadlock.
	 */
	if (before(ptr, tp->rcv_nxt))
		return;

	/* Do we already have a newer (or duplicate) urgent pointer? */
	if (tp->urg_data && !after(ptr, tp->urg_seq))
		return;

	/* Tell the world about our new urgent pointer. */
	sk_send_sigurg(sk);

	/* We may be adding urgent data when the last byte read was
	 * urgent. To do this requires some care. We cannot just ignore
	 * tp->copied_seq since we would read the last urgent byte again
	 * as data, nor can we alter copied_seq until this data arrives
S
Stephen Hemminger 已提交
5590
	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
L
Linus Torvalds 已提交
5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601
	 *
	 * NOTE. Double Dutch. Rendering to plain English: author of comment
	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
	 * and expect that both A and B disappear from stream. This is _wrong_.
	 * Though this happens in BSD with high probability, this is occasional.
	 * Any application relying on this is buggy. Note also, that fix "works"
	 * only in this artificial test. Insert some normal data between A and B and we will
	 * decline of BSD again. Verdict: it is better to remove to trap
	 * buggy users.
	 */
	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5602
	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
L
Linus Torvalds 已提交
5603 5604 5605
		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
		tp->copied_seq++;
		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
D
David S. Miller 已提交
5606
			__skb_unlink(skb, &sk->sk_receive_queue);
L
Linus Torvalds 已提交
5607 5608 5609 5610
			__kfree_skb(skb);
		}
	}

5611
	WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
5612
	WRITE_ONCE(tp->urg_seq, ptr);
5613 5614 5615

	/* Disable header prediction. */
	tp->pred_flags = 0;
L
Linus Torvalds 已提交
5616 5617 5618
}

/* This is the 'fast' part of urgent handling. */
5619
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
L
Linus Torvalds 已提交
5620 5621 5622 5623
{
	struct tcp_sock *tp = tcp_sk(sk);

	/* Check if we get a new urgent pointer - normally not. */
5624
	if (unlikely(th->urg))
5625
		tcp_check_urg(sk, th);
L
Linus Torvalds 已提交
5626 5627

	/* Do we wait for any urgent data? - normally not... */
5628
	if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
L
Linus Torvalds 已提交
5629 5630 5631
		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
			  th->syn;

5632
		/* Is the urgent pointer pointing into this packet? */
L
Linus Torvalds 已提交
5633 5634 5635 5636
		if (ptr < skb->len) {
			u8 tmp;
			if (skb_copy_bits(skb, ptr, &tmp, 1))
				BUG();
5637
			WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
L
Linus Torvalds 已提交
5638
			if (!sock_flag(sk, SOCK_DEAD))
5639
				sk->sk_data_ready(sk);
L
Linus Torvalds 已提交
5640 5641 5642 5643
		}
	}
}

5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660
/* Accept RST for rcv_nxt - 1 after a FIN.
 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
 * FIN is sent followed by a RST packet. The RST is sent with the same
 * sequence number as the FIN, and thus according to RFC 5961 a challenge
 * ACK should be sent. However, Mac OSX rate limits replies to challenge
 * ACKs on the closed socket. In addition middleboxes can drop either the
 * challenge ACK or a subsequent RST.
 */
static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
					       TCPF_CLOSING));
}

5661 5662 5663
/* Does PAWS and seqno based validation of an incoming segment, flags will
 * play significant role here.
 */
E
Eric Dumazet 已提交
5664 5665
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
				  const struct tcphdr *th, int syn_inerr)
5666 5667
{
	struct tcp_sock *tp = tcp_sk(sk);
5668
	bool rst_seq_match = false;
5669 5670

	/* RFC1323: H1. Apply PAWS check first. */
5671 5672
	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
	    tp->rx_opt.saw_tstamp &&
5673 5674
	    tcp_paws_discard(sk, skb)) {
		if (!th->rst) {
5675
			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5676 5677 5678 5679
			if (!tcp_oow_rate_limited(sock_net(sk), skb,
						  LINUX_MIB_TCPACKSKIPPEDPAWS,
						  &tp->last_oow_ack_time))
				tcp_send_dupack(sk, skb);
5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692
			goto discard;
		}
		/* Reset is accepted even if it did not pass PAWS. */
	}

	/* Step 1: check sequence number */
	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
		/* RFC793, page 37: "In all states except SYN-SENT, all reset
		 * (RST) segments are validated by checking their SEQ-fields."
		 * And page 69: "If an incoming segment is not acceptable,
		 * an acknowledgment should be sent in reply (unless the RST
		 * bit is set, if so drop the segment and return)".
		 */
5693 5694 5695
		if (!th->rst) {
			if (th->syn)
				goto syn_challenge;
5696 5697 5698 5699
			if (!tcp_oow_rate_limited(sock_net(sk), skb,
						  LINUX_MIB_TCPACKSKIPPEDSEQ,
						  &tp->last_oow_ack_time))
				tcp_send_dupack(sk, skb);
5700
		} else if (tcp_reset_check(sk, skb)) {
5701
			tcp_reset(sk, skb);
5702
		}
5703 5704 5705 5706 5707
		goto discard;
	}

	/* Step 2: check RST bit */
	if (th->rst) {
5708 5709 5710 5711
		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
		 * FIN and SACK too if available):
		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
		 * the right-most SACK block,
5712
		 * then
E
Eric Dumazet 已提交
5713 5714 5715 5716
		 *     RESET the connection
		 * else
		 *     Send a challenge ACK
		 */
5717 5718
		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
		    tcp_reset_check(sk, skb)) {
5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736
			rst_seq_match = true;
		} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
			struct tcp_sack_block *sp = &tp->selective_acks[0];
			int max_sack = sp[0].end_seq;
			int this_sack;

			for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
			     ++this_sack) {
				max_sack = after(sp[this_sack].end_seq,
						 max_sack) ?
					sp[this_sack].end_seq : max_sack;
			}

			if (TCP_SKB_CB(skb)->seq == max_sack)
				rst_seq_match = true;
		}

		if (rst_seq_match)
5737
			tcp_reset(sk, skb);
5738 5739 5740 5741 5742 5743 5744
		else {
			/* Disable TFO if RST is out-of-order
			 * and no data has been received
			 * for current active TFO socket
			 */
			if (tp->syn_fastopen && !tp->data_segs_in &&
			    sk->sk_state == TCP_ESTABLISHED)
5745
				tcp_fastopen_active_disable(sk);
5746
			tcp_send_challenge_ack(sk);
5747
		}
5748 5749 5750 5751 5752
		goto discard;
	}

	/* step 3: check security and precedence [ignored] */

E
Eric Dumazet 已提交
5753
	/* step 4: Check for a SYN
5754
	 * RFC 5961 4.2 : Send a challenge ack
E
Eric Dumazet 已提交
5755 5756
	 */
	if (th->syn) {
5757
syn_challenge:
5758
		if (syn_inerr)
5759 5760
			TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5761
		tcp_send_challenge_ack(sk);
E
Eric Dumazet 已提交
5762
		goto discard;
5763 5764
	}

5765 5766
	bpf_skops_parse_hdr(sk, skb);

E
Eric Dumazet 已提交
5767
	return true;
5768 5769

discard:
5770
	tcp_drop(sk, skb);
E
Eric Dumazet 已提交
5771
	return false;
5772 5773
}

L
Linus Torvalds 已提交
5774
/*
5775
 *	TCP receive function for the ESTABLISHED state.
5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795
 *
 *	It is split into a fast path and a slow path. The fast path is
 * 	disabled when:
 *	- A zero window was announced from us - zero window probing
 *        is only handled properly in the slow path.
 *	- Out of order segments arrived.
 *	- Urgent data is expected.
 *	- There is no buffer space left
 *	- Unexpected TCP flags/window values/header lengths are received
 *	  (detected by checking the TCP header against pred_flags)
 *	- Data is sent in both directions. Fast path only supports pure senders
 *	  or pure receivers (this means either the sequence number or the ack
 *	  value must stay constant)
 *	- Unexpected TCP option.
 *
 *	When these conditions are not satisfied it drops into a standard
 *	receive procedure patterned after RFC793 to handle all cases.
 *	The first three cases are guaranteed by proper pred_flags setting,
 *	the rest is checked inline. Fast processing is turned on in
 *	tcp_data_queue when everything is OK.
L
Linus Torvalds 已提交
5796
 */
5797
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
5798
{
5799
	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
5800
	const struct tcphdr *th = (const struct tcphdr *)skb->data;
L
Linus Torvalds 已提交
5801
	struct tcp_sock *tp = tcp_sk(sk);
5802
	unsigned int len = skb->len;
L
Linus Torvalds 已提交
5803

5804 5805 5806
	/* TCP congestion window tracking */
	trace_tcp_probe(sk, skb);

5807
	tcp_mstamp_refresh(tp);
5808
	if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
E
Eric Dumazet 已提交
5809
		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823
	/*
	 *	Header prediction.
	 *	The code loosely follows the one in the famous
	 *	"30 instruction TCP receive" Van Jacobson mail.
	 *
	 *	Van's trick is to deposit buffers into socket queue
	 *	on a device interrupt, to call tcp_recv function
	 *	on the receive process context and checksum and copy
	 *	the buffer to user space. smart...
	 *
	 *	Our current scheme is not silly either but we take the
	 *	extra cost of the net_bh soft interrupt processing...
	 *	We do checksum and copy also but from device to kernel.
	 */
L
Linus Torvalds 已提交
5824 5825 5826

	tp->rx_opt.saw_tstamp = 0;

5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880
	/*	pred_flags is 0xS?10 << 16 + snd_wnd
	 *	if header_prediction is to be made
	 *	'S' will always be tp->tcp_header_len >> 2
	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
	 *  turn it off	(when there are holes in the receive
	 *	 space for instance)
	 *	PSH flag is ignored.
	 */

	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
		int tcp_header_len = tp->tcp_header_len;

		/* Timestamp header prediction: tcp_header_len
		 * is automatically equal to th->doff*4 due to pred_flags
		 * match.
		 */

		/* Check timestamp */
		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
			/* No? Slow path! */
			if (!tcp_parse_aligned_timestamp(tp, th))
				goto slow_path;

			/* If PAWS failed, check it more carefully in slow path */
			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				goto slow_path;

			/* DO NOT update ts_recent here, if checksum fails
			 * and timestamp was corrupted part, it will result
			 * in a hung connection since we will drop all
			 * future packets due to the PAWS test.
			 */
		}

		if (len <= tcp_header_len) {
			/* Bulk data transfer: sender */
			if (len == tcp_header_len) {
				/* Predicted packet is in window by definition.
				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
				 * Hence, check seq<=rcv_wup reduces to:
				 */
				if (tcp_header_len ==
				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				    tp->rcv_nxt == tp->rcv_wup)
					tcp_store_ts_recent(tp);

				/* We know that such packets are checksummed
				 * on entry.
				 */
				tcp_ack(sk, skb, 0);
				__kfree_skb(skb);
				tcp_data_snd_check(sk);
5881 5882 5883 5884 5885
				/* When receiving pure ack in fast path, update
				 * last ts ecr directly instead of calling
				 * tcp_rcv_rtt_measure_ts()
				 */
				tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
5886 5887
				return;
			} else { /* Header too small */
5888
				reason = SKB_DROP_REASON_PKT_TOO_SMALL;
5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915
				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				goto discard;
			}
		} else {
			int eaten = 0;
			bool fragstolen = false;

			if (tcp_checksum_complete(skb))
				goto csum_error;

			if ((int)skb->truesize > sk->sk_forward_alloc)
				goto step5;

			/* Predicted packet is in window by definition.
			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
			 * Hence, check seq<=rcv_wup reduces to:
			 */
			if (tcp_header_len ==
			    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
			    tp->rcv_nxt == tp->rcv_wup)
				tcp_store_ts_recent(tp);

			tcp_rcv_rtt_measure_ts(sk, skb);

			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);

			/* Bulk data transfer: receiver */
5916 5917
			__skb_pull(skb, tcp_header_len);
			eaten = tcp_queue_rcv(sk, skb, &fragstolen);
5918 5919 5920 5921 5922 5923 5924 5925 5926

			tcp_event_data_recv(sk, skb);

			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				/* Well, only one small jumplet in fast path... */
				tcp_ack(sk, skb, FLAG_DATA);
				tcp_data_snd_check(sk);
				if (!inet_csk_ack_scheduled(sk))
					goto no_ack;
5927 5928
			} else {
				tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
5929 5930 5931 5932 5933 5934
			}

			__tcp_ack_snd_check(sk, 0);
no_ack:
			if (eaten)
				kfree_skb_partial(skb, fragstolen);
5935
			tcp_data_ready(sk);
5936 5937 5938 5939 5940
			return;
		}
	}

slow_path:
5941
	if (len < (th->doff << 2) || tcp_checksum_complete(skb))
L
Linus Torvalds 已提交
5942 5943
		goto csum_error;

5944 5945
	if (!th->ack && !th->rst && !th->syn) {
		reason = SKB_DROP_REASON_TCP_FLAGS;
5946
		goto discard;
5947
	}
5948

5949 5950 5951 5952
	/*
	 *	Standard slow path.
	 */

E
Eric Dumazet 已提交
5953
	if (!tcp_validate_incoming(sk, skb, th, 1))
5954
		return;
L
Linus Torvalds 已提交
5955

5956 5957
step5:
	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5958
		goto discard;
L
Linus Torvalds 已提交
5959

5960
	tcp_rcv_rtt_measure_ts(sk, skb);
L
Linus Torvalds 已提交
5961 5962 5963 5964 5965 5966 5967

	/* Process urgent data. */
	tcp_urg(sk, skb, th);

	/* step 7: process the segment text */
	tcp_data_queue(sk, skb);

5968
	tcp_data_snd_check(sk);
L
Linus Torvalds 已提交
5969
	tcp_ack_snd_check(sk);
5970
	return;
L
Linus Torvalds 已提交
5971 5972

csum_error:
5973
	reason = SKB_DROP_REASON_TCP_CSUM;
5974
	trace_tcp_bad_csum(skb);
5975 5976
	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
L
Linus Torvalds 已提交
5977 5978

discard:
5979
	tcp_drop_reason(sk, skb, reason);
L
Linus Torvalds 已提交
5980
}
E
Eric Dumazet 已提交
5981
EXPORT_SYMBOL(tcp_rcv_established);
L
Linus Torvalds 已提交
5982

5983
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	tcp_mtup_init(sk);
	icsk->icsk_af_ops->rebuild_header(sk);
	tcp_init_metrics(sk);

	/* Initialize the congestion window to start the transfer.
	 * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
	 * retransmitted. In light of RFC6298 more aggressive 1sec
	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
	 * retransmission has occurred.
	 */
	if (tp->total_retrans > 1 && tp->undo_marker)
		tp->snd_cwnd = 1;
	else
		tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
	tp->snd_cwnd_stamp = tcp_jiffies32;

6004
	bpf_skops_established(sk, bpf_op, skb);
6005
	/* Initialize congestion control unless BPF initialized it already: */
6006 6007
	if (!icsk->icsk_ca_initialized)
		tcp_init_congestion_control(sk);
6008 6009 6010
	tcp_init_buffer_space(sk);
}

P
Pavel Emelyanov 已提交
6011 6012 6013 6014 6015 6016
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

	tcp_set_state(sk, TCP_ESTABLISHED);
6017
	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
P
Pavel Emelyanov 已提交
6018

6019
	if (skb) {
E
Eric Dumazet 已提交
6020
		icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
P
Pavel Emelyanov 已提交
6021
		security_inet_conn_established(sk, skb);
6022
		sk_mark_napi_id(sk, skb);
D
David S. Miller 已提交
6023
	}
P
Pavel Emelyanov 已提交
6024

6025
	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
P
Pavel Emelyanov 已提交
6026 6027 6028 6029

	/* Prevent spurious tcp_cwnd_restart() on first data
	 * packet.
	 */
6030
	tp->lsndtime = tcp_jiffies32;
P
Pavel Emelyanov 已提交
6031 6032 6033

	if (sock_flag(sk, SOCK_KEEPOPEN))
		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
6034 6035 6036 6037 6038

	if (!tp->rx_opt.snd_wscale)
		__tcp_fast_path_on(tp, tp->snd_wnd);
	else
		tp->pred_flags = 0;
P
Pavel Emelyanov 已提交
6039 6040
}

6041 6042 6043 6044
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
				    struct tcp_fastopen_cookie *cookie)
{
	struct tcp_sock *tp = tcp_sk(sk);
6045
	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
6046 6047
	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
	bool syn_drop = false;
6048 6049 6050 6051 6052 6053 6054

	if (mss == tp->rx_opt.user_mss) {
		struct tcp_options_received opt;

		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
		tcp_clear_options(&opt);
		opt.user_mss = opt.mss_clamp = 0;
6055
		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
6056 6057 6058
		mss = opt.mss_clamp;
	}

6059 6060
	if (!tp->syn_fastopen) {
		/* Ignore an unsolicited cookie */
6061
		cookie->len = -1;
6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075
	} else if (tp->total_retrans) {
		/* SYN timed out and the SYN-ACK neither has a cookie nor
		 * acknowledges data. Presumably the remote received only
		 * the retransmitted (regular) SYNs: either the original
		 * SYN-data or the corresponding SYN-ACK was dropped.
		 */
		syn_drop = (cookie->len < 0 && data);
	} else if (cookie->len < 0 && !tp->syn_data) {
		/* We requested a cookie but didn't get it. If we did not use
		 * the (old) exp opt format then try so next time (try_exp=1).
		 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
		 */
		try_exp = tp->syn_fastopen_exp ? 2 : 1;
	}
6076

6077
	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
6078 6079

	if (data) { /* Retransmit unacked data in SYN */
6080 6081 6082 6083
		if (tp->total_retrans)
			tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
		else
			tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
6084 6085 6086
		skb_rbtree_walk_from(data)
			 tcp_mark_skb_lost(sk, data);
		tcp_xmit_retransmit_queue(sk);
6087
		NET_INC_STATS(sock_net(sk),
6088
				LINUX_MIB_TCPFASTOPENACTIVEFAIL);
6089 6090
		return true;
	}
Y
Yuchung Cheng 已提交
6091
	tp->syn_data_acked = tp->syn_data;
6092 6093 6094 6095 6096 6097
	if (tp->syn_data_acked) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
		/* SYN-data is counted as two separate packets in tcp_ack() */
		if (tp->delivered > 1)
			--tp->delivered;
	}
6098 6099 6100

	tcp_fastopen_add_skb(sk, synack);

6101 6102 6103
	return false;
}

6104 6105 6106 6107 6108 6109 6110 6111 6112 6113
static void smc_check_reset_syn(struct tcp_sock *tp)
{
#if IS_ENABLED(CONFIG_SMC)
	if (static_branch_unlikely(&tcp_have_smc)) {
		if (tp->syn_smc && !tp->rx_opt.smc_ok)
			tp->syn_smc = 0;
	}
#endif
}

6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128
static void tcp_try_undo_spurious_syn(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 syn_stamp;

	/* undo_marker is set when SYN or SYNACK times out. The timeout is
	 * spurious if the ACK's timestamp option echo value matches the
	 * original SYN timestamp.
	 */
	syn_stamp = tp->retrans_stamp;
	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
	    syn_stamp == tp->rx_opt.rcv_tsecr)
		tp->undo_marker = 0;
}

L
Linus Torvalds 已提交
6129
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
6130
					 const struct tcphdr *th)
L
Linus Torvalds 已提交
6131
{
6132
	struct inet_connection_sock *icsk = inet_csk(sk);
6133
	struct tcp_sock *tp = tcp_sk(sk);
6134
	struct tcp_fastopen_cookie foc = { .len = -1 };
6135
	int saved_clamp = tp->rx_opt.mss_clamp;
6136
	bool fastopen_fail;
L
Linus Torvalds 已提交
6137

6138
	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
6139
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
6140
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
L
Linus Torvalds 已提交
6141 6142 6143 6144 6145 6146 6147 6148 6149 6150

	if (th->ack) {
		/* rfc793:
		 * "If the state is SYN-SENT then
		 *    first check the ACK bit
		 *      If the ACK bit is set
		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
		 *        a reset (unless the RST bit is set, if so drop
		 *        the segment and return)"
		 */
6151
		if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
6152 6153 6154 6155 6156 6157
		    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
			/* Previous FIN/ACK or RST/ACK might be ignored. */
			if (icsk->icsk_retransmits == 0)
				inet_csk_reset_xmit_timer(sk,
						ICSK_TIME_RETRANS,
						TCP_TIMEOUT_MIN, TCP_RTO_MAX);
L
Linus Torvalds 已提交
6158
			goto reset_and_undo;
6159
		}
L
Linus Torvalds 已提交
6160 6161 6162

		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
6163
			     tcp_time_stamp(tp))) {
6164
			NET_INC_STATS(sock_net(sk),
6165
					LINUX_MIB_PAWSACTIVEREJECTED);
L
Linus Torvalds 已提交
6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177
			goto reset_and_undo;
		}

		/* Now ACK is acceptable.
		 *
		 * "If the RST bit is set
		 *    If the ACK was acceptable then signal the user "error:
		 *    connection reset", drop the segment, enter CLOSED state,
		 *    delete TCB, and return."
		 */

		if (th->rst) {
6178
			tcp_reset(sk, skb);
L
Linus Torvalds 已提交
6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198
			goto discard;
		}

		/* rfc793:
		 *   "fifth, if neither of the SYN or RST bits is set then
		 *    drop the segment and return."
		 *
		 *    See note below!
		 *                                        --ANK(990513)
		 */
		if (!th->syn)
			goto discard_and_undo;

		/* rfc793:
		 *   "If the SYN bit is on ...
		 *    are acceptable then ...
		 *    (our SYN has been ACKed), change the connection
		 *    state to ESTABLISHED..."
		 */

6199
		tcp_ecn_rcv_synack(tp, th);
L
Linus Torvalds 已提交
6200

6201
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6202
		tcp_try_undo_spurious_syn(sk);
6203
		tcp_ack(sk, skb, FLAG_SLOWPATH);
L
Linus Torvalds 已提交
6204 6205 6206 6207

		/* Ok.. it's good. Set up sequence numbers and
		 * move to established.
		 */
6208
		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
L
Linus Torvalds 已提交
6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230
		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

		/* RFC1323: The window in SYN & SYN/ACK segments is
		 * never scaled.
		 */
		tp->snd_wnd = ntohs(th->window);

		if (!tp->rx_opt.wscale_ok) {
			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
			tp->window_clamp = min(tp->window_clamp, 65535U);
		}

		if (tp->rx_opt.saw_tstamp) {
			tp->rx_opt.tstamp_ok	   = 1;
			tp->tcp_header_len =
				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
			tcp_store_ts_recent(tp);
		} else {
			tp->tcp_header_len = sizeof(struct tcphdr);
		}

6231
		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
L
Linus Torvalds 已提交
6232 6233 6234 6235 6236
		tcp_initialize_rcv_mss(sk);

		/* Remember, tcp_poll() does not lock socket!
		 * Change state from SYN-SENT only after copied_seq
		 * is initialized. */
6237
		WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6238

6239 6240
		smc_check_reset_syn(tp);

R
Ralf Baechle 已提交
6241
		smp_mb();
L
Linus Torvalds 已提交
6242

P
Pavel Emelyanov 已提交
6243
		tcp_finish_connect(sk, skb);
L
Linus Torvalds 已提交
6244

6245 6246
		fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
				tcp_rcv_fastopen_synack(sk, skb, &foc);
6247

6248 6249 6250 6251 6252 6253
		if (!sock_flag(sk, SOCK_DEAD)) {
			sk->sk_state_change(sk);
			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
		}
		if (fastopen_fail)
			return -1;
6254 6255
		if (sk->sk_write_pending ||
		    icsk->icsk_accept_queue.rskq_defer_accept ||
W
Wei Wang 已提交
6256
		    inet_csk_in_pingpong_mode(sk)) {
L
Linus Torvalds 已提交
6257 6258 6259 6260 6261 6262 6263
			/* Save one ACK. Data will be ready after
			 * several ticks, if write_pending is set.
			 *
			 * It may be deleted, but with this feature tcpdumps
			 * look so _wonderfully_ clever, that I was not able
			 * to stand against the temptation 8)     --ANK
			 */
6264
			inet_csk_schedule_ack(sk);
6265
			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
6266 6267
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
						  TCP_DELACK_MAX, TCP_RTO_MAX);
L
Linus Torvalds 已提交
6268 6269

discard:
6270
			tcp_drop(sk, skb);
L
Linus Torvalds 已提交
6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290
			return 0;
		} else {
			tcp_send_ack(sk);
		}
		return -1;
	}

	/* No ACK in the segment */

	if (th->rst) {
		/* rfc793:
		 * "If the RST bit is set
		 *
		 *      Otherwise (no ACK) drop the segment and return."
		 */

		goto discard_and_undo;
	}

	/* PAWS check. */
6291
	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
I
Ilpo Järvinen 已提交
6292
	    tcp_paws_reject(&tp->rx_opt, 0))
L
Linus Torvalds 已提交
6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310
		goto discard_and_undo;

	if (th->syn) {
		/* We see SYN without ACK. It is attempt of
		 * simultaneous connect with crossed SYNs.
		 * Particularly, it can be connect to self.
		 */
		tcp_set_state(sk, TCP_SYN_RECV);

		if (tp->rx_opt.saw_tstamp) {
			tp->rx_opt.tstamp_ok = 1;
			tcp_store_ts_recent(tp);
			tp->tcp_header_len =
				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
		} else {
			tp->tcp_header_len = sizeof(struct tcphdr);
		}

6311
		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
6312
		WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
L
Linus Torvalds 已提交
6313 6314 6315 6316 6317 6318 6319 6320 6321
		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

		/* RFC1323: The window in SYN & SYN/ACK segments is
		 * never scaled.
		 */
		tp->snd_wnd    = ntohs(th->window);
		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
		tp->max_window = tp->snd_wnd;

6322
		tcp_ecn_rcv_syn(tp, th);
L
Linus Torvalds 已提交
6323

J
John Heffner 已提交
6324
		tcp_mtup_init(sk);
6325
		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
L
Linus Torvalds 已提交
6326 6327 6328 6329 6330
		tcp_initialize_rcv_mss(sk);

		tcp_send_synack(sk);
#if 0
		/* Note, we could accept data and URG from this segment.
6331 6332 6333
		 * There are no obstacles to make this (except that we must
		 * either change tcp_recvmsg() to prevent it from returning data
		 * before 3WHS completes per RFC793, or employ TCP Fast Open).
L
Linus Torvalds 已提交
6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360
		 *
		 * However, if we ignore data in ACKless segments sometimes,
		 * we have no reasons to accept it sometimes.
		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
		 * is not flawless. So, discard packet for sanity.
		 * Uncomment this return to process the data.
		 */
		return -1;
#else
		goto discard;
#endif
	}
	/* "fifth, if neither of the SYN or RST bits is set then
	 * drop the segment and return."
	 */

discard_and_undo:
	tcp_clear_options(&tp->rx_opt);
	tp->rx_opt.mss_clamp = saved_clamp;
	goto discard;

reset_and_undo:
	tcp_clear_options(&tp->rx_opt);
	tp->rx_opt.mss_clamp = saved_clamp;
	return 1;
}

6361 6362
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
6363 6364
	struct request_sock *req;

6365 6366 6367 6368 6369
	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
	 * undo. If peer SACKs triggered fast recovery, we can't undo here.
	 */
	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
		tcp_try_undo_loss(sk, false);
6370 6371 6372

	/* Reset rtx states to prevent spurious retransmits_timed_out() */
	tcp_sk(sk)->retrans_stamp = 0;
6373 6374 6375 6376 6377
	inet_csk(sk)->icsk_retransmits = 0;

	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
	 * we no longer need req so release it.
	 */
6378 6379 6380
	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
					lockdep_sock_is_held(sk));
	reqsk_fastopen_remove(sk, req, false);
6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392

	/* Re-arm the timer because data may have been sent out.
	 * This is similar to the regular data transmission case
	 * when new data has just been ack'ed.
	 *
	 * (TFO) - we could try to be more aggressive and
	 * retransmitting any data sooner based on when they
	 * are sent out.
	 */
	tcp_rearm_rto(sk);
}

L
Linus Torvalds 已提交
6393 6394
/*
 *	This function implements the receiving procedure of RFC 793 for
6395
 *	all states except ESTABLISHED and TIME_WAIT.
L
Linus Torvalds 已提交
6396 6397 6398
 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *	address independent.
 */
6399

6400
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
6401 6402
{
	struct tcp_sock *tp = tcp_sk(sk);
6403
	struct inet_connection_sock *icsk = inet_csk(sk);
6404
	const struct tcphdr *th = tcp_hdr(skb);
6405
	struct request_sock *req;
L
Linus Torvalds 已提交
6406
	int queued = 0;
6407
	bool acceptable;
L
Linus Torvalds 已提交
6408 6409 6410 6411 6412 6413

	switch (sk->sk_state) {
	case TCP_CLOSE:
		goto discard;

	case TCP_LISTEN:
S
Stephen Hemminger 已提交
6414
		if (th->ack)
L
Linus Torvalds 已提交
6415 6416
			return 1;

S
Stephen Hemminger 已提交
6417
		if (th->rst)
L
Linus Torvalds 已提交
6418 6419
			goto discard;

S
Stephen Hemminger 已提交
6420
		if (th->syn) {
E
Eric Dumazet 已提交
6421 6422
			if (th->fin)
				goto discard;
6423
			/* It is possible that we process SYN packets from backlog,
6424
			 * so we need to make sure to disable BH and RCU right there.
6425
			 */
6426
			rcu_read_lock();
6427 6428 6429
			local_bh_disable();
			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
			local_bh_enable();
6430
			rcu_read_unlock();
L
Linus Torvalds 已提交
6431

6432 6433
			if (!acceptable)
				return 1;
6434
			consume_skb(skb);
6435
			return 0;
L
Linus Torvalds 已提交
6436 6437 6438 6439
		}
		goto discard;

	case TCP_SYN_SENT:
6440
		tp->rx_opt.saw_tstamp = 0;
6441
		tcp_mstamp_refresh(tp);
6442
		queued = tcp_rcv_synsent_state_process(sk, skb, th);
L
Linus Torvalds 已提交
6443 6444 6445 6446 6447 6448
		if (queued >= 0)
			return queued;

		/* Do step6 onward by hand. */
		tcp_urg(sk, skb, th);
		__kfree_skb(skb);
6449
		tcp_data_snd_check(sk);
L
Linus Torvalds 已提交
6450 6451 6452
		return 0;
	}

6453
	tcp_mstamp_refresh(tp);
6454
	tp->rx_opt.saw_tstamp = 0;
6455 6456
	req = rcu_dereference_protected(tp->fastopen_rsk,
					lockdep_sock_is_held(sk));
6457
	if (req) {
6458 6459
		bool req_stolen;

6460
		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
6461 6462
		    sk->sk_state != TCP_FIN_WAIT1);

6463
		if (!tcp_check_req(sk, skb, req, true, &req_stolen))
6464
			goto discard;
6465
	}
6466

6467
	if (!th->ack && !th->rst && !th->syn)
6468 6469
		goto discard;

6470
	if (!tcp_validate_incoming(sk, skb, th, 0))
E
Eric Dumazet 已提交
6471
		return 0;
L
Linus Torvalds 已提交
6472 6473

	/* step 5: check the ACK field */
6474 6475
	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
				      FLAG_UPDATE_TS_RECENT |
6476
				      FLAG_NO_CHALLENGE_ACK) > 0;
6477

6478 6479 6480
	if (!acceptable) {
		if (sk->sk_state == TCP_SYN_RECV)
			return 1;	/* send one RST */
6481
		tcp_send_challenge_ack(sk);
6482 6483
		goto discard;
	}
6484 6485
	switch (sk->sk_state) {
	case TCP_SYN_RECV:
6486
		tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
Y
Yuchung Cheng 已提交
6487 6488 6489
		if (!tp->srtt_us)
			tcp_synack_rtt_meas(sk, req);

6490
		if (req) {
6491
			tcp_rcv_synrecv_state_fastopen(sk);
6492
		} else {
6493 6494
			tcp_try_undo_spurious_syn(sk);
			tp->retrans_stamp = 0;
6495 6496
			tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
					  skb);
6497
			WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6498 6499 6500 6501
		}
		smp_mb();
		tcp_set_state(sk, TCP_ESTABLISHED);
		sk->sk_state_change(sk);
L
Linus Torvalds 已提交
6502

6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516
		/* Note, that this wakeup is only for marginal crossed SYN case.
		 * Passively open sockets are not waked up, because
		 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
		 */
		if (sk->sk_socket)
			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);

		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

		if (tp->rx_opt.tstamp_ok)
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

6517 6518
		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
			tcp_update_pacing_rate(sk);
6519

6520
		/* Prevent spurious tcp_cwnd_restart() on first data packet */
6521
		tp->lsndtime = tcp_jiffies32;
6522 6523

		tcp_initialize_rcv_mss(sk);
6524
		tcp_fast_path_on(tp);
6525 6526
		break;

6527 6528 6529
	case TCP_FIN_WAIT1: {
		int tmo;

6530 6531 6532
		if (req)
			tcp_rcv_synrecv_state_fastopen(sk);

6533 6534
		if (tp->snd_una != tp->write_seq)
			break;
6535

6536 6537
		tcp_set_state(sk, TCP_FIN_WAIT2);
		sk->sk_shutdown |= SEND_SHUTDOWN;
6538

6539
		sk_dst_confirm(sk);
6540

6541 6542 6543 6544 6545
		if (!sock_flag(sk, SOCK_DEAD)) {
			/* Wake up lingering close() */
			sk->sk_state_change(sk);
			break;
		}
L
Linus Torvalds 已提交
6546

6547 6548 6549 6550 6551 6552 6553 6554 6555
		if (tp->linger2 < 0) {
			tcp_done(sk);
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
			return 1;
		}
		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
			/* Receive out of order FIN after close() */
			if (tp->syn_fastopen && th->fin)
6556
				tcp_fastopen_active_disable(sk);
6557
			tcp_done(sk);
6558
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575
			return 1;
		}

		tmo = tcp_fin_time(sk);
		if (tmo > TCP_TIMEWAIT_LEN) {
			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
		} else if (th->fin || sock_owned_by_user(sk)) {
			/* Bad case. We could lose such FIN otherwise.
			 * It is not a big problem, but it looks confusing
			 * and not so rare event. We still can lose it now,
			 * if it spins in bh_lock_sock(), but it is really
			 * marginal case.
			 */
			inet_csk_reset_keepalive_timer(sk, tmo);
		} else {
			tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
			goto discard;
6576 6577
		}
		break;
6578
	}
L
Linus Torvalds 已提交
6579

6580 6581 6582 6583 6584 6585
	case TCP_CLOSING:
		if (tp->snd_una == tp->write_seq) {
			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
			goto discard;
		}
		break;
L
Linus Torvalds 已提交
6586

6587 6588 6589 6590 6591
	case TCP_LAST_ACK:
		if (tp->snd_una == tp->write_seq) {
			tcp_update_metrics(sk);
			tcp_done(sk);
			goto discard;
L
Linus Torvalds 已提交
6592
		}
6593
		break;
6594
	}
L
Linus Torvalds 已提交
6595 6596 6597 6598 6599 6600 6601 6602 6603

	/* step 6: check the URG bit */
	tcp_urg(sk, skb, th);

	/* step 7: process the segment text */
	switch (sk->sk_state) {
	case TCP_CLOSE_WAIT:
	case TCP_CLOSING:
	case TCP_LAST_ACK:
6604
		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
6605 6606 6607 6608 6609
			/* If a subflow has been reset, the packet should not
			 * continue to be processed, drop the packet.
			 */
			if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
				goto discard;
L
Linus Torvalds 已提交
6610
			break;
6611
		}
J
Joe Perches 已提交
6612
		fallthrough;
L
Linus Torvalds 已提交
6613 6614 6615
	case TCP_FIN_WAIT1:
	case TCP_FIN_WAIT2:
		/* RFC 793 says to queue data in these states,
6616
		 * RFC 1122 says we MUST send a reset.
L
Linus Torvalds 已提交
6617 6618 6619 6620 6621
		 * BSD 4.4 also does reset.
		 */
		if (sk->sk_shutdown & RCV_SHUTDOWN) {
			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6622
				NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6623
				tcp_reset(sk, skb);
L
Linus Torvalds 已提交
6624 6625 6626
				return 1;
			}
		}
J
Joe Perches 已提交
6627
		fallthrough;
6628
	case TCP_ESTABLISHED:
L
Linus Torvalds 已提交
6629 6630 6631 6632 6633 6634 6635
		tcp_data_queue(sk, skb);
		queued = 1;
		break;
	}

	/* tcp_data could move socket to TIME-WAIT */
	if (sk->sk_state != TCP_CLOSE) {
6636
		tcp_data_snd_check(sk);
L
Linus Torvalds 已提交
6637 6638 6639
		tcp_ack_snd_check(sk);
	}

6640
	if (!queued) {
L
Linus Torvalds 已提交
6641
discard:
6642
		tcp_drop(sk, skb);
L
Linus Torvalds 已提交
6643 6644 6645 6646
	}
	return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
O
Octavian Purdila 已提交
6647 6648 6649 6650 6651 6652

static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
	struct inet_request_sock *ireq = inet_rsk(req);

	if (family == AF_INET)
6653 6654
		net_dbg_ratelimited("drop open request from %pI4/%u\n",
				    &ireq->ir_rmt_addr, port);
6655 6656
#if IS_ENABLED(CONFIG_IPV6)
	else if (family == AF_INET6)
6657 6658
		net_dbg_ratelimited("drop open request from %pI6/%u\n",
				    &ireq->ir_v6_rmt_addr, port);
6659
#endif
O
Octavian Purdila 已提交
6660 6661
}

6662 6663 6664 6665 6666
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
 *
 * If we receive a SYN packet with these bits set, it means a
 * network is playing bad games with TOS bits. In order to
 * avoid possible false congestion notifications, we disable
S
stephen hemminger 已提交
6667
 * TCP ECN negotiation.
6668 6669
 *
 * Exception: tcp_ca wants ECN. This is required for DCTCP
6670 6671 6672
 * congestion control: Linux DCTCP asserts ECT on all packets,
 * including SYN, which is most optimal solution; however,
 * others, such as FreeBSD do not.
6673 6674 6675 6676 6677
 *
 * Exception: At least one of the reserved bits of the TCP header (th->res1) is
 * set, indicating the use of a future TCP extension (such as AccECN). See
 * RFC8311 §4.3 which updates RFC3168 to allow the development of such
 * extensions.
6678 6679 6680
 */
static void tcp_ecn_create_request(struct request_sock *req,
				   const struct sk_buff *skb,
6681 6682
				   const struct sock *listen_sk,
				   const struct dst_entry *dst)
6683 6684 6685 6686
{
	const struct tcphdr *th = tcp_hdr(skb);
	const struct net *net = sock_net(listen_sk);
	bool th_ecn = th->ece && th->cwr;
6687
	bool ect, ecn_ok;
6688
	u32 ecn_ok_dst;
6689 6690 6691 6692 6693

	if (!th_ecn)
		return;

	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6694 6695
	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6696

6697
	if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6698 6699
	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
	    tcp_bpf_ca_needs_ecn((struct sock *)req))
6700 6701 6702
		inet_rsk(req)->ecn_ok = 1;
}

6703 6704 6705 6706 6707 6708
static void tcp_openreq_init(struct request_sock *req,
			     const struct tcp_options_received *rx_opt,
			     struct sk_buff *skb, const struct sock *sk)
{
	struct inet_request_sock *ireq = inet_rsk(req);

6709
	req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
6710 6711
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6712
	tcp_rsk(req)->snt_synack = 0;
6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724
	tcp_rsk(req)->last_oow_ack_time = 0;
	req->mss = rx_opt->mss_clamp;
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
	ireq->tstamp_ok = rx_opt->tstamp_ok;
	ireq->sack_ok = rx_opt->sack_ok;
	ireq->snd_wscale = rx_opt->snd_wscale;
	ireq->wscale_ok = rx_opt->wscale_ok;
	ireq->acked = 0;
	ireq->ecn_ok = 0;
	ireq->ir_rmt_port = tcp_hdr(skb)->source;
	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
	ireq->ir_mark = inet_request_mark(sk, skb);
6725
#if IS_ENABLED(CONFIG_SMC)
6726 6727
	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
			tcp_sk(sk)->smc_hs_congested(sk));
6728
#endif
6729 6730
}

E
Eric Dumazet 已提交
6731
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6732 6733
				      struct sock *sk_listener,
				      bool attach_listener)
E
Eric Dumazet 已提交
6734
{
6735 6736
	struct request_sock *req = reqsk_alloc(ops, sk_listener,
					       attach_listener);
E
Eric Dumazet 已提交
6737 6738 6739 6740

	if (req) {
		struct inet_request_sock *ireq = inet_rsk(req);

E
Eric Dumazet 已提交
6741
		ireq->ireq_opt = NULL;
6742 6743 6744
#if IS_ENABLED(CONFIG_IPV6)
		ireq->pktopts = NULL;
#endif
6745
		atomic64_set(&ireq->ir_cookie, 0);
E
Eric Dumazet 已提交
6746 6747
		ireq->ireq_state = TCP_NEW_SYN_RECV;
		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6748
		ireq->ireq_family = sk_listener->sk_family;
6749
		req->timeout = TCP_TIMEOUT_INIT;
E
Eric Dumazet 已提交
6750 6751 6752 6753 6754 6755
	}

	return req;
}
EXPORT_SYMBOL(inet_reqsk_alloc);

6756 6757 6758
/*
 * Return true if a syncookie should be sent
 */
6759
static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
6760
{
6761
	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6762 6763
	const char *msg = "Dropping request";
	bool want_cookie = false;
6764
	struct net *net = sock_net(sk);
6765 6766

#ifdef CONFIG_SYN_COOKIES
6767
	if (net->ipv4.sysctl_tcp_syncookies) {
6768 6769
		msg = "Sending cookies";
		want_cookie = true;
6770
		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6771 6772
	} else
#endif
6773
		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6774

6775
	if (!queue->synflood_warned &&
6776
	    net->ipv4.sysctl_tcp_syncookies != 2 &&
6777
	    xchg(&queue->synflood_warned, 1) == 0)
6778
		net_info_ratelimited("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
6779
				     proto, sk->sk_num, msg);
6780

6781 6782 6783
	return want_cookie;
}

6784 6785 6786 6787 6788 6789
static void tcp_reqsk_record_syn(const struct sock *sk,
				 struct request_sock *req,
				 const struct sk_buff *skb)
{
	if (tcp_sk(sk)->save_syn) {
		u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6790
		struct saved_syn *saved_syn;
6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801
		u32 mac_hdrlen;
		void *base;

		if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
			base = skb_mac_header(skb);
			mac_hdrlen = skb_mac_header_len(skb);
			len += mac_hdrlen;
		} else {
			base = skb_network_header(skb);
			mac_hdrlen = 0;
		}
6802 6803 6804 6805

		saved_syn = kmalloc(struct_size(saved_syn, data, len),
				    GFP_ATOMIC);
		if (saved_syn) {
6806
			saved_syn->mac_hdrlen = mac_hdrlen;
6807 6808
			saved_syn->network_hdrlen = skb_network_header_len(skb);
			saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
6809
			memcpy(saved_syn->data, base, len);
6810
			req->saved_syn = saved_syn;
6811 6812 6813 6814
		}
	}
}

6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844
/* If a SYN cookie is required and supported, returns a clamped MSS value to be
 * used for SYN cookie generation.
 */
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
			  const struct tcp_request_sock_ops *af_ops,
			  struct sock *sk, struct tcphdr *th)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u16 mss;

	if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
	    !inet_csk_reqsk_queue_is_full(sk))
		return 0;

	if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
		return 0;

	if (sk_acceptq_is_full(sk)) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
		return 0;
	}

	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
	if (!mss)
		mss = af_ops->mss_clamp;

	return mss;
}
EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);

O
Octavian Purdila 已提交
6845 6846 6847 6848
int tcp_conn_request(struct request_sock_ops *rsk_ops,
		     const struct tcp_request_sock_ops *af_ops,
		     struct sock *sk, struct sk_buff *skb)
{
6849 6850
	struct tcp_fastopen_cookie foc = { .len = -1 };
	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
O
Octavian Purdila 已提交
6851 6852
	struct tcp_options_received tmp_opt;
	struct tcp_sock *tp = tcp_sk(sk);
6853
	struct net *net = sock_net(sk);
6854 6855 6856
	struct sock *fastopen_sk = NULL;
	struct request_sock *req;
	bool want_cookie = false;
6857
	struct dst_entry *dst;
O
Octavian Purdila 已提交
6858 6859 6860 6861 6862 6863
	struct flowi fl;

	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
6864
	if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
O
Octavian Purdila 已提交
6865
	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6866
		want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
O
Octavian Purdila 已提交
6867 6868 6869 6870
		if (!want_cookie)
			goto drop;
	}

6871
	if (sk_acceptq_is_full(sk)) {
6872
		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
O
Octavian Purdila 已提交
6873 6874 6875
		goto drop;
	}

6876
	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
O
Octavian Purdila 已提交
6877 6878 6879
	if (!req)
		goto drop;

6880
	req->syncookie = want_cookie;
O
Octavian Purdila 已提交
6881
	tcp_rsk(req)->af_specific = af_ops;
6882
	tcp_rsk(req)->ts_off = 0;
6883 6884 6885
#if IS_ENABLED(CONFIG_MPTCP)
	tcp_rsk(req)->is_mptcp = 0;
#endif
O
Octavian Purdila 已提交
6886 6887 6888 6889

	tcp_clear_options(&tmp_opt);
	tmp_opt.mss_clamp = af_ops->mss_clamp;
	tmp_opt.user_mss  = tp->rx_opt.user_mss;
6890 6891
	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
			  want_cookie ? NULL : &foc);
O
Octavian Purdila 已提交
6892 6893 6894 6895

	if (want_cookie && !tmp_opt.saw_tstamp)
		tcp_clear_options(&tmp_opt);

6896 6897 6898
	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
		tmp_opt.smc_ok = 0;

O
Octavian Purdila 已提交
6899 6900
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
	tcp_openreq_init(req, &tmp_opt, skb, sk);
6901
	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
O
Octavian Purdila 已提交
6902

6903
	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
6904
	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6905

6906 6907
	dst = af_ops->route_req(sk, skb, &fl, req);
	if (!dst)
O
Octavian Purdila 已提交
6908 6909
		goto drop_and_free;

6910
	if (tmp_opt.tstamp_ok)
6911
		tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
6912

6913
	if (!want_cookie && !isn) {
O
Octavian Purdila 已提交
6914
		/* Kill the following clause, if you dislike this way. */
6915 6916 6917 6918
		if (!net->ipv4.sysctl_tcp_syncookies &&
		    (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
		     (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
		    !tcp_peer_is_proven(req, dst)) {
O
Octavian Purdila 已提交
6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930
			/* Without syncookies last quarter of
			 * backlog is filled with destinations,
			 * proven to be alive.
			 * It means that we continue to communicate
			 * to destinations, already remembered
			 * to the moment of synflood.
			 */
			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				    rsk_ops->family);
			goto drop_and_release;
		}

6931
		isn = af_ops->init_seq(skb);
O
Octavian Purdila 已提交
6932 6933
	}

6934 6935 6936 6937 6938 6939 6940 6941
	tcp_ecn_create_request(req, skb, sk, dst);

	if (want_cookie) {
		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
		if (!tmp_opt.tstamp_ok)
			inet_rsk(req)->ecn_ok = 0;
	}

O
Octavian Purdila 已提交
6942
	tcp_rsk(req)->snt_isn = isn;
6943
	tcp_rsk(req)->txhash = net_tx_rndhash();
6944
	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
O
Octavian Purdila 已提交
6945
	tcp_openreq_init_rwin(req, sk, dst);
6946
	sk_rx_queue_set(req_to_sk(req), skb);
6947 6948
	if (!want_cookie) {
		tcp_reqsk_record_syn(sk, req, skb);
6949
		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6950
	}
6951
	if (fastopen_sk) {
6952
		af_ops->send_synack(fastopen_sk, dst, &fl, req,
6953
				    &foc, TCP_SYNACK_FASTOPEN, skb);
6954
		/* Add the child socket directly into the accept queue */
6955 6956 6957 6958
		if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
			reqsk_fastopen_remove(fastopen_sk, req, false);
			bh_unlock_sock(fastopen_sk);
			sock_put(fastopen_sk);
6959
			goto drop_and_free;
6960
		}
6961 6962
		sk->sk_data_ready(sk);
		bh_unlock_sock(fastopen_sk);
6963 6964
		sock_put(fastopen_sk);
	} else {
6965
		tcp_rsk(req)->tfo_listener = false;
6966 6967 6968 6969
		if (!want_cookie) {
			req->timeout = tcp_timeout_init((struct sock *)req);
			inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
		}
6970 6971
		af_ops->send_synack(sk, dst, &fl, req, &foc,
				    !want_cookie ? TCP_SYNACK_NORMAL :
6972 6973
						   TCP_SYNACK_COOKIE,
				    skb);
6974 6975 6976 6977
		if (want_cookie) {
			reqsk_free(req);
			return 0;
		}
O
Octavian Purdila 已提交
6978
	}
6979
	reqsk_put(req);
O
Octavian Purdila 已提交
6980 6981 6982 6983 6984
	return 0;

drop_and_release:
	dst_release(dst);
drop_and_free:
6985
	__reqsk_free(req);
O
Octavian Purdila 已提交
6986
drop:
6987
	tcp_listendrop(sk);
O
Octavian Purdila 已提交
6988 6989 6990
	return 0;
}
EXPORT_SYMBOL(tcp_conn_request);