tcp_ipv4.c 85.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 *		IPv4 specific functions
 *
 *		code split from:
 *		linux/ipv4/tcp.c
 *		linux/ipv4/tcp_input.c
 *		linux/ipv4/tcp_output.c
 *
 *		See tcp.c for author information
 */

/*
 * Changes:
 *		David S. Miller	:	New socket lookup architecture.
 *					This code is dedicated to John Dyson.
 *		David S. Miller :	Change semantics of established hash,
 *					half is devoted to TIME_WAIT sockets
 *					and the rest go in the other half.
 *		Andi Kleen :		Add support for syncookies and fixed
 *					some bugs: ip options weren't passed to
 *					the TCP layer, missed a check for an
 *					ACK bit.
 *		Andi Kleen :		Implemented fast path mtu discovery.
 *	     				Fixed many serious bugs in the
32
 *					request_sock handling and moved
L
Linus Torvalds 已提交
33 34
 *					most of it into the af independent code.
 *					Added tail drop and some other bugfixes.
S
Stephen Hemminger 已提交
35
 *					Added new listen semantics.
L
Linus Torvalds 已提交
36 37 38 39 40 41 42 43 44 45 46 47
 *		Mike McLagan	:	Routing by source
 *	Juan Jose Ciarlante:		ip_dynaddr bits
 *		Andi Kleen:		various fixes.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
 *					coma.
 *	Andi Kleen		:	Fix new listen.
 *	Andi Kleen		:	Fix accept error reporting.
 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
 *					a single port at the same time.
 */

48
#define pr_fmt(fmt) "TCP: " fmt
L
Linus Torvalds 已提交
49

H
Herbert Xu 已提交
50
#include <linux/bottom_half.h>
L
Linus Torvalds 已提交
51 52 53 54 55 56 57 58
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
59
#include <linux/slab.h>
L
Linus Torvalds 已提交
60

61
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
62
#include <net/icmp.h>
63
#include <net/inet_hashtables.h>
L
Linus Torvalds 已提交
64
#include <net/tcp.h>
65
#include <net/transp_v6.h>
L
Linus Torvalds 已提交
66 67
#include <net/ipv6.h>
#include <net/inet_common.h>
68
#include <net/timewait_sock.h>
L
Linus Torvalds 已提交
69
#include <net/xfrm.h>
70
#include <net/secure_seq.h>
71
#include <net/busy_poll.h>
L
Linus Torvalds 已提交
72 73 74 75 76 77

#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
78
#include <linux/inetdevice.h>
79
#include <linux/btf_ids.h>
L
Linus Torvalds 已提交
80

H
Herbert Xu 已提交
81
#include <crypto/hash.h>
82 83
#include <linux/scatterlist.h>

84 85
#include <trace/events/tcp.h>

86
#ifdef CONFIG_TCP_MD5SIG
E
Eric Dumazet 已提交
87
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
E
Eric Dumazet 已提交
88
			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 90
#endif

91
struct inet_hashinfo tcp_hashinfo;
E
Eric Dumazet 已提交
92
EXPORT_SYMBOL(tcp_hashinfo);
L
Linus Torvalds 已提交
93

94
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
L
Linus Torvalds 已提交
95
{
96 97 98 99 100 101
	return secure_tcp_seq(ip_hdr(skb)->daddr,
			      ip_hdr(skb)->saddr,
			      tcp_hdr(skb)->dest,
			      tcp_hdr(skb)->source);
}

102
static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103
{
104
	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
L
Linus Torvalds 已提交
105 106
}

107 108
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
109
	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 111
	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
	struct tcp_sock *tp = tcp_sk(sk);
112 113 114 115 116 117 118 119 120 121 122 123 124
	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;

	if (reuse == 2) {
		/* Still does not detect *everything* that goes through
		 * lo, since we require a loopback src or dst address
		 * or direct binding to 'lo' interface.
		 */
		bool loopback = false;
		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
			loopback = true;
#if IS_ENABLED(CONFIG_IPV6)
		if (tw->tw_family == AF_INET6) {
			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125
			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126
			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127
			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 129 130 131 132 133 134 135 136 137 138
				loopback = true;
		} else
#endif
		{
			if (ipv4_is_loopback(tw->tw_daddr) ||
			    ipv4_is_loopback(tw->tw_rcv_saddr))
				loopback = true;
		}
		if (!loopback)
			reuse = 0;
	}
139 140 141 142 143 144 145 146 147 148 149 150 151

	/* With PAWS, it is safe from the viewpoint
	   of data integrity. Even without PAWS it is safe provided sequence
	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.

	   Actually, the idea is close to VJ's one, only timestamp cache is
	   held not per host, but per port pair and TW bucket is used as state
	   holder.

	   If TW bucket has been already destroyed we fall back to VJ's scheme
	   and use initial timestamp retrieved from peer table.
	 */
	if (tcptw->tw_ts_recent_stamp &&
152 153
	    (!twp || (reuse && time_after32(ktime_get_seconds(),
					    tcptw->tw_ts_recent_stamp)))) {
154 155 156 157 158 159 160 161 162 163 164 165
		/* In case of repair and re-using TIME-WAIT sockets we still
		 * want to be sure that it is safe as above but honor the
		 * sequence numbers and time stamps set as part of the repair
		 * process.
		 *
		 * Without this check re-using a TIME-WAIT socket with TCP
		 * repair would accumulate a -1 on the repair assigned
		 * sequence number. The first time it is reused the sequence
		 * is -1, the second time -2, etc. This fixes that issue
		 * without appearing to create any others.
		 */
		if (likely(!tp->repair)) {
166 167 168 169 170
			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;

			if (!seq)
				seq = 1;
			WRITE_ONCE(tp->write_seq, seq);
171 172 173
			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
		}
174 175 176 177 178 179 180 181
		sock_hold(sktw);
		return 1;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);

A
Andrey Ignatov 已提交
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
			      int addr_len)
{
	/* This check is replicated from tcp_v4_connect() and intended to
	 * prevent BPF program called below from accessing bytes that are out
	 * of the bound specified by user in addr_len.
	 */
	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	sock_owned_by_me(sk);

	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
}

L
Linus Torvalds 已提交
197 198 199
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
200
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
L
Linus Torvalds 已提交
201 202
	struct inet_sock *inet = inet_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
203
	__be16 orig_sport, orig_dport;
204
	__be32 daddr, nexthop;
205
	struct flowi4 *fl4;
206
	struct rtable *rt;
L
Linus Torvalds 已提交
207
	int err;
208
	struct ip_options_rcu *inet_opt;
209
	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
L
Linus Torvalds 已提交
210 211 212 213 214 215 216 217

	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	if (usin->sin_family != AF_INET)
		return -EAFNOSUPPORT;

	nexthop = daddr = usin->sin_addr.s_addr;
218
	inet_opt = rcu_dereference_protected(inet->inet_opt,
219
					     lockdep_sock_is_held(sk));
220
	if (inet_opt && inet_opt->opt.srr) {
L
Linus Torvalds 已提交
221 222
		if (!daddr)
			return -EINVAL;
223
		nexthop = inet_opt->opt.faddr;
L
Linus Torvalds 已提交
224 225
	}

226 227
	orig_sport = inet->inet_sport;
	orig_dport = usin->sin_port;
228 229
	fl4 = &inet->cork.fl.u.ip4;
	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 231
			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
			      IPPROTO_TCP,
232
			      orig_sport, orig_dport, sk);
233 234 235
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		if (err == -ENETUNREACH)
236
			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237
		return err;
238
	}
L
Linus Torvalds 已提交
239 240 241 242 243 244

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}

245
	if (!inet_opt || !inet_opt->opt.srr)
246
		daddr = fl4->daddr;
L
Linus Torvalds 已提交
247

E
Eric Dumazet 已提交
248
	if (!inet->inet_saddr)
249
		inet->inet_saddr = fl4->saddr;
250
	sk_rcv_saddr_set(sk, inet->inet_saddr);
L
Linus Torvalds 已提交
251

E
Eric Dumazet 已提交
252
	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
L
Linus Torvalds 已提交
253 254 255
		/* Reset inherited state */
		tp->rx_opt.ts_recent	   = 0;
		tp->rx_opt.ts_recent_stamp = 0;
P
Pavel Emelyanov 已提交
256
		if (likely(!tp->repair))
257
			WRITE_ONCE(tp->write_seq, 0);
L
Linus Torvalds 已提交
258 259
	}

E
Eric Dumazet 已提交
260
	inet->inet_dport = usin->sin_port;
261
	sk_daddr_set(sk, daddr);
L
Linus Torvalds 已提交
262

263
	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 265
	if (inet_opt)
		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
L
Linus Torvalds 已提交
266

267
	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
L
Linus Torvalds 已提交
268 269 270 271 272 273 274

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initialization after this.
	 */
	tcp_set_state(sk, TCP_SYN_SENT);
275
	err = inet_hash_connect(tcp_death_row, sk);
L
Linus Torvalds 已提交
276 277 278
	if (err)
		goto failure;

279
	sk_set_txhash(sk);
280

281
	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 283 284 285
			       inet->inet_sport, inet->inet_dport, sk);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
L
Linus Torvalds 已提交
286
		goto failure;
287
	}
L
Linus Torvalds 已提交
288
	/* OK, now commit destination to socket.  */
289
	sk->sk_gso_type = SKB_GSO_TCPV4;
290
	sk_setup_caps(sk, &rt->dst);
W
Wei Wang 已提交
291
	rt = NULL;
L
Linus Torvalds 已提交
292

293 294
	if (likely(!tp->repair)) {
		if (!tp->write_seq)
295 296 297 298 299
			WRITE_ONCE(tp->write_seq,
				   secure_tcp_seq(inet->inet_saddr,
						  inet->inet_daddr,
						  inet->inet_sport,
						  usin->sin_port));
300 301
		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
						 inet->inet_saddr,
302
						 inet->inet_daddr);
303
	}
L
Linus Torvalds 已提交
304

305
	inet->inet_id = prandom_u32();
L
Linus Torvalds 已提交
306

W
Wei Wang 已提交
307 308 309 310 311
	if (tcp_fastopen_defer_connect(sk, &err))
		return err;
	if (err)
		goto failure;

A
Andrey Vagin 已提交
312
	err = tcp_connect(sk);
P
Pavel Emelyanov 已提交
313

L
Linus Torvalds 已提交
314 315 316 317 318 319
	if (err)
		goto failure;

	return 0;

failure:
320 321 322 323
	/*
	 * This unhashes the socket and releases the local port,
	 * if necessary.
	 */
L
Linus Torvalds 已提交
324 325 326
	tcp_set_state(sk, TCP_CLOSE);
	ip_rt_put(rt);
	sk->sk_route_caps = 0;
E
Eric Dumazet 已提交
327
	inet->inet_dport = 0;
L
Linus Torvalds 已提交
328 329
	return err;
}
E
Eric Dumazet 已提交
330
EXPORT_SYMBOL(tcp_v4_connect);
L
Linus Torvalds 已提交
331 332

/*
333 334 335
 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 * It can be called through tcp_release_cb() if socket was owned by user
 * at the time tcp_v4_err() was called to handle ICMP message.
L
Linus Torvalds 已提交
336
 */
337
void tcp_v4_mtu_reduced(struct sock *sk)
L
Linus Torvalds 已提交
338 339
{
	struct inet_sock *inet = inet_sk(sk);
340 341
	struct dst_entry *dst;
	u32 mtu;
L
Linus Torvalds 已提交
342

343 344
	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
		return;
345
	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 347
	dst = inet_csk_update_pmtu(sk, mtu);
	if (!dst)
L
Linus Torvalds 已提交
348 349 350 351 352 353 354 355 356 357 358
		return;

	/* Something is about to be wrong... Remember soft error
	 * for the case, if this connection will not able to recover.
	 */
	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
		sk->sk_err_soft = EMSGSIZE;

	mtu = dst_mtu(dst);

	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359
	    ip_sk_accept_pmtu(sk) &&
360
	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
L
Linus Torvalds 已提交
361 362 363 364 365 366 367 368 369 370
		tcp_sync_mss(sk, mtu);

		/* Resend the TCP packet because it's
		 * clear that the old packet has been
		 * dropped. This is the new "fast" path mtu
		 * discovery.
		 */
		tcp_simple_retransmit(sk);
	} /* else let the usual retransmit timer handle it */
}
371
EXPORT_SYMBOL(tcp_v4_mtu_reduced);
L
Linus Torvalds 已提交
372

373 374 375 376
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
	struct dst_entry *dst = __sk_dst_check(sk, 0);

377
	if (dst)
378
		dst->ops->redirect(dst, sk, skb);
379 380
}

381 382

/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383
void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 385 386 387 388 389 390 391
{
	struct request_sock *req = inet_reqsk(sk);
	struct net *net = sock_net(sk);

	/* ICMPs are not backlogged, hence we cannot get
	 * an established socket here.
	 */
	if (seq != tcp_rsk(req)->snt_isn) {
392
		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393
	} else if (abort) {
394 395 396 397 398 399
		/*
		 * Still in SYN_RECV, just remove it silently.
		 * There is no good way to pass the error to the newly
		 * created socket, and POSIX does not want network
		 * errors returned from accept().
		 */
F
Fan Du 已提交
400
		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401
		tcp_listendrop(req->rsk_listener);
402
	}
403
	reqsk_put(req);
404 405 406
}
EXPORT_SYMBOL(tcp_req_err);

407
/* TCP-LD (RFC 6069) logic */
408
void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	s32 remaining;
	u32 delta_us;

	if (sock_owned_by_user(sk))
		return;

	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
	    !icsk->icsk_backoff)
		return;

	skb = tcp_rtx_queue_head(sk);
	if (WARN_ON_ONCE(!skb))
		return;

	icsk->icsk_backoff--;
	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);

	tcp_mstamp_refresh(tp);
	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);

	if (remaining > 0) {
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
					  remaining, TCP_RTO_MAX);
	} else {
		/* RTO revert clocked out retransmission.
		 * Will retransmit now.
		 */
		tcp_retransmit_timer(sk);
	}
}
445
EXPORT_SYMBOL(tcp_ld_RTO_revert);
446

L
Linus Torvalds 已提交
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the tcp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */

463
int tcp_v4_err(struct sk_buff *skb, u32 info)
L
Linus Torvalds 已提交
464
{
465 466
	const struct iphdr *iph = (const struct iphdr *)skb->data;
	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
L
Linus Torvalds 已提交
467 468
	struct tcp_sock *tp;
	struct inet_sock *inet;
469 470
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
L
Linus Torvalds 已提交
471
	struct sock *sk;
472
	struct request_sock *fastopen;
473
	u32 seq, snd_una;
L
Linus Torvalds 已提交
474
	int err;
475
	struct net *net = dev_net(skb->dev);
L
Linus Torvalds 已提交
476

477 478
	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
				       th->dest, iph->saddr, ntohs(th->source),
479
				       inet_iif(skb), 0);
L
Linus Torvalds 已提交
480
	if (!sk) {
E
Eric Dumazet 已提交
481
		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482
		return -ENOENT;
L
Linus Torvalds 已提交
483 484
	}
	if (sk->sk_state == TCP_TIME_WAIT) {
485
		inet_twsk_put(inet_twsk(sk));
486
		return 0;
L
Linus Torvalds 已提交
487
	}
488
	seq = ntohl(th->seq);
489 490 491 492 493 494 495 496
	if (sk->sk_state == TCP_NEW_SYN_RECV) {
		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
				     type == ICMP_TIME_EXCEEDED ||
				     (type == ICMP_DEST_UNREACH &&
				      (code == ICMP_NET_UNREACH ||
				       code == ICMP_HOST_UNREACH)));
		return 0;
	}
L
Linus Torvalds 已提交
497 498 499 500

	bh_lock_sock(sk);
	/* If too many ICMPs get dropped on busy
	 * servers this needs to be solved differently.
501 502
	 * We do take care of PMTU discovery (RFC1191) special case :
	 * we can receive locally generated ICMP messages while socket is held.
L
Linus Torvalds 已提交
503
	 */
504 505
	if (sock_owned_by_user(sk)) {
		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506
			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507
	}
L
Linus Torvalds 已提交
508 509 510
	if (sk->sk_state == TCP_CLOSE)
		goto out;

511
	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512
		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 514 515
		goto out;
	}

L
Linus Torvalds 已提交
516
	tp = tcp_sk(sk);
517
	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518
	fastopen = rcu_dereference(tp->fastopen_rsk);
519
	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
L
Linus Torvalds 已提交
520
	if (sk->sk_state != TCP_LISTEN &&
521
	    !between(seq, snd_una, tp->snd_nxt)) {
522
		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
L
Linus Torvalds 已提交
523 524 525 526
		goto out;
	}

	switch (type) {
527
	case ICMP_REDIRECT:
528
		if (!sock_owned_by_user(sk))
529
			do_redirect(skb, sk);
530
		goto out;
L
Linus Torvalds 已提交
531 532 533 534 535 536 537 538 539 540 541
	case ICMP_SOURCE_QUENCH:
		/* Just silently ignore these. */
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		break;
	case ICMP_DEST_UNREACH:
		if (code > NR_ICMP_UNREACH)
			goto out;

		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 543 544 545 546 547 548
			/* We are not interested in TCP_LISTEN and open_requests
			 * (SYN-ACKs send out by Linux are always <576bytes so
			 * they should go through unfragmented).
			 */
			if (sk->sk_state == TCP_LISTEN)
				goto out;

549
			WRITE_ONCE(tp->mtu_info, info);
550
			if (!sock_owned_by_user(sk)) {
551
				tcp_v4_mtu_reduced(sk);
552
			} else {
553
				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 555
					sock_hold(sk);
			}
L
Linus Torvalds 已提交
556 557 558 559
			goto out;
		}

		err = icmp_err_convert[code].errno;
560 561 562 563 564 565
		/* check if this ICMP message allows revert of backoff.
		 * (see RFC 6069)
		 */
		if (!fastopen &&
		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
			tcp_ld_RTO_revert(sk, seq);
L
Linus Torvalds 已提交
566 567 568 569 570 571 572 573 574 575
		break;
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	default:
		goto out;
	}

	switch (sk->sk_state) {
	case TCP_SYN_SENT:
576 577
	case TCP_SYN_RECV:
		/* Only in fast or simultaneous open. If a fast open socket is
R
Randy Dunlap 已提交
578
		 * already accepted it is treated as a connected one below.
579
		 */
580
		if (fastopen && !fastopen->sk)
581 582
			break;

583
		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584

L
Linus Torvalds 已提交
585 586 587
		if (!sock_owned_by_user(sk)) {
			sk->sk_err = err;

588
			sk_error_report(sk);
L
Linus Torvalds 已提交
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615

			tcp_done(sk);
		} else {
			sk->sk_err_soft = err;
		}
		goto out;
	}

	/* If we've already connected we will keep trying
	 * until we time out, or the user gives up.
	 *
	 * rfc1122 4.2.3.9 allows to consider as hard errors
	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
	 * but it is obsoleted by pmtu discovery).
	 *
	 * Note, that in modern internet, where routing is unreliable
	 * and in each dark corner broken firewalls sit, sending random
	 * errors ordered by their masters even this two messages finally lose
	 * their original sense (even Linux sends invalid PORT_UNREACHs)
	 *
	 * Now we are in compliance with RFCs.
	 *							--ANK (980905)
	 */

	inet = inet_sk(sk);
	if (!sock_owned_by_user(sk) && inet->recverr) {
		sk->sk_err = err;
616
		sk_error_report(sk);
L
Linus Torvalds 已提交
617 618 619 620 621 622 623
	} else	{ /* Only an error on timeout */
		sk->sk_err_soft = err;
	}

out:
	bh_unlock_sock(sk);
	sock_put(sk);
624
	return 0;
L
Linus Torvalds 已提交
625 626
}

627
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
L
Linus Torvalds 已提交
628
{
629
	struct tcphdr *th = tcp_hdr(skb);
L
Linus Torvalds 已提交
630

631 632 633
	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
	skb->csum_start = skb_transport_header(skb) - skb->head;
	skb->csum_offset = offsetof(struct tcphdr, check);
L
Linus Torvalds 已提交
634 635
}

636
/* This routine computes an IPv4 TCP checksum. */
637
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638
{
639
	const struct inet_sock *inet = inet_sk(sk);
640 641 642

	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
E
Eric Dumazet 已提交
643
EXPORT_SYMBOL(tcp_v4_send_check);
644

L
Linus Torvalds 已提交
645 646 647 648 649 650 651 652 653 654 655 656 657
/*
 *	This routine will send an RST to the other tcp.
 *
 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 *		      for reset.
 *	Answer: if a packet caused RST, it is not for a socket
 *		existing in our system, if it is matched to a socket,
 *		it is just duplicate segment or bug in other side's TCP.
 *		So that we build reply only basing on parameters
 *		arrived with segment.
 *	Exception: precedence violation. We do not implement it in any case.
 */

658 659 660 661 662 663
#ifdef CONFIG_TCP_MD5SIG
#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
#else
#define OPTION_BYTES sizeof(__be32)
#endif

664
static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
665
{
666
	const struct tcphdr *th = tcp_hdr(skb);
667 668
	struct {
		struct tcphdr th;
669
		__be32 opt[OPTION_BYTES / sizeof(__be32)];
670
	} rep;
L
Linus Torvalds 已提交
671
	struct ip_reply_arg arg;
672
#ifdef CONFIG_TCP_MD5SIG
673
	struct tcp_md5sig_key *key = NULL;
674 675 676 677
	const __u8 *hash_location = NULL;
	unsigned char newhash[16];
	int genhash;
	struct sock *sk1 = NULL;
678
#endif
679
	u64 transmit_time = 0;
J
Jon Maxwell 已提交
680
	struct sock *ctl_sk;
681
	struct net *net;
L
Linus Torvalds 已提交
682 683 684 685 686

	/* Never send a reset in response to a reset. */
	if (th->rst)
		return;

687 688 689 690
	/* If sk not NULL, it means we did a successful lookup and incoming
	 * route had to be correct. prequeue might have dropped our dst.
	 */
	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
L
Linus Torvalds 已提交
691 692 693
		return;

	/* Swap the send and the receive. */
694 695 696 697 698
	memset(&rep, 0, sizeof(rep));
	rep.th.dest   = th->source;
	rep.th.source = th->dest;
	rep.th.doff   = sizeof(struct tcphdr) / 4;
	rep.th.rst    = 1;
L
Linus Torvalds 已提交
699 700

	if (th->ack) {
701
		rep.th.seq = th->ack_seq;
L
Linus Torvalds 已提交
702
	} else {
703 704 705
		rep.th.ack = 1;
		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				       skb->len - (th->doff << 2));
L
Linus Torvalds 已提交
706 707
	}

708
	memset(&arg, 0, sizeof(arg));
709 710 711
	arg.iov[0].iov_base = (unsigned char *)&rep;
	arg.iov[0].iov_len  = sizeof(rep.th);

712
	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713
#ifdef CONFIG_TCP_MD5SIG
714
	rcu_read_lock();
715
	hash_location = tcp_parse_md5sig_option(th);
716
	if (sk && sk_fullsock(sk)) {
717
		const union tcp_md5_addr *addr;
718
		int l3index;
719

720 721 722 723
		/* sdif set, means packet ingressed via a device
		 * in an L3 domain and inet_iif is set to it.
		 */
		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724
		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725
		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726
	} else if (hash_location) {
727
		const union tcp_md5_addr *addr;
728 729
		int sdif = tcp_v4_sdif(skb);
		int dif = inet_iif(skb);
730
		int l3index;
731

732 733 734 735 736 737 738
		/*
		 * active side is lost. Try to find listening socket through
		 * source port, and then find md5 key through listening socket.
		 * we are not loose security here:
		 * Incoming packet is checked with md5 hash with finding key,
		 * no RST generated if md5 hash doesn't match.
		 */
739 740
		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
					     ip_hdr(skb)->saddr,
741
					     th->source, ip_hdr(skb)->daddr,
742
					     ntohs(th->source), dif, sdif);
743 744
		/* don't send rst if it can't find key */
		if (!sk1)
745 746
			goto out;

747 748 749 750
		/* sdif set, means packet ingressed via a device
		 * in an L3 domain and dif is set to it.
		 */
		l3index = sdif ? dif : 0;
751
		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752
		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753
		if (!key)
754 755
			goto out;

756

757
		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758
		if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 760
			goto out;

761 762
	}

763 764 765 766 767 768 769 770 771
	if (key) {
		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
				   (TCPOPT_NOP << 16) |
				   (TCPOPT_MD5SIG << 8) |
				   TCPOLEN_MD5SIG);
		/* Update length and the length the header thinks exists */
		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
		rep.th.doff = arg.iov[0].iov_len / 4;

772
		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 774
				     key, ip_hdr(skb)->saddr,
				     ip_hdr(skb)->daddr, &rep.th);
775 776
	}
#endif
777 778 779 780 781 782 783 784 785 786 787
	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
	if (rep.opt[0] == 0) {
		__be32 mrst = mptcp_reset_option(skb);

		if (mrst) {
			rep.opt[0] = mrst;
			arg.iov[0].iov_len += sizeof(mrst);
			rep.th.doff = arg.iov[0].iov_len / 4;
		}
	}

788 789
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				      ip_hdr(skb)->saddr, /* XXX */
790
				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
L
Linus Torvalds 已提交
791
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 793
	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;

794
	/* When socket is gone, all binding information is lost.
A
Alexey Kuznetsov 已提交
795 796
	 * routing might fail in this case. No choice here, if we choose to force
	 * input interface, we will misroute in case of asymmetric route.
797
	 */
798
	if (sk) {
A
Alexey Kuznetsov 已提交
799
		arg.bound_dev_if = sk->sk_bound_dev_if;
800 801
		if (sk_fullsock(sk))
			trace_tcp_send_reset(sk, skb);
802
	}
L
Linus Torvalds 已提交
803

804 805 806
	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));

807
	arg.tos = ip_hdr(skb)->tos;
808
	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809
	local_bh_disable();
810
	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811
	if (sk) {
J
Jon Maxwell 已提交
812 813
		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
				   inet_twsk(sk)->tw_mark : sk->sk_mark;
814 815
		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
				   inet_twsk(sk)->tw_priority : sk->sk_priority;
816
		transmit_time = tcp_transmit_time(sk);
817
	}
J
Jon Maxwell 已提交
818
	ip_send_unicast_reply(ctl_sk,
819
			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
820
			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 822
			      &arg, arg.iov[0].iov_len,
			      transmit_time);
L
Linus Torvalds 已提交
823

J
Jon Maxwell 已提交
824
	ctl_sk->sk_mark = 0;
E
Eric Dumazet 已提交
825 826
	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827
	local_bh_enable();
828 829

#ifdef CONFIG_TCP_MD5SIG
830 831
out:
	rcu_read_unlock();
832
#endif
L
Linus Torvalds 已提交
833 834 835 836 837 838
}

/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
   outside socket context is ugly, certainly. What can I do?
 */

839
static void tcp_v4_send_ack(const struct sock *sk,
840
			    struct sk_buff *skb, u32 seq, u32 ack,
841
			    u32 win, u32 tsval, u32 tsecr, int oif,
842
			    struct tcp_md5sig_key *key,
843
			    int reply_flags, u8 tos)
L
Linus Torvalds 已提交
844
{
845
	const struct tcphdr *th = tcp_hdr(skb);
L
Linus Torvalds 已提交
846 847
	struct {
		struct tcphdr th;
848
		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849
#ifdef CONFIG_TCP_MD5SIG
850
			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851 852
#endif
			];
L
Linus Torvalds 已提交
853
	} rep;
854
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
855
	struct ip_reply_arg arg;
J
Jon Maxwell 已提交
856
	struct sock *ctl_sk;
857
	u64 transmit_time;
L
Linus Torvalds 已提交
858 859

	memset(&rep.th, 0, sizeof(struct tcphdr));
860
	memset(&arg, 0, sizeof(arg));
L
Linus Torvalds 已提交
861 862 863

	arg.iov[0].iov_base = (unsigned char *)&rep;
	arg.iov[0].iov_len  = sizeof(rep.th);
864
	if (tsecr) {
865 866 867
		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
				   (TCPOPT_TIMESTAMP << 8) |
				   TCPOLEN_TIMESTAMP);
868 869
		rep.opt[1] = htonl(tsval);
		rep.opt[2] = htonl(tsecr);
870
		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
L
Linus Torvalds 已提交
871 872 873 874 875 876 877 878 879 880 881
	}

	/* Swap the send and the receive. */
	rep.th.dest    = th->source;
	rep.th.source  = th->dest;
	rep.th.doff    = arg.iov[0].iov_len / 4;
	rep.th.seq     = htonl(seq);
	rep.th.ack_seq = htonl(ack);
	rep.th.ack     = 1;
	rep.th.window  = htons(win);

882 883
#ifdef CONFIG_TCP_MD5SIG
	if (key) {
884
		int offset = (tsecr) ? 3 : 0;
885 886 887 888 889 890 891 892

		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
					  (TCPOPT_NOP << 16) |
					  (TCPOPT_MD5SIG << 8) |
					  TCPOLEN_MD5SIG);
		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
		rep.th.doff = arg.iov[0].iov_len/4;

893
		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 895
				    key, ip_hdr(skb)->saddr,
				    ip_hdr(skb)->daddr, &rep.th);
896 897
	}
#endif
898
	arg.flags = reply_flags;
899 900
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				      ip_hdr(skb)->saddr, /* XXX */
L
Linus Torvalds 已提交
901 902
				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 904
	if (oif)
		arg.bound_dev_if = oif;
905
	arg.tos = tos;
906
	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907
	local_bh_disable();
908
	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 910
	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
			   inet_twsk(sk)->tw_mark : sk->sk_mark;
911 912
	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
			   inet_twsk(sk)->tw_priority : sk->sk_priority;
913
	transmit_time = tcp_transmit_time(sk);
J
Jon Maxwell 已提交
914
	ip_send_unicast_reply(ctl_sk,
915
			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
916
			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 918
			      &arg, arg.iov[0].iov_len,
			      transmit_time);
L
Linus Torvalds 已提交
919

J
Jon Maxwell 已提交
920
	ctl_sk->sk_mark = 0;
E
Eric Dumazet 已提交
921
	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922
	local_bh_enable();
L
Linus Torvalds 已提交
923 924 925 926
}

static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
927
	struct inet_timewait_sock *tw = inet_twsk(sk);
928
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
L
Linus Torvalds 已提交
929

930
	tcp_v4_send_ack(sk, skb,
931
			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933
			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 935
			tcptw->tw_ts_recent,
			tw->tw_bound_dev_if,
936
			tcp_twsk_md5_key(tcptw),
937 938
			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
			tw->tw_tos
939
			);
L
Linus Torvalds 已提交
940

941
	inet_twsk_put(tw);
L
Linus Torvalds 已提交
942 943
}

944
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945
				  struct request_sock *req)
L
Linus Torvalds 已提交
946
{
947
	const union tcp_md5_addr *addr;
948
	int l3index;
949

950 951 952
	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
	 */
953 954 955
	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
					     tcp_sk(sk)->snd_nxt;

956 957 958 959 960
	/* RFC 7323 2.3
	 * The window field (SEG.WND) of every outgoing segment, with the
	 * exception of <SYN> segments, MUST be right-shifted by
	 * Rcv.Wind.Shift bits:
	 */
961
	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962
	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963
	tcp_v4_send_ack(sk, skb, seq,
964 965
			tcp_rsk(req)->rcv_nxt,
			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966
			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 968
			req->ts_recent,
			0,
969
			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 971
			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
			ip_hdr(skb)->tos);
L
Linus Torvalds 已提交
972 973 974
}

/*
975
 *	Send a SYN-ACK after having received a SYN.
976
 *	This still operates on a request_sock only, not on a big
L
Linus Torvalds 已提交
977 978
 *	socket.
 */
979
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980
			      struct flowi *fl,
981
			      struct request_sock *req,
982
			      struct tcp_fastopen_cookie *foc,
983 984
			      enum tcp_synack_type synack_type,
			      struct sk_buff *syn_skb)
L
Linus Torvalds 已提交
985
{
986
	const struct inet_request_sock *ireq = inet_rsk(req);
987
	struct flowi4 fl4;
L
Linus Torvalds 已提交
988
	int err = -1;
989
	struct sk_buff *skb;
990
	u8 tos;
L
Linus Torvalds 已提交
991 992

	/* First, grab a route. */
993
	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994
		return -1;
L
Linus Torvalds 已提交
995

996
	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
L
Linus Torvalds 已提交
997 998

	if (skb) {
999
		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
L
Linus Torvalds 已提交
1000

1001
		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
W
Wei Wang 已提交
1002 1003
				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
				(inet_sk(sk)->tos & INET_ECN_MASK) :
1004 1005 1006 1007 1008 1009
				inet_sk(sk)->tos;

		if (!INET_ECN_is_capable(tos) &&
		    tcp_bpf_ca_needs_ecn((struct sock *)req))
			tos |= INET_ECN_ECT_0;

1010
		rcu_read_lock();
1011 1012
		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
					    ireq->ir_rmt_addr,
1013
					    rcu_dereference(ireq->ireq_opt),
1014
					    tos);
1015
		rcu_read_unlock();
1016
		err = net_xmit_eval(err);
L
Linus Torvalds 已提交
1017 1018 1019 1020 1021 1022
	}

	return err;
}

/*
1023
 *	IPv4 request_sock destructor.
L
Linus Torvalds 已提交
1024
 */
1025
static void tcp_v4_reqsk_destructor(struct request_sock *req)
L
Linus Torvalds 已提交
1026
{
E
Eric Dumazet 已提交
1027
	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
L
Linus Torvalds 已提交
1028 1029
}

1030 1031 1032 1033 1034 1035 1036
#ifdef CONFIG_TCP_MD5SIG
/*
 * RFC2385 MD5 checksumming requires a mapping of
 * IP address->MD5 Key.
 * We need to maintain these in the sk structure.
 */

1037
DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 1039
EXPORT_SYMBOL(tcp_md5_needed);

1040
/* Find the Key structure for an address.  */
1041
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042 1043
					   const union tcp_md5_addr *addr,
					   int family)
1044
{
1045
	const struct tcp_sock *tp = tcp_sk(sk);
E
Eric Dumazet 已提交
1046
	struct tcp_md5sig_key *key;
1047
	const struct tcp_md5sig_info *md5sig;
1048 1049 1050
	__be32 mask;
	struct tcp_md5sig_key *best_match = NULL;
	bool match;
1051

1052 1053
	/* caller either holds rcu_read_lock() or socket lock */
	md5sig = rcu_dereference_check(tp->md5sig_info,
1054
				       lockdep_sock_is_held(sk));
1055
	if (!md5sig)
1056
		return NULL;
A
Arnd Bergmann 已提交
1057

1058 1059
	hlist_for_each_entry_rcu(key, &md5sig->head, node,
				 lockdep_sock_is_held(sk)) {
E
Eric Dumazet 已提交
1060 1061
		if (key->family != family)
			continue;
1062 1063
		if (key->l3index && key->l3index != l3index)
			continue;
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082
		if (family == AF_INET) {
			mask = inet_make_mask(key->prefixlen);
			match = (key->addr.a4.s_addr & mask) ==
				(addr->a4.s_addr & mask);
#if IS_ENABLED(CONFIG_IPV6)
		} else if (family == AF_INET6) {
			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
						  key->prefixlen);
#endif
		} else {
			match = false;
		}

		if (match && (!best_match ||
			      key->prefixlen > best_match->prefixlen))
			best_match = key;
	}
	return best_match;
}
1083
EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084

1085 1086
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
						      const union tcp_md5_addr *addr,
1087 1088
						      int family, u8 prefixlen,
						      int l3index)
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_md5sig_key *key;
	unsigned int size = sizeof(struct in_addr);
	const struct tcp_md5sig_info *md5sig;

	/* caller either holds rcu_read_lock() or socket lock */
	md5sig = rcu_dereference_check(tp->md5sig_info,
				       lockdep_sock_is_held(sk));
	if (!md5sig)
		return NULL;
#if IS_ENABLED(CONFIG_IPV6)
	if (family == AF_INET6)
		size = sizeof(struct in6_addr);
#endif
1104 1105
	hlist_for_each_entry_rcu(key, &md5sig->head, node,
				 lockdep_sock_is_held(sk)) {
1106 1107
		if (key->family != family)
			continue;
1108 1109
		if (key->l3index && key->l3index != l3index)
			continue;
1110 1111
		if (!memcmp(&key->addr, addr, size) &&
		    key->prefixlen == prefixlen)
E
Eric Dumazet 已提交
1112
			return key;
1113 1114 1115 1116
	}
	return NULL;
}

1117
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118
					 const struct sock *addr_sk)
1119
{
1120
	const union tcp_md5_addr *addr;
1121
	int l3index;
E
Eric Dumazet 已提交
1122

1123 1124
	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
						 addr_sk->sk_bound_dev_if);
1125
	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126
	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 1128 1129 1130
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);

/* This can be called on a newly created socket, from other files */
E
Eric Dumazet 已提交
1131
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132 1133
		   int family, u8 prefixlen, int l3index,
		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 1135
{
	/* Add Key to the list */
1136
	struct tcp_md5sig_key *key;
1137
	struct tcp_sock *tp = tcp_sk(sk);
E
Eric Dumazet 已提交
1138
	struct tcp_md5sig_info *md5sig;
1139

1140
	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141
	if (key) {
1142 1143 1144 1145 1146 1147 1148
		/* Pre-existing entry - just update that one.
		 * Note that the key might be used concurrently.
		 * data_race() is telling kcsan that we do not care of
		 * key mismatches, since changing MD5 key on live flows
		 * can lead to packet drops.
		 */
		data_race(memcpy(key->key, newkey, newkeylen));
1149

1150 1151 1152 1153 1154 1155
		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
		 * Also note that a reader could catch new key->keylen value
		 * but old key->key[], this is the reason we use __GFP_ZERO
		 * at sock_kmalloc() time below these lines.
		 */
		WRITE_ONCE(key->keylen, newkeylen);
1156

E
Eric Dumazet 已提交
1157 1158
		return 0;
	}
1159

1160
	md5sig = rcu_dereference_protected(tp->md5sig_info,
1161
					   lockdep_sock_is_held(sk));
E
Eric Dumazet 已提交
1162 1163 1164
	if (!md5sig) {
		md5sig = kmalloc(sizeof(*md5sig), gfp);
		if (!md5sig)
1165 1166
			return -ENOMEM;

E
Eric Dumazet 已提交
1167 1168
		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
		INIT_HLIST_HEAD(&md5sig->head);
1169
		rcu_assign_pointer(tp->md5sig_info, md5sig);
E
Eric Dumazet 已提交
1170
	}
1171

1172
	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
E
Eric Dumazet 已提交
1173 1174
	if (!key)
		return -ENOMEM;
1175
	if (!tcp_alloc_md5sig_pool()) {
1176
		sock_kfree_s(sk, key, sizeof(*key));
E
Eric Dumazet 已提交
1177
		return -ENOMEM;
1178
	}
E
Eric Dumazet 已提交
1179 1180 1181 1182

	memcpy(key->key, newkey, newkeylen);
	key->keylen = newkeylen;
	key->family = family;
1183
	key->prefixlen = prefixlen;
1184
	key->l3index = l3index;
E
Eric Dumazet 已提交
1185 1186 1187 1188
	memcpy(&key->addr, addr,
	       (family == AF_INET6) ? sizeof(struct in6_addr) :
				      sizeof(struct in_addr));
	hlist_add_head_rcu(&key->node, &md5sig->head);
1189 1190
	return 0;
}
E
Eric Dumazet 已提交
1191
EXPORT_SYMBOL(tcp_md5_do_add);
1192

1193
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194
		   u8 prefixlen, int l3index)
1195
{
E
Eric Dumazet 已提交
1196 1197
	struct tcp_md5sig_key *key;

1198
	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
E
Eric Dumazet 已提交
1199 1200 1201
	if (!key)
		return -ENOENT;
	hlist_del_rcu(&key->node);
1202
	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
E
Eric Dumazet 已提交
1203 1204
	kfree_rcu(key, rcu);
	return 0;
1205
}
E
Eric Dumazet 已提交
1206
EXPORT_SYMBOL(tcp_md5_do_del);
1207

1208
static void tcp_clear_md5_list(struct sock *sk)
1209 1210
{
	struct tcp_sock *tp = tcp_sk(sk);
E
Eric Dumazet 已提交
1211
	struct tcp_md5sig_key *key;
1212
	struct hlist_node *n;
1213
	struct tcp_md5sig_info *md5sig;
1214

1215 1216
	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);

1217
	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
E
Eric Dumazet 已提交
1218
		hlist_del_rcu(&key->node);
1219
		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
E
Eric Dumazet 已提交
1220
		kfree_rcu(key, rcu);
1221 1222 1223
	}
}

1224
static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225
				 sockptr_t optval, int optlen)
1226 1227 1228
{
	struct tcp_md5sig cmd;
	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229
	const union tcp_md5_addr *addr;
1230
	u8 prefixlen = 32;
1231
	int l3index = 0;
1232 1233 1234 1235

	if (optlen < sizeof(cmd))
		return -EINVAL;

1236
	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237 1238 1239 1240 1241
		return -EFAULT;

	if (sin->sin_family != AF_INET)
		return -EINVAL;

1242 1243 1244 1245 1246 1247 1248
	if (optname == TCP_MD5SIG_EXT &&
	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
		prefixlen = cmd.tcpm_prefixlen;
		if (prefixlen > 32)
			return -EINVAL;
	}

1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
	if (optname == TCP_MD5SIG_EXT &&
	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
		struct net_device *dev;

		rcu_read_lock();
		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
		if (dev && netif_is_l3_master(dev))
			l3index = dev->ifindex;

		rcu_read_unlock();

		/* ok to reference set/not set outside of rcu;
		 * right now device MUST be an L3 master
		 */
		if (!dev || !l3index)
			return -EINVAL;
	}

1267 1268
	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;

1269
	if (!cmd.tcpm_keylen)
1270
		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271 1272 1273 1274

	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
		return -EINVAL;

1275
	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276
			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 1278
}

1279 1280 1281
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
				   __be32 daddr, __be32 saddr,
				   const struct tcphdr *th, int nbytes)
1282 1283
{
	struct tcp4_pseudohdr *bp;
1284
	struct scatterlist sg;
1285
	struct tcphdr *_th;
1286

1287
	bp = hp->scratch;
1288 1289 1290
	bp->saddr = saddr;
	bp->daddr = daddr;
	bp->pad = 0;
1291
	bp->protocol = IPPROTO_TCP;
1292
	bp->len = cpu_to_be16(nbytes);
1293

1294 1295 1296 1297 1298 1299 1300
	_th = (struct tcphdr *)(bp + 1);
	memcpy(_th, th, sizeof(*th));
	_th->check = 0;

	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
				sizeof(*bp) + sizeof(*th));
H
Herbert Xu 已提交
1301
	return crypto_ahash_update(hp->md5_req);
1302 1303
}

E
Eric Dumazet 已提交
1304
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
E
Eric Dumazet 已提交
1305
			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 1307
{
	struct tcp_md5sig_pool *hp;
H
Herbert Xu 已提交
1308
	struct ahash_request *req;
1309 1310 1311 1312

	hp = tcp_get_md5sig_pool();
	if (!hp)
		goto clear_hash_noput;
H
Herbert Xu 已提交
1313
	req = hp->md5_req;
1314

H
Herbert Xu 已提交
1315
	if (crypto_ahash_init(req))
1316
		goto clear_hash;
1317
	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318 1319 1320
		goto clear_hash;
	if (tcp_md5_hash_key(hp, key))
		goto clear_hash;
H
Herbert Xu 已提交
1321 1322
	ahash_request_set_crypt(req, NULL, md5_hash, 0);
	if (crypto_ahash_final(req))
1323 1324 1325 1326
		goto clear_hash;

	tcp_put_md5sig_pool();
	return 0;
1327

1328 1329 1330 1331
clear_hash:
	tcp_put_md5sig_pool();
clear_hash_noput:
	memset(md5_hash, 0, 16);
1332
	return 1;
1333 1334
}

1335 1336
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
			const struct sock *sk,
E
Eric Dumazet 已提交
1337
			const struct sk_buff *skb)
1338
{
1339
	struct tcp_md5sig_pool *hp;
H
Herbert Xu 已提交
1340
	struct ahash_request *req;
E
Eric Dumazet 已提交
1341
	const struct tcphdr *th = tcp_hdr(skb);
1342 1343
	__be32 saddr, daddr;

1344 1345 1346
	if (sk) { /* valid for establish/request sockets */
		saddr = sk->sk_rcv_saddr;
		daddr = sk->sk_daddr;
1347
	} else {
1348 1349 1350
		const struct iphdr *iph = ip_hdr(skb);
		saddr = iph->saddr;
		daddr = iph->daddr;
1351
	}
1352 1353 1354 1355

	hp = tcp_get_md5sig_pool();
	if (!hp)
		goto clear_hash_noput;
H
Herbert Xu 已提交
1356
	req = hp->md5_req;
1357

H
Herbert Xu 已提交
1358
	if (crypto_ahash_init(req))
1359 1360
		goto clear_hash;

1361
	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362 1363 1364 1365 1366
		goto clear_hash;
	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
		goto clear_hash;
	if (tcp_md5_hash_key(hp, key))
		goto clear_hash;
H
Herbert Xu 已提交
1367 1368
	ahash_request_set_crypt(req, NULL, md5_hash, 0);
	if (crypto_ahash_final(req))
1369 1370 1371 1372 1373 1374 1375 1376 1377 1378
		goto clear_hash;

	tcp_put_md5sig_pool();
	return 0;

clear_hash:
	tcp_put_md5sig_pool();
clear_hash_noput:
	memset(md5_hash, 0, 16);
	return 1;
1379
}
1380
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381

1382 1383
#endif

1384
/* Called with rcu_read_lock() */
1385
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386 1387
				    const struct sk_buff *skb,
				    int dif, int sdif)
1388
{
1389
#ifdef CONFIG_TCP_MD5SIG
1390 1391 1392 1393 1394 1395 1396 1397
	/*
	 * This gets called for each TCP segment that arrives
	 * so we want to be efficient.
	 * We have 3 drop cases:
	 * o No MD5 hash and one expected.
	 * o MD5 hash and we're not expecting one.
	 * o MD5 hash and its wrong.
	 */
1398
	const __u8 *hash_location = NULL;
1399
	struct tcp_md5sig_key *hash_expected;
1400
	const struct iphdr *iph = ip_hdr(skb);
1401
	const struct tcphdr *th = tcp_hdr(skb);
1402
	const union tcp_md5_addr *addr;
1403
	unsigned char newhash[16];
1404 1405 1406 1407 1408 1409
	int genhash, l3index;

	/* sdif set, means packet ingressed via a device
	 * in an L3 domain and dif is set to the l3mdev
	 */
	l3index = sdif ? dif : 0;
1410

1411
	addr = (union tcp_md5_addr *)&iph->saddr;
1412
	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413
	hash_location = tcp_parse_md5sig_option(th);
1414 1415 1416

	/* We've parsed the options - do we have a hash? */
	if (!hash_expected && !hash_location)
E
Eric Dumazet 已提交
1417
		return false;
1418 1419

	if (hash_expected && !hash_location) {
1420
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
E
Eric Dumazet 已提交
1421
		return true;
1422 1423 1424
	}

	if (!hash_expected && hash_location) {
1425
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
E
Eric Dumazet 已提交
1426
		return true;
1427 1428 1429 1430 1431
	}

	/* Okay, so this is hash_expected and hash_location -
	 * so we need to calculate the checksum.
	 */
1432 1433
	genhash = tcp_v4_md5_hash_skb(newhash,
				      hash_expected,
1434
				      NULL, skb);
1435 1436

	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438
		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439 1440 1441
				     &iph->saddr, ntohs(th->source),
				     &iph->daddr, ntohs(th->dest),
				     genhash ? " tcp_v4_calc_md5_hash failed"
1442
				     : "", l3index);
E
Eric Dumazet 已提交
1443
		return true;
1444
	}
E
Eric Dumazet 已提交
1445
	return false;
1446
#endif
1447 1448
	return false;
}
1449

1450 1451
static void tcp_v4_init_req(struct request_sock *req,
			    const struct sock *sk_listener,
1452 1453 1454
			    struct sk_buff *skb)
{
	struct inet_request_sock *ireq = inet_rsk(req);
E
Eric Dumazet 已提交
1455
	struct net *net = sock_net(sk_listener);
1456

1457 1458
	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
E
Eric Dumazet 已提交
1459
	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 1461
}

1462
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463
					  struct sk_buff *skb,
1464
					  struct flowi *fl,
1465
					  struct request_sock *req)
1466
{
1467 1468 1469 1470 1471
	tcp_v4_init_req(req, sk, skb);

	if (security_inet_conn_request(sk, skb, req))
		return NULL;

1472
	return inet_csk_route_req(sk, &fl->u.ip4, req);
1473 1474
}

1475
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
L
Linus Torvalds 已提交
1476
	.family		=	PF_INET,
1477
	.obj_size	=	sizeof(struct tcp_request_sock),
1478
	.rtx_syn_ack	=	tcp_rtx_synack,
1479 1480
	.send_ack	=	tcp_v4_reqsk_send_ack,
	.destructor	=	tcp_v4_reqsk_destructor,
L
Linus Torvalds 已提交
1481
	.send_reset	=	tcp_v4_send_reset,
S
stephen hemminger 已提交
1482
	.syn_ack_timeout =	tcp_syn_ack_timeout,
L
Linus Torvalds 已提交
1483 1484
};

1485
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486
	.mss_clamp	=	TCP_MSS_DEFAULT,
1487
#ifdef CONFIG_TCP_MD5SIG
1488
	.req_md5_lookup	=	tcp_v4_md5_lookup,
1489
	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1490
#endif
1491 1492 1493
#ifdef CONFIG_SYN_COOKIES
	.cookie_init_seq =	cookie_v4_init_sequence,
#endif
1494
	.route_req	=	tcp_v4_route_req,
1495 1496
	.init_seq	=	tcp_v4_init_seq,
	.init_ts_off	=	tcp_v4_init_ts_off,
1497
	.send_synack	=	tcp_v4_send_synack,
1498
};
1499

L
Linus Torvalds 已提交
1500 1501 1502
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	/* Never answer to SYNs send to broadcast or multicast */
E
Eric Dumazet 已提交
1503
	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
L
Linus Torvalds 已提交
1504 1505
		goto drop;

O
Octavian Purdila 已提交
1506 1507
	return tcp_conn_request(&tcp_request_sock_ops,
				&tcp_request_sock_ipv4_ops, sk, skb);
L
Linus Torvalds 已提交
1508 1509

drop:
1510
	tcp_listendrop(sk);
L
Linus Torvalds 已提交
1511 1512
	return 0;
}
E
Eric Dumazet 已提交
1513
EXPORT_SYMBOL(tcp_v4_conn_request);
L
Linus Torvalds 已提交
1514 1515 1516 1517 1518 1519


/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
 */
1520
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521
				  struct request_sock *req,
1522 1523 1524
				  struct dst_entry *dst,
				  struct request_sock *req_unhash,
				  bool *own_req)
L
Linus Torvalds 已提交
1525
{
1526
	struct inet_request_sock *ireq;
1527
	bool found_dup_sk = false;
L
Linus Torvalds 已提交
1528 1529 1530
	struct inet_sock *newinet;
	struct tcp_sock *newtp;
	struct sock *newsk;
1531
#ifdef CONFIG_TCP_MD5SIG
1532
	const union tcp_md5_addr *addr;
1533
	struct tcp_md5sig_key *key;
1534
	int l3index;
1535
#endif
1536
	struct ip_options_rcu *inet_opt;
L
Linus Torvalds 已提交
1537 1538 1539 1540 1541 1542

	if (sk_acceptq_is_full(sk))
		goto exit_overflow;

	newsk = tcp_create_openreq_child(sk, req, skb);
	if (!newsk)
1543
		goto exit_nonewsk;
L
Linus Torvalds 已提交
1544

1545
	newsk->sk_gso_type = SKB_GSO_TCPV4;
1546
	inet_sk_rx_dst_set(newsk, skb);
L
Linus Torvalds 已提交
1547 1548 1549

	newtp		      = tcp_sk(newsk);
	newinet		      = inet_sk(newsk);
1550
	ireq		      = inet_rsk(req);
1551 1552
	sk_daddr_set(newsk, ireq->ir_rmt_addr);
	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553
	newsk->sk_bound_dev_if = ireq->ir_iif;
E
Eric Dumazet 已提交
1554 1555 1556
	newinet->inet_saddr   = ireq->ir_loc_addr;
	inet_opt	      = rcu_dereference(ireq->ireq_opt);
	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557
	newinet->mc_index     = inet_iif(skb);
1558
	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1559
	newinet->rcv_tos      = ip_hdr(skb)->tos;
1560
	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561 1562
	if (inet_opt)
		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563
	newinet->inet_id = prandom_u32();
L
Linus Torvalds 已提交
1564

W
Wei Wang 已提交
1565 1566 1567
	/* Set ToS of the new socket based upon the value of incoming SYN.
	 * ECT bits are set later in tcp_init_transfer().
	 */
1568 1569 1570
	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

E
Eric Dumazet 已提交
1571 1572 1573 1574 1575 1576 1577
	if (!dst) {
		dst = inet_csk_route_child_sock(sk, newsk, req);
		if (!dst)
			goto put_and_exit;
	} else {
		/* syncookie case : see end of cookie_v4_check() */
	}
1578 1579
	sk_setup_caps(newsk, dst);

1580 1581
	tcp_ca_openreq_child(newsk, dst);

L
Linus Torvalds 已提交
1582
	tcp_sync_mss(newsk, dst_mtu(dst));
E
Eric Dumazet 已提交
1583
	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584

L
Linus Torvalds 已提交
1585 1586
	tcp_initialize_rcv_mss(newsk);

1587
#ifdef CONFIG_TCP_MD5SIG
1588
	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589
	/* Copy over the MD5 key from the original socket */
1590
	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591
	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592
	if (key) {
1593 1594 1595 1596 1597 1598
		/*
		 * We're using one, so create a matching key
		 * on the newsk structure. If we fail to get
		 * memory, then we end up not copying the key
		 * across. Shucks.
		 */
1599
		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600
			       key->key, key->keylen, GFP_ATOMIC);
E
Eric Dumazet 已提交
1601
		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602 1603 1604
	}
#endif

1605 1606
	if (__inet_inherit_port(sk, newsk) < 0)
		goto put_and_exit;
1607 1608
	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
				       &found_dup_sk);
E
Eric Dumazet 已提交
1609
	if (likely(*own_req)) {
1610
		tcp_move_syn(newtp, req);
E
Eric Dumazet 已提交
1611 1612
		ireq->ireq_opt = NULL;
	} else {
1613 1614
		newinet->inet_opt = NULL;

1615 1616 1617 1618 1619 1620 1621 1622
		if (!req_unhash && found_dup_sk) {
			/* This code path should only be executed in the
			 * syncookie case only
			 */
			bh_unlock_sock(newsk);
			sock_put(newsk);
			newsk = NULL;
		}
E
Eric Dumazet 已提交
1623
	}
L
Linus Torvalds 已提交
1624 1625 1626
	return newsk;

exit_overflow:
1627
	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628 1629
exit_nonewsk:
	dst_release(dst);
L
Linus Torvalds 已提交
1630
exit:
1631
	tcp_listendrop(sk);
L
Linus Torvalds 已提交
1632
	return NULL;
1633
put_and_exit:
E
Eric Dumazet 已提交
1634
	newinet->inet_opt = NULL;
1635 1636
	inet_csk_prepare_forced_close(newsk);
	tcp_done(newsk);
1637
	goto exit;
L
Linus Torvalds 已提交
1638
}
E
Eric Dumazet 已提交
1639
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
L
Linus Torvalds 已提交
1640

1641
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1642
{
1643
#ifdef CONFIG_SYN_COOKIES
1644
	const struct tcphdr *th = tcp_hdr(skb);
L
Linus Torvalds 已提交
1645

1646
	if (!th->syn)
C
Cong Wang 已提交
1647
		sk = cookie_v4_check(sk, skb);
L
Linus Torvalds 已提交
1648 1649 1650 1651
#endif
	return sk;
}

1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
			 struct tcphdr *th, u32 *cookie)
{
	u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
				    &tcp_request_sock_ipv4_ops, sk, th);
	if (mss) {
		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
		tcp_synq_overflow(sk);
	}
#endif
	return mss;
}

1667 1668
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
							   u32));
L
Linus Torvalds 已提交
1669
/* The socket must have it's spinlock held when we get
1670
 * here, unless it is a TCP_LISTEN socket.
L
Linus Torvalds 已提交
1671 1672 1673 1674 1675 1676 1677 1678
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
1679 1680
	struct sock *rsk;

L
Linus Torvalds 已提交
1681
	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682 1683
		struct dst_entry *dst = sk->sk_rx_dst;

1684
		sock_rps_save_rxhash(sk, skb);
1685
		sk_mark_napi_id(sk, skb);
1686
		if (dst) {
E
Eric Dumazet 已提交
1687
			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688 1689
			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
					     dst, 0)) {
1690 1691 1692 1693
				dst_release(dst);
				sk->sk_rx_dst = NULL;
			}
		}
1694
		tcp_rcv_established(sk, skb);
L
Linus Torvalds 已提交
1695 1696 1697
		return 0;
	}

E
Eric Dumazet 已提交
1698
	if (tcp_checksum_complete(skb))
L
Linus Torvalds 已提交
1699 1700 1701
		goto csum_err;

	if (sk->sk_state == TCP_LISTEN) {
1702 1703
		struct sock *nsk = tcp_v4_cookie_check(sk, skb);

L
Linus Torvalds 已提交
1704 1705 1706
		if (!nsk)
			goto discard;
		if (nsk != sk) {
1707 1708
			if (tcp_child_process(sk, nsk, skb)) {
				rsk = nsk;
L
Linus Torvalds 已提交
1709
				goto reset;
1710
			}
L
Linus Torvalds 已提交
1711 1712
			return 0;
		}
1713
	} else
1714
		sock_rps_save_rxhash(sk, skb);
1715

1716
	if (tcp_rcv_state_process(sk, skb)) {
1717
		rsk = sk;
L
Linus Torvalds 已提交
1718
		goto reset;
1719
	}
L
Linus Torvalds 已提交
1720 1721 1722
	return 0;

reset:
1723
	tcp_v4_send_reset(rsk, skb);
L
Linus Torvalds 已提交
1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
discard:
	kfree_skb(skb);
	/* Be careful here. If this function gets more complicated and
	 * gcc suffers from register pressure on the x86, sk (in %ebx)
	 * might be destroyed here. This current version compiles correctly,
	 * but you have been warned.
	 */
	return 0;

csum_err:
1734
	trace_tcp_bad_csum(skb);
1735 1736
	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
L
Linus Torvalds 已提交
1737 1738
	goto discard;
}
E
Eric Dumazet 已提交
1739
EXPORT_SYMBOL(tcp_v4_do_rcv);
L
Linus Torvalds 已提交
1740

1741
int tcp_v4_early_demux(struct sk_buff *skb)
D
David S. Miller 已提交
1742 1743 1744 1745 1746 1747
{
	const struct iphdr *iph;
	const struct tcphdr *th;
	struct sock *sk;

	if (skb->pkt_type != PACKET_HOST)
1748
		return 0;
D
David S. Miller 已提交
1749

1750
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751
		return 0;
D
David S. Miller 已提交
1752 1753

	iph = ip_hdr(skb);
1754
	th = tcp_hdr(skb);
D
David S. Miller 已提交
1755 1756

	if (th->doff < sizeof(struct tcphdr) / 4)
1757
		return 0;
D
David S. Miller 已提交
1758

1759
	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
D
David S. Miller 已提交
1760
				       iph->saddr, th->source,
1761
				       iph->daddr, ntohs(th->dest),
1762
				       skb->skb_iif, inet_sdif(skb));
D
David S. Miller 已提交
1763 1764 1765
	if (sk) {
		skb->sk = sk;
		skb->destructor = sock_edemux;
1766
		if (sk_fullsock(sk)) {
1767
			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
E
Eric Dumazet 已提交
1768

D
David S. Miller 已提交
1769 1770
			if (dst)
				dst = dst_check(dst, 0);
1771
			if (dst &&
E
Eric Dumazet 已提交
1772
			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773
				skb_dst_set_noref(skb, dst);
D
David S. Miller 已提交
1774 1775
		}
	}
1776
	return 0;
D
David S. Miller 已提交
1777 1778
}

E
Eric Dumazet 已提交
1779 1780
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
1781
	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782
	u32 tail_gso_size, tail_gso_segs;
1783 1784 1785 1786 1787 1788 1789
	struct skb_shared_info *shinfo;
	const struct tcphdr *th;
	struct tcphdr *thtail;
	struct sk_buff *tail;
	unsigned int hdrlen;
	bool fragstolen;
	u32 gso_segs;
1790
	u32 gso_size;
1791
	int delta;
E
Eric Dumazet 已提交
1792 1793 1794 1795 1796 1797 1798

	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
	 * we can fix skb->truesize to its real value to avoid future drops.
	 * This is valid because skb is not yet charged to the socket.
	 * It has been noticed pure SACK packets were sometimes dropped
	 * (if cooked by drivers without copybreak feature).
	 */
1799
	skb_condense(skb);
E
Eric Dumazet 已提交
1800

E
Eric Dumazet 已提交
1801 1802
	skb_dst_drop(skb);

1803 1804
	if (unlikely(tcp_checksum_complete(skb))) {
		bh_unlock_sock(sk);
1805
		trace_tcp_bad_csum(skb);
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825
		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		return true;
	}

	/* Attempt coalescing to last skb in backlog, even if we are
	 * above the limits.
	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
	 */
	th = (const struct tcphdr *)skb->data;
	hdrlen = th->doff * 4;

	tail = sk->sk_backlog.tail;
	if (!tail)
		goto no_coalesce;
	thtail = (struct tcphdr *)tail->data;

	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
	    ((TCP_SKB_CB(tail)->tcp_flags |
1826 1827 1828
	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
	    !((TCP_SKB_CB(tail)->tcp_flags &
	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838
	    ((TCP_SKB_CB(tail)->tcp_flags ^
	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
	    tail->decrypted != skb->decrypted ||
#endif
	    thtail->doff != th->doff ||
	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
		goto no_coalesce;

	__skb_pull(skb, hdrlen);
1839 1840 1841 1842 1843 1844 1845 1846 1847

	shinfo = skb_shinfo(skb);
	gso_size = shinfo->gso_size ?: skb->len;
	gso_segs = shinfo->gso_segs ?: 1;

	shinfo = skb_shinfo(tail);
	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
	tail_gso_segs = shinfo->gso_segs ?: 1;

1848 1849 1850
	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;

1851
		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852
			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853 1854
			thtail->window = th->window;
		}
1855

1856 1857 1858 1859 1860 1861 1862 1863 1864
		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
		 * thtail->fin, so that the fast path in tcp_rcv_established()
		 * is not entered if we append a packet with a FIN.
		 * SYN, RST, URG are not present.
		 * ACK is set on both packets.
		 * PSH : we do not really care in TCP stack,
		 *       at least for 'GRO' packets.
		 */
		thtail->fin |= th->fin;
1865 1866 1867 1868 1869 1870 1871 1872 1873
		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;

		if (TCP_SKB_CB(skb)->has_rxtstamp) {
			TCP_SKB_CB(tail)->has_rxtstamp = true;
			tail->tstamp = skb->tstamp;
			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
		}

		/* Not as strict as GRO. We only need to carry mss max value */
1874 1875
		shinfo->gso_size = max(gso_size, tail_gso_size);
		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891

		sk->sk_backlog.len += delta;
		__NET_INC_STATS(sock_net(sk),
				LINUX_MIB_TCPBACKLOGCOALESCE);
		kfree_skb_partial(skb, fragstolen);
		return false;
	}
	__skb_push(skb, hdrlen);

no_coalesce:
	/* Only socket owner can try to collapse/prune rx queues
	 * to reduce memory overhead, so add a little headroom here.
	 * Few sockets backlog are possibly concurrently non empty.
	 */
	limit += 64*1024;

E
Eric Dumazet 已提交
1892 1893 1894 1895 1896 1897 1898 1899 1900
	if (unlikely(sk_add_backlog(sk, skb, limit))) {
		bh_unlock_sock(sk);
		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
		return true;
	}
	return false;
}
EXPORT_SYMBOL(tcp_add_backlog);

1901 1902 1903 1904
int tcp_filter(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = (struct tcphdr *)skb->data;

1905
	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906 1907 1908
}
EXPORT_SYMBOL(tcp_filter);

1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
		sizeof(struct inet_skb_parm));
}

static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
			   const struct tcphdr *th)
{
	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
	 * barrier() makes sure compiler wont play fool^Waliasing games.
	 */
	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
		sizeof(struct inet_skb_parm));
	barrier();

	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				    skb->len - th->doff * 4);
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
	TCP_SKB_CB(skb)->sacked	 = 0;
	TCP_SKB_CB(skb)->has_rxtstamp =
			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}

L
Linus Torvalds 已提交
1937 1938 1939 1940 1941 1942
/*
 *	From tcp_input.c
 */

int tcp_v4_rcv(struct sk_buff *skb)
{
1943
	struct net *net = dev_net(skb->dev);
E
Eric Dumazet 已提交
1944
	struct sk_buff *skb_to_free;
1945
	int sdif = inet_sdif(skb);
1946
	int dif = inet_iif(skb);
1947
	const struct iphdr *iph;
1948
	const struct tcphdr *th;
1949
	bool refcounted;
L
Linus Torvalds 已提交
1950 1951 1952 1953 1954 1955 1956
	struct sock *sk;
	int ret;

	if (skb->pkt_type != PACKET_HOST)
		goto discard_it;

	/* Count it even if it's bad */
E
Eric Dumazet 已提交
1957
	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
L
Linus Torvalds 已提交
1958 1959 1960 1961

	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
		goto discard_it;

1962
	th = (const struct tcphdr *)skb->data;
L
Linus Torvalds 已提交
1963

1964
	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
L
Linus Torvalds 已提交
1965 1966 1967 1968 1969 1970
		goto bad_packet;
	if (!pskb_may_pull(skb, th->doff * 4))
		goto discard_it;

	/* An explanation is required here, I think.
	 * Packet length and doff are validated by header prediction,
S
Stephen Hemminger 已提交
1971
	 * provided case of th->doff==0 is eliminated.
L
Linus Torvalds 已提交
1972
	 * So, we defer the checks. */
1973 1974

	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975
		goto csum_error;
L
Linus Torvalds 已提交
1976

1977
	th = (const struct tcphdr *)skb->data;
1978
	iph = ip_hdr(skb);
1979
lookup:
1980
	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981
			       th->dest, sdif, &refcounted);
L
Linus Torvalds 已提交
1982 1983 1984
	if (!sk)
		goto no_tcp_socket;

E
Eric Dumazet 已提交
1985 1986 1987 1988
process:
	if (sk->sk_state == TCP_TIME_WAIT)
		goto do_time_wait;

1989 1990
	if (sk->sk_state == TCP_NEW_SYN_RECV) {
		struct request_sock *req = inet_reqsk(sk);
1991
		bool req_stolen = false;
1992
		struct sock *nsk;
1993 1994

		sk = req->rsk_listener;
1995
		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996
			sk_drops_add(sk, skb);
1997 1998 1999
			reqsk_put(req);
			goto discard_it;
		}
2000 2001 2002 2003
		if (tcp_checksum_complete(skb)) {
			reqsk_put(req);
			goto csum_error;
		}
2004
		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
			if (!nsk) {
				inet_csk_reqsk_queue_drop_and_put(sk, req);
				goto lookup;
			}
			sk = nsk;
			/* reuseport_migrate_sock() has already held one sk_refcnt
			 * before returning.
			 */
		} else {
			/* We own a reference on the listener, increase it again
			 * as we might lose it too soon.
			 */
			sock_hold(sk);
2019
		}
2020
		refcounted = true;
E
Eric Dumazet 已提交
2021
		nsk = NULL;
2022 2023 2024 2025
		if (!tcp_filter(sk, skb)) {
			th = (const struct tcphdr *)skb->data;
			iph = ip_hdr(skb);
			tcp_v4_fill_cb(skb, iph, th);
2026
			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027
		}
2028 2029
		if (!nsk) {
			reqsk_put(req);
2030 2031 2032 2033 2034 2035 2036 2037 2038 2039
			if (req_stolen) {
				/* Another cpu got exclusive access to req
				 * and created a full blown socket.
				 * Try to feed this packet to this socket
				 * instead of discarding it.
				 */
				tcp_v4_restore_cb(skb);
				sock_put(sk);
				goto lookup;
			}
2040
			goto discard_and_relse;
2041 2042 2043
		}
		if (nsk == sk) {
			reqsk_put(req);
2044
			tcp_v4_restore_cb(skb);
2045 2046
		} else if (tcp_child_process(sk, nsk, skb)) {
			tcp_v4_send_reset(nsk, skb);
2047
			goto discard_and_relse;
2048
		} else {
2049
			sock_put(sk);
2050 2051 2052
			return 0;
		}
	}
2053
	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054
		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055
		goto discard_and_relse;
2056
	}
2057

L
Linus Torvalds 已提交
2058 2059
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
		goto discard_and_relse;
2060

2061
	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062 2063
		goto discard_and_relse;

2064
	nf_reset_ct(skb);
L
Linus Torvalds 已提交
2065

2066
	if (tcp_filter(sk, skb))
L
Linus Torvalds 已提交
2067
		goto discard_and_relse;
2068 2069
	th = (const struct tcphdr *)skb->data;
	iph = ip_hdr(skb);
2070
	tcp_v4_fill_cb(skb, iph, th);
L
Linus Torvalds 已提交
2071 2072 2073

	skb->dev = NULL;

2074 2075 2076 2077 2078 2079 2080
	if (sk->sk_state == TCP_LISTEN) {
		ret = tcp_v4_do_rcv(sk, skb);
		goto put_and_return;
	}

	sk_incoming_cpu_update(sk);

2081
	bh_lock_sock_nested(sk);
2082
	tcp_segs_in(tcp_sk(sk), skb);
L
Linus Torvalds 已提交
2083 2084
	ret = 0;
	if (!sock_owned_by_user(sk)) {
E
Eric Dumazet 已提交
2085 2086
		skb_to_free = sk->sk_rx_skb_cache;
		sk->sk_rx_skb_cache = NULL;
F
Florian Westphal 已提交
2087
		ret = tcp_v4_do_rcv(sk, skb);
E
Eric Dumazet 已提交
2088 2089 2090 2091
	} else {
		if (tcp_add_backlog(sk, skb))
			goto discard_and_relse;
		skb_to_free = NULL;
Z
Zhu Yi 已提交
2092
	}
L
Linus Torvalds 已提交
2093
	bh_unlock_sock(sk);
E
Eric Dumazet 已提交
2094 2095
	if (skb_to_free)
		__kfree_skb(skb_to_free);
L
Linus Torvalds 已提交
2096

2097
put_and_return:
2098 2099
	if (refcounted)
		sock_put(sk);
L
Linus Torvalds 已提交
2100 2101 2102 2103 2104 2105 2106

	return ret;

no_tcp_socket:
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
		goto discard_it;

2107 2108
	tcp_v4_fill_cb(skb, iph, th);

E
Eric Dumazet 已提交
2109
	if (tcp_checksum_complete(skb)) {
2110
csum_error:
2111
		trace_tcp_bad_csum(skb);
E
Eric Dumazet 已提交
2112
		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
L
Linus Torvalds 已提交
2113
bad_packet:
E
Eric Dumazet 已提交
2114
		__TCP_INC_STATS(net, TCP_MIB_INERRS);
L
Linus Torvalds 已提交
2115
	} else {
2116
		tcp_v4_send_reset(NULL, skb);
L
Linus Torvalds 已提交
2117 2118 2119 2120 2121
	}

discard_it:
	/* Discard frame. */
	kfree_skb(skb);
2122
	return 0;
L
Linus Torvalds 已提交
2123 2124

discard_and_relse:
2125
	sk_drops_add(sk, skb);
2126 2127
	if (refcounted)
		sock_put(sk);
L
Linus Torvalds 已提交
2128 2129 2130 2131
	goto discard_it;

do_time_wait:
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132
		inet_twsk_put(inet_twsk(sk));
L
Linus Torvalds 已提交
2133 2134 2135
		goto discard_it;
	}

2136 2137
	tcp_v4_fill_cb(skb, iph, th);

2138 2139 2140
	if (tcp_checksum_complete(skb)) {
		inet_twsk_put(inet_twsk(sk));
		goto csum_error;
L
Linus Torvalds 已提交
2141
	}
2142
	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
L
Linus Torvalds 已提交
2143
	case TCP_TW_SYN: {
2144
		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145 2146
							&tcp_hashinfo, skb,
							__tcp_hdrlen(th),
2147
							iph->saddr, th->source,
2148
							iph->daddr, th->dest,
2149 2150
							inet_iif(skb),
							sdif);
L
Linus Torvalds 已提交
2151
		if (sk2) {
2152
			inet_twsk_deschedule_put(inet_twsk(sk));
L
Linus Torvalds 已提交
2153
			sk = sk2;
2154
			tcp_v4_restore_cb(skb);
2155
			refcounted = false;
L
Linus Torvalds 已提交
2156 2157 2158
			goto process;
		}
	}
2159
		/* to ACK */
J
Joe Perches 已提交
2160
		fallthrough;
L
Linus Torvalds 已提交
2161 2162 2163 2164
	case TCP_TW_ACK:
		tcp_v4_timewait_ack(sk, skb);
		break;
	case TCP_TW_RST:
2165 2166 2167
		tcp_v4_send_reset(sk, skb);
		inet_twsk_deschedule_put(inet_twsk(sk));
		goto discard_it;
L
Linus Torvalds 已提交
2168 2169 2170 2171 2172
	case TCP_TW_SUCCESS:;
	}
	goto discard_it;
}

2173 2174 2175 2176 2177
static struct timewait_sock_ops tcp_timewait_sock_ops = {
	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
	.twsk_unique	= tcp_twsk_unique,
	.twsk_destructor= tcp_twsk_destructor,
};
L
Linus Torvalds 已提交
2178

2179
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
E
Eric Dumazet 已提交
2180 2181 2182
{
	struct dst_entry *dst = skb_dst(skb);

E
Eric Dumazet 已提交
2183
	if (dst && dst_hold_safe(dst)) {
2184 2185 2186
		sk->sk_rx_dst = dst;
		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
	}
E
Eric Dumazet 已提交
2187
}
2188
EXPORT_SYMBOL(inet_sk_rx_dst_set);
E
Eric Dumazet 已提交
2189

2190
const struct inet_connection_sock_af_ops ipv4_specific = {
2191 2192 2193
	.queue_xmit	   = ip_queue_xmit,
	.send_check	   = tcp_v4_send_check,
	.rebuild_header	   = inet_sk_rebuild_header,
E
Eric Dumazet 已提交
2194
	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2195 2196 2197 2198 2199 2200 2201
	.conn_request	   = tcp_v4_conn_request,
	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
	.net_header_len	   = sizeof(struct iphdr),
	.setsockopt	   = ip_setsockopt,
	.getsockopt	   = ip_getsockopt,
	.addr2sockaddr	   = inet_csk_addr2sockaddr,
	.sockaddr_len	   = sizeof(struct sockaddr_in),
2202
	.mtu_reduced	   = tcp_v4_mtu_reduced,
L
Linus Torvalds 已提交
2203
};
E
Eric Dumazet 已提交
2204
EXPORT_SYMBOL(ipv4_specific);
L
Linus Torvalds 已提交
2205

2206
#ifdef CONFIG_TCP_MD5SIG
2207
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208
	.md5_lookup		= tcp_v4_md5_lookup,
2209
	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2210 2211
	.md5_parse		= tcp_v4_parse_md5_keys,
};
2212
#endif
2213

L
Linus Torvalds 已提交
2214 2215 2216 2217 2218
/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v4_init_sock(struct sock *sk)
{
2219
	struct inet_connection_sock *icsk = inet_csk(sk);
L
Linus Torvalds 已提交
2220

2221
	tcp_init_sock(sk);
L
Linus Torvalds 已提交
2222

2223
	icsk->icsk_af_ops = &ipv4_specific;
2224

2225
#ifdef CONFIG_TCP_MD5SIG
2226
	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227
#endif
L
Linus Torvalds 已提交
2228 2229 2230 2231

	return 0;
}

2232
void tcp_v4_destroy_sock(struct sock *sk)
L
Linus Torvalds 已提交
2233 2234 2235
{
	struct tcp_sock *tp = tcp_sk(sk);

2236 2237
	trace_tcp_destroy_sock(sk);

L
Linus Torvalds 已提交
2238 2239
	tcp_clear_xmit_timers(sk);

2240
	tcp_cleanup_congestion_control(sk);
2241

D
Dave Watson 已提交
2242 2243
	tcp_cleanup_ulp(sk);

L
Linus Torvalds 已提交
2244
	/* Cleanup up the write buffer. */
2245
	tcp_write_queue_purge(sk);
L
Linus Torvalds 已提交
2246

2247 2248 2249
	/* Check if we want to disable active TFO */
	tcp_fastopen_active_disable_ofo_check(sk);

L
Linus Torvalds 已提交
2250
	/* Cleans up our, hopefully empty, out_of_order_queue. */
2251
	skb_rbtree_purge(&tp->out_of_order_queue);
L
Linus Torvalds 已提交
2252

2253 2254 2255
#ifdef CONFIG_TCP_MD5SIG
	/* Clean up the MD5 key list, if any */
	if (tp->md5sig_info) {
E
Eric Dumazet 已提交
2256
		tcp_clear_md5_list(sk);
2257
		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258 2259 2260
		tp->md5sig_info = NULL;
	}
#endif
C
Chris Leech 已提交
2261

L
Linus Torvalds 已提交
2262
	/* Clean up a referenced TCP bind bucket. */
2263
	if (inet_csk(sk)->icsk_bind_hash)
2264
		inet_put_port(sk);
L
Linus Torvalds 已提交
2265

2266
	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267

2268 2269
	/* If socket is aborted during connect operation */
	tcp_free_fastopen_req(tp);
2270
	tcp_fastopen_destroy_cipher(sk);
2271
	tcp_saved_syn_free(tp);
2272

2273
	sk_sockets_allocated_dec(sk);
L
Linus Torvalds 已提交
2274 2275 2276 2277 2278 2279
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);

#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */

2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290
static unsigned short seq_file_family(const struct seq_file *seq);

static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
	unsigned short family = seq_file_family(seq);

	/* AF_UNSPEC is used as a match all */
	return ((family == AF_UNSPEC || family == sk->sk_family) &&
		net_eq(sock_net(sk), seq_file_net(seq)));
}

2291 2292 2293 2294 2295 2296 2297 2298
/* Find a non empty bucket (starting from st->bucket)
 * and return the first sk from it.
 */
static void *listening_get_first(struct seq_file *seq)
{
	struct tcp_iter_state *st = seq->private;

	st->offset = 0;
2299 2300 2301
	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
		struct inet_listen_hashbucket *ilb2;
		struct inet_connection_sock *icsk;
2302 2303
		struct sock *sk;

2304 2305
		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
		if (hlist_empty(&ilb2->head))
2306 2307
			continue;

2308 2309 2310
		spin_lock(&ilb2->lock);
		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
			sk = (struct sock *)icsk;
2311 2312 2313
			if (seq_sk_match(seq, sk))
				return sk;
		}
2314
		spin_unlock(&ilb2->lock);
2315 2316 2317 2318 2319 2320 2321 2322 2323
	}

	return NULL;
}

/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
 * If "cur" is the last one in the st->bucket,
 * call listening_get_first() to return the first sk of the next
 * non empty bucket.
2324
 */
L
Linus Torvalds 已提交
2325 2326
static void *listening_get_next(struct seq_file *seq, void *cur)
{
J
Jianjun Kong 已提交
2327
	struct tcp_iter_state *st = seq->private;
2328 2329
	struct inet_listen_hashbucket *ilb2;
	struct inet_connection_sock *icsk;
2330
	struct sock *sk = cur;
L
Linus Torvalds 已提交
2331 2332

	++st->num;
2333
	++st->offset;
L
Linus Torvalds 已提交
2334

2335 2336 2337
	icsk = inet_csk(sk);
	inet_lhash2_for_each_icsk_continue(icsk) {
		sk = (struct sock *)icsk;
2338
		if (seq_sk_match(seq, sk))
2339
			return sk;
L
Linus Torvalds 已提交
2340
	}
2341

2342 2343
	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
	spin_unlock(&ilb2->lock);
2344 2345
	++st->bucket;
	return listening_get_first(seq);
L
Linus Torvalds 已提交
2346 2347 2348 2349
}

static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
2350 2351 2352 2353 2354
	struct tcp_iter_state *st = seq->private;
	void *rc;

	st->bucket = 0;
	st->offset = 0;
2355
	rc = listening_get_first(seq);
L
Linus Torvalds 已提交
2356 2357 2358 2359 2360 2361 2362 2363

	while (rc && *pos) {
		rc = listening_get_next(seq, rc);
		--*pos;
	}
	return rc;
}

E
Eric Dumazet 已提交
2364
static inline bool empty_bucket(const struct tcp_iter_state *st)
2365
{
E
Eric Dumazet 已提交
2366
	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2367 2368
}

2369 2370 2371 2372
/*
 * Get first established socket starting from bucket given in st->bucket.
 * If st->bucket is zero, the very first socket in the hash is returned.
 */
L
Linus Torvalds 已提交
2373 2374
static void *established_get_first(struct seq_file *seq)
{
J
Jianjun Kong 已提交
2375
	struct tcp_iter_state *st = seq->private;
2376

2377 2378
	st->offset = 0;
	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
L
Linus Torvalds 已提交
2379
		struct sock *sk;
2380
		struct hlist_nulls_node *node;
2381
		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
L
Linus Torvalds 已提交
2382

2383 2384 2385 2386
		/* Lockless fast path for the common case of empty buckets */
		if (empty_bucket(st))
			continue;

2387
		spin_lock_bh(lock);
2388
		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2389 2390
			if (seq_sk_match(seq, sk))
				return sk;
L
Linus Torvalds 已提交
2391
		}
2392
		spin_unlock_bh(lock);
L
Linus Torvalds 已提交
2393
	}
2394 2395

	return NULL;
L
Linus Torvalds 已提交
2396 2397 2398 2399 2400
}

static void *established_get_next(struct seq_file *seq, void *cur)
{
	struct sock *sk = cur;
2401
	struct hlist_nulls_node *node;
J
Jianjun Kong 已提交
2402
	struct tcp_iter_state *st = seq->private;
2403

L
Linus Torvalds 已提交
2404
	++st->num;
2405
	++st->offset;
L
Linus Torvalds 已提交
2406

E
Eric Dumazet 已提交
2407
	sk = sk_nulls_next(sk);
L
Linus Torvalds 已提交
2408

2409
	sk_nulls_for_each_from(sk, node) {
2410
		if (seq_sk_match(seq, sk))
E
Eric Dumazet 已提交
2411
			return sk;
L
Linus Torvalds 已提交
2412 2413
	}

E
Eric Dumazet 已提交
2414 2415 2416
	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
	++st->bucket;
	return established_get_first(seq);
L
Linus Torvalds 已提交
2417 2418 2419 2420
}

static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
2421 2422 2423 2424 2425
	struct tcp_iter_state *st = seq->private;
	void *rc;

	st->bucket = 0;
	rc = established_get_first(seq);
L
Linus Torvalds 已提交
2426 2427 2428 2429

	while (rc && pos) {
		rc = established_get_next(seq, rc);
		--pos;
2430
	}
L
Linus Torvalds 已提交
2431 2432 2433 2434 2435 2436
	return rc;
}

static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
{
	void *rc;
J
Jianjun Kong 已提交
2437
	struct tcp_iter_state *st = seq->private;
L
Linus Torvalds 已提交
2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449

	st->state = TCP_SEQ_STATE_LISTENING;
	rc	  = listening_get_idx(seq, &pos);

	if (!rc) {
		st->state = TCP_SEQ_STATE_ESTABLISHED;
		rc	  = established_get_idx(seq, pos);
	}

	return rc;
}

2450 2451 2452
static void *tcp_seek_last_pos(struct seq_file *seq)
{
	struct tcp_iter_state *st = seq->private;
2453
	int bucket = st->bucket;
2454 2455 2456 2457 2458 2459
	int offset = st->offset;
	int orig_num = st->num;
	void *rc = NULL;

	switch (st->state) {
	case TCP_SEQ_STATE_LISTENING:
2460
		if (st->bucket > tcp_hashinfo.lhash2_mask)
2461 2462
			break;
		st->state = TCP_SEQ_STATE_LISTENING;
2463
		rc = listening_get_first(seq);
2464
		while (offset-- && rc && bucket == st->bucket)
2465 2466 2467 2468
			rc = listening_get_next(seq, rc);
		if (rc)
			break;
		st->bucket = 0;
E
Eric Dumazet 已提交
2469
		st->state = TCP_SEQ_STATE_ESTABLISHED;
J
Joe Perches 已提交
2470
		fallthrough;
2471 2472 2473 2474
	case TCP_SEQ_STATE_ESTABLISHED:
		if (st->bucket > tcp_hashinfo.ehash_mask)
			break;
		rc = established_get_first(seq);
2475
		while (offset-- && rc && bucket == st->bucket)
2476 2477 2478 2479 2480 2481 2482 2483
			rc = established_get_next(seq, rc);
	}

	st->num = orig_num;

	return rc;
}

2484
void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
L
Linus Torvalds 已提交
2485
{
J
Jianjun Kong 已提交
2486
	struct tcp_iter_state *st = seq->private;
2487 2488 2489 2490 2491 2492 2493 2494
	void *rc;

	if (*pos && *pos == st->last_pos) {
		rc = tcp_seek_last_pos(seq);
		if (rc)
			goto out;
	}

L
Linus Torvalds 已提交
2495 2496
	st->state = TCP_SEQ_STATE_LISTENING;
	st->num = 0;
2497 2498 2499 2500 2501 2502 2503
	st->bucket = 0;
	st->offset = 0;
	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;

out:
	st->last_pos = *pos;
	return rc;
L
Linus Torvalds 已提交
2504
}
2505
EXPORT_SYMBOL(tcp_seq_start);
L
Linus Torvalds 已提交
2506

2507
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
L
Linus Torvalds 已提交
2508
{
2509
	struct tcp_iter_state *st = seq->private;
L
Linus Torvalds 已提交
2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521
	void *rc = NULL;

	if (v == SEQ_START_TOKEN) {
		rc = tcp_get_idx(seq, 0);
		goto out;
	}

	switch (st->state) {
	case TCP_SEQ_STATE_LISTENING:
		rc = listening_get_next(seq, v);
		if (!rc) {
			st->state = TCP_SEQ_STATE_ESTABLISHED;
2522 2523
			st->bucket = 0;
			st->offset = 0;
L
Linus Torvalds 已提交
2524 2525 2526 2527 2528 2529 2530 2531 2532
			rc	  = established_get_first(seq);
		}
		break;
	case TCP_SEQ_STATE_ESTABLISHED:
		rc = established_get_next(seq, v);
		break;
	}
out:
	++*pos;
2533
	st->last_pos = *pos;
L
Linus Torvalds 已提交
2534 2535
	return rc;
}
2536
EXPORT_SYMBOL(tcp_seq_next);
L
Linus Torvalds 已提交
2537

2538
void tcp_seq_stop(struct seq_file *seq, void *v)
L
Linus Torvalds 已提交
2539
{
J
Jianjun Kong 已提交
2540
	struct tcp_iter_state *st = seq->private;
L
Linus Torvalds 已提交
2541 2542 2543 2544

	switch (st->state) {
	case TCP_SEQ_STATE_LISTENING:
		if (v != SEQ_START_TOKEN)
2545
			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
L
Linus Torvalds 已提交
2546 2547 2548
		break;
	case TCP_SEQ_STATE_ESTABLISHED:
		if (v)
2549
			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
L
Linus Torvalds 已提交
2550 2551 2552
		break;
	}
}
2553
EXPORT_SYMBOL(tcp_seq_stop);
L
Linus Torvalds 已提交
2554

2555
static void get_openreq4(const struct request_sock *req,
E
Eric Dumazet 已提交
2556
			 struct seq_file *f, int i)
L
Linus Torvalds 已提交
2557
{
2558
	const struct inet_request_sock *ireq = inet_rsk(req);
2559
	long delta = req->rsk_timer.expires - jiffies;
L
Linus Torvalds 已提交
2560

2561
	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562
		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
L
Linus Torvalds 已提交
2563
		i,
2564
		ireq->ir_loc_addr,
2565
		ireq->ir_num,
2566 2567
		ireq->ir_rmt_addr,
		ntohs(ireq->ir_rmt_port),
L
Linus Torvalds 已提交
2568 2569 2570
		TCP_SYN_RECV,
		0, 0, /* could print option size, but that is af dependent. */
		1,    /* timers active (only the expire timer) */
2571
		jiffies_delta_to_clock_t(delta),
2572
		req->num_timeout,
E
Eric Dumazet 已提交
2573 2574
		from_kuid_munged(seq_user_ns(f),
				 sock_i_uid(req->rsk_listener)),
L
Linus Torvalds 已提交
2575 2576
		0,  /* non standard timer */
		0, /* open_requests have no inode */
2577
		0,
2578
		req);
L
Linus Torvalds 已提交
2579 2580
}

2581
static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
L
Linus Torvalds 已提交
2582 2583 2584
{
	int timer_active;
	unsigned long timer_expires;
2585
	const struct tcp_sock *tp = tcp_sk(sk);
2586
	const struct inet_connection_sock *icsk = inet_csk(sk);
2587
	const struct inet_sock *inet = inet_sk(sk);
2588
	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
E
Eric Dumazet 已提交
2589 2590 2591 2592
	__be32 dest = inet->inet_daddr;
	__be32 src = inet->inet_rcv_saddr;
	__u16 destp = ntohs(inet->inet_dport);
	__u16 srcp = ntohs(inet->inet_sport);
2593
	int rx_queue;
2594
	int state;
L
Linus Torvalds 已提交
2595

N
Nandita Dukkipati 已提交
2596
	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597
	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
N
Nandita Dukkipati 已提交
2598
	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
L
Linus Torvalds 已提交
2599
		timer_active	= 1;
2600 2601
		timer_expires	= icsk->icsk_timeout;
	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
L
Linus Torvalds 已提交
2602
		timer_active	= 4;
2603
		timer_expires	= icsk->icsk_timeout;
2604
	} else if (timer_pending(&sk->sk_timer)) {
L
Linus Torvalds 已提交
2605
		timer_active	= 2;
2606
		timer_expires	= sk->sk_timer.expires;
L
Linus Torvalds 已提交
2607 2608 2609 2610 2611
	} else {
		timer_active	= 0;
		timer_expires = jiffies;
	}

2612
	state = inet_sk_state_load(sk);
2613
	if (state == TCP_LISTEN)
2614
		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615
	else
2616 2617
		/* Because we don't lock the socket,
		 * we might find a transient negative value.
2618
		 */
2619
		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620
				      READ_ONCE(tp->copied_seq), 0);
2621

2622
	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623
			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624
		i, src, srcp, dest, destp, state,
2625
		READ_ONCE(tp->write_seq) - tp->snd_una,
2626
		rx_queue,
L
Linus Torvalds 已提交
2627
		timer_active,
2628
		jiffies_delta_to_clock_t(timer_expires - jiffies),
2629
		icsk->icsk_retransmits,
2630
		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631
		icsk->icsk_probes_out,
2632
		sock_i_ino(sk),
2633
		refcount_read(&sk->sk_refcnt), sk,
2634 2635
		jiffies_to_clock_t(icsk->icsk_rto),
		jiffies_to_clock_t(icsk->icsk_ack.ato),
W
Wei Wang 已提交
2636
		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
L
Linus Torvalds 已提交
2637
		tp->snd_cwnd,
2638 2639
		state == TCP_LISTEN ?
		    fastopenq->max_qlen :
2640
		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
L
Linus Torvalds 已提交
2641 2642
}

2643
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644
			       struct seq_file *f, int i)
L
Linus Torvalds 已提交
2645
{
2646
	long delta = tw->tw_timer.expires - jiffies;
2647
	__be32 dest, src;
L
Linus Torvalds 已提交
2648 2649 2650 2651 2652 2653 2654
	__u16 destp, srcp;

	dest  = tw->tw_daddr;
	src   = tw->tw_rcv_saddr;
	destp = ntohs(tw->tw_dport);
	srcp  = ntohs(tw->tw_sport);

2655
	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656
		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
L
Linus Torvalds 已提交
2657
		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658
		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659
		refcount_read(&tw->tw_refcnt), tw);
L
Linus Torvalds 已提交
2660 2661 2662 2663 2664 2665
}

#define TMPSZ 150

static int tcp4_seq_show(struct seq_file *seq, void *v)
{
J
Jianjun Kong 已提交
2666
	struct tcp_iter_state *st;
E
Eric Dumazet 已提交
2667
	struct sock *sk = v;
L
Linus Torvalds 已提交
2668

2669
	seq_setwidth(seq, TMPSZ - 1);
L
Linus Torvalds 已提交
2670
	if (v == SEQ_START_TOKEN) {
2671
		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
L
Linus Torvalds 已提交
2672 2673 2674 2675 2676 2677
			   "rx_queue tr tm->when retrnsmt   uid  timeout "
			   "inode");
		goto out;
	}
	st = seq->private;

2678 2679 2680
	if (sk->sk_state == TCP_TIME_WAIT)
		get_timewait4_sock(v, seq, st->num);
	else if (sk->sk_state == TCP_NEW_SYN_RECV)
E
Eric Dumazet 已提交
2681
		get_openreq4(v, seq, st->num);
2682 2683
	else
		get_tcp4_sock(v, seq, st->num);
L
Linus Torvalds 已提交
2684
out:
2685
	seq_pad(seq, '\n');
L
Linus Torvalds 已提交
2686 2687 2688
	return 0;
}

2689
#ifdef CONFIG_BPF_SYSCALL
2690 2691 2692 2693 2694 2695 2696 2697 2698
struct bpf_tcp_iter_state {
	struct tcp_iter_state state;
	unsigned int cur_sk;
	unsigned int end_sk;
	unsigned int max_sk;
	struct sock **batch;
	bool st_bucket_done;
};

2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716
struct bpf_iter__tcp {
	__bpf_md_ptr(struct bpf_iter_meta *, meta);
	__bpf_md_ptr(struct sock_common *, sk_common);
	uid_t uid __aligned(8);
};

static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
			     struct sock_common *sk_common, uid_t uid)
{
	struct bpf_iter__tcp ctx;

	meta->seq_num--;  /* skip SEQ_START_TOKEN */
	ctx.meta = meta;
	ctx.sk_common = sk_common;
	ctx.uid = uid;
	return bpf_iter_run_prog(prog, &ctx);
}

2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
	while (iter->cur_sk < iter->end_sk)
		sock_put(iter->batch[iter->cur_sk++]);
}

static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
				      unsigned int new_batch_sz)
{
	struct sock **new_batch;

	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
			     GFP_USER | __GFP_NOWARN);
	if (!new_batch)
		return -ENOMEM;

	bpf_iter_tcp_put_batch(iter);
	kvfree(iter->batch);
	iter->batch = new_batch;
	iter->max_sk = new_batch_sz;

	return 0;
}

static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
						 struct sock *start_sk)
{
	struct bpf_tcp_iter_state *iter = seq->private;
	struct tcp_iter_state *st = &iter->state;
	struct inet_connection_sock *icsk;
	unsigned int expected = 1;
	struct sock *sk;

	sock_hold(start_sk);
	iter->batch[iter->end_sk++] = start_sk;

	icsk = inet_csk(start_sk);
	inet_lhash2_for_each_icsk_continue(icsk) {
		sk = (struct sock *)icsk;
		if (seq_sk_match(seq, sk)) {
			if (iter->end_sk < iter->max_sk) {
				sock_hold(sk);
				iter->batch[iter->end_sk++] = sk;
			}
			expected++;
		}
	}
	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);

	return expected;
}

static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
						   struct sock *start_sk)
{
	struct bpf_tcp_iter_state *iter = seq->private;
	struct tcp_iter_state *st = &iter->state;
	struct hlist_nulls_node *node;
	unsigned int expected = 1;
	struct sock *sk;

	sock_hold(start_sk);
	iter->batch[iter->end_sk++] = start_sk;

	sk = sk_nulls_next(start_sk);
	sk_nulls_for_each_from(sk, node) {
		if (seq_sk_match(seq, sk)) {
			if (iter->end_sk < iter->max_sk) {
				sock_hold(sk);
				iter->batch[iter->end_sk++] = sk;
			}
			expected++;
		}
	}
	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));

	return expected;
}

static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
{
	struct bpf_tcp_iter_state *iter = seq->private;
	struct tcp_iter_state *st = &iter->state;
	unsigned int expected;
	bool resized = false;
	struct sock *sk;

	/* The st->bucket is done.  Directly advance to the next
	 * bucket instead of having the tcp_seek_last_pos() to skip
	 * one by one in the current bucket and eventually find out
	 * it has to advance to the next bucket.
	 */
	if (iter->st_bucket_done) {
		st->offset = 0;
		st->bucket++;
		if (st->state == TCP_SEQ_STATE_LISTENING &&
		    st->bucket > tcp_hashinfo.lhash2_mask) {
			st->state = TCP_SEQ_STATE_ESTABLISHED;
			st->bucket = 0;
		}
	}

again:
	/* Get a new batch */
	iter->cur_sk = 0;
	iter->end_sk = 0;
	iter->st_bucket_done = false;

	sk = tcp_seek_last_pos(seq);
	if (!sk)
		return NULL; /* Done */

	if (st->state == TCP_SEQ_STATE_LISTENING)
		expected = bpf_iter_tcp_listening_batch(seq, sk);
	else
		expected = bpf_iter_tcp_established_batch(seq, sk);

	if (iter->end_sk == expected) {
		iter->st_bucket_done = true;
		return sk;
	}

	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
		resized = true;
		goto again;
	}

	return sk;
}

static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
	/* bpf iter does not support lseek, so it always
	 * continue from where it was stop()-ped.
	 */
	if (*pos)
		return bpf_iter_tcp_batch(seq);

	return SEQ_START_TOKEN;
}

static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct bpf_tcp_iter_state *iter = seq->private;
	struct tcp_iter_state *st = &iter->state;
	struct sock *sk;

	/* Whenever seq_next() is called, the iter->cur_sk is
	 * done with seq_show(), so advance to the next sk in
	 * the batch.
	 */
	if (iter->cur_sk < iter->end_sk) {
		/* Keeping st->num consistent in tcp_iter_state.
		 * bpf_iter_tcp does not use st->num.
		 * meta.seq_num is used instead.
		 */
		st->num++;
		/* Move st->offset to the next sk in the bucket such that
		 * the future start() will resume at st->offset in
		 * st->bucket.  See tcp_seek_last_pos().
		 */
		st->offset++;
		sock_put(iter->batch[iter->cur_sk++]);
	}

	if (iter->cur_sk < iter->end_sk)
		sk = iter->batch[iter->cur_sk];
	else
		sk = bpf_iter_tcp_batch(seq);

	++*pos;
	/* Keeping st->last_pos consistent in tcp_iter_state.
	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
	 */
	st->last_pos = *pos;
	return sk;
}

2895 2896 2897 2898 2899
static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
	struct bpf_iter_meta meta;
	struct bpf_prog *prog;
	struct sock *sk = v;
2900
	bool slow;
2901
	uid_t uid;
2902
	int ret;
2903 2904 2905 2906

	if (v == SEQ_START_TOKEN)
		return 0;

2907 2908 2909 2910 2911 2912 2913 2914
	if (sk_fullsock(sk))
		slow = lock_sock_fast(sk);

	if (unlikely(sk_unhashed(sk))) {
		ret = SEQ_SKIP;
		goto unlock;
	}

2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
	if (sk->sk_state == TCP_TIME_WAIT) {
		uid = 0;
	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
		const struct request_sock *req = v;

		uid = from_kuid_munged(seq_user_ns(seq),
				       sock_i_uid(req->rsk_listener));
	} else {
		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
	}

	meta.seq = seq;
	prog = bpf_iter_get_info(&meta, false);
2928 2929 2930 2931 2932 2933 2934
	ret = tcp_prog_seq_show(prog, &meta, v, uid);

unlock:
	if (sk_fullsock(sk))
		unlock_sock_fast(sk, slow);
	return ret;

2935 2936 2937 2938
}

static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
2939
	struct bpf_tcp_iter_state *iter = seq->private;
2940 2941 2942 2943 2944 2945 2946 2947 2948 2949
	struct bpf_iter_meta meta;
	struct bpf_prog *prog;

	if (!v) {
		meta.seq = seq;
		prog = bpf_iter_get_info(&meta, true);
		if (prog)
			(void)tcp_prog_seq_show(prog, &meta, v, 0);
	}

2950 2951 2952 2953
	if (iter->cur_sk < iter->end_sk) {
		bpf_iter_tcp_put_batch(iter);
		iter->st_bucket_done = false;
	}
2954 2955 2956 2957
}

static const struct seq_operations bpf_iter_tcp_seq_ops = {
	.show		= bpf_iter_tcp_seq_show,
2958 2959
	.start		= bpf_iter_tcp_seq_start,
	.next		= bpf_iter_tcp_seq_next,
2960 2961 2962
	.stop		= bpf_iter_tcp_seq_stop,
};
#endif
2963 2964
static unsigned short seq_file_family(const struct seq_file *seq)
{
2965
	const struct tcp_seq_afinfo *afinfo;
2966

2967
#ifdef CONFIG_BPF_SYSCALL
2968
	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2969
	if (seq->op == &bpf_iter_tcp_seq_ops)
2970
		return AF_UNSPEC;
2971
#endif
2972 2973 2974 2975 2976

	/* Iterated from proc fs */
	afinfo = PDE_DATA(file_inode(seq->file));
	return afinfo->family;
}
2977

2978 2979 2980 2981 2982 2983 2984
static const struct seq_operations tcp4_seq_ops = {
	.show		= tcp4_seq_show,
	.start		= tcp_seq_start,
	.next		= tcp_seq_next,
	.stop		= tcp_seq_stop,
};

L
Linus Torvalds 已提交
2985 2986 2987 2988
static struct tcp_seq_afinfo tcp4_seq_afinfo = {
	.family		= AF_INET,
};

2989
static int __net_init tcp4_proc_init_net(struct net *net)
2990
{
2991 2992
	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2993 2994
		return -ENOMEM;
	return 0;
2995 2996
}

2997
static void __net_exit tcp4_proc_exit_net(struct net *net)
2998
{
2999
	remove_proc_entry("tcp", net->proc_net);
3000 3001 3002 3003 3004 3005 3006
}

static struct pernet_operations tcp4_net_ops = {
	.init = tcp4_proc_init_net,
	.exit = tcp4_proc_exit_net,
};

L
Linus Torvalds 已提交
3007 3008
int __init tcp4_proc_init(void)
{
3009
	return register_pernet_subsys(&tcp4_net_ops);
L
Linus Torvalds 已提交
3010 3011 3012 3013
}

void tcp4_proc_exit(void)
{
3014
	unregister_pernet_subsys(&tcp4_net_ops);
L
Linus Torvalds 已提交
3015 3016 3017
}
#endif /* CONFIG_PROC_FS */

3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031
/* @wake is one when sk_stream_write_space() calls us.
 * This sends EPOLLOUT only if notsent_bytes is half the limit.
 * This mimics the strategy used in sock_def_write_space().
 */
bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
			    READ_ONCE(tp->snd_nxt);

	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}
EXPORT_SYMBOL(tcp_stream_memory_free);

L
Linus Torvalds 已提交
3032 3033 3034 3035
struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
A
Andrey Ignatov 已提交
3036
	.pre_connect		= tcp_v4_pre_connect,
L
Linus Torvalds 已提交
3037 3038
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
3039
	.accept			= inet_csk_accept,
L
Linus Torvalds 已提交
3040 3041 3042 3043 3044 3045
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
3046
	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3047
	.keepalive		= tcp_set_keepalive,
L
Linus Torvalds 已提交
3048
	.recvmsg		= tcp_recvmsg,
3049 3050
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
L
Linus Torvalds 已提交
3051
	.backlog_rcv		= tcp_v4_do_rcv,
E
Eric Dumazet 已提交
3052
	.release_cb		= tcp_release_cb,
3053 3054 3055
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
3056 3057 3058
#ifdef CONFIG_BPF_SYSCALL
	.psock_update_sk_prot	= tcp_bpf_update_proto,
#endif
L
Linus Torvalds 已提交
3059
	.enter_memory_pressure	= tcp_enter_memory_pressure,
3060
	.leave_memory_pressure	= tcp_leave_memory_pressure,
3061
	.stream_memory_free	= tcp_stream_memory_free,
L
Linus Torvalds 已提交
3062
	.sockets_allocated	= &tcp_sockets_allocated,
3063
	.orphan_count		= &tcp_orphan_count,
L
Linus Torvalds 已提交
3064 3065
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
3066
	.sysctl_mem		= sysctl_tcp_mem,
3067 3068
	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
L
Linus Torvalds 已提交
3069 3070
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
3071
	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3072
	.twsk_prot		= &tcp_timewait_sock_ops,
3073
	.rsk_prot		= &tcp_request_sock_ops,
3074
	.h.hashinfo		= &tcp_hashinfo,
3075
	.no_autobind		= true,
3076
	.diag_destroy		= tcp_abort,
L
Linus Torvalds 已提交
3077
};
E
Eric Dumazet 已提交
3078
EXPORT_SYMBOL(tcp_prot);
L
Linus Torvalds 已提交
3079

3080 3081 3082 3083
static void __net_exit tcp_sk_exit(struct net *net)
{
	int cpu;

3084
	if (net->ipv4.tcp_congestion_control)
3085 3086
		bpf_module_put(net->ipv4.tcp_congestion_control,
			       net->ipv4.tcp_congestion_control->owner);
3087

3088 3089 3090 3091 3092
	for_each_possible_cpu(cpu)
		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
	free_percpu(net->ipv4.tcp_sk);
}

3093 3094
static int __net_init tcp_sk_init(struct net *net)
{
3095
	int res, cpu, cnt;
3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107

	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
	if (!net->ipv4.tcp_sk)
		return -ENOMEM;

	for_each_possible_cpu(cpu) {
		struct sock *sk;

		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
					   IPPROTO_TCP, net);
		if (res)
			goto fail;
3108
		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3109 3110 3111 3112 3113 3114

		/* Please enforce IP_DF and IPID==0 for RST and
		 * ACK sent in SYN-RECV and TIME-WAIT state.
		 */
		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;

3115 3116
		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
	}
3117

3118
	net->ipv4.sysctl_tcp_ecn = 2;
3119 3120
	net->ipv4.sysctl_tcp_ecn_fallback = 1;

F
Fan Du 已提交
3121
	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
E
Eric Dumazet 已提交
3122
	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3123
	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3124
	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3125
	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3126

3127
	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3128
	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3129
	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3130

3131
	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3132
	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3133
	net->ipv4.sysctl_tcp_syncookies = 1;
3134
	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3135
	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3136
	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3137
	net->ipv4.sysctl_tcp_orphan_retries = 0;
3138
	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3139
	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3140
	net->ipv4.sysctl_tcp_tw_reuse = 2;
3141
	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3142

3143
	cnt = tcp_hashinfo.ehash_mask + 1;
3144
	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3145 3146
	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;

3147
	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
E
Eric Dumazet 已提交
3148
	net->ipv4.sysctl_tcp_sack = 1;
3149
	net->ipv4.sysctl_tcp_window_scaling = 1;
3150
	net->ipv4.sysctl_tcp_timestamps = 1;
3151
	net->ipv4.sysctl_tcp_early_retrans = 3;
3152
	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3153
	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3154
	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3155
	net->ipv4.sysctl_tcp_max_reordering = 300;
E
Eric Dumazet 已提交
3156
	net->ipv4.sysctl_tcp_dsack = 1;
3157
	net->ipv4.sysctl_tcp_app_win = 31;
3158
	net->ipv4.sysctl_tcp_adv_win_scale = 1;
E
Eric Dumazet 已提交
3159
	net->ipv4.sysctl_tcp_frto = 2;
3160
	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3161 3162 3163 3164 3165
	/* This limits the percentage of the congestion window which we
	 * will allow a single TSO frame to consume.  Building TSO frames
	 * which are too large can cause TCP streams to be bursty.
	 */
	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3166 3167
	/* Default TSQ limit of 16 TSO segments */
	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3168 3169
	/* rfc5961 challenge ack rate limiting */
	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3170
	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3171
	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3172
	net->ipv4.sysctl_tcp_autocorking = 1;
3173
	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3174
	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3175
	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3176 3177 3178 3179 3180 3181 3182 3183
	if (net != &init_net) {
		memcpy(net->ipv4.sysctl_tcp_rmem,
		       init_net.ipv4.sysctl_tcp_rmem,
		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
		memcpy(net->ipv4.sysctl_tcp_wmem,
		       init_net.ipv4.sysctl_tcp_wmem,
		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
	}
3184
	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3185
	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
E
Eric Dumazet 已提交
3186
	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3187
	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3188
	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3189 3190
	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3191

3192 3193
	/* Reno is always built in */
	if (!net_eq(net, &init_net) &&
3194 3195
	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
			       init_net.ipv4.tcp_congestion_control->owner))
3196 3197 3198 3199
		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
	else
		net->ipv4.tcp_congestion_control = &tcp_reno;

3200
	return 0;
3201 3202 3203 3204
fail:
	tcp_sk_exit(net);

	return res;
E
Eric W. Biederman 已提交
3205 3206 3207 3208
}

static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
3209 3210
	struct net *net;

3211
	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3212 3213 3214

	list_for_each_entry(net, net_exit_list, exit_list)
		tcp_fastopen_ctx_destroy(net);
3215 3216 3217
}

static struct pernet_operations __net_initdata tcp_sk_ops = {
E
Eric W. Biederman 已提交
3218 3219 3220
       .init	   = tcp_sk_init,
       .exit	   = tcp_sk_exit,
       .exit_batch = tcp_sk_exit_batch,
3221 3222
};

3223 3224 3225 3226
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
		     struct sock_common *sk_common, uid_t uid)

3227 3228
#define INIT_BATCH_SZ 16

3229
static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3230
{
3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244
	struct bpf_tcp_iter_state *iter = priv_data;
	int err;

	err = bpf_iter_init_seq_net(priv_data, aux);
	if (err)
		return err;

	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
	if (err) {
		bpf_iter_fini_seq_net(priv_data);
		return err;
	}

	return 0;
3245 3246 3247 3248
}

static void bpf_iter_fini_tcp(void *priv_data)
{
3249 3250
	struct bpf_tcp_iter_state *iter = priv_data;

3251
	bpf_iter_fini_seq_net(priv_data);
3252
	kvfree(iter->batch);
3253 3254
}

3255
static const struct bpf_iter_seq_info tcp_seq_info = {
3256 3257 3258
	.seq_ops		= &bpf_iter_tcp_seq_ops,
	.init_seq_private	= bpf_iter_init_tcp,
	.fini_seq_private	= bpf_iter_fini_tcp,
3259
	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3260 3261
};

3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275
static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
			    const struct bpf_prog *prog)
{
	switch (func_id) {
	case BPF_FUNC_setsockopt:
		return &bpf_sk_setsockopt_proto;
	case BPF_FUNC_getsockopt:
		return &bpf_sk_getsockopt_proto;
	default:
		return NULL;
	}
}

3276 3277
static struct bpf_iter_reg tcp_reg_info = {
	.target			= "tcp",
3278 3279 3280 3281 3282
	.ctx_arg_info_size	= 1,
	.ctx_arg_info		= {
		{ offsetof(struct bpf_iter__tcp, sk_common),
		  PTR_TO_BTF_ID_OR_NULL },
	},
3283
	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3284
	.seq_info		= &tcp_seq_info,
3285 3286 3287 3288
};

static void __init bpf_iter_register(void)
{
3289
	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3290 3291 3292 3293 3294 3295
	if (bpf_iter_reg_target(&tcp_reg_info))
		pr_warn("Warning: could not register bpf iterator tcp\n");
}

#endif

3296
void __init tcp_v4_init(void)
L
Linus Torvalds 已提交
3297
{
3298
	if (register_pernet_subsys(&tcp_sk_ops))
L
Linus Torvalds 已提交
3299
		panic("Failed to create the TCP control socket.\n");
3300 3301 3302 3303

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
	bpf_iter_register();
#endif
L
Linus Torvalds 已提交
3304
}