subflow.c 41.4 KB
Newer Older
1 2 3 4 5 6
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

7 8
#define pr_fmt(fmt) "MPTCP: " fmt

9 10 11
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
12
#include <crypto/algapi.h>
13
#include <crypto/sha2.h>
14 15 16 17 18
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
#include <net/tcp.h>
19 20
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
P
Paolo Abeni 已提交
21
#include <net/transp_v6.h>
22
#endif
23
#include <net/mptcp.h>
24
#include <uapi/linux/mptcp.h>
25
#include "protocol.h"
26 27
#include "mib.h"

P
Paolo Abeni 已提交
28 29
static void mptcp_subflow_ops_undo_override(struct sock *ssk);

30 31 32 33 34
static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
				  enum linux_mptcp_mib_field field)
{
	MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
}
35

36 37 38 39 40 41
static void subflow_req_destructor(struct request_sock *req)
{
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);

	pr_debug("subflow_req=%p", subflow_req);

42 43 44
	if (subflow_req->msk)
		sock_put((struct sock *)subflow_req->msk);

P
Paolo Abeni 已提交
45
	mptcp_token_destroy_request(req);
46 47 48
	tcp_request_sock_ops.destructor(req);
}

49 50 51 52 53 54 55 56 57 58 59
static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
				  void *hmac)
{
	u8 msg[8];

	put_unaligned_be32(nonce1, &msg[0]);
	put_unaligned_be32(nonce2, &msg[4]);

	mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
}

60 61 62 63 64 65
static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
{
	return mptcp_is_fully_established((void *)msk) &&
	       READ_ONCE(msk->pm.accept_subflow);
}

66
/* validate received token and create truncated hmac and nonce for SYN-ACK */
67 68 69 70 71 72 73 74 75 76 77 78 79 80
static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
{
	struct mptcp_sock *msk = subflow_req->msk;
	u8 hmac[SHA256_DIGEST_SIZE];

	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));

	subflow_generate_hmac(msk->local_key, msk->remote_key,
			      subflow_req->local_nonce,
			      subflow_req->remote_nonce, hmac);

	subflow_req->thmac = get_unaligned_be64(hmac);
}

81
static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
82 83 84 85 86 87 88
{
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
	struct mptcp_sock *msk;
	int local_id;

	msk = mptcp_token_get_sock(subflow_req->token);
	if (!msk) {
89
		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
90
		return NULL;
91 92 93 94 95
	}

	local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
	if (local_id < 0) {
		sock_put((struct sock *)msk);
96
		return NULL;
97 98 99
	}
	subflow_req->local_id = local_id;

100
	return msk;
101 102
}

103
static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
104 105 106 107
{
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);

	subflow_req->mp_capable = 0;
108
	subflow_req->mp_join = 0;
109
	subflow_req->msk = NULL;
P
Paolo Abeni 已提交
110
	mptcp_token_init_request(req);
111 112 113 114 115 116

#ifdef CONFIG_TCP_MD5SIG
	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
	 * TCP option space.
	 */
	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
117
		return -EINVAL;
118 119
#endif

120 121 122
	return 0;
}

123 124 125 126 127 128 129 130
/* Init mptcp request socket.
 *
 * Returns an error code if a JOIN has failed and a TCP reset
 * should be sent.
 */
static int subflow_init_req(struct request_sock *req,
			    const struct sock *sk_listener,
			    struct sk_buff *skb)
131 132 133 134 135 136 137 138 139 140
{
	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
	struct mptcp_options_received mp_opt;
	int ret;

	pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);

	ret = __subflow_init_req(req, sk_listener);
	if (ret)
141
		return 0;
142 143 144

	mptcp_get_options(skb, &mp_opt);

145
	if (mp_opt.mp_capable) {
146 147
		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);

148
		if (mp_opt.mp_join)
149
			return 0;
150
	} else if (mp_opt.mp_join) {
151 152
		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
	}
153

154
	if (mp_opt.mp_capable && listener->request_mptcp) {
155 156
		int err, retries = 4;

157
		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
158 159 160 161
again:
		do {
			get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
		} while (subflow_req->local_key == 0);
162

163 164 165 166 167 168 169 170 171 172
		if (unlikely(req->syncookie)) {
			mptcp_crypto_key_sha(subflow_req->local_key,
					     &subflow_req->token,
					     &subflow_req->idsn);
			if (mptcp_token_exists(subflow_req->token)) {
				if (retries-- > 0)
					goto again;
			} else {
				subflow_req->mp_capable = 1;
			}
173
			return 0;
174 175
		}

176 177 178
		err = mptcp_token_new_request(req);
		if (err == 0)
			subflow_req->mp_capable = 1;
179 180
		else if (retries-- > 0)
			goto again;
181

182
	} else if (mp_opt.mp_join && listener->request_mptcp) {
183
		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
184
		subflow_req->mp_join = 1;
185 186 187 188
		subflow_req->backup = mp_opt.backup;
		subflow_req->remote_id = mp_opt.join_id;
		subflow_req->token = mp_opt.token;
		subflow_req->remote_nonce = mp_opt.nonce;
189
		subflow_req->msk = subflow_token_join_request(req);
190

191 192 193 194
		/* Can't fall back to TCP in this case. */
		if (!subflow_req->msk)
			return -EPERM;

195 196
		subflow_req_create_thmac(subflow_req);

197
		if (unlikely(req->syncookie)) {
198 199 200 201
			if (mptcp_can_accept_new_subflow(subflow_req->msk))
				subflow_init_req_cookie_join_save(subflow_req, skb);
		}

202 203
		pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
			 subflow_req->remote_nonce, subflow_req->msk);
204
	}
205 206

	return 0;
207 208
}

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
int mptcp_subflow_init_cookie_req(struct request_sock *req,
				  const struct sock *sk_listener,
				  struct sk_buff *skb)
{
	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
	struct mptcp_options_received mp_opt;
	int err;

	err = __subflow_init_req(req, sk_listener);
	if (err)
		return err;

	mptcp_get_options(skb, &mp_opt);

	if (mp_opt.mp_capable && mp_opt.mp_join)
		return -EINVAL;

	if (mp_opt.mp_capable && listener->request_mptcp) {
		if (mp_opt.sndr_key == 0)
			return -EINVAL;

		subflow_req->local_key = mp_opt.rcvr_key;
		err = mptcp_token_new_request(req);
		if (err)
			return err;

		subflow_req->mp_capable = 1;
		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
238 239 240 241 242 243 244 245
	} else if (mp_opt.mp_join && listener->request_mptcp) {
		if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
			return -EINVAL;

		if (mptcp_can_accept_new_subflow(subflow_req->msk))
			subflow_req->mp_join = 1;

		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
246 247 248 249 250 251
	}

	return 0;
}
EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);

252 253 254 255
static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
					      struct sk_buff *skb,
					      struct flowi *fl,
					      struct request_sock *req)
256
{
257
	struct dst_entry *dst;
258
	int err;
259

260 261
	tcp_rsk(req)->is_mptcp = 1;

262 263 264
	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
	if (!dst)
		return NULL;
265

266 267 268
	err = subflow_init_req(req, sk, skb);
	if (err == 0)
		return dst;
269

270 271 272 273
	dst_release(dst);
	if (!req->syncookie)
		tcp_request_sock_ops.send_reset(sk, skb);
	return NULL;
274 275 276
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
277 278 279 280
static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
					      struct sk_buff *skb,
					      struct flowi *fl,
					      struct request_sock *req)
281
{
282
	struct dst_entry *dst;
283
	int err;
284

285 286
	tcp_rsk(req)->is_mptcp = 1;

287 288 289
	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
	if (!dst)
		return NULL;
290

291 292 293 294 295 296 297 298
	err = subflow_init_req(req, sk, skb);
	if (err == 0)
		return dst;

	dst_release(dst);
	if (!req->syncookie)
		tcp6_request_sock_ops.send_reset(sk, skb);
	return NULL;
299 300 301
}
#endif

302 303 304
/* validate received truncated hmac and create hmac for third ACK */
static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
{
305
	u8 hmac[SHA256_DIGEST_SIZE];
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
	u64 thmac;

	subflow_generate_hmac(subflow->remote_key, subflow->local_key,
			      subflow->remote_nonce, subflow->local_nonce,
			      hmac);

	thmac = get_unaligned_be64(hmac);
	pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
		 subflow, subflow->token,
		 (unsigned long long)thmac,
		 (unsigned long long)subflow->thmac);

	return thmac == subflow->thmac;
}

321 322
void mptcp_subflow_reset(struct sock *ssk)
{
P
Paolo Abeni 已提交
323 324 325
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	struct sock *sk = subflow->conn;

326 327 328
	/* must hold: tcp_done() could drop last reference on parent */
	sock_hold(sk);

329 330 331
	tcp_set_state(ssk, TCP_CLOSE);
	tcp_send_active_reset(ssk, GFP_ATOMIC);
	tcp_done(ssk);
P
Paolo Abeni 已提交
332 333
	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
	    schedule_work(&mptcp_sk(sk)->work))
334 335 336
		return; /* worker will put sk for us */

	sock_put(sk);
337 338
}

339 340 341
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
342
	struct mptcp_options_received mp_opt;
343
	struct sock *parent = subflow->conn;
344 345 346

	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);

347
	if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
348 349 350 351
		inet_sk_state_store(parent, TCP_ESTABLISHED);
		parent->sk_state_change(parent);
	}

352 353 354 355
	/* be sure no special action on any packet other than syn-ack */
	if (subflow->conn_finished)
		return;

P
Paolo Abeni 已提交
356
	mptcp_propagate_sndbuf(parent, sk);
357
	subflow->rel_write_seq = 1;
358
	subflow->conn_finished = 1;
359 360
	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
361

362
	mptcp_get_options(skb, &mp_opt);
363 364 365 366 367 368 369 370 371
	if (subflow->request_mptcp) {
		if (!mp_opt.mp_capable) {
			MPTCP_INC_STATS(sock_net(sk),
					MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
			mptcp_do_fallback(sk);
			pr_fallback(mptcp_sk(subflow->conn));
			goto fallback;
		}

372 373
		subflow->mp_capable = 1;
		subflow->can_ack = 1;
374
		subflow->remote_key = mp_opt.sndr_key;
375 376
		pr_debug("subflow=%p, remote_key=%llu", subflow,
			 subflow->remote_key);
377 378 379 380 381 382 383
		mptcp_finish_connect(sk);
	} else if (subflow->request_join) {
		u8 hmac[SHA256_DIGEST_SIZE];

		if (!mp_opt.mp_join)
			goto do_reset;

384 385
		subflow->thmac = mp_opt.thmac;
		subflow->remote_nonce = mp_opt.nonce;
386 387 388
		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
			 subflow->thmac, subflow->remote_nonce);

389
		if (!subflow_thmac_valid(subflow)) {
390
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
391 392 393 394 395 396
			goto do_reset;
		}

		subflow_generate_hmac(subflow->local_key, subflow->remote_key,
				      subflow->local_nonce,
				      subflow->remote_nonce,
397 398
				      hmac);
		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
399 400 401 402

		if (!mptcp_finish_join(sk))
			goto do_reset;

403
		subflow->mp_join = 1;
404
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
405 406 407
	} else if (mptcp_check_fallback(sk)) {
fallback:
		mptcp_rcv_space_init(mptcp_sk(parent), sk);
408
	}
409 410 411
	return;

do_reset:
412
	mptcp_subflow_reset(sk);
413 414
}

415 416
struct request_sock_ops mptcp_subflow_request_sock_ops;
EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
417 418 419 420 421 422 423 424 425 426 427 428
static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;

static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	pr_debug("subflow=%p", subflow);

	/* Never answer to SYNs sent to broadcast or multicast */
	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
		goto drop;

429
	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
430 431 432 433 434 435 436 437 438 439 440
				&subflow_request_sock_ipv4_ops,
				sk, skb);
drop:
	tcp_listendrop(sk);
	return 0;
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
static struct inet_connection_sock_af_ops subflow_v6_specific;
static struct inet_connection_sock_af_ops subflow_v6m_specific;
P
Paolo Abeni 已提交
441
static struct proto tcpv6_prot_override;
442 443 444 445 446 447 448 449 450 451 452 453 454

static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	pr_debug("subflow=%p", subflow);

	if (skb->protocol == htons(ETH_P_IP))
		return subflow_v4_conn_request(sk, skb);

	if (!ipv6_unicast_destination(skb))
		goto drop;

455
	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
456 457 458 459 460 461 462 463
				&subflow_request_sock_ipv6_ops, sk, skb);

drop:
	tcp_listendrop(sk);
	return 0; /* don't send reset */
}
#endif

464 465
/* validate hmac received in third ACK */
static bool subflow_hmac_valid(const struct request_sock *req,
466
			       const struct mptcp_options_received *mp_opt)
467 468
{
	const struct mptcp_subflow_request_sock *subflow_req;
469
	u8 hmac[SHA256_DIGEST_SIZE];
470 471 472
	struct mptcp_sock *msk;

	subflow_req = mptcp_subflow_rsk(req);
473
	msk = subflow_req->msk;
474 475 476 477 478 479 480
	if (!msk)
		return false;

	subflow_generate_hmac(msk->remote_key, msk->local_key,
			      subflow_req->remote_nonce,
			      subflow_req->local_nonce, hmac);

481
	return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
482 483
}

484 485 486 487 488 489 490 491 492
static void mptcp_sock_destruct(struct sock *sk)
{
	/* if new mptcp socket isn't accepted, it is free'd
	 * from the tcp listener sockets request queue, linked
	 * from req->sk.  The tcp socket is released.
	 * This calls the ULP release function which will
	 * also remove the mptcp socket, via
	 * sock_put(ctx->conn).
	 *
493 494
	 * Problem is that the mptcp socket will be in
	 * ESTABLISHED state and will not have the SOCK_DEAD flag.
495 496 497
	 * Both result in warnings from inet_sock_destruct.
	 */

498
	if (sk->sk_state == TCP_ESTABLISHED) {
499 500 501 502 503
		sk->sk_state = TCP_CLOSE;
		WARN_ON_ONCE(sk->sk_socket);
		sock_orphan(sk);
	}

504
	mptcp_destroy_common(mptcp_sk(sk));
505 506 507
	inet_sock_destruct(sk);
}

508 509 510 511 512 513
static void mptcp_force_close(struct sock *sk)
{
	inet_sk_state_store(sk, TCP_CLOSE);
	sk_common_release(sk);
}

514 515 516 517 518 519 520 521 522
static void subflow_ulp_fallback(struct sock *sk,
				 struct mptcp_subflow_context *old_ctx)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	mptcp_subflow_tcp_fallback(sk, old_ctx);
	icsk->icsk_ulp_ops = NULL;
	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
	tcp_sk(sk)->is_mptcp = 0;
P
Paolo Abeni 已提交
523 524

	mptcp_subflow_ops_undo_override(sk);
525 526
}

527 528 529 530 531 532 533 534 535 536 537 538 539 540
static void subflow_drop_ctx(struct sock *ssk)
{
	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);

	if (!ctx)
		return;

	subflow_ulp_fallback(ssk, ctx);
	if (ctx->conn)
		sock_put(ctx->conn);

	kfree_rcu(ctx, rcu);
}

541 542 543 544 545 546 547 548 549 550 551
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
				     struct mptcp_options_received *mp_opt)
{
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);

	subflow->remote_key = mp_opt->sndr_key;
	subflow->fully_established = 1;
	subflow->can_ack = 1;
	WRITE_ONCE(msk->fully_established, true);
}

552 553 554 555 556 557 558 559
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
					  struct sk_buff *skb,
					  struct request_sock *req,
					  struct dst_entry *dst,
					  struct request_sock *req_unhash,
					  bool *own_req)
{
	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
560
	struct mptcp_subflow_request_sock *subflow_req;
561
	struct mptcp_options_received mp_opt;
562
	bool fallback, fallback_is_fatal;
P
Paolo Abeni 已提交
563
	struct sock *new_msk = NULL;
564 565 566 567
	struct sock *child;

	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);

568 569
	/* After child creation we must look for 'mp_capable' even when options
	 * are not parsed
570 571
	 */
	mp_opt.mp_capable = 0;
572 573 574

	/* hopefully temporary handling for MP_JOIN+syncookie */
	subflow_req = mptcp_subflow_rsk(req);
575
	fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
576 577
	fallback = !tcp_rsk(req)->is_mptcp;
	if (fallback)
578 579
		goto create_child;

580
	/* if the sk is MP_CAPABLE, we try to fetch the client key */
581
	if (subflow_req->mp_capable) {
582 583 584 585 586
		if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
			/* here we can receive and accept an in-window,
			 * out-of-order pkt, which will not carry the MP_CAPABLE
			 * opt even on mptcp enabled paths
			 */
P
Paolo Abeni 已提交
587
			goto create_msk;
588 589
		}

590 591
		mptcp_get_options(skb, &mp_opt);
		if (!mp_opt.mp_capable) {
592
			fallback = true;
P
Paolo Abeni 已提交
593
			goto create_child;
594
		}
P
Paolo Abeni 已提交
595 596

create_msk:
597
		new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
P
Paolo Abeni 已提交
598
		if (!new_msk)
599
			fallback = true;
600
	} else if (subflow_req->mp_join) {
601
		mptcp_get_options(skb, &mp_opt);
602 603
		if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
		    !mptcp_can_accept_new_subflow(subflow_req->msk)) {
604
			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
605
			fallback = true;
606
		}
607
	}
608

609
create_child:
610 611 612 613
	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
						     req_unhash, own_req);

	if (child && *own_req) {
614 615
		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);

616 617
		tcp_rsk(req)->drop_req = false;

618 619 620
		/* we need to fallback on ctx allocation failure and on pre-reqs
		 * checking above. In the latter scenario we additionally need
		 * to reset the context to non MPTCP status.
621
		 */
622
		if (!ctx || fallback) {
623
			if (fallback_is_fatal)
624
				goto dispose_child;
625

626
			subflow_drop_ctx(child);
P
Paolo Abeni 已提交
627
			goto out;
628
		}
629 630

		if (ctx->mp_capable) {
631 632 633 634 635
			/* this can't race with mptcp_close(), as the msk is
			 * not yet exposted to user-space
			 */
			inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);

636 637 638
			/* record the newly created socket as the first msk
			 * subflow, but don't link it yet into conn_list
			 */
639 640
			WRITE_ONCE(mptcp_sk(new_msk)->first, child);

P
Paolo Abeni 已提交
641 642 643
			/* new mpc subflow takes ownership of the newly
			 * created mptcp socket
			 */
644
			new_msk->sk_destruct = mptcp_sock_destruct;
645
			mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
P
Paolo Abeni 已提交
646
			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
P
Paolo Abeni 已提交
647 648
			ctx->conn = new_msk;
			new_msk = NULL;
649 650 651 652

			/* with OoO packets we can reach here without ingress
			 * mpc option
			 */
653 654
			if (mp_opt.mp_capable)
				mptcp_subflow_fully_established(ctx, &mp_opt);
655 656 657
		} else if (ctx->mp_join) {
			struct mptcp_sock *owner;

658
			owner = subflow_req->msk;
659
			if (!owner)
660
				goto dispose_child;
661

662 663
			/* move the msk reference ownership to the subflow */
			subflow_req->msk = NULL;
664 665
			ctx->conn = (struct sock *)owner;
			if (!mptcp_finish_join(child))
666
				goto dispose_child;
667 668

			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
669
			tcp_rsk(req)->drop_req = true;
670 671 672
		}
	}

P
Paolo Abeni 已提交
673 674 675
out:
	/* dispose of the left over mptcp master, if any */
	if (unlikely(new_msk))
676
		mptcp_force_close(new_msk);
677 678 679 680

	/* check for expected invariant - should never trigger, just help
	 * catching eariler subtle bugs
	 */
681
	WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
682 683
		     (!mptcp_subflow_ctx(child) ||
		      !mptcp_subflow_ctx(child)->conn));
684
	return child;
685

686
dispose_child:
687
	subflow_drop_ctx(child);
688 689
	tcp_rsk(req)->drop_req = true;
	inet_csk_prepare_for_destroy_sock(child);
690
	tcp_done(child);
P
Paolo Abeni 已提交
691
	req->rsk_ops->send_reset(sk, skb);
692 693 694

	/* The last child reference will be released by the caller */
	return child;
695 696 697
}

static struct inet_connection_sock_af_ops subflow_specific;
P
Paolo Abeni 已提交
698
static struct proto tcp_prot_override;
699

700 701 702 703
enum mapping_status {
	MAPPING_OK,
	MAPPING_INVALID,
	MAPPING_EMPTY,
704 705
	MAPPING_DATA_FIN,
	MAPPING_DUMMY
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
};

static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
{
	if ((u32)seq == (u32)old_seq)
		return old_seq;

	/* Assume map covers data not mapped yet. */
	return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
}

static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
{
	WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
		  ssn, subflow->map_subflow_seq, subflow->map_data_len);
}

static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	unsigned int skb_consumed;

	skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
	if (WARN_ON_ONCE(skb_consumed >= skb->len))
		return true;

	return skb->len - skb_consumed <= subflow->map_data_len -
					  mptcp_subflow_get_map_offset(subflow);
}

static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;

	if (unlikely(before(ssn, subflow->map_subflow_seq))) {
		/* Mapping covers data later in the subflow stream,
		 * currently unsupported.
		 */
		warn_bad_map(subflow, ssn);
		return false;
	}
	if (unlikely(!before(ssn, subflow->map_subflow_seq +
				  subflow->map_data_len))) {
		/* Mapping does covers past subflow data, invalid */
		warn_bad_map(subflow, ssn + skb->len);
		return false;
	}
	return true;
}

757 758
static enum mapping_status get_mapping_status(struct sock *ssk,
					      struct mptcp_sock *msk)
759 760 761 762 763 764 765 766 767 768 769
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	struct mptcp_ext *mpext;
	struct sk_buff *skb;
	u16 data_len;
	u64 map_seq;

	skb = skb_peek(&ssk->sk_receive_queue);
	if (!skb)
		return MAPPING_EMPTY;

770 771 772
	if (mptcp_check_fallback(ssk))
		return MAPPING_DUMMY;

773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
	mpext = mptcp_get_ext(skb);
	if (!mpext || !mpext->use_map) {
		if (!subflow->map_valid && !skb->len) {
			/* the TCP stack deliver 0 len FIN pkt to the receive
			 * queue, that is the only 0len pkts ever expected here,
			 * and we can admit no mapping only for 0 len pkts
			 */
			if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
				WARN_ONCE(1, "0len seq %d:%d flags %x",
					  TCP_SKB_CB(skb)->seq,
					  TCP_SKB_CB(skb)->end_seq,
					  TCP_SKB_CB(skb)->tcp_flags);
			sk_eat_skb(ssk, skb);
			return MAPPING_EMPTY;
		}

		if (!subflow->map_valid)
			return MAPPING_INVALID;

		goto validate_seq;
	}

	pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
		 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
		 mpext->data_len, mpext->data_fin);

	data_len = mpext->data_len;
	if (data_len == 0) {
		pr_err("Infinite mapping not handled");
802
		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
803 804 805 806 807
		return MAPPING_INVALID;
	}

	if (mpext->data_fin == 1) {
		if (data_len == 1) {
808 809
			bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
								 mpext->dsn64);
810
			pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
811 812 813 814 815 816 817 818 819
			if (subflow->map_valid) {
				/* A DATA_FIN might arrive in a DSS
				 * option before the previous mapping
				 * has been fully consumed. Continue
				 * handling the existing mapping.
				 */
				skb_ext_del(skb, SKB_EXT_MPTCP);
				return MAPPING_OK;
			} else {
820 821 822
				if (updated && schedule_work(&msk->work))
					sock_hold((struct sock *)msk);

823 824
				return MAPPING_DATA_FIN;
			}
825
		} else {
P
Paolo Abeni 已提交
826
			u64 data_fin_seq = mpext->data_seq + data_len - 1;
827 828 829 830 831 832 833 834 835 836

			/* If mpext->data_seq is a 32-bit value, data_fin_seq
			 * must also be limited to 32 bits.
			 */
			if (!mpext->dsn64)
				data_fin_seq &= GENMASK_ULL(31, 0);

			mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
			pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
				 data_fin_seq, mpext->dsn64);
837 838 839 840 841 842 843 844 845 846 847 848 849
		}

		/* Adjust for DATA_FIN using 1 byte of sequence space */
		data_len--;
	}

	if (!mpext->dsn64) {
		map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
				     mpext->data_seq);
		pr_debug("expanded seq=%llu", subflow->map_seq);
	} else {
		map_seq = mpext->data_seq;
	}
850
	WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
851 852 853 854 855 856 857 858 859 860 861 862 863

	if (subflow->map_valid) {
		/* Allow replacing only with an identical map */
		if (subflow->map_seq == map_seq &&
		    subflow->map_subflow_seq == mpext->subflow_seq &&
		    subflow->map_data_len == data_len) {
			skb_ext_del(skb, SKB_EXT_MPTCP);
			return MAPPING_OK;
		}

		/* If this skb data are fully covered by the current mapping,
		 * the new map would need caching, which is not supported
		 */
864 865
		if (skb_is_fully_mapped(ssk, skb)) {
			MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
866
			return MAPPING_INVALID;
867
		}
868 869 870 871 872 873 874 875 876

		/* will validate the next map after consuming the current one */
		return MAPPING_OK;
	}

	subflow->map_seq = map_seq;
	subflow->map_subflow_seq = mpext->subflow_seq;
	subflow->map_data_len = data_len;
	subflow->map_valid = 1;
877
	subflow->mpc_map = mpext->mpc_map;
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
	pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
		 subflow->map_seq, subflow->map_subflow_seq,
		 subflow->map_data_len);

validate_seq:
	/* we revalidate valid mapping on new skb, because we must ensure
	 * the current skb is completely covered by the available mapping
	 */
	if (!validate_mapping(ssk, skb))
		return MAPPING_INVALID;

	skb_ext_del(skb, SKB_EXT_MPTCP);
	return MAPPING_OK;
}

893
static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
894
				       u64 limit)
895 896
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
897 898 899 900 901 902 903
	bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
	u32 incr;

	incr = limit >= skb->len ? skb->len + fin : limit;

	pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
		 subflow->map_subflow_seq);
P
Paolo Abeni 已提交
904
	MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
905 906 907 908 909
	tcp_sk(ssk)->copied_seq += incr;
	if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
		sk_eat_skb(ssk, skb);
	if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
		subflow->map_valid = 0;
910 911
}

912 913 914 915 916 917 918 919 920
static bool subflow_check_data_avail(struct sock *ssk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	enum mapping_status status;
	struct mptcp_sock *msk;
	struct sk_buff *skb;

	pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
		 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
921 922
	if (!skb_peek(&ssk->sk_receive_queue))
		subflow->data_avail = 0;
923 924 925 926 927 928 929 930
	if (subflow->data_avail)
		return true;

	msk = mptcp_sk(subflow->conn);
	for (;;) {
		u64 ack_seq;
		u64 old_ack;

931
		status = get_mapping_status(ssk, msk);
932 933 934 935 936
		pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
		if (status == MAPPING_INVALID) {
			ssk->sk_err = EBADMSG;
			goto fatal;
		}
937 938 939 940 941 942 943 944
		if (status == MAPPING_DUMMY) {
			__mptcp_do_fallback(msk);
			skb = skb_peek(&ssk->sk_receive_queue);
			subflow->map_valid = 1;
			subflow->map_seq = READ_ONCE(msk->ack_seq);
			subflow->map_data_len = skb->len;
			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
						   subflow->ssn_offset;
945
			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
946 947
			return true;
		}
948 949 950 951 952 953 954 955

		if (status != MAPPING_OK)
			return false;

		skb = skb_peek(&ssk->sk_receive_queue);
		if (WARN_ON_ONCE(!skb))
			return false;

956 957 958 959 960 961 962 963 964 965 966 967 968
		/* if msk lacks the remote key, this subflow must provide an
		 * MP_CAPABLE-based mapping
		 */
		if (unlikely(!READ_ONCE(msk->can_ack))) {
			if (!subflow->mpc_map) {
				ssk->sk_err = EBADMSG;
				goto fatal;
			}
			WRITE_ONCE(msk->remote_key, subflow->remote_key);
			WRITE_ONCE(msk->ack_seq, subflow->map_seq);
			WRITE_ONCE(msk->can_ack, true);
		}

969 970 971 972
		old_ack = READ_ONCE(msk->ack_seq);
		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
			 ack_seq);
973
		if (ack_seq == old_ack) {
974 975 976 977
			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
			break;
		} else if (after64(ack_seq, old_ack)) {
			subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
978
			break;
979
		}
980 981

		/* only accept in-sequence mapping. Old values are spurious
982
		 * retransmission
983
		 */
984
		mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
985 986 987 988 989 990 991 992 993 994
	}
	return true;

fatal:
	/* fatal protocol error, close the socket */
	/* This barrier is coupled with smp_rmb() in tcp_poll() */
	smp_wmb();
	ssk->sk_error_report(ssk);
	tcp_set_state(ssk, TCP_CLOSE);
	tcp_send_active_reset(ssk, GFP_ATOMIC);
995
	subflow->data_avail = 0;
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
	return false;
}

bool mptcp_subflow_data_available(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	/* check if current mapping is still valid */
	if (subflow->map_valid &&
	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
		subflow->map_valid = 0;
		subflow->data_avail = 0;

		pr_debug("Done with mapping: seq=%u data_len=%u",
			 subflow->map_subflow_seq,
			 subflow->map_data_len);
	}

1014
	return subflow_check_data_avail(sk);
1015 1016
}

1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
 * not the ssk one.
 *
 * In mptcp, rwin is about the mptcp-level connection data.
 *
 * Data that is still on the ssk rx queue can thus be ignored,
 * as far as mptcp peer is concerened that data is still inflight.
 * DSS ACK is updated when skb is moved to the mptcp rx queue.
 */
void mptcp_space(const struct sock *ssk, int *space, int *full_space)
{
	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	const struct sock *sk = subflow->conn;

1031
	*space = __mptcp_space(sk);
1032 1033 1034
	*full_space = tcp_full_space(sk);
}

1035 1036 1037
static void subflow_data_ready(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1038
	u16 state = 1 << inet_sk_state_load(sk);
1039
	struct sock *parent = subflow->conn;
1040
	struct mptcp_sock *msk;
1041

1042
	msk = mptcp_sk(parent);
1043
	if (state & TCPF_LISTEN) {
1044
		set_bit(MPTCP_DATA_READY, &msk->flags);
P
Paolo Abeni 已提交
1045
		parent->sk_data_ready(parent);
1046 1047 1048
		return;
	}

1049
	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
1050
		     !subflow->mp_join && !(state & TCPF_CLOSE));
1051

1052
	if (mptcp_subflow_data_available(sk))
1053
		mptcp_data_ready(parent, sk);
1054 1055
}

1056
static void subflow_write_space(struct sock *ssk)
1057
{
P
Paolo Abeni 已提交
1058 1059 1060 1061
	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;

	mptcp_propagate_sndbuf(sk, ssk);
	mptcp_write_space(sk);
1062 1063
}

1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
static struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	if (sk->sk_family == AF_INET6)
		return &subflow_v6_specific;
#endif
	return &subflow_specific;
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1075 1076
void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
{
1077 1078 1079 1080 1081 1082 1083
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_connection_sock_af_ops *target;

	target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);

	pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
M
Mat Martineau 已提交
1084
		 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
1085 1086 1087 1088 1089 1090 1091

	if (likely(icsk->icsk_af_ops == target))
		return;

	subflow->icsk_af_ops = icsk->icsk_af_ops;
	icsk->icsk_af_ops = target;
}
1092
#endif
1093

1094 1095 1096
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
			 struct sockaddr_storage *addr,
			 unsigned short family)
1097 1098
{
	memset(addr, 0, sizeof(*addr));
1099
	addr->ss_family = family;
1100 1101 1102
	if (addr->ss_family == AF_INET) {
		struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;

1103 1104 1105 1106 1107 1108
		if (info->family == AF_INET)
			in_addr->sin_addr = info->addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
		else if (ipv6_addr_v4mapped(&info->addr6))
			in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
#endif
1109 1110 1111 1112 1113 1114
		in_addr->sin_port = info->port;
	}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	else if (addr->ss_family == AF_INET6) {
		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;

1115 1116 1117 1118 1119
		if (info->family == AF_INET)
			ipv6_addr_set_v4mapped(info->addr.s_addr,
					       &in6_addr->sin6_addr);
		else
			in6_addr->sin6_addr = info->addr6;
1120 1121 1122 1123 1124
		in6_addr->sin6_port = info->port;
	}
#endif
}

1125
int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
1126 1127 1128 1129 1130
			    const struct mptcp_addr_info *remote)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
	struct mptcp_subflow_context *subflow;
	struct sockaddr_storage addr;
1131
	int remote_id = remote->id;
1132
	int local_id = loc->id;
1133
	struct socket *sf;
1134
	struct sock *ssk;
1135 1136 1137 1138
	u32 remote_token;
	int addrlen;
	int err;

1139
	if (!mptcp_is_fully_established(sk))
1140 1141 1142 1143 1144 1145
		return -ENOTCONN;

	err = mptcp_subflow_create_socket(sk, &sf);
	if (err)
		return err;

1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
	ssk = sf->sk;
	subflow = mptcp_subflow_ctx(ssk);
	do {
		get_random_bytes(&subflow->local_nonce, sizeof(u32));
	} while (!subflow->local_nonce);

	if (!local_id) {
		err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
		if (err < 0)
			goto failed;

		local_id = err;
	}

1160 1161 1162
	subflow->remote_key = msk->remote_key;
	subflow->local_key = msk->local_key;
	subflow->token = msk->token;
1163
	mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
1164 1165 1166

	addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1167
	if (addr.ss_family == AF_INET6)
1168 1169
		addrlen = sizeof(struct sockaddr_in6);
#endif
1170
	ssk->sk_bound_dev_if = loc->ifindex;
1171 1172 1173 1174 1175
	err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
	if (err)
		goto failed;

	mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
1176 1177
	pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
		 remote_token, local_id, remote_id);
1178
	subflow->remote_token = remote_token;
1179
	subflow->local_id = local_id;
1180
	subflow->remote_id = remote_id;
1181
	subflow->request_join = 1;
1182
	subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
1183
	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
1184

1185
	mptcp_add_pending_subflow(msk, subflow);
1186 1187
	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
	if (err && err != -EINPROGRESS)
1188
		goto failed_unlink;
1189

1190 1191 1192
	/* discard the subflow socket */
	mptcp_sock_graft(ssk, sk->sk_socket);
	iput(SOCK_INODE(sf));
1193 1194
	return err;

1195
failed_unlink:
1196
	spin_lock_bh(&msk->join_list_lock);
1197
	list_del(&subflow->node);
1198 1199 1200
	spin_unlock_bh(&msk->join_list_lock);

failed:
P
Paolo Abeni 已提交
1201
	subflow->disposable = 1;
1202 1203 1204 1205
	sock_release(sf);
	return err;
}

1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
	struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
				*child_skcd = &child->sk_cgrp_data;

	/* only the additional subflows created by kworkers have to be modified */
	if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
	    cgroup_id(sock_cgroup_ptr(child_skcd))) {
#ifdef CONFIG_MEMCG
		struct mem_cgroup *memcg = parent->sk_memcg;

		mem_cgroup_sk_free(child);
		if (memcg && css_tryget(&memcg->css))
			child->sk_memcg = memcg;
#endif /* CONFIG_MEMCG */

		cgroup_sk_free(child_skcd);
		*child_skcd = *parent_skcd;
		cgroup_sk_clone(child_skcd);
	}
#endif /* CONFIG_SOCK_CGROUP_DATA */
}

P
Paolo Abeni 已提交
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
static void mptcp_subflow_ops_override(struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	if (ssk->sk_prot == &tcpv6_prot)
		ssk->sk_prot = &tcpv6_prot_override;
	else
#endif
		ssk->sk_prot = &tcp_prot_override;
}

static void mptcp_subflow_ops_undo_override(struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	if (ssk->sk_prot == &tcpv6_prot_override)
		ssk->sk_prot = &tcpv6_prot;
	else
#endif
		ssk->sk_prot = &tcp_prot;
}
1249 1250 1251 1252 1253 1254 1255
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
	struct mptcp_subflow_context *subflow;
	struct net *net = sock_net(sk);
	struct socket *sf;
	int err;

1256 1257 1258 1259 1260 1261
	/* un-accepted server sockets can reach here - on bad configuration
	 * bail early to avoid greater trouble later
	 */
	if (unlikely(!sk->sk_socket))
		return -EINVAL;

1262 1263
	err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
			       &sf);
1264 1265 1266 1267 1268
	if (err)
		return err;

	lock_sock(sf->sk);

1269 1270 1271
	/* the newly created socket has to be in the same cgroup as its parent */
	mptcp_attach_cgroup(sk, sf->sk);

1272 1273 1274 1275 1276
	/* kernel sockets do not by default acquire net ref, but TCP timer
	 * needs it.
	 */
	sf->sk->sk_net_refcnt = 1;
	get_net(net);
1277
#ifdef CONFIG_PROC_FS
1278
	this_cpu_add(*net->core.sock_inuse, 1);
1279
#endif
1280 1281 1282
	err = tcp_set_ulp(sf->sk, "mptcp");
	release_sock(sf->sk);

1283 1284
	if (err) {
		sock_release(sf);
1285
		return err;
1286
	}
1287

1288 1289 1290 1291 1292 1293 1294 1295 1296 1297
	/* the newly created socket really belongs to the owning MPTCP master
	 * socket, even if for additional subflows the allocation is performed
	 * by a kernel workqueue. Adjust inode references, so that the
	 * procfs/diag interaces really show this one belonging to the correct
	 * user.
	 */
	SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
	SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
	SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;

1298 1299 1300 1301
	subflow = mptcp_subflow_ctx(sf->sk);
	pr_debug("subflow=%p", subflow);

	*new_sock = sf;
1302
	sock_hold(sk);
1303
	subflow->conn = sk;
P
Paolo Abeni 已提交
1304
	mptcp_subflow_ops_override(sf->sk);
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319

	return 0;
}

static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
							gfp_t priority)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct mptcp_subflow_context *ctx;

	ctx = kzalloc(sizeof(*ctx), priority);
	if (!ctx)
		return NULL;

	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1320
	INIT_LIST_HEAD(&ctx->node);
P
Paolo Abeni 已提交
1321
	INIT_LIST_HEAD(&ctx->delegated_node);
1322 1323 1324 1325 1326 1327 1328 1329

	pr_debug("subflow=%p", ctx);

	ctx->tcp_sock = sk;

	return ctx;
}

1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
static void __subflow_state_change(struct sock *sk)
{
	struct socket_wq *wq;

	rcu_read_lock();
	wq = rcu_dereference(sk->sk_wq);
	if (skwq_has_sleeper(wq))
		wake_up_interruptible_all(&wq->wait);
	rcu_read_unlock();
}

static bool subflow_is_done(const struct sock *sk)
{
	return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
}

static void subflow_state_change(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
P
Paolo Abeni 已提交
1349
	struct sock *parent = subflow->conn;
1350 1351 1352

	__subflow_state_change(sk);

1353
	if (subflow_simultaneous_connect(sk)) {
P
Paolo Abeni 已提交
1354
		mptcp_propagate_sndbuf(parent, sk);
1355
		mptcp_do_fallback(sk);
1356
		mptcp_rcv_space_init(mptcp_sk(parent), sk);
1357 1358 1359 1360 1361 1362 1363 1364
		pr_fallback(mptcp_sk(parent));
		subflow->conn_finished = 1;
		if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
			inet_sk_state_store(parent, TCP_ESTABLISHED);
			parent->sk_state_change(parent);
		}
	}

1365 1366 1367 1368
	/* as recvmsg() does not acquire the subflow socket for ssk selection
	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
	 * the data available machinery here.
	 */
1369
	if (mptcp_subflow_data_available(sk))
1370
		mptcp_data_ready(parent, sk);
1371

1372
	if (__mptcp_check_fallback(mptcp_sk(parent)) &&
1373 1374
	    !subflow->rx_eof && subflow_is_done(sk)) {
		subflow->rx_eof = 1;
1375
		mptcp_subflow_eof(parent);
1376 1377 1378
	}
}

1379 1380
static int subflow_ulp_init(struct sock *sk)
{
1381
	struct inet_connection_sock *icsk = inet_csk(sk);
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402
	struct mptcp_subflow_context *ctx;
	struct tcp_sock *tp = tcp_sk(sk);
	int err = 0;

	/* disallow attaching ULP to a socket unless it has been
	 * created with sock_create_kern()
	 */
	if (!sk->sk_kern_sock) {
		err = -EOPNOTSUPP;
		goto out;
	}

	ctx = subflow_create_ctx(sk, GFP_KERNEL);
	if (!ctx) {
		err = -ENOMEM;
		goto out;
	}

	pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);

	tp->is_mptcp = 1;
1403 1404
	ctx->icsk_af_ops = icsk->icsk_af_ops;
	icsk->icsk_af_ops = subflow_default_af_ops(sk);
1405 1406 1407 1408 1409 1410
	ctx->tcp_data_ready = sk->sk_data_ready;
	ctx->tcp_state_change = sk->sk_state_change;
	ctx->tcp_write_space = sk->sk_write_space;
	sk->sk_data_ready = subflow_data_ready;
	sk->sk_write_space = subflow_write_space;
	sk->sk_state_change = subflow_state_change;
1411 1412 1413 1414
out:
	return err;
}

P
Paolo Abeni 已提交
1415
static void subflow_ulp_release(struct sock *ssk)
1416
{
P
Paolo Abeni 已提交
1417 1418 1419
	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
	bool release = true;
	struct sock *sk;
1420 1421 1422 1423

	if (!ctx)
		return;

P
Paolo Abeni 已提交
1424 1425 1426
	sk = ctx->conn;
	if (sk) {
		/* if the msk has been orphaned, keep the ctx
1427 1428
		 * alive, will be freed by __mptcp_close_ssk(),
		 * when the subflow is still unaccepted
P
Paolo Abeni 已提交
1429
		 */
1430
		release = ctx->disposable || list_empty(&ctx->node);
P
Paolo Abeni 已提交
1431 1432
		sock_put(sk);
	}
1433

P
Paolo Abeni 已提交
1434
	mptcp_subflow_ops_undo_override(ssk);
P
Paolo Abeni 已提交
1435 1436
	if (release)
		kfree_rcu(ctx, rcu);
1437 1438
}

1439 1440 1441 1442 1443 1444 1445 1446
static void subflow_ulp_clone(const struct request_sock *req,
			      struct sock *newsk,
			      const gfp_t priority)
{
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
	struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
	struct mptcp_subflow_context *new_ctx;

1447 1448
	if (!tcp_rsk(req)->is_mptcp ||
	    (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1449
		subflow_ulp_fallback(newsk, old_ctx);
1450 1451 1452 1453
		return;
	}

	new_ctx = subflow_create_ctx(newsk, priority);
M
Mat Martineau 已提交
1454
	if (!new_ctx) {
1455
		subflow_ulp_fallback(newsk, old_ctx);
1456 1457 1458 1459 1460
		return;
	}

	new_ctx->conn_finished = 1;
	new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1461 1462 1463
	new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
	new_ctx->tcp_state_change = old_ctx->tcp_state_change;
	new_ctx->tcp_write_space = old_ctx->tcp_write_space;
P
Paolo Abeni 已提交
1464 1465 1466
	new_ctx->rel_write_seq = 1;
	new_ctx->tcp_sock = newsk;

1467 1468 1469 1470 1471 1472 1473 1474 1475 1476
	if (subflow_req->mp_capable) {
		/* see comments in subflow_syn_recv_sock(), MPTCP connection
		 * is fully established only after we receive the remote key
		 */
		new_ctx->mp_capable = 1;
		new_ctx->local_key = subflow_req->local_key;
		new_ctx->token = subflow_req->token;
		new_ctx->ssn_offset = subflow_req->ssn_offset;
		new_ctx->idsn = subflow_req->idsn;
	} else if (subflow_req->mp_join) {
1477
		new_ctx->ssn_offset = subflow_req->ssn_offset;
1478 1479 1480 1481
		new_ctx->mp_join = 1;
		new_ctx->fully_established = 1;
		new_ctx->backup = subflow_req->backup;
		new_ctx->local_id = subflow_req->local_id;
1482
		new_ctx->remote_id = subflow_req->remote_id;
1483 1484 1485
		new_ctx->token = subflow_req->token;
		new_ctx->thmac = subflow_req->thmac;
	}
1486 1487
}

P
Paolo Abeni 已提交
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497
static void tcp_release_cb_override(struct sock *ssk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);

	if (mptcp_subflow_has_delegated_action(subflow))
		mptcp_subflow_process_delegated(ssk);

	tcp_release_cb(ssk);
}

1498 1499 1500 1501 1502
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
	.name		= "mptcp",
	.owner		= THIS_MODULE,
	.init		= subflow_ulp_init,
	.release	= subflow_ulp_release,
1503
	.clone		= subflow_ulp_clone,
1504 1505
};

1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518
static int subflow_ops_init(struct request_sock_ops *subflow_ops)
{
	subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
	subflow_ops->slab_name = "request_sock_subflow";

	subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
					      subflow_ops->obj_size, 0,
					      SLAB_ACCOUNT |
					      SLAB_TYPESAFE_BY_RCU,
					      NULL);
	if (!subflow_ops->slab)
		return -ENOMEM;

1519 1520
	subflow_ops->destructor = subflow_req_destructor;

1521 1522 1523
	return 0;
}

1524
void __init mptcp_subflow_init(void)
1525
{
1526 1527
	mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
	if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
1528 1529 1530
		panic("MPTCP: failed to init subflow request sock ops\n");

	subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1531
	subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
1532 1533 1534 1535 1536 1537

	subflow_specific = ipv4_specific;
	subflow_specific.conn_request = subflow_v4_conn_request;
	subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
	subflow_specific.sk_rx_dst_set = subflow_finish_connect;

P
Paolo Abeni 已提交
1538 1539 1540
	tcp_prot_override = tcp_prot;
	tcp_prot_override.release_cb = tcp_release_cb_override;

1541 1542
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1543
	subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555

	subflow_v6_specific = ipv6_specific;
	subflow_v6_specific.conn_request = subflow_v6_conn_request;
	subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
	subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;

	subflow_v6m_specific = subflow_v6_specific;
	subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
	subflow_v6m_specific.send_check = ipv4_specific.send_check;
	subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
	subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
	subflow_v6m_specific.net_frag_header_len = 0;
P
Paolo Abeni 已提交
1556 1557 1558

	tcpv6_prot_override = tcpv6_prot;
	tcpv6_prot_override.release_cb = tcp_release_cb_override;
1559 1560
#endif

1561 1562
	mptcp_diag_subflow_init(&subflow_ulp_ops);

1563 1564 1565
	if (tcp_register_ulp(&subflow_ulp_ops) != 0)
		panic("MPTCP: failed to register subflows to ULP\n");
}