ip_sockglue.c 36.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP to API glue.
8
 *
L
Linus Torvalds 已提交
9 10 11 12 13
 * Authors:	see ip.c
 *
 * Fixes:
 *		Many		:	Split from ip.c , see ip.c for history.
 *		Martin Mares	:	TOS setting fixed.
14
 *		Alan Cox	:	Fixed a couple of oopses in Martin's
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24
 *					TOS tweaks.
 *		Mike McLagan	:	Routing by source
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
25
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
26
#include <linux/netdevice.h>
27
#include <linux/slab.h>
L
Linus Torvalds 已提交
28 29 30
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
31
#include <net/tcp_states.h>
L
Linus Torvalds 已提交
32 33 34 35 36
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/netfilter.h>
#include <linux/route.h>
#include <linux/mroute.h>
37
#include <net/inet_ecn.h>
L
Linus Torvalds 已提交
38 39
#include <net/route.h>
#include <net/xfrm.h>
40
#include <net/compat.h>
41
#include <net/checksum.h>
E
Eric Dumazet 已提交
42
#if IS_ENABLED(CONFIG_IPV6)
L
Linus Torvalds 已提交
43 44
#include <net/transp_v6.h>
#endif
45
#include <net/ip_fib.h>
L
Linus Torvalds 已提交
46 47

#include <linux/errqueue.h>
48
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
49

50 51
#include <linux/bpfilter.h>

L
Linus Torvalds 已提交
52 53 54 55 56 57
/*
 *	SOL_IP control messages.
 */

static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
58
	struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
L
Linus Torvalds 已提交
59

60
	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
L
Linus Torvalds 已提交
61 62 63 64 65 66

	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}

static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
{
67
	int ttl = ip_hdr(skb)->ttl;
L
Linus Torvalds 已提交
68 69 70 71 72
	put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
}

static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
{
73
	put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
L
Linus Torvalds 已提交
74 75 76 77 78 79 80
}

static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
{
	if (IPCB(skb)->opt.optlen == 0)
		return;

81 82
	put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
		 ip_hdr(skb) + 1);
L
Linus Torvalds 已提交
83 84 85
}


86 87
static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
				 struct sk_buff *skb)
L
Linus Torvalds 已提交
88 89
{
	unsigned char optbuf[sizeof(struct ip_options) + 40];
D
Daniel Baluta 已提交
90
	struct ip_options *opt = (struct ip_options *)optbuf;
L
Linus Torvalds 已提交
91 92 93 94

	if (IPCB(skb)->opt.optlen == 0)
		return;

95
	if (ip_options_echo(net, opt, skb)) {
L
Linus Torvalds 已提交
96 97 98 99 100 101 102 103
		msg->msg_flags |= MSG_CTRUNC;
		return;
	}
	ip_options_undo(opt);

	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}

W
Willem de Bruijn 已提交
104 105 106 107 108 109 110 111 112 113 114
static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
{
	int val;

	if (IPCB(skb)->frag_max_size == 0)
		return;

	val = IPCB(skb)->frag_max_size;
	put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
}

115
static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
E
Eric Dumazet 已提交
116
				  int tlen, int offset)
117 118 119 120 121 122
{
	__wsum csum = skb->csum;

	if (skb->ip_summed != CHECKSUM_COMPLETE)
		return;

P
Paolo Abeni 已提交
123 124 125 126
	if (offset != 0) {
		int tend_off = skb_transport_offset(skb) + tlen;
		csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
	}
127 128 129 130

	put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}

C
Catherine Zhang 已提交
131 132 133
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
	char *secdata;
134
	u32 seclen, secid;
C
Catherine Zhang 已提交
135 136
	int err;

137 138 139 140 141
	err = security_socket_getpeersec_dgram(NULL, skb, &secid);
	if (err)
		return;

	err = security_secid_to_secctx(secid, &secdata, &seclen);
C
Catherine Zhang 已提交
142 143 144 145
	if (err)
		return;

	put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
146
	security_release_secctx(secdata, seclen);
C
Catherine Zhang 已提交
147 148
}

149
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
150 151
{
	struct sockaddr_in sin;
152
	const struct iphdr *iph = ip_hdr(skb);
153
	__be16 *ports = (__be16 *)skb_transport_header(skb);
154

155
	if (skb_transport_offset(skb) + 4 > (int)skb->len)
156 157 158 159 160 161 162 163 164 165 166 167 168 169
		return;

	/* All current transport protocols have the port numbers in the
	 * first four bytes of the transport header and this function is
	 * written with this assumption in mind.
	 */

	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = iph->daddr;
	sin.sin_port = ports[1];
	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));

	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
L
Linus Torvalds 已提交
170

171 172
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
			 struct sk_buff *skb, int tlen, int offset)
L
Linus Torvalds 已提交
173
{
174
	struct inet_sock *inet = inet_sk(sk);
175
	unsigned int flags = inet->cmsg_flags;
L
Linus Torvalds 已提交
176 177

	/* Ordered by supposed usage frequency */
T
Tom Herbert 已提交
178
	if (flags & IP_CMSG_PKTINFO) {
L
Linus Torvalds 已提交
179 180
		ip_cmsg_recv_pktinfo(msg, skb);

T
Tom Herbert 已提交
181 182 183 184 185 186
		flags &= ~IP_CMSG_PKTINFO;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_TTL) {
L
Linus Torvalds 已提交
187 188
		ip_cmsg_recv_ttl(msg, skb);

T
Tom Herbert 已提交
189 190 191 192 193 194
		flags &= ~IP_CMSG_TTL;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_TOS) {
L
Linus Torvalds 已提交
195 196
		ip_cmsg_recv_tos(msg, skb);

T
Tom Herbert 已提交
197 198 199 200 201 202
		flags &= ~IP_CMSG_TOS;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_RECVOPTS) {
L
Linus Torvalds 已提交
203 204
		ip_cmsg_recv_opts(msg, skb);

T
Tom Herbert 已提交
205 206 207 208 209 210
		flags &= ~IP_CMSG_RECVOPTS;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_RETOPTS) {
211
		ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
C
Catherine Zhang 已提交
212

T
Tom Herbert 已提交
213 214 215 216 217 218
		flags &= ~IP_CMSG_RETOPTS;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_PASSSEC) {
C
Catherine Zhang 已提交
219
		ip_cmsg_recv_security(msg, skb);
220

T
Tom Herbert 已提交
221 222 223 224 225
		flags &= ~IP_CMSG_PASSSEC;
		if (!flags)
			return;
	}

226
	if (flags & IP_CMSG_ORIGDSTADDR) {
227 228
		ip_cmsg_recv_dstaddr(msg, skb);

229 230 231 232 233 234
		flags &= ~IP_CMSG_ORIGDSTADDR;
		if (!flags)
			return;
	}

	if (flags & IP_CMSG_CHECKSUM)
E
Eric Dumazet 已提交
235
		ip_cmsg_recv_checksum(msg, skb, tlen, offset);
W
Willem de Bruijn 已提交
236 237 238

	if (flags & IP_CMSG_RECVFRAGSIZE)
		ip_cmsg_recv_fragsize(msg, skb);
L
Linus Torvalds 已提交
239
}
240
EXPORT_SYMBOL(ip_cmsg_recv_offset);
L
Linus Torvalds 已提交
241

242
int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
243
		 bool allow_ipv6)
L
Linus Torvalds 已提交
244
{
245
	int err, val;
L
Linus Torvalds 已提交
246
	struct cmsghdr *cmsg;
247
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
248

249
	for_each_cmsghdr(cmsg, msg) {
L
Linus Torvalds 已提交
250 251
		if (!CMSG_OK(msg, cmsg))
			return -EINVAL;
252
#if IS_ENABLED(CONFIG_IPV6)
253 254 255 256 257 258 259 260 261 262
		if (allow_ipv6 &&
		    cmsg->cmsg_level == SOL_IPV6 &&
		    cmsg->cmsg_type == IPV6_PKTINFO) {
			struct in6_pktinfo *src_info;

			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
				return -EINVAL;
			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
				return -EINVAL;
263 264
			if (src_info->ipi6_ifindex)
				ipc->oif = src_info->ipi6_ifindex;
265 266 267 268
			ipc->addr = src_info->ipi6_addr.s6_addr32[3];
			continue;
		}
#endif
269
		if (cmsg->cmsg_level == SOL_SOCKET) {
270 271 272
			err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
			if (err)
				return err;
273 274 275
			continue;
		}

L
Linus Torvalds 已提交
276 277 278 279
		if (cmsg->cmsg_level != SOL_IP)
			continue;
		switch (cmsg->cmsg_type) {
		case IP_RETOPTS:
280
			err = cmsg->cmsg_len - sizeof(struct cmsghdr);
281 282

			/* Our caller is responsible for freeing ipc->opt */
E
Eric Dumazet 已提交
283 284
			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
					     err < 40 ? err : 40);
L
Linus Torvalds 已提交
285 286 287 288 289 290 291 292 293
			if (err)
				return err;
			break;
		case IP_PKTINFO:
		{
			struct in_pktinfo *info;
			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
				return -EINVAL;
			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
294 295
			if (info->ipi_ifindex)
				ipc->oif = info->ipi_ifindex;
L
Linus Torvalds 已提交
296 297 298
			ipc->addr = info->ipi_spec_dst.s_addr;
			break;
		}
299 300 301 302 303 304 305 306 307
		case IP_TTL:
			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
				return -EINVAL;
			val = *(int *)CMSG_DATA(cmsg);
			if (val < 1 || val > 255)
				return -EINVAL;
			ipc->ttl = val;
			break;
		case IP_TOS:
308 309 310 311 312
			if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
				val = *(int *)CMSG_DATA(cmsg);
			else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
				val = *(u8 *)CMSG_DATA(cmsg);
			else
313 314 315 316 317 318 319
				return -EINVAL;
			if (val < 0 || val > 255)
				return -EINVAL;
			ipc->tos = val;
			ipc->priority = rt_tos2priority(ipc->tos);
			break;

L
Linus Torvalds 已提交
320 321 322 323 324 325 326
		default:
			return -EINVAL;
		}
	}
	return 0;
}

E
Eric Dumazet 已提交
327
static void ip_ra_destroy_rcu(struct rcu_head *head)
E
Eric Dumazet 已提交
328
{
E
Eric Dumazet 已提交
329 330 331 332
	struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);

	sock_put(ra->saved_sk);
	kfree(ra);
E
Eric Dumazet 已提交
333
}
L
Linus Torvalds 已提交
334

E
Eric Dumazet 已提交
335 336
int ip_ra_control(struct sock *sk, unsigned char on,
		  void (*destructor)(struct sock *))
L
Linus Torvalds 已提交
337
{
338 339
	struct ip_ra_chain *ra, *new_ra;
	struct ip_ra_chain __rcu **rap;
340
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
341

E
Eric Dumazet 已提交
342
	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
L
Linus Torvalds 已提交
343 344 345 346
		return -EINVAL;

	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;

347
	mutex_lock(&net->ipv4.ra_mutex);
348
	for (rap = &net->ipv4.ra_chain;
349
	     (ra = rcu_dereference_protected(*rap,
350
			lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
351
	     rap = &ra->next) {
L
Linus Torvalds 已提交
352 353
		if (ra->sk == sk) {
			if (on) {
354
				mutex_unlock(&net->ipv4.ra_mutex);
J
Jesper Juhl 已提交
355
				kfree(new_ra);
L
Linus Torvalds 已提交
356 357
				return -EADDRINUSE;
			}
E
Eric Dumazet 已提交
358 359
			/* dont let ip_call_ra_chain() use sk again */
			ra->sk = NULL;
360
			RCU_INIT_POINTER(*rap, ra->next);
361
			mutex_unlock(&net->ipv4.ra_mutex);
L
Linus Torvalds 已提交
362 363 364

			if (ra->destructor)
				ra->destructor(sk);
E
Eric Dumazet 已提交
365 366 367 368 369 370 371
			/*
			 * Delay sock_put(sk) and kfree(ra) after one rcu grace
			 * period. This guarantee ip_call_ra_chain() dont need
			 * to mess with socket refcounts.
			 */
			ra->saved_sk = sk;
			call_rcu(&ra->rcu, ip_ra_destroy_rcu);
L
Linus Torvalds 已提交
372 373 374
			return 0;
		}
	}
375
	if (!new_ra) {
376
		mutex_unlock(&net->ipv4.ra_mutex);
L
Linus Torvalds 已提交
377
		return -ENOBUFS;
378
	}
L
Linus Torvalds 已提交
379 380 381
	new_ra->sk = sk;
	new_ra->destructor = destructor;

382
	RCU_INIT_POINTER(new_ra->next, ra);
E
Eric Dumazet 已提交
383
	rcu_assign_pointer(*rap, new_ra);
L
Linus Torvalds 已提交
384
	sock_hold(sk);
385
	mutex_unlock(&net->ipv4.ra_mutex);
L
Linus Torvalds 已提交
386 387 388 389

	return 0;
}

390
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
A
Al Viro 已提交
391
		   __be16 port, u32 info, u8 *payload)
L
Linus Torvalds 已提交
392 393 394 395 396 397 398
{
	struct sock_exterr_skb *serr;

	skb = skb_clone(skb, GFP_ATOMIC);
	if (!skb)
		return;

399
	serr = SKB_EXT_ERR(skb);
L
Linus Torvalds 已提交
400 401
	serr->ee.ee_errno = err;
	serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
402 403
	serr->ee.ee_type = icmp_hdr(skb)->type;
	serr->ee.ee_code = icmp_hdr(skb)->code;
L
Linus Torvalds 已提交
404 405 406
	serr->ee.ee_pad = 0;
	serr->ee.ee_info = info;
	serr->ee.ee_data = 0;
407
	serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
408
				   skb_network_header(skb);
L
Linus Torvalds 已提交
409 410
	serr->port = port;

411
	if (skb_pull(skb, payload - skb->data)) {
412 413 414 415 416
		skb_reset_transport_header(skb);
		if (sock_queue_err_skb(sk, skb) == 0)
			return;
	}
	kfree_skb(skb);
L
Linus Torvalds 已提交
417 418
}

A
Al Viro 已提交
419
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
L
Linus Torvalds 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432
{
	struct inet_sock *inet = inet_sk(sk);
	struct sock_exterr_skb *serr;
	struct iphdr *iph;
	struct sk_buff *skb;

	if (!inet->recverr)
		return;

	skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
	if (!skb)
		return;

433 434
	skb_put(skb, sizeof(struct iphdr));
	skb_reset_network_header(skb);
435
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
436 437
	iph->daddr = daddr;

438
	serr = SKB_EXT_ERR(skb);
L
Linus Torvalds 已提交
439 440
	serr->ee.ee_errno = err;
	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
441
	serr->ee.ee_type = 0;
L
Linus Torvalds 已提交
442 443 444 445
	serr->ee.ee_code = 0;
	serr->ee.ee_pad = 0;
	serr->ee.ee_info = info;
	serr->ee.ee_data = 0;
446
	serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
L
Linus Torvalds 已提交
447 448
	serr->port = port;

449
	__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
450
	skb_reset_transport_header(skb);
L
Linus Torvalds 已提交
451 452 453 454 455

	if (sock_queue_err_skb(sk, skb))
		kfree_skb(skb);
}

456 457 458 459 460 461 462 463 464
/* For some errors we have valid addr_offset even with zero payload and
 * zero port. Also, addr_offset should be supported if port is set.
 */
static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
{
	return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
	       serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
}

465 466 467 468 469 470 471 472
/* IPv4 supports cmsg on all imcp errors and some timestamps
 *
 * Timestamp code paths do not initialize the fields expected by cmsg:
 * the PKTINFO fields in skb->cb[]. Fill those in here.
 */
static bool ipv4_datagram_support_cmsg(const struct sock *sk,
				       struct sk_buff *skb,
				       int ee_origin)
473
{
474 475 476 477
	struct in_pktinfo *info;

	if (ee_origin == SO_EE_ORIGIN_ICMP)
		return true;
478

479 480 481 482
	if (ee_origin == SO_EE_ORIGIN_LOCAL)
		return false;

	/* Support IP_PKTINFO on tstamp packets if requested, to correlate
483
	 * timestamp with egress dev. Not possible for packets without iif
484 485
	 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
	 */
486 487 488
	info = PKTINFO_SKB_CB(skb);
	if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
	    !info->ipi_ifindex)
489 490 491 492 493 494
		return false;

	info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
	return true;
}

495
/*
L
Linus Torvalds 已提交
496 497
 *	Handle MSG_ERRQUEUE
 */
498
int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
L
Linus Torvalds 已提交
499 500
{
	struct sock_exterr_skb *serr;
501
	struct sk_buff *skb;
502
	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
L
Linus Torvalds 已提交
503 504 505 506 507 508 509
	struct {
		struct sock_extended_err ee;
		struct sockaddr_in	 offender;
	} errhdr;
	int err;
	int copied;

510 511
	WARN_ON_ONCE(sk->sk_family == AF_INET6);

L
Linus Torvalds 已提交
512
	err = -EAGAIN;
513
	skb = sock_dequeue_err_skb(sk);
514
	if (!skb)
L
Linus Torvalds 已提交
515 516 517 518 519 520 521
		goto out;

	copied = skb->len;
	if (copied > len) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}
522
	err = skb_copy_datagram_msg(skb, 0, msg, copied);
523 524 525 526
	if (unlikely(err)) {
		kfree_skb(skb);
		return err;
	}
L
Linus Torvalds 已提交
527 528 529 530
	sock_recv_timestamp(msg, sk, skb);

	serr = SKB_EXT_ERR(skb);

531
	if (sin && ipv4_datagram_support_addr(serr)) {
L
Linus Torvalds 已提交
532
		sin->sin_family = AF_INET;
533 534
		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
						   serr->addr_offset);
L
Linus Torvalds 已提交
535 536
		sin->sin_port = serr->port;
		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
537
		*addr_len = sizeof(*sin);
L
Linus Torvalds 已提交
538 539 540 541
	}

	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
	sin = &errhdr.offender;
542
	memset(sin, 0, sizeof(*sin));
543

544
	if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
L
Linus Torvalds 已提交
545
		sin->sin_family = AF_INET;
546
		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
547
		if (inet_sk(sk)->cmsg_flags)
L
Linus Torvalds 已提交
548 549 550 551 552 553 554 555 556 557
			ip_cmsg_recv(msg, skb);
	}

	put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);

	/* Now we could try to dump offended packet options */

	msg->msg_flags |= MSG_ERRQUEUE;
	err = copied;

558
	consume_skb(skb);
L
Linus Torvalds 已提交
559 560 561 562 563 564
out:
	return err;
}


/*
E
Eric Dumazet 已提交
565 566
 *	Socket option code for IP. This is the end of the line after any
 *	TCP,UDP etc options on an IP socket.
L
Linus Torvalds 已提交
567
 */
568 569 570 571 572
static bool setsockopt_needs_rtnl(int optname)
{
	switch (optname) {
	case IP_ADD_MEMBERSHIP:
	case IP_ADD_SOURCE_MEMBERSHIP:
573
	case IP_BLOCK_SOURCE:
574
	case IP_DROP_MEMBERSHIP:
575 576 577 578 579
	case IP_DROP_SOURCE_MEMBERSHIP:
	case IP_MSFILTER:
	case IP_UNBLOCK_SOURCE:
	case MCAST_BLOCK_SOURCE:
	case MCAST_MSFILTER:
580
	case MCAST_JOIN_GROUP:
581
	case MCAST_JOIN_SOURCE_GROUP:
582
	case MCAST_LEAVE_GROUP:
583 584
	case MCAST_LEAVE_SOURCE_GROUP:
	case MCAST_UNBLOCK_SOURCE:
585 586 587 588
		return true;
	}
	return false;
}
L
Linus Torvalds 已提交
589

590
static int do_ip_setsockopt(struct sock *sk, int level,
591
			    int optname, char __user *optval, unsigned int optlen)
L
Linus Torvalds 已提交
592 593
{
	struct inet_sock *inet = inet_sk(sk);
594
	struct net *net = sock_net(sk);
595
	int val = 0, err;
596
	bool needs_rtnl = setsockopt_needs_rtnl(optname);
L
Linus Torvalds 已提交
597

598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
	switch (optname) {
	case IP_PKTINFO:
	case IP_RECVTTL:
	case IP_RECVOPTS:
	case IP_RECVTOS:
	case IP_RETOPTS:
	case IP_TOS:
	case IP_TTL:
	case IP_HDRINCL:
	case IP_MTU_DISCOVER:
	case IP_RECVERR:
	case IP_ROUTER_ALERT:
	case IP_FREEBIND:
	case IP_PASSSEC:
	case IP_TRANSPARENT:
	case IP_MINTTL:
	case IP_NODEFRAG:
615
	case IP_BIND_ADDRESS_NO_PORT:
616 617 618 619 620
	case IP_UNICAST_IF:
	case IP_MULTICAST_TTL:
	case IP_MULTICAST_ALL:
	case IP_MULTICAST_LOOP:
	case IP_RECVORIGDSTADDR:
621
	case IP_CHECKSUM:
W
Willem de Bruijn 已提交
622
	case IP_RECVFRAGSIZE:
L
Linus Torvalds 已提交
623 624 625 626 627 628 629 630 631 632 633 634 635 636
		if (optlen >= sizeof(int)) {
			if (get_user(val, (int __user *) optval))
				return -EFAULT;
		} else if (optlen >= sizeof(char)) {
			unsigned char ucval;

			if (get_user(ucval, (unsigned char __user *) optval))
				return -EFAULT;
			val = (int) ucval;
		}
	}

	/* If optlen==0, it is equivalent to val == 0 */

637 638
	if (optname == IP_ROUTER_ALERT)
		return ip_ra_control(sk, val ? 1 : 0, NULL);
639
	if (ip_mroute_opt(optname))
640
		return ip_mroute_setsockopt(sk, optname, optval, optlen);
L
Linus Torvalds 已提交
641 642

	err = 0;
643 644
	if (needs_rtnl)
		rtnl_lock();
L
Linus Torvalds 已提交
645 646 647
	lock_sock(sk);

	switch (optname) {
S
Stephen Hemminger 已提交
648 649
	case IP_OPTIONS:
	{
650 651
		struct ip_options_rcu *old, *opt = NULL;

652
		if (optlen > 40)
S
Stephen Hemminger 已提交
653
			goto e_inval;
654
		err = ip_options_get_from_user(sock_net(sk), &opt,
655
					       optval, optlen);
S
Stephen Hemminger 已提交
656 657
		if (err)
			break;
658
		old = rcu_dereference_protected(inet->inet_opt,
659
						lockdep_sock_is_held(sk));
S
Stephen Hemminger 已提交
660 661
		if (inet->is_icsk) {
			struct inet_connection_sock *icsk = inet_csk(sk);
E
Eric Dumazet 已提交
662
#if IS_ENABLED(CONFIG_IPV6)
S
Stephen Hemminger 已提交
663 664 665
			if (sk->sk_family == PF_INET ||
			    (!((1 << sk->sk_state) &
			       (TCPF_LISTEN | TCPF_CLOSE)) &&
E
Eric Dumazet 已提交
666
			     inet->inet_daddr != LOOPBACK4_IPV6)) {
L
Linus Torvalds 已提交
667
#endif
668 669
				if (old)
					icsk->icsk_ext_hdr_len -= old->opt.optlen;
S
Stephen Hemminger 已提交
670
				if (opt)
671
					icsk->icsk_ext_hdr_len += opt->opt.optlen;
S
Stephen Hemminger 已提交
672
				icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
E
Eric Dumazet 已提交
673
#if IS_ENABLED(CONFIG_IPV6)
L
Linus Torvalds 已提交
674
			}
S
Stephen Hemminger 已提交
675
#endif
L
Linus Torvalds 已提交
676
		}
677 678
		rcu_assign_pointer(inet->inet_opt, opt);
		if (old)
679
			kfree_rcu(old, rcu);
S
Stephen Hemminger 已提交
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
		break;
	}
	case IP_PKTINFO:
		if (val)
			inet->cmsg_flags |= IP_CMSG_PKTINFO;
		else
			inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
		break;
	case IP_RECVTTL:
		if (val)
			inet->cmsg_flags |=  IP_CMSG_TTL;
		else
			inet->cmsg_flags &= ~IP_CMSG_TTL;
		break;
	case IP_RECVTOS:
		if (val)
			inet->cmsg_flags |=  IP_CMSG_TOS;
		else
			inet->cmsg_flags &= ~IP_CMSG_TOS;
		break;
	case IP_RECVOPTS:
		if (val)
			inet->cmsg_flags |=  IP_CMSG_RECVOPTS;
		else
			inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
		break;
	case IP_RETOPTS:
		if (val)
			inet->cmsg_flags |= IP_CMSG_RETOPTS;
		else
			inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
		break;
	case IP_PASSSEC:
		if (val)
			inet->cmsg_flags |= IP_CMSG_PASSSEC;
		else
			inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
		break;
718 719 720 721 722 723
	case IP_RECVORIGDSTADDR:
		if (val)
			inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
		else
			inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
		break;
724 725 726 727 728 729 730 731 732 733 734 735 736
	case IP_CHECKSUM:
		if (val) {
			if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
				inet_inc_convert_csum(sk);
				inet->cmsg_flags |= IP_CMSG_CHECKSUM;
			}
		} else {
			if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
				inet_dec_convert_csum(sk);
				inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
			}
		}
		break;
W
Willem de Bruijn 已提交
737 738 739 740 741 742 743 744
	case IP_RECVFRAGSIZE:
		if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
			goto e_inval;
		if (val)
			inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
		else
			inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
		break;
S
Stephen Hemminger 已提交
745 746
	case IP_TOS:	/* This sets both TOS and Precedence */
		if (sk->sk_type == SOCK_STREAM) {
747 748
			val &= ~INET_ECN_MASK;
			val |= inet->tos & INET_ECN_MASK;
S
Stephen Hemminger 已提交
749 750 751 752 753 754 755 756
		}
		if (inet->tos != val) {
			inet->tos = val;
			sk->sk_priority = rt_tos2priority(val);
			sk_dst_reset(sk);
		}
		break;
	case IP_TTL:
E
Eric Dumazet 已提交
757
		if (optlen < 1)
S
Stephen Hemminger 已提交
758
			goto e_inval;
759
		if (val != -1 && (val < 1 || val > 255))
S
Stephen Hemminger 已提交
760 761 762 763 764 765
			goto e_inval;
		inet->uc_ttl = val;
		break;
	case IP_HDRINCL:
		if (sk->sk_type != SOCK_RAW) {
			err = -ENOPROTOOPT;
C
Catherine Zhang 已提交
766
			break;
S
Stephen Hemminger 已提交
767 768 769
		}
		inet->hdrincl = val ? 1 : 0;
		break;
770 771 772 773 774 775 776
	case IP_NODEFRAG:
		if (sk->sk_type != SOCK_RAW) {
			err = -ENOPROTOOPT;
			break;
		}
		inet->nodefrag = val ? 1 : 0;
		break;
777 778 779
	case IP_BIND_ADDRESS_NO_PORT:
		inet->bind_address_no_port = val ? 1 : 0;
		break;
S
Stephen Hemminger 已提交
780
	case IP_MTU_DISCOVER:
781
		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
S
Stephen Hemminger 已提交
782 783 784 785 786 787 788 789 790 791 792
			goto e_inval;
		inet->pmtudisc = val;
		break;
	case IP_RECVERR:
		inet->recverr = !!val;
		if (!val)
			skb_queue_purge(&sk->sk_error_queue);
		break;
	case IP_MULTICAST_TTL:
		if (sk->sk_type == SOCK_STREAM)
			goto e_inval;
E
Eric Dumazet 已提交
793
		if (optlen < 1)
S
Stephen Hemminger 已提交
794
			goto e_inval;
795
		if (val == -1)
S
Stephen Hemminger 已提交
796 797 798 799 800 801
			val = 1;
		if (val < 0 || val > 255)
			goto e_inval;
		inet->mc_ttl = val;
		break;
	case IP_MULTICAST_LOOP:
E
Eric Dumazet 已提交
802
		if (optlen < 1)
S
Stephen Hemminger 已提交
803 804 805
			goto e_inval;
		inet->mc_loop = !!val;
		break;
806 807 808 809
	case IP_UNICAST_IF:
	{
		struct net_device *dev = NULL;
		int ifindex;
810
		int midx;
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825

		if (optlen != sizeof(int))
			goto e_inval;

		ifindex = (__force int)ntohl((__force __be32)val);
		if (ifindex == 0) {
			inet->uc_index = 0;
			err = 0;
			break;
		}

		dev = dev_get_by_index(sock_net(sk), ifindex);
		err = -EADDRNOTAVAIL;
		if (!dev)
			break;
826 827

		midx = l3mdev_master_ifindex(dev);
828 829 830
		dev_put(dev);

		err = -EINVAL;
831 832
		if (sk->sk_bound_dev_if &&
		    (!midx || midx != sk->sk_bound_dev_if))
833 834 835 836 837 838
			break;

		inet->uc_index = ifindex;
		err = 0;
		break;
	}
S
Stephen Hemminger 已提交
839 840 841 842
	case IP_MULTICAST_IF:
	{
		struct ip_mreqn mreq;
		struct net_device *dev = NULL;
843
		int midx;
S
Stephen Hemminger 已提交
844 845 846 847 848 849 850

		if (sk->sk_type == SOCK_STREAM)
			goto e_inval;
		/*
		 *	Check the arguments are allowable
		 */

851 852 853
		if (optlen < sizeof(struct in_addr))
			goto e_inval;

S
Stephen Hemminger 已提交
854 855
		err = -EFAULT;
		if (optlen >= sizeof(struct ip_mreqn)) {
856
			if (copy_from_user(&mreq, optval, sizeof(mreq)))
L
Linus Torvalds 已提交
857
				break;
S
Stephen Hemminger 已提交
858 859
		} else {
			memset(&mreq, 0, sizeof(mreq));
860 861 862 863 864 865 866 867 868
			if (optlen >= sizeof(struct ip_mreq)) {
				if (copy_from_user(&mreq, optval,
						   sizeof(struct ip_mreq)))
					break;
			} else if (optlen >= sizeof(struct in_addr)) {
				if (copy_from_user(&mreq.imr_address, optval,
						   sizeof(struct in_addr)))
					break;
			}
S
Stephen Hemminger 已提交
869 870 871
		}

		if (!mreq.imr_ifindex) {
A
Al Viro 已提交
872
			if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
S
Stephen Hemminger 已提交
873 874 875
				inet->mc_index = 0;
				inet->mc_addr  = 0;
				err = 0;
L
Linus Torvalds 已提交
876 877
				break;
			}
878
			dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
E
Eric Dumazet 已提交
879
			if (dev)
S
Stephen Hemminger 已提交
880 881
				mreq.imr_ifindex = dev->ifindex;
		} else
E
Eric Dumazet 已提交
882
			dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
L
Linus Torvalds 已提交
883 884


S
Stephen Hemminger 已提交
885 886 887
		err = -EADDRNOTAVAIL;
		if (!dev)
			break;
888 889 890

		midx = l3mdev_master_ifindex(dev);

E
Eric Dumazet 已提交
891
		dev_put(dev);
S
Stephen Hemminger 已提交
892 893 894

		err = -EINVAL;
		if (sk->sk_bound_dev_if &&
895 896
		    mreq.imr_ifindex != sk->sk_bound_dev_if &&
		    (!midx || midx != sk->sk_bound_dev_if))
S
Stephen Hemminger 已提交
897
			break;
L
Linus Torvalds 已提交
898

S
Stephen Hemminger 已提交
899 900 901 902 903
		inet->mc_index = mreq.imr_ifindex;
		inet->mc_addr  = mreq.imr_address.s_addr;
		err = 0;
		break;
	}
L
Linus Torvalds 已提交
904

S
Stephen Hemminger 已提交
905 906 907 908
	case IP_ADD_MEMBERSHIP:
	case IP_DROP_MEMBERSHIP:
	{
		struct ip_mreqn mreq;
L
Linus Torvalds 已提交
909

910 911 912 913
		err = -EPROTO;
		if (inet_sk(sk)->is_icsk)
			break;

S
Stephen Hemminger 已提交
914 915 916 917
		if (optlen < sizeof(struct ip_mreq))
			goto e_inval;
		err = -EFAULT;
		if (optlen >= sizeof(struct ip_mreqn)) {
918
			if (copy_from_user(&mreq, optval, sizeof(mreq)))
L
Linus Torvalds 已提交
919
				break;
S
Stephen Hemminger 已提交
920 921
		} else {
			memset(&mreq, 0, sizeof(mreq));
922
			if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
L
Linus Torvalds 已提交
923
				break;
S
Stephen Hemminger 已提交
924
		}
L
Linus Torvalds 已提交
925

S
Stephen Hemminger 已提交
926
		if (optname == IP_ADD_MEMBERSHIP)
927
			err = ip_mc_join_group(sk, &mreq);
S
Stephen Hemminger 已提交
928
		else
929
			err = ip_mc_leave_group(sk, &mreq);
S
Stephen Hemminger 已提交
930 931 932 933 934 935 936 937 938 939
		break;
	}
	case IP_MSFILTER:
	{
		struct ip_msfilter *msf;

		if (optlen < IP_MSFILTER_SIZE(0))
			goto e_inval;
		if (optlen > sysctl_optmem_max) {
			err = -ENOBUFS;
L
Linus Torvalds 已提交
940 941
			break;
		}
942 943 944
		msf = memdup_user(optval, optlen);
		if (IS_ERR(msf)) {
			err = PTR_ERR(msf);
S
Stephen Hemminger 已提交
945 946 947 948
			break;
		}
		/* numsrc >= (1G-4) overflow in 32 bits */
		if (msf->imsf_numsrc >= 0x3ffffffcU ||
949
		    msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
S
Stephen Hemminger 已提交
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
			kfree(msf);
			err = -ENOBUFS;
			break;
		}
		if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
			kfree(msf);
			err = -EINVAL;
			break;
		}
		err = ip_mc_msfilter(sk, msf, 0);
		kfree(msf);
		break;
	}
	case IP_BLOCK_SOURCE:
	case IP_UNBLOCK_SOURCE:
	case IP_ADD_SOURCE_MEMBERSHIP:
	case IP_DROP_SOURCE_MEMBERSHIP:
	{
		struct ip_mreq_source mreqs;
		int omode, add;
L
Linus Torvalds 已提交
970

S
Stephen Hemminger 已提交
971 972 973
		if (optlen != sizeof(struct ip_mreq_source))
			goto e_inval;
		if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
L
Linus Torvalds 已提交
974 975 976
			err = -EFAULT;
			break;
		}
S
Stephen Hemminger 已提交
977 978 979 980 981 982 983 984
		if (optname == IP_BLOCK_SOURCE) {
			omode = MCAST_EXCLUDE;
			add = 1;
		} else if (optname == IP_UNBLOCK_SOURCE) {
			omode = MCAST_EXCLUDE;
			add = 0;
		} else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
			struct ip_mreqn mreq;
L
Linus Torvalds 已提交
985

S
Stephen Hemminger 已提交
986 987 988
			mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
			mreq.imr_address.s_addr = mreqs.imr_interface;
			mreq.imr_ifindex = 0;
989
			err = ip_mc_join_group(sk, &mreq);
S
Stephen Hemminger 已提交
990
			if (err && err != -EADDRINUSE)
L
Linus Torvalds 已提交
991
				break;
S
Stephen Hemminger 已提交
992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
			omode = MCAST_INCLUDE;
			add = 1;
		} else /* IP_DROP_SOURCE_MEMBERSHIP */ {
			omode = MCAST_INCLUDE;
			add = 0;
		}
		err = ip_mc_source(add, omode, sk, &mreqs, 0);
		break;
	}
	case MCAST_JOIN_GROUP:
	case MCAST_LEAVE_GROUP:
	{
		struct group_req greq;
		struct sockaddr_in *psin;
		struct ip_mreqn mreq;

		if (optlen < sizeof(struct group_req))
			goto e_inval;
		err = -EFAULT;
		if (copy_from_user(&greq, optval, sizeof(greq)))
			break;
		psin = (struct sockaddr_in *)&greq.gr_group;
		if (psin->sin_family != AF_INET)
			goto e_inval;
		memset(&mreq, 0, sizeof(mreq));
		mreq.imr_multiaddr = psin->sin_addr;
		mreq.imr_ifindex = greq.gr_interface;

		if (optname == MCAST_JOIN_GROUP)
1021
			err = ip_mc_join_group(sk, &mreq);
S
Stephen Hemminger 已提交
1022
		else
1023
			err = ip_mc_leave_group(sk, &mreq);
S
Stephen Hemminger 已提交
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
		break;
	}
	case MCAST_JOIN_SOURCE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	{
		struct group_source_req greqs;
		struct ip_mreq_source mreqs;
		struct sockaddr_in *psin;
		int omode, add;

		if (optlen != sizeof(struct group_source_req))
			goto e_inval;
		if (copy_from_user(&greqs, optval, sizeof(greqs))) {
L
Linus Torvalds 已提交
1039 1040 1041
			err = -EFAULT;
			break;
		}
S
Stephen Hemminger 已提交
1042 1043 1044
		if (greqs.gsr_group.ss_family != AF_INET ||
		    greqs.gsr_source.ss_family != AF_INET) {
			err = -EADDRNOTAVAIL;
L
Linus Torvalds 已提交
1045 1046
			break;
		}
S
Stephen Hemminger 已提交
1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
		psin = (struct sockaddr_in *)&greqs.gsr_group;
		mreqs.imr_multiaddr = psin->sin_addr.s_addr;
		psin = (struct sockaddr_in *)&greqs.gsr_source;
		mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
		mreqs.imr_interface = 0; /* use index for mc_source */

		if (optname == MCAST_BLOCK_SOURCE) {
			omode = MCAST_EXCLUDE;
			add = 1;
		} else if (optname == MCAST_UNBLOCK_SOURCE) {
			omode = MCAST_EXCLUDE;
			add = 0;
		} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
L
Linus Torvalds 已提交
1060 1061
			struct ip_mreqn mreq;

S
Stephen Hemminger 已提交
1062
			psin = (struct sockaddr_in *)&greqs.gsr_group;
L
Linus Torvalds 已提交
1063
			mreq.imr_multiaddr = psin->sin_addr;
S
Stephen Hemminger 已提交
1064 1065
			mreq.imr_address.s_addr = 0;
			mreq.imr_ifindex = greqs.gsr_interface;
1066
			err = ip_mc_join_group(sk, &mreq);
S
Stephen Hemminger 已提交
1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
			if (err && err != -EADDRINUSE)
				break;
			greqs.gsr_interface = mreq.imr_ifindex;
			omode = MCAST_INCLUDE;
			add = 1;
		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
			omode = MCAST_INCLUDE;
			add = 0;
		}
		err = ip_mc_source(add, omode, sk, &mreqs,
				   greqs.gsr_interface);
		break;
	}
	case MCAST_MSFILTER:
	{
		struct sockaddr_in *psin;
		struct ip_msfilter *msf = NULL;
		struct group_filter *gsf = NULL;
		int msize, i, ifindex;

		if (optlen < GROUP_FILTER_SIZE(0))
			goto e_inval;
		if (optlen > sysctl_optmem_max) {
			err = -ENOBUFS;
L
Linus Torvalds 已提交
1091 1092
			break;
		}
1093 1094 1095
		gsf = memdup_user(optval, optlen);
		if (IS_ERR(gsf)) {
			err = PTR_ERR(gsf);
L
Linus Torvalds 已提交
1096 1097
			break;
		}
E
Eric Dumazet 已提交
1098

S
Stephen Hemminger 已提交
1099 1100
		/* numsrc >= (4G-140)/128 overflow in 32 bits */
		if (gsf->gf_numsrc >= 0x1ffffff ||
1101
		    gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
S
Stephen Hemminger 已提交
1102 1103 1104 1105 1106 1107 1108 1109
			err = -ENOBUFS;
			goto mc_msf_out;
		}
		if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
			err = -EINVAL;
			goto mc_msf_out;
		}
		msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
1110
		msf = kmalloc(msize, GFP_KERNEL);
S
Stephen Hemminger 已提交
1111
		if (!msf) {
S
Stephen Hemminger 已提交
1112 1113 1114 1115 1116 1117
			err = -ENOBUFS;
			goto mc_msf_out;
		}
		ifindex = gsf->gf_interface;
		psin = (struct sockaddr_in *)&gsf->gf_group;
		if (psin->sin_family != AF_INET) {
L
Linus Torvalds 已提交
1118
			err = -EADDRNOTAVAIL;
S
Stephen Hemminger 已提交
1119
			goto mc_msf_out;
L
Linus Torvalds 已提交
1120
		}
S
Stephen Hemminger 已提交
1121 1122 1123 1124 1125
		msf->imsf_multiaddr = psin->sin_addr.s_addr;
		msf->imsf_interface = 0;
		msf->imsf_fmode = gsf->gf_fmode;
		msf->imsf_numsrc = gsf->gf_numsrc;
		err = -EADDRNOTAVAIL;
E
Eric Dumazet 已提交
1126
		for (i = 0; i < gsf->gf_numsrc; ++i) {
S
Stephen Hemminger 已提交
1127
			psin = (struct sockaddr_in *)&gsf->gf_slist[i];
1128

S
Stephen Hemminger 已提交
1129 1130 1131 1132 1133 1134 1135 1136
			if (psin->sin_family != AF_INET)
				goto mc_msf_out;
			msf->imsf_slist[i] = psin->sin_addr.s_addr;
		}
		kfree(gsf);
		gsf = NULL;

		err = ip_mc_msfilter(sk, msf, ifindex);
E
Eric Dumazet 已提交
1137
mc_msf_out:
S
Stephen Hemminger 已提交
1138 1139 1140 1141
		kfree(msf);
		kfree(gsf);
		break;
	}
1142 1143 1144 1145 1146 1147 1148
	case IP_MULTICAST_ALL:
		if (optlen < 1)
			goto e_inval;
		if (val != 0 && val != 1)
			goto e_inval;
		inet->mc_all = val;
		break;
S
Stephen Hemminger 已提交
1149 1150

	case IP_FREEBIND:
E
Eric Dumazet 已提交
1151
		if (optlen < 1)
S
Stephen Hemminger 已提交
1152 1153 1154 1155 1156 1157 1158
			goto e_inval;
		inet->freebind = !!val;
		break;

	case IP_IPSEC_POLICY:
	case IP_XFRM_POLICY:
		err = -EPERM;
1159
		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
1160
			break;
S
Stephen Hemminger 已提交
1161 1162
		err = xfrm_user_policy(sk, optname, optval, optlen);
		break;
L
Linus Torvalds 已提交
1163

1164
	case IP_TRANSPARENT:
1165 1166
		if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1167 1168 1169 1170 1171 1172 1173 1174
			err = -EPERM;
			break;
		}
		if (optlen < 1)
			goto e_inval;
		inet->transparent = !!val;
		break;

1175 1176 1177 1178 1179 1180 1181 1182
	case IP_MINTTL:
		if (optlen < 1)
			goto e_inval;
		if (val < 0 || val > 255)
			goto e_inval;
		inet->min_ttl = val;
		break;

S
Stephen Hemminger 已提交
1183 1184 1185
	default:
		err = -ENOPROTOOPT;
		break;
L
Linus Torvalds 已提交
1186 1187
	}
	release_sock(sk);
1188 1189
	if (needs_rtnl)
		rtnl_unlock();
L
Linus Torvalds 已提交
1190 1191 1192 1193
	return err;

e_inval:
	release_sock(sk);
1194 1195
	if (needs_rtnl)
		rtnl_unlock();
L
Linus Torvalds 已提交
1196 1197 1198
	return -EINVAL;
}

E
Eric Dumazet 已提交
1199
/**
1200
 * ipv4_pktinfo_prepare - transfer some info from rtable to skb
E
Eric Dumazet 已提交
1201 1202 1203
 * @sk: socket
 * @skb: buffer
 *
1204 1205
 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
 * destination in skb->cb[] before dst drop.
S
stephen hemminger 已提交
1206
 * This way, receiver doesn't make cache line misses to read rtable.
E
Eric Dumazet 已提交
1207
 */
1208
void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
E
Eric Dumazet 已提交
1209
{
1210
	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
1211 1212
	bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
		       ipv6_sk_rxinfo(sk);
1213

1214
	if (prepare && skb_rtable(skb)) {
1215 1216 1217 1218
		/* skb->cb is overloaded: prior to this point it is IP{6}CB
		 * which has interface index (iif) as the first member of the
		 * underlying inet{6}_skb_parm struct. This code then overlays
		 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
1219 1220 1221 1222
		 * element so the iif is picked up from the prior IPCB. If iif
		 * is the loopback interface, then return the sending interface
		 * (e.g., process binds socket to eth0 for Tx which is
		 * redirected to loopback in the rtable/dst).
1223
		 */
1224 1225 1226 1227
		struct rtable *rt = skb_rtable(skb);
		bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);

		if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
1228
			pktinfo->ipi_ifindex = inet_iif(skb);
1229 1230
		else if (l3slave && rt && rt->rt_iif)
			pktinfo->ipi_ifindex = rt->rt_iif;
1231

1232
		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1233 1234 1235 1236
	} else {
		pktinfo->ipi_ifindex = 0;
		pktinfo->ipi_spec_dst.s_addr = 0;
	}
1237
	skb_dst_drop(skb);
E
Eric Dumazet 已提交
1238 1239
}

1240
int ip_setsockopt(struct sock *sk, int level,
1241
		int optname, char __user *optval, unsigned int optlen)
1242 1243 1244 1245 1246 1247 1248
{
	int err;

	if (level != SOL_IP)
		return -ENOPROTOOPT;

	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
1249 1250 1251 1252 1253
#ifdef CONFIG_BPFILTER
	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
	    optname < BPFILTER_IPT_SET_MAX)
		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
#endif
1254 1255 1256
#ifdef CONFIG_NETFILTER
	/* we need to exclude all possible ENOPROTOOPTs except default case */
	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
1257 1258
			optname != IP_IPSEC_POLICY &&
			optname != IP_XFRM_POLICY &&
1259
			!ip_mroute_opt(optname))
1260 1261 1262 1263
		err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
#endif
	return err;
}
E
Eric Dumazet 已提交
1264
EXPORT_SYMBOL(ip_setsockopt);
1265 1266

#ifdef CONFIG_COMPAT
1267
int compat_ip_setsockopt(struct sock *sk, int level, int optname,
1268
			 char __user *optval, unsigned int optlen)
1269 1270 1271 1272 1273 1274
{
	int err;

	if (level != SOL_IP)
		return -ENOPROTOOPT;

1275 1276 1277 1278
	if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
			ip_setsockopt);

1279 1280 1281 1282
	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
#ifdef CONFIG_NETFILTER
	/* we need to exclude all possible ENOPROTOOPTs except default case */
	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
1283 1284
			optname != IP_IPSEC_POLICY &&
			optname != IP_XFRM_POLICY &&
1285 1286 1287
			!ip_mroute_opt(optname))
		err = compat_nf_setsockopt(sk, PF_INET, optname, optval,
					   optlen);
1288 1289 1290
#endif
	return err;
}
1291
EXPORT_SYMBOL(compat_ip_setsockopt);
1292 1293
#endif

L
Linus Torvalds 已提交
1294
/*
E
Eric Dumazet 已提交
1295 1296
 *	Get the options. Note for future reference. The GET of IP options gets
 *	the _received_ ones. The set sets the _sent_ ones.
L
Linus Torvalds 已提交
1297 1298
 */

1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
static bool getsockopt_needs_rtnl(int optname)
{
	switch (optname) {
	case IP_MSFILTER:
	case MCAST_MSFILTER:
		return true;
	}
	return false;
}

1309
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1310
			    char __user *optval, int __user *optlen, unsigned int flags)
L
Linus Torvalds 已提交
1311 1312
{
	struct inet_sock *inet = inet_sk(sk);
1313 1314
	bool needs_rtnl = getsockopt_needs_rtnl(optname);
	int val, err = 0;
L
Linus Torvalds 已提交
1315
	int len;
1316

S
Stephen Hemminger 已提交
1317
	if (level != SOL_IP)
L
Linus Torvalds 已提交
1318 1319
		return -EOPNOTSUPP;

1320
	if (ip_mroute_opt(optname))
1321
		return ip_mroute_getsockopt(sk, optname, optval, optlen);
L
Linus Torvalds 已提交
1322

1323
	if (get_user(len, optlen))
L
Linus Torvalds 已提交
1324
		return -EFAULT;
S
Stephen Hemminger 已提交
1325
	if (len < 0)
L
Linus Torvalds 已提交
1326
		return -EINVAL;
1327

1328 1329
	if (needs_rtnl)
		rtnl_lock();
L
Linus Torvalds 已提交
1330 1331
	lock_sock(sk);

S
Stephen Hemminger 已提交
1332 1333 1334 1335
	switch (optname) {
	case IP_OPTIONS:
	{
		unsigned char optbuf[sizeof(struct ip_options)+40];
1336 1337 1338 1339
		struct ip_options *opt = (struct ip_options *)optbuf;
		struct ip_options_rcu *inet_opt;

		inet_opt = rcu_dereference_protected(inet->inet_opt,
1340
						     lockdep_sock_is_held(sk));
S
Stephen Hemminger 已提交
1341
		opt->optlen = 0;
1342 1343 1344 1345
		if (inet_opt)
			memcpy(optbuf, &inet_opt->opt,
			       sizeof(struct ip_options) +
			       inet_opt->opt.optlen);
S
Stephen Hemminger 已提交
1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
		release_sock(sk);

		if (opt->optlen == 0)
			return put_user(0, optlen);

		ip_options_undo(opt);

		len = min_t(unsigned int, len, opt->optlen);
		if (put_user(len, optlen))
			return -EFAULT;
		if (copy_to_user(optval, opt->__data, len))
			return -EFAULT;
		return 0;
	}
	case IP_PKTINFO:
		val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
		break;
	case IP_RECVTTL:
		val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
		break;
	case IP_RECVTOS:
		val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
		break;
	case IP_RECVOPTS:
		val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
		break;
	case IP_RETOPTS:
		val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
		break;
	case IP_PASSSEC:
		val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
		break;
1378 1379 1380
	case IP_RECVORIGDSTADDR:
		val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
		break;
1381 1382 1383
	case IP_CHECKSUM:
		val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
		break;
W
Willem de Bruijn 已提交
1384 1385 1386
	case IP_RECVFRAGSIZE:
		val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
		break;
S
Stephen Hemminger 已提交
1387 1388 1389 1390
	case IP_TOS:
		val = inet->tos;
		break;
	case IP_TTL:
1391 1392
	{
		struct net *net = sock_net(sk);
S
Stephen Hemminger 已提交
1393
		val = (inet->uc_ttl == -1 ?
1394
		       net->ipv4.sysctl_ip_default_ttl :
S
Stephen Hemminger 已提交
1395 1396
		       inet->uc_ttl);
		break;
1397
	}
S
Stephen Hemminger 已提交
1398 1399 1400
	case IP_HDRINCL:
		val = inet->hdrincl;
		break;
1401 1402 1403
	case IP_NODEFRAG:
		val = inet->nodefrag;
		break;
1404 1405 1406
	case IP_BIND_ADDRESS_NO_PORT:
		val = inet->bind_address_no_port;
		break;
S
Stephen Hemminger 已提交
1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417
	case IP_MTU_DISCOVER:
		val = inet->pmtudisc;
		break;
	case IP_MTU:
	{
		struct dst_entry *dst;
		val = 0;
		dst = sk_dst_get(sk);
		if (dst) {
			val = dst_mtu(dst);
			dst_release(dst);
L
Linus Torvalds 已提交
1418
		}
S
Stephen Hemminger 已提交
1419
		if (!val) {
L
Linus Torvalds 已提交
1420
			release_sock(sk);
S
Stephen Hemminger 已提交
1421
			return -ENOTCONN;
L
Linus Torvalds 已提交
1422
		}
S
Stephen Hemminger 已提交
1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
		break;
	}
	case IP_RECVERR:
		val = inet->recverr;
		break;
	case IP_MULTICAST_TTL:
		val = inet->mc_ttl;
		break;
	case IP_MULTICAST_LOOP:
		val = inet->mc_loop;
		break;
1434 1435 1436
	case IP_UNICAST_IF:
		val = (__force int)htonl((__u32) inet->uc_index);
		break;
S
Stephen Hemminger 已提交
1437 1438 1439 1440 1441 1442
	case IP_MULTICAST_IF:
	{
		struct in_addr addr;
		len = min_t(unsigned int, len, sizeof(struct in_addr));
		addr.s_addr = inet->mc_addr;
		release_sock(sk);
L
Linus Torvalds 已提交
1443

S
Stephen Hemminger 已提交
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454
		if (put_user(len, optlen))
			return -EFAULT;
		if (copy_to_user(optval, &addr, len))
			return -EFAULT;
		return 0;
	}
	case IP_MSFILTER:
	{
		struct ip_msfilter msf;

		if (len < IP_MSFILTER_SIZE(0)) {
1455 1456
			err = -EINVAL;
			goto out;
L
Linus Torvalds 已提交
1457
		}
S
Stephen Hemminger 已提交
1458
		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
1459 1460
			err = -EFAULT;
			goto out;
L
Linus Torvalds 已提交
1461
		}
S
Stephen Hemminger 已提交
1462 1463
		err = ip_mc_msfget(sk, &msf,
				   (struct ip_msfilter __user *)optval, optlen);
1464
		goto out;
S
Stephen Hemminger 已提交
1465 1466 1467 1468
	}
	case MCAST_MSFILTER:
	{
		struct group_filter gsf;
L
Linus Torvalds 已提交
1469

S
Stephen Hemminger 已提交
1470
		if (len < GROUP_FILTER_SIZE(0)) {
1471 1472
			err = -EINVAL;
			goto out;
S
Stephen Hemminger 已提交
1473 1474
		}
		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
1475 1476
			err = -EFAULT;
			goto out;
S
Stephen Hemminger 已提交
1477 1478
		}
		err = ip_mc_gsfget(sk, &gsf,
E
Eric Dumazet 已提交
1479 1480
				   (struct group_filter __user *)optval,
				   optlen);
1481
		goto out;
S
Stephen Hemminger 已提交
1482
	}
1483 1484 1485
	case IP_MULTICAST_ALL:
		val = inet->mc_all;
		break;
S
Stephen Hemminger 已提交
1486 1487 1488
	case IP_PKTOPTIONS:
	{
		struct msghdr msg;
L
Linus Torvalds 已提交
1489

S
Stephen Hemminger 已提交
1490
		release_sock(sk);
L
Linus Torvalds 已提交
1491

S
Stephen Hemminger 已提交
1492 1493
		if (sk->sk_type != SOCK_STREAM)
			return -ENOPROTOOPT;
L
Linus Torvalds 已提交
1494

1495
		msg.msg_control = (__force void *) optval;
S
Stephen Hemminger 已提交
1496
		msg.msg_controllen = len;
1497
		msg.msg_flags = flags;
L
Linus Torvalds 已提交
1498

S
Stephen Hemminger 已提交
1499 1500 1501
		if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
			struct in_pktinfo info;

E
Eric Dumazet 已提交
1502 1503
			info.ipi_addr.s_addr = inet->inet_rcv_saddr;
			info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
S
Stephen Hemminger 已提交
1504 1505
			info.ipi_ifindex = inet->mc_index;
			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
L
Linus Torvalds 已提交
1506
		}
S
Stephen Hemminger 已提交
1507 1508 1509 1510
		if (inet->cmsg_flags & IP_CMSG_TTL) {
			int hlim = inet->mc_ttl;
			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
		}
1511 1512 1513 1514
		if (inet->cmsg_flags & IP_CMSG_TOS) {
			int tos = inet->rcv_tos;
			put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
		}
S
Stephen Hemminger 已提交
1515 1516 1517 1518 1519 1520
		len -= msg.msg_controllen;
		return put_user(len, optlen);
	}
	case IP_FREEBIND:
		val = inet->freebind;
		break;
1521 1522 1523
	case IP_TRANSPARENT:
		val = inet->transparent;
		break;
1524 1525 1526
	case IP_MINTTL:
		val = inet->min_ttl;
		break;
S
Stephen Hemminger 已提交
1527 1528 1529
	default:
		release_sock(sk);
		return -ENOPROTOOPT;
L
Linus Torvalds 已提交
1530 1531
	}
	release_sock(sk);
1532

E
Eric Dumazet 已提交
1533
	if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
L
Linus Torvalds 已提交
1534 1535
		unsigned char ucval = (unsigned char)val;
		len = 1;
S
Stephen Hemminger 已提交
1536
		if (put_user(len, optlen))
L
Linus Torvalds 已提交
1537
			return -EFAULT;
1538
		if (copy_to_user(optval, &ucval, 1))
L
Linus Torvalds 已提交
1539 1540 1541
			return -EFAULT;
	} else {
		len = min_t(unsigned int, sizeof(int), len);
S
Stephen Hemminger 已提交
1542
		if (put_user(len, optlen))
L
Linus Torvalds 已提交
1543
			return -EFAULT;
1544
		if (copy_to_user(optval, &val, len))
L
Linus Torvalds 已提交
1545 1546 1547
			return -EFAULT;
	}
	return 0;
1548 1549 1550 1551 1552 1553

out:
	release_sock(sk);
	if (needs_rtnl)
		rtnl_unlock();
	return err;
L
Linus Torvalds 已提交
1554 1555
}

1556
int ip_getsockopt(struct sock *sk, int level,
S
Stephen Hemminger 已提交
1557
		  int optname, char __user *optval, int __user *optlen)
1558 1559 1560
{
	int err;

1561
	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
1562 1563 1564 1565 1566
#ifdef CONFIG_BPFILTER
	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
	    optname < BPFILTER_IPT_GET_MAX)
		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
#endif
1567 1568
#ifdef CONFIG_NETFILTER
	/* we need to exclude all possible ENOPROTOOPTs except default case */
1569 1570
	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
			!ip_mroute_opt(optname)) {
1571
		int len;
1572

1573
		if (get_user(len, optlen))
1574 1575
			return -EFAULT;

1576
		err = nf_getsockopt(sk, PF_INET, optname, optval, &len);
1577 1578 1579 1580 1581 1582 1583
		if (err >= 0)
			err = put_user(len, optlen);
		return err;
	}
#endif
	return err;
}
E
Eric Dumazet 已提交
1584
EXPORT_SYMBOL(ip_getsockopt);
1585 1586

#ifdef CONFIG_COMPAT
1587 1588
int compat_ip_getsockopt(struct sock *sk, int level, int optname,
			 char __user *optval, int __user *optlen)
1589
{
1590 1591 1592 1593 1594 1595
	int err;

	if (optname == MCAST_MSFILTER)
		return compat_mc_getsockopt(sk, level, optname, optval, optlen,
			ip_getsockopt);

1596 1597
	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
		MSG_CMSG_COMPAT);
1598

1599 1600 1601 1602 1603
#ifdef CONFIG_BPFILTER
	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
	    optname < BPFILTER_IPT_GET_MAX)
		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
#endif
1604 1605
#ifdef CONFIG_NETFILTER
	/* we need to exclude all possible ENOPROTOOPTs except default case */
1606 1607
	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
			!ip_mroute_opt(optname)) {
1608
		int len;
1609

1610
		if (get_user(len, optlen))
1611 1612
			return -EFAULT;

1613
		err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
1614 1615 1616 1617 1618 1619 1620
		if (err >= 0)
			err = put_user(len, optlen);
		return err;
	}
#endif
	return err;
}
1621
EXPORT_SYMBOL(compat_ip_getsockopt);
1622
#endif