raw.c 26.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		RAW - implementation of IP "raw" sockets.
 *
9
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *		Alan Cox	:	verify_area() fixed up
 *		Alan Cox	:	ICMP error handling
 *		Alan Cox	:	EMSGSIZE if you send too big a packet
 *		Alan Cox	: 	Now uses generic datagrams and shared
 *					skbuff library. No more peek crashes,
 *					no more backlogs
 *		Alan Cox	:	Checks sk->broadcast.
 *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
 *		Alan Cox	:	Raw passes ip options too
 *		Alan Cox	:	Setsocketopt added
 *		Alan Cox	:	Fixed error return for broadcasts
 *		Alan Cox	:	Removed wake_up calls
 *		Alan Cox	:	Use ttl/tos
 *		Alan Cox	:	Cleaned up old debugging
 *		Alan Cox	:	Use new kernel side addresses
 *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
 *		Alan Cox	:	BSD style RAW socket demultiplexing.
 *		Alan Cox	:	Beginnings of mrouted support.
 *		Alan Cox	:	Added IP_HDRINCL option.
 *		Alan Cox	:	Skip broadcast check if BSDism set.
 *		David S. Miller	:	New socket lookup architecture.
 */
35

A
Alan Cox 已提交
36
#include <linux/types.h>
A
Arun Sharma 已提交
37
#include <linux/atomic.h>
L
Linus Torvalds 已提交
38 39
#include <asm/byteorder.h>
#include <asm/current.h>
40
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
41 42 43 44 45
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/kernel.h>
46
#include <linux/export.h>
L
Linus Torvalds 已提交
47 48 49 50 51 52 53 54 55
#include <linux/spinlock.h>
#include <linux/sockios.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/mroute.h>
#include <linux/netdevice.h>
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
56
#include <linux/igmp.h>
57
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
58 59 60 61 62 63 64 65 66
#include <net/dst.h>
#include <net/sock.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <net/snmp.h>
67
#include <net/tcp_states.h>
L
Linus Torvalds 已提交
68 69 70 71 72 73 74 75
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
76
#include <linux/compat.h>
77 78 79
#include <linux/uio.h>

struct raw_frag_vec {
A
Al Viro 已提交
80
	struct msghdr *msg;
81 82 83 84 85 86
	union {
		struct icmphdr icmph;
		char c[1];
	} hdr;
	int hlen;
};
L
Linus Torvalds 已提交
87

88
struct raw_hashinfo raw_v4_hashinfo = {
89
	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
90
};
91
EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
L
Linus Torvalds 已提交
92

93
int raw_hash_sk(struct sock *sk)
L
Linus Torvalds 已提交
94
{
95
	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
96
	struct hlist_head *head;
L
Linus Torvalds 已提交
97

E
Eric Dumazet 已提交
98
	head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
99 100

	write_lock_bh(&h->lock);
L
Linus Torvalds 已提交
101
	sk_add_node(sk, head);
102
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
103
	write_unlock_bh(&h->lock);
104 105

	return 0;
106 107 108
}
EXPORT_SYMBOL_GPL(raw_hash_sk);

109
void raw_unhash_sk(struct sock *sk)
110
{
111 112
	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;

113 114
	write_lock_bh(&h->lock);
	if (sk_del_node_init(sk))
115
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
116 117 118 119
	write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);

120
struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
121 122
			     unsigned short num, __be32 raddr, __be32 laddr,
			     int dif, int sdif)
L
Linus Torvalds 已提交
123
{
124
	sk_for_each_from(sk) {
L
Linus Torvalds 已提交
125 126
		struct inet_sock *inet = inet_sk(sk);

E
Eric Dumazet 已提交
127 128 129
		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
130
		    raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
L
Linus Torvalds 已提交
131 132 133 134 135 136
			goto found; /* gotcha */
	}
	sk = NULL;
found:
	return sk;
}
137
EXPORT_SYMBOL_GPL(__raw_v4_lookup);
L
Linus Torvalds 已提交
138 139 140 141 142

/*
 *	0 - deliver
 *	1 - block
 */
E
Eric Dumazet 已提交
143
static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
L
Linus Torvalds 已提交
144
{
E
Eric Dumazet 已提交
145 146
	struct icmphdr _hdr;
	const struct icmphdr *hdr;
L
Linus Torvalds 已提交
147

E
Eric Dumazet 已提交
148 149 150
	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
				 sizeof(_hdr), &_hdr);
	if (!hdr)
L
Linus Torvalds 已提交
151 152
		return 1;

E
Eric Dumazet 已提交
153
	if (hdr->type < 32) {
L
Linus Torvalds 已提交
154 155
		__u32 data = raw_sk(sk)->filter.data;

E
Eric Dumazet 已提交
156
		return ((1U << hdr->type) & data) != 0;
L
Linus Torvalds 已提交
157 158 159 160 161 162 163 164 165 166 167 168
	}

	/* Do not block unknown ICMP types */
	return 0;
}

/* IP input processing comes here for RAW socket delivery.
 * Caller owns SKB, so we must make clones.
 *
 * RFC 1122: SHOULD pass TOS value up to the transport layer.
 * -> It does. And not only TOS, but all IP header.
 */
169
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
L
Linus Torvalds 已提交
170
{
171
	int sdif = inet_sdif(skb);
172
	int dif = inet_iif(skb);
L
Linus Torvalds 已提交
173 174
	struct sock *sk;
	struct hlist_head *head;
175
	int delivered = 0;
176
	struct net *net;
L
Linus Torvalds 已提交
177

178 179
	read_lock(&raw_v4_hashinfo.lock);
	head = &raw_v4_hashinfo.ht[hash];
L
Linus Torvalds 已提交
180 181
	if (hlist_empty(head))
		goto out;
182

183
	net = dev_net(skb->dev);
184
	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
185
			     iph->saddr, iph->daddr, dif, sdif);
L
Linus Torvalds 已提交
186 187

	while (sk) {
188
		delivered = 1;
189 190
		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
191
				   skb->dev->ifindex, sdif)) {
L
Linus Torvalds 已提交
192 193 194 195 196 197
			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

			/* Not releasing hash table! */
			if (clone)
				raw_rcv(sk, clone);
		}
198
		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
L
Linus Torvalds 已提交
199
				     iph->saddr, iph->daddr,
200
				     dif, sdif);
L
Linus Torvalds 已提交
201 202
	}
out:
203
	read_unlock(&raw_v4_hashinfo.lock);
204
	return delivered;
L
Linus Torvalds 已提交
205 206
}

207 208 209 210 211
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
	int hash;
	struct sock *raw_sk;

212 213
	hash = protocol & (RAW_HTABLE_SIZE - 1);
	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
214 215 216 217 218 219 220 221 222 223 224 225

	/* If there maybe a raw socket we must check - if not we
	 * don't care less
	 */
	if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
		raw_sk = NULL;

	return raw_sk != NULL;

}

static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
L
Linus Torvalds 已提交
226 227
{
	struct inet_sock *inet = inet_sk(sk);
228 229
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
L
Linus Torvalds 已提交
230 231 232
	int err = 0;
	int harderr = 0;

233 234
	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
		ipv4_sk_update_pmtu(skb, sk, info);
235
	else if (type == ICMP_REDIRECT) {
236
		ipv4_sk_redirect(skb, sk);
237 238
		return;
	}
239

L
Linus Torvalds 已提交
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
	/* Report error on raw socket, if:
	   1. User requested ip_recverr.
	   2. Socket is connected (otherwise the error indication
	      is useless without ip_recverr and error is hard.
	 */
	if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
		return;

	switch (type) {
	default:
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	case ICMP_SOURCE_QUENCH:
		return;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		harderr = 1;
		break;
	case ICMP_DEST_UNREACH:
		err = EHOSTUNREACH;
		if (code > NR_ICMP_UNREACH)
			break;
		err = icmp_err_convert[code].errno;
		harderr = icmp_err_convert[code].fatal;
		if (code == ICMP_FRAG_NEEDED) {
			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
			err = EMSGSIZE;
		}
	}

	if (inet->recverr) {
272
		const struct iphdr *iph = (const struct iphdr *)skb->data;
L
Linus Torvalds 已提交
273 274 275 276 277 278 279 280 281 282 283 284 285
		u8 *payload = skb->data + (iph->ihl << 2);

		if (inet->hdrincl)
			payload = skb->data;
		ip_icmp_error(sk, skb, err, 0, info, payload);
	}

	if (inet->recverr || harderr) {
		sk->sk_err = err;
		sk->sk_error_report(sk);
	}
}

286 287 288 289
void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
	int hash;
	struct sock *raw_sk;
290
	const struct iphdr *iph;
291
	struct net *net;
292

293
	hash = protocol & (RAW_HTABLE_SIZE - 1);
294

295 296
	read_lock(&raw_v4_hashinfo.lock);
	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
297
	if (raw_sk) {
298 299 300
		int dif = skb->dev->ifindex;
		int sdif = inet_sdif(skb);

301
		iph = (const struct iphdr *)skb->data;
302
		net = dev_net(skb->dev);
303 304 305

		while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
						iph->daddr, iph->saddr,
306
						dif, sdif)) != NULL) {
307 308
			raw_err(raw_sk, skb, info);
			raw_sk = sk_next(raw_sk);
309
			iph = (const struct iphdr *)skb->data;
310 311
		}
	}
312
	read_unlock(&raw_v4_hashinfo.lock);
313 314
}

D
Daniel Baluta 已提交
315
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
316 317
{
	/* Charge it to the socket. */
318

319
	ipv4_pktinfo_prepare(sk, skb);
320
	if (sock_queue_rcv_skb(sk, skb) < 0) {
L
Linus Torvalds 已提交
321 322 323 324 325 326 327 328 329 330
		kfree_skb(skb);
		return NET_RX_DROP;
	}

	return NET_RX_SUCCESS;
}

int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
W
Wang Chen 已提交
331
		atomic_inc(&sk->sk_drops);
L
Linus Torvalds 已提交
332 333 334
		kfree_skb(skb);
		return NET_RX_DROP;
	}
335
	nf_reset_ct(skb);
L
Linus Torvalds 已提交
336

337
	skb_push(skb, skb->data - skb_network_header(skb));
L
Linus Torvalds 已提交
338 339 340 341 342

	raw_rcv_skb(sk, skb);
	return 0;
}

343
static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
A
Al Viro 已提交
344
			   struct msghdr *msg, size_t length,
345 346
			   struct rtable **rtp, unsigned int flags,
			   const struct sockcm_cookie *sockc)
L
Linus Torvalds 已提交
347 348
{
	struct inet_sock *inet = inet_sk(sk);
349
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
350 351
	struct iphdr *iph;
	struct sk_buff *skb;
352
	unsigned int iphlen;
L
Linus Torvalds 已提交
353
	int err;
E
Eric Dumazet 已提交
354
	struct rtable *rt = *rtp;
355
	int hlen, tlen;
L
Linus Torvalds 已提交
356

357
	if (length > rt->dst.dev->mtu) {
358
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
359
			       rt->dst.dev->mtu);
L
Linus Torvalds 已提交
360 361
		return -EMSGSIZE;
	}
362 363 364
	if (length < sizeof(struct iphdr))
		return -EINVAL;

L
Linus Torvalds 已提交
365 366 367
	if (flags&MSG_PROBE)
		goto out;

368 369
	hlen = LL_RESERVED_SPACE(rt->dst.dev);
	tlen = rt->dst.dev->needed_tailroom;
370
	skb = sock_alloc_send_skb(sk,
371
				  length + hlen + tlen + 15,
372
				  flags & MSG_DONTWAIT, &err);
373
	if (!skb)
374
		goto error;
375
	skb_reserve(skb, hlen);
L
Linus Torvalds 已提交
376 377

	skb->priority = sk->sk_priority;
W
Willem de Bruijn 已提交
378
	skb->mark = sockc->mark;
379
	skb->tstamp = sockc->transmit_time;
380
	skb_dst_set(skb, &rt->dst);
E
Eric Dumazet 已提交
381
	*rtp = NULL;
L
Linus Torvalds 已提交
382

383
	skb_reset_network_header(skb);
384
	iph = ip_hdr(skb);
385
	skb_put(skb, length);
L
Linus Torvalds 已提交
386 387 388

	skb->ip_summed = CHECKSUM_NONE;

389
	skb_setup_tx_timestamp(skb, sockc->tsflags);
390

391 392 393
	if (flags & MSG_CONFIRM)
		skb_set_dst_pending_confirm(skb, 1);

394
	skb->transport_header = skb->network_header;
395
	err = -EFAULT;
396
	if (memcpy_from_msg(iph, msg, length))
397
		goto error_free;
L
Linus Torvalds 已提交
398

399
	iphlen = iph->ihl * 4;
400 401 402 403 404 405 406 407 408 409 410 411 412

	/*
	 * We don't want to modify the ip header, but we do need to
	 * be sure that it won't cause problems later along the network
	 * stack.  Specifically we want to make sure that iph->ihl is a
	 * sane value.  If ihl points beyond the length of the buffer passed
	 * in, reject the frame as invalid
	 */
	err = -EINVAL;
	if (iphlen > length)
		goto error_free;

	if (iphlen >= sizeof(*iph)) {
L
Linus Torvalds 已提交
413
		if (!iph->saddr)
414
			iph->saddr = fl4->saddr;
L
Linus Torvalds 已提交
415 416 417
		iph->check   = 0;
		iph->tot_len = htons(length);
		if (!iph->id)
418
			ip_select_ident(net, skb, NULL);
L
Linus Torvalds 已提交
419 420

		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
421 422 423 424 425
		skb->transport_header += iphlen;
		if (iph->protocol == IPPROTO_ICMP &&
		    length >= iphlen + sizeof(struct icmphdr))
			icmp_out_count(net, ((struct icmphdr *)
				skb_transport_header(skb))->type);
L
Linus Torvalds 已提交
426 427
	}

428 429
	err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		      net, sk, skb, NULL, rt->dst.dev,
430
		      dst_output);
L
Linus Torvalds 已提交
431
	if (err > 0)
E
Eric Dumazet 已提交
432
		err = net_xmit_errno(err);
L
Linus Torvalds 已提交
433 434 435 436 437
	if (err)
		goto error;
out:
	return 0;

438
error_free:
L
Linus Torvalds 已提交
439 440
	kfree_skb(skb);
error:
P
Pavel Emelyanov 已提交
441
	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
E
Eric Dumazet 已提交
442 443
	if (err == -ENOBUFS && !inet->recverr)
		err = 0;
444
	return err;
L
Linus Torvalds 已提交
445 446
}

447
static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
L
Linus Torvalds 已提交
448
{
449
	int err;
L
Linus Torvalds 已提交
450

451
	if (fl4->flowi4_proto != IPPROTO_ICMP)
H
Heiko Carstens 已提交
452
		return 0;
L
Linus Torvalds 已提交
453

454
	/* We only need the first two bytes. */
455 456
	rfv->hlen = 2;

A
Al Viro 已提交
457
	err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
458 459 460
	if (err)
		return err;

461 462
	fl4->fl4_icmp_type = rfv->hdr.icmph.type;
	fl4->fl4_icmp_code = rfv->hdr.icmph.code;
463

H
Heiko Carstens 已提交
464
	return 0;
L
Linus Torvalds 已提交
465 466
}

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
		       struct sk_buff *skb)
{
	struct raw_frag_vec *rfv = from;

	if (offset < rfv->hlen) {
		int copy = min(rfv->hlen - offset, len);

		if (skb->ip_summed == CHECKSUM_PARTIAL)
			memcpy(to, rfv->hdr.c + offset, copy);
		else
			skb->csum = csum_block_add(
				skb->csum,
				csum_partial_copy_nocheck(rfv->hdr.c + offset,
							  to, copy, 0),
				odd);

		odd = 0;
		offset += copy;
		to += copy;
		len -= copy;

		if (!len)
			return 0;
	}

	offset -= rfv->hlen;

495
	return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
496 497
}

498
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
L
Linus Torvalds 已提交
499 500
{
	struct inet_sock *inet = inet_sk(sk);
501
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
502 503
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
504
	struct flowi4 fl4;
L
Linus Torvalds 已提交
505
	int free = 0;
A
Al Viro 已提交
506
	__be32 daddr;
A
Al Viro 已提交
507
	__be32 saddr;
L
Linus Torvalds 已提交
508 509
	u8  tos;
	int err;
510
	struct ip_options_data opt_copy;
511
	struct raw_frag_vec rfv;
512
	int hdrincl;
L
Linus Torvalds 已提交
513 514

	err = -EMSGSIZE;
515
	if (len > 0xFFFF)
L
Linus Torvalds 已提交
516 517
		goto out;

518
	/* hdrincl should be READ_ONCE(inet->hdrincl)
519 520
	 * but READ_ONCE() doesn't work with bit fields.
	 * Doing this indirectly yields the same result.
521 522
	 */
	hdrincl = inet->hdrincl;
523
	hdrincl = READ_ONCE(hdrincl);
L
Linus Torvalds 已提交
524 525 526 527 528 529 530
	/*
	 *	Check the flags.
	 */

	err = -EOPNOTSUPP;
	if (msg->msg_flags & MSG_OOB)	/* Mirror BSD error message */
		goto out;               /* compatibility */
531

L
Linus Torvalds 已提交
532
	/*
533
	 *	Get and verify the address.
L
Linus Torvalds 已提交
534 535 536
	 */

	if (msg->msg_namelen) {
537
		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
L
Linus Torvalds 已提交
538 539 540 541
		err = -EINVAL;
		if (msg->msg_namelen < sizeof(*usin))
			goto out;
		if (usin->sin_family != AF_INET) {
J
Joe Perches 已提交
542 543
			pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
				     __func__, current->comm);
L
Linus Torvalds 已提交
544 545 546 547 548 549 550 551 552 553 554
			err = -EAFNOSUPPORT;
			if (usin->sin_family)
				goto out;
		}
		daddr = usin->sin_addr.s_addr;
		/* ANK: I did not forget to get protocol from port field.
		 * I just do not know, who uses this weirdness.
		 * IP_HDRINCL is much more convenient.
		 */
	} else {
		err = -EDESTADDRREQ;
555
		if (sk->sk_state != TCP_ESTABLISHED)
L
Linus Torvalds 已提交
556
			goto out;
E
Eric Dumazet 已提交
557
		daddr = inet->inet_daddr;
L
Linus Torvalds 已提交
558 559
	}

560
	ipcm_init_sk(&ipc, inet);
L
Linus Torvalds 已提交
561 562

	if (msg->msg_controllen) {
563
		err = ip_cmsg_send(sk, msg, &ipc, false);
564 565
		if (unlikely(err)) {
			kfree(ipc.opt);
L
Linus Torvalds 已提交
566
			goto out;
567
		}
L
Linus Torvalds 已提交
568 569 570 571 572 573 574
		if (ipc.opt)
			free = 1;
	}

	saddr = ipc.addr;
	ipc.addr = daddr;

575 576 577 578 579 580 581 582 583 584 585 586
	if (!ipc.opt) {
		struct ip_options_rcu *inet_opt;

		rcu_read_lock();
		inet_opt = rcu_dereference(inet->inet_opt);
		if (inet_opt) {
			memcpy(&opt_copy, inet_opt,
			       sizeof(*inet_opt) + inet_opt->opt.optlen);
			ipc.opt = &opt_copy.opt;
		}
		rcu_read_unlock();
	}
L
Linus Torvalds 已提交
587 588 589 590 591 592

	if (ipc.opt) {
		err = -EINVAL;
		/* Linux does not mangle headers on raw sockets,
		 * so that IP options + IP_HDRINCL is non-sense.
		 */
593
		if (hdrincl)
L
Linus Torvalds 已提交
594
			goto done;
595
		if (ipc.opt->opt.srr) {
L
Linus Torvalds 已提交
596 597
			if (!daddr)
				goto done;
598
			daddr = ipc.opt->opt.faddr;
L
Linus Torvalds 已提交
599 600
		}
	}
601
	tos = get_rtconn_flags(&ipc, sk);
L
Linus Torvalds 已提交
602 603 604
	if (msg->msg_flags & MSG_DONTROUTE)
		tos |= RTO_ONLINK;

605
	if (ipv4_is_multicast(daddr)) {
606
		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
L
Linus Torvalds 已提交
607 608 609
			ipc.oif = inet->mc_index;
		if (!saddr)
			saddr = inet->mc_addr;
610
	} else if (!ipc.oif) {
611
		ipc.oif = inet->uc_index;
612 613 614 615 616 617 618 619 620 621 622 623 624
	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
		/* oif is set, packet is to local broadcast and
		 * and uc_index is set. oif is most likely set
		 * by sk_bound_dev_if. If uc_index != oif check if the
		 * oif is an L3 master and uc_index is an L3 slave.
		 * If so, we want to allow the send using the uc_index.
		 */
		if (ipc.oif != inet->uc_index &&
		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
							      inet->uc_index)) {
			ipc.oif = inet->uc_index;
		}
	}
L
Linus Torvalds 已提交
625

W
Willem de Bruijn 已提交
626
	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
627
			   RT_SCOPE_UNIVERSE,
628
			   hdrincl ? IPPROTO_RAW : sk->sk_protocol,
629
			   inet_sk_flowi_flags(sk) |
630
			    (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
631
			   daddr, saddr, 0, 0, sk->sk_uid);
632

633
	if (!hdrincl) {
A
Al Viro 已提交
634
		rfv.msg = msg;
635 636 637
		rfv.hlen = 0;

		err = raw_probe_proto_opt(&rfv, &fl4);
638
		if (err)
639
			goto done;
640 641 642
	}

	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
643
	rt = ip_route_output_flow(net, &fl4, sk);
644 645 646 647
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
		goto done;
L
Linus Torvalds 已提交
648 649 650 651 652 653 654 655 656 657
	}

	err = -EACCES;
	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
		goto done;

	if (msg->msg_flags & MSG_CONFIRM)
		goto do_confirm;
back_from_confirm:

658
	if (hdrincl)
A
Al Viro 已提交
659
		err = raw_send_hdrinc(sk, &fl4, msg, len,
660
				      &rt, msg->msg_flags, &ipc.sockc);
661

L
Linus Torvalds 已提交
662 663
	 else {
		if (!ipc.addr)
664
			ipc.addr = fl4.daddr;
L
Linus Torvalds 已提交
665
		lock_sock(sk);
666 667
		err = ip_append_data(sk, &fl4, raw_getfrag,
				     &rfv, len, 0,
668
				     &ipc, &rt, msg->msg_flags);
L
Linus Torvalds 已提交
669 670
		if (err)
			ip_flush_pending_frames(sk);
E
Eric Dumazet 已提交
671
		else if (!(msg->msg_flags & MSG_MORE)) {
672
			err = ip_push_pending_frames(sk, &fl4);
E
Eric Dumazet 已提交
673 674 675
			if (err == -ENOBUFS && !inet->recverr)
				err = 0;
		}
L
Linus Torvalds 已提交
676 677 678 679 680 681 682
		release_sock(sk);
	}
done:
	if (free)
		kfree(ipc.opt);
	ip_rt_put(rt);

683 684 685 686
out:
	if (err < 0)
		return err;
	return len;
L
Linus Torvalds 已提交
687 688

do_confirm:
689 690
	if (msg->msg_flags & MSG_PROBE)
		dst_confirm_neigh(&rt->dst, &fl4.daddr);
L
Linus Torvalds 已提交
691 692 693 694 695 696 697 698
	if (!(msg->msg_flags & MSG_PROBE) || len)
		goto back_from_confirm;
	err = 0;
	goto done;
}

static void raw_close(struct sock *sk, long timeout)
{
699
	/*
L
Lucas De Marchi 已提交
700
	 * Raw sockets may have direct kernel references. Kill them.
L
Linus Torvalds 已提交
701 702 703 704 705 706
	 */
	ip_ra_control(sk, 0, NULL);

	sk_common_release(sk);
}

707
static void raw_destroy(struct sock *sk)
D
Denis V. Lunev 已提交
708 709 710 711 712 713
{
	lock_sock(sk);
	ip_flush_pending_frames(sk);
	release_sock(sk);
}

L
Linus Torvalds 已提交
714 715 716 717 718
/* This gets rid of all the nasties in af_inet. -DaveM */
static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
719
	u32 tb_id = RT_TABLE_LOCAL;
L
Linus Torvalds 已提交
720 721 722 723 724
	int ret = -EINVAL;
	int chk_addr_ret;

	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
		goto out;
725 726 727 728 729 730 731 732

	if (sk->sk_bound_dev_if)
		tb_id = l3mdev_fib_table_by_index(sock_net(sk),
						 sk->sk_bound_dev_if) ? : tb_id;

	chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
					    tb_id);

L
Linus Torvalds 已提交
733 734 735 736
	ret = -EADDRNOTAVAIL;
	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
		goto out;
E
Eric Dumazet 已提交
737
	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
L
Linus Torvalds 已提交
738
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
E
Eric Dumazet 已提交
739
		inet->inet_saddr = 0;  /* Use device */
L
Linus Torvalds 已提交
740 741 742 743 744 745 746 747 748 749
	sk_dst_reset(sk);
	ret = 0;
out:	return ret;
}

/*
 *	This should be easy, if there is something there
 *	we return it, otherwise we block.
 */

750 751
static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
		       int noblock, int flags, int *addr_len)
L
Linus Torvalds 已提交
752 753 754 755
{
	struct inet_sock *inet = inet_sk(sk);
	size_t copied = 0;
	int err = -EOPNOTSUPP;
756
	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
L
Linus Torvalds 已提交
757 758 759 760 761 762
	struct sk_buff *skb;

	if (flags & MSG_OOB)
		goto out;

	if (flags & MSG_ERRQUEUE) {
763
		err = ip_recv_error(sk, msg, len, addr_len);
L
Linus Torvalds 已提交
764 765 766 767 768 769 770 771 772 773 774 775 776
		goto out;
	}

	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (!skb)
		goto out;

	copied = skb->len;
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

777
	err = skb_copy_datagram_msg(skb, 0, msg, copied);
L
Linus Torvalds 已提交
778 779 780
	if (err)
		goto done;

781
	sock_recv_ts_and_drops(msg, sk, skb);
L
Linus Torvalds 已提交
782 783 784 785

	/* Copy the address. */
	if (sin) {
		sin->sin_family = AF_INET;
786
		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
787
		sin->sin_port = 0;
L
Linus Torvalds 已提交
788
		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
789
		*addr_len = sizeof(*sin);
L
Linus Torvalds 已提交
790 791 792 793 794 795 796
	}
	if (inet->cmsg_flags)
		ip_cmsg_recv(msg, skb);
	if (flags & MSG_TRUNC)
		copied = skb->len;
done:
	skb_free_datagram(sk, skb);
797 798 799 800
out:
	if (err)
		return err;
	return copied;
L
Linus Torvalds 已提交
801 802
}

803
static int raw_sk_init(struct sock *sk)
L
Linus Torvalds 已提交
804 805 806
{
	struct raw_sock *rp = raw_sk(sk);

E
Eric Dumazet 已提交
807
	if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
L
Linus Torvalds 已提交
808 809 810 811
		memset(&rp->filter, 0, sizeof(rp->filter));
	return 0;
}

812
static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen)
L
Linus Torvalds 已提交
813 814 815
{
	if (optlen > sizeof(struct icmp_filter))
		optlen = sizeof(struct icmp_filter);
816
	if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen))
L
Linus Torvalds 已提交
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
		return -EFAULT;
	return 0;
}

static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
{
	int len, ret = -EFAULT;

	if (get_user(len, optlen))
		goto out;
	ret = -EINVAL;
	if (len < 0)
		goto out;
	if (len > sizeof(struct icmp_filter))
		len = sizeof(struct icmp_filter);
	ret = -EFAULT;
	if (put_user(len, optlen) ||
	    copy_to_user(optval, &raw_sk(sk)->filter, len))
		goto out;
	ret = 0;
out:	return ret;
}

840
static int do_raw_setsockopt(struct sock *sk, int level, int optname,
841
			     sockptr_t optval, unsigned int optlen)
L
Linus Torvalds 已提交
842 843
{
	if (optname == ICMP_FILTER) {
E
Eric Dumazet 已提交
844
		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
L
Linus Torvalds 已提交
845 846 847 848 849 850 851
			return -EOPNOTSUPP;
		else
			return raw_seticmpfilter(sk, optval, optlen);
	}
	return -ENOPROTOOPT;
}

852
static int raw_setsockopt(struct sock *sk, int level, int optname,
853
			  sockptr_t optval, unsigned int optlen)
L
Linus Torvalds 已提交
854 855
{
	if (level != SOL_RAW)
856 857 858
		return ip_setsockopt(sk, level, optname, optval, optlen);
	return do_raw_setsockopt(sk, level, optname, optval, optlen);
}
L
Linus Torvalds 已提交
859

860 861 862
static int do_raw_getsockopt(struct sock *sk, int level, int optname,
			  char __user *optval, int __user *optlen)
{
L
Linus Torvalds 已提交
863
	if (optname == ICMP_FILTER) {
E
Eric Dumazet 已提交
864
		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
L
Linus Torvalds 已提交
865 866 867 868 869 870 871
			return -EOPNOTSUPP;
		else
			return raw_geticmpfilter(sk, optval, optlen);
	}
	return -ENOPROTOOPT;
}

872 873 874 875 876 877 878 879
static int raw_getsockopt(struct sock *sk, int level, int optname,
			  char __user *optval, int __user *optlen)
{
	if (level != SOL_RAW)
		return ip_getsockopt(sk, level, optname, optval, optlen);
	return do_raw_getsockopt(sk, level, optname, optval, optlen);
}

L
Linus Torvalds 已提交
880 881 882
static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
	switch (cmd) {
J
Joe Perches 已提交
883 884
	case SIOCOUTQ: {
		int amount = sk_wmem_alloc_get(sk);
885

J
Joe Perches 已提交
886 887 888 889 890 891 892 893
		return put_user(amount, (int __user *)arg);
	}
	case SIOCINQ: {
		struct sk_buff *skb;
		int amount = 0;

		spin_lock_bh(&sk->sk_receive_queue.lock);
		skb = skb_peek(&sk->sk_receive_queue);
894
		if (skb)
J
Joe Perches 已提交
895 896 897 898
			amount = skb->len;
		spin_unlock_bh(&sk->sk_receive_queue.lock);
		return put_user(amount, (int __user *)arg);
	}
L
Linus Torvalds 已提交
899

J
Joe Perches 已提交
900
	default:
L
Linus Torvalds 已提交
901
#ifdef CONFIG_IP_MROUTE
J
Joe Perches 已提交
902
		return ipmr_ioctl(sk, cmd, (void __user *)arg);
L
Linus Torvalds 已提交
903
#else
J
Joe Perches 已提交
904
		return -ENOIOCTLCMD;
L
Linus Torvalds 已提交
905 906 907 908
#endif
	}
}

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925
#ifdef CONFIG_COMPAT
static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case SIOCOUTQ:
	case SIOCINQ:
		return -ENOIOCTLCMD;
	default:
#ifdef CONFIG_IP_MROUTE
		return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
		return -ENOIOCTLCMD;
#endif
	}
}
#endif

926 927 928 929 930 931
int raw_abort(struct sock *sk, int err)
{
	lock_sock(sk);

	sk->sk_err = err;
	sk->sk_error_report(sk);
932
	__udp_disconnect(sk, 0);
933 934 935 936 937 938 939

	release_sock(sk);

	return 0;
}
EXPORT_SYMBOL_GPL(raw_abort);

L
Linus Torvalds 已提交
940
struct proto raw_prot = {
941 942 943
	.name		   = "RAW",
	.owner		   = THIS_MODULE,
	.close		   = raw_close,
D
Denis V. Lunev 已提交
944
	.destroy	   = raw_destroy,
945
	.connect	   = ip4_datagram_connect,
946
	.disconnect	   = __udp_disconnect,
947
	.ioctl		   = raw_ioctl,
948
	.init		   = raw_sk_init,
949 950 951 952 953 954
	.setsockopt	   = raw_setsockopt,
	.getsockopt	   = raw_getsockopt,
	.sendmsg	   = raw_sendmsg,
	.recvmsg	   = raw_recvmsg,
	.bind		   = raw_bind,
	.backlog_rcv	   = raw_rcv_skb,
955
	.release_cb	   = ip4_datagram_release_cb,
956 957
	.hash		   = raw_hash_sk,
	.unhash		   = raw_unhash_sk,
958
	.obj_size	   = sizeof(struct raw_sock),
959 960
	.useroffset	   = offsetof(struct raw_sock, filter),
	.usersize	   = sizeof_field(struct raw_sock, filter),
961
	.h.raw_hash	   = &raw_v4_hashinfo,
962
#ifdef CONFIG_COMPAT
963
	.compat_ioctl	   = compat_raw_ioctl,
964
#endif
965
	.diag_destroy	   = raw_abort,
L
Linus Torvalds 已提交
966 967 968 969 970 971
};

#ifdef CONFIG_PROC_FS
static struct sock *raw_get_first(struct seq_file *seq)
{
	struct sock *sk;
972
	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
973
	struct raw_iter_state *state = raw_seq_private(seq);
L
Linus Torvalds 已提交
974

975 976
	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
			++state->bucket) {
977
		sk_for_each(sk, &h->ht[state->bucket])
978
			if (sock_net(sk) == seq_file_net(seq))
L
Linus Torvalds 已提交
979 980 981 982 983 984 985 986 987
				goto found;
	}
	sk = NULL;
found:
	return sk;
}

static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
988
	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
989
	struct raw_iter_state *state = raw_seq_private(seq);
L
Linus Torvalds 已提交
990 991 992 993 994

	do {
		sk = sk_next(sk);
try_again:
		;
995
	} while (sk && sock_net(sk) != seq_file_net(seq));
L
Linus Torvalds 已提交
996

997
	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
998
		sk = sk_head(&h->ht[state->bucket]);
L
Linus Torvalds 已提交
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
		goto try_again;
	}
	return sk;
}

static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
{
	struct sock *sk = raw_get_first(seq);

	if (sk)
		while (pos && (sk = raw_get_next(seq, sk)) != NULL)
			--pos;
	return pos ? NULL : sk;
}

1014
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
1015
	__acquires(&h->lock)
L
Linus Torvalds 已提交
1016
{
1017
	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
1018

1019
	read_lock(&h->lock);
L
Linus Torvalds 已提交
1020 1021
	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
1022
EXPORT_SYMBOL_GPL(raw_seq_start);
L
Linus Torvalds 已提交
1023

1024
void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
L
Linus Torvalds 已提交
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
{
	struct sock *sk;

	if (v == SEQ_START_TOKEN)
		sk = raw_get_first(seq);
	else
		sk = raw_get_next(seq, v);
	++*pos;
	return sk;
}
1035
EXPORT_SYMBOL_GPL(raw_seq_next);
L
Linus Torvalds 已提交
1036

1037
void raw_seq_stop(struct seq_file *seq, void *v)
1038
	__releases(&h->lock)
L
Linus Torvalds 已提交
1039
{
1040
	struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
1041

1042
	read_unlock(&h->lock);
L
Linus Torvalds 已提交
1043
}
1044
EXPORT_SYMBOL_GPL(raw_seq_stop);
L
Linus Torvalds 已提交
1045

1046
static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
L
Linus Torvalds 已提交
1047 1048
{
	struct inet_sock *inet = inet_sk(sp);
E
Eric Dumazet 已提交
1049 1050
	__be32 dest = inet->inet_daddr,
	       src = inet->inet_rcv_saddr;
L
Linus Torvalds 已提交
1051
	__u16 destp = 0,
E
Eric Dumazet 已提交
1052
	      srcp  = inet->inet_num;
L
Linus Torvalds 已提交
1053

1054
	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
1055
		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
1056
		i, src, srcp, dest, destp, sp->sk_state,
1057 1058
		sk_wmem_alloc_get(sp),
		sk_rmem_alloc_get(sp),
1059 1060 1061
		0, 0L, 0,
		from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
		0, sock_i_ino(sp),
1062
		refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
L
Linus Torvalds 已提交
1063 1064 1065 1066 1067
}

static int raw_seq_show(struct seq_file *seq, void *v)
{
	if (v == SEQ_START_TOKEN)
1068 1069
		seq_printf(seq, "  sl  local_address rem_address   st tx_queue "
				"rx_queue tr tm->when retrnsmt   uid  timeout "
E
Eric Dumazet 已提交
1070
				"inode ref pointer drops\n");
1071 1072
	else
		raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
L
Linus Torvalds 已提交
1073 1074 1075
	return 0;
}

1076
static const struct seq_operations raw_seq_ops = {
L
Linus Torvalds 已提交
1077 1078 1079 1080 1081 1082
	.start = raw_seq_start,
	.next  = raw_seq_next,
	.stop  = raw_seq_stop,
	.show  = raw_seq_show,
};

1083
static __net_init int raw_init_net(struct net *net)
L
Linus Torvalds 已提交
1084
{
1085 1086
	if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
			sizeof(struct raw_iter_state), &raw_v4_hashinfo))
L
Linus Torvalds 已提交
1087
		return -ENOMEM;
1088

L
Linus Torvalds 已提交
1089 1090 1091
	return 0;
}

1092 1093
static __net_exit void raw_exit_net(struct net *net)
{
1094
	remove_proc_entry("raw", net->proc_net);
1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
}

static __net_initdata struct pernet_operations raw_net_ops = {
	.init = raw_init_net,
	.exit = raw_exit_net,
};

int __init raw_proc_init(void)
{
	return register_pernet_subsys(&raw_net_ops);
}

L
Linus Torvalds 已提交
1107 1108
void __init raw_proc_exit(void)
{
1109
	unregister_pernet_subsys(&raw_net_ops);
L
Linus Torvalds 已提交
1110
}
1111
#endif /* CONFIG_PROC_FS */
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135

static void raw_sysctl_init_net(struct net *net)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
	net->ipv4.sysctl_raw_l3mdev_accept = 1;
#endif
}

static int __net_init raw_sysctl_init(struct net *net)
{
	raw_sysctl_init_net(net);
	return 0;
}

static struct pernet_operations __net_initdata raw_sysctl_ops = {
	.init	= raw_sysctl_init,
};

void __init raw_init(void)
{
	raw_sysctl_init_net(&init_net);
	if (register_pernet_subsys(&raw_sysctl_ops))
		panic("RAW: failed to init sysctl parameters.\n");
}