udp.c 84.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The User Datagram Protocol (UDP).
 *
9
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
10 11
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
12
 *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 * Fixes:
 *		Alan Cox	:	verify_area() calls
 *		Alan Cox	: 	stopped close while in use off icmp
 *					messages. Not a fix but a botch that
 *					for udp at least is 'valid'.
 *		Alan Cox	:	Fixed icmp handling properly
 *		Alan Cox	: 	Correct error for oversized datagrams
22 23
 *		Alan Cox	:	Tidied select() semantics.
 *		Alan Cox	:	udp_err() fixed properly, also now
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 *					select and read wake correctly on errors
 *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
 *		Alan Cox	:	UDP can count its memory
 *		Alan Cox	:	send to an unknown connection causes
 *					an ECONNREFUSED off the icmp, but
 *					does NOT close.
 *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
 *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
 *					bug no longer crashes it.
 *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
 *		Alan Cox	:	Uses skb_free_datagram
 *		Alan Cox	:	Added get/set sockopt support.
 *		Alan Cox	:	Broadcasting without option set returns EACCES.
 *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
 *		Alan Cox	:	Use ip_tos and ip_ttl
 *		Alan Cox	:	SNMP Mibs
 *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
 *		Matt Dillon	:	UDP length checks.
 *		Alan Cox	:	Smarter af_inet used properly.
 *		Alan Cox	:	Use new kernel side addressing.
 *		Alan Cox	:	Incorrect return on truncated datagram receive.
 *	Arnt Gulbrandsen 	:	New udp_send and stuff
 *		Alan Cox	:	Cache last socket
 *		Alan Cox	:	Route cache
 *		Jon Peatfield	:	Minor efficiency fix to sendto().
 *		Mike Shaver	:	RFC1122 checks.
 *		Alan Cox	:	Nonblocking error fix.
 *	Willy Konynenberg	:	Transparent proxying support.
 *		Mike McLagan	:	Routing by source
 *		David S. Miller	:	New socket lookup architecture.
 *					Last socket cache retained as it
 *					does have a high hit rate.
 *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
 *		Andi Kleen	:	Some cleanups, cache destination entry
58
 *					for connect.
L
Linus Torvalds 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
 *					return ENOTCONN for unconnected sockets (POSIX)
 *		Janos Farkas	:	don't deliver multi/broadcasts to a different
 *					bound-to-device socket
 *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
 *					datagrams.
 *	Hirokazu Takahashi	:	sendfile() on UDP works now.
 *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
 *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
 *					a single port at the same time.
 *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
72
 *	James Chapman		:	Add L2TP encapsulation type.
L
Linus Torvalds 已提交
73
 */
74

75 76
#define pr_fmt(fmt) "UDP: " fmt

J
Jakub Kicinski 已提交
77
#include <linux/bpf-cgroup.h>
78
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
79
#include <asm/ioctls.h>
M
Mike Rapoport 已提交
80
#include <linux/memblock.h>
81
#include <linux/highmem.h>
L
Linus Torvalds 已提交
82 83 84 85 86
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
87
#include <linux/igmp.h>
88
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
89 90 91 92 93 94
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
95
#include <linux/slab.h>
96
#include <net/tcp_states.h>
L
Linus Torvalds 已提交
97 98 99
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
100
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
101
#include <net/icmp.h>
S
Shawn Bohrer 已提交
102
#include <net/inet_hashtables.h>
103
#include <net/ip_tunnels.h>
L
Linus Torvalds 已提交
104 105 106
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
107
#include <trace/events/udp.h>
108
#include <linux/static_key.h>
109
#include <linux/btf_ids.h>
110
#include <trace/events/skb.h>
111
#include <net/busy_poll.h>
112
#include "udp_impl.h"
113
#include <net/sock_reuseport.h>
E
Eric Dumazet 已提交
114
#include <net/addrconf.h>
115
#include <net/udp_tunnel.h>
116 117 118
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
L
Linus Torvalds 已提交
119

120
struct udp_table udp_table __read_mostly;
121
EXPORT_SYMBOL(udp_table);
L
Linus Torvalds 已提交
122

E
Eric Dumazet 已提交
123
long sysctl_udp_mem[3] __read_mostly;
H
Hideo Aoki 已提交
124
EXPORT_SYMBOL(sysctl_udp_mem);
E
Eric Dumazet 已提交
125

126
atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
H
Hideo Aoki 已提交
127
EXPORT_SYMBOL(udp_memory_allocated);
128 129
DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
H
Hideo Aoki 已提交
130

131 132
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
133

134
static int udp_lib_lport_inuse(struct net *net, __u16 num,
135
			       const struct udp_hslot *hslot,
136
			       unsigned long *bitmap,
137
			       struct sock *sk, unsigned int log)
L
Linus Torvalds 已提交
138
{
139
	struct sock *sk2;
140
	kuid_t uid = sock_i_uid(sk);
141

142
	sk_for_each(sk2, &hslot->head) {
143 144
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
145
		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
146 147 148
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
149
		    inet_rcv_saddr_equal(sk, sk2, true)) {
150 151 152 153 154 155 156 157 158 159 160
			if (sk2->sk_reuseport && sk->sk_reuseport &&
			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
			    uid_eq(uid, sock_i_uid(sk2))) {
				if (!bitmap)
					return 0;
			} else {
				if (!bitmap)
					return 1;
				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
					  bitmap);
			}
161
		}
162
	}
163 164 165
	return 0;
}

E
Eric Dumazet 已提交
166 167 168 169 170
/*
 * Note: we still hold spinlock of primary hash chain, so no other writer
 * can insert/delete a socket with local_port == num
 */
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
171
				struct udp_hslot *hslot2,
172
				struct sock *sk)
E
Eric Dumazet 已提交
173 174
{
	struct sock *sk2;
175
	kuid_t uid = sock_i_uid(sk);
E
Eric Dumazet 已提交
176 177 178
	int res = 0;

	spin_lock(&hslot2->lock);
179
	udp_portaddr_for_each_entry(sk2, &hslot2->head) {
180 181 182 183 184 185
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
		    (udp_sk(sk2)->udp_port_hash == num) &&
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
186
		    inet_rcv_saddr_equal(sk, sk2, true)) {
187 188 189 190 191 192 193
			if (sk2->sk_reuseport && sk->sk_reuseport &&
			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
			    uid_eq(uid, sock_i_uid(sk2))) {
				res = 0;
			} else {
				res = 1;
			}
E
Eric Dumazet 已提交
194 195
			break;
		}
196
	}
E
Eric Dumazet 已提交
197 198 199 200
	spin_unlock(&hslot2->lock);
	return res;
}

201
static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
202 203 204 205 206
{
	struct net *net = sock_net(sk);
	kuid_t uid = sock_i_uid(sk);
	struct sock *sk2;

207
	sk_for_each(sk2, &hslot->head) {
208 209 210 211 212 213 214
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
215
		    inet_rcv_saddr_equal(sk, sk2, false)) {
216 217
			return reuseport_add_sock(sk, sk2,
						  inet_rcv_saddr_any(sk));
218 219 220
		}
	}

221
	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
222 223
}

224
/**
225
 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
226 227 228
 *
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
L
Lucas De Marchi 已提交
229
 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
E
Eric Dumazet 已提交
230
 *                   with NULL address
231
 */
232
int udp_lib_get_port(struct sock *sk, unsigned short snum,
E
Eric Dumazet 已提交
233
		     unsigned int hash2_nulladdr)
234
{
235
	struct udp_hslot *hslot, *hslot2;
236
	struct udp_table *udptable = sk->sk_prot->h.udp_table;
237
	int    error = 1;
238
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
239

240
	if (!snum) {
E
Eric Dumazet 已提交
241
		int low, high, remaining;
242
		unsigned int rand;
243 244
		unsigned short first, last;
		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
245

246
		inet_get_local_port_range(net, &low, &high);
247
		remaining = (high - low) + 1;
248

249
		rand = get_random_u32();
250
		first = reciprocal_scale(rand, remaining) + low;
251 252 253
		/*
		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
		 */
254
		rand = (rand | 1) * (udptable->mask + 1);
E
Eric Dumazet 已提交
255 256
		last = first + udptable->mask + 1;
		do {
257
			hslot = udp_hashslot(udptable, net, first);
258
			bitmap_zero(bitmap, PORTS_PER_CHAIN);
259
			spin_lock_bh(&hslot->lock);
260
			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
261
					    udptable->log);
262 263 264 265 266 267 268

			snum = first;
			/*
			 * Iterate on all possible values of snum for this hash.
			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
			 * give us randomization and full range coverage.
			 */
E
Eric Dumazet 已提交
269
			do {
270
				if (low <= snum && snum <= high &&
271
				    !test_bit(snum >> udptable->log, bitmap) &&
272
				    !inet_is_local_reserved_port(net, snum))
273 274 275 276
					goto found;
				snum += rand;
			} while (snum != first);
			spin_unlock_bh(&hslot->lock);
277
			cond_resched();
E
Eric Dumazet 已提交
278
		} while (++first != last);
279
		goto fail;
280
	} else {
281
		hslot = udp_hashslot(udptable, net, snum);
282
		spin_lock_bh(&hslot->lock);
E
Eric Dumazet 已提交
283 284 285 286 287 288 289 290 291 292 293
		if (hslot->count > 10) {
			int exist;
			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;

			slot2          &= udptable->mask;
			hash2_nulladdr &= udptable->mask;

			hslot2 = udp_hashslot2(udptable, slot2);
			if (hslot->count < hslot2->count)
				goto scan_primary_hash;

294
			exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
E
Eric Dumazet 已提交
295 296 297
			if (!exist && (hash2_nulladdr != slot2)) {
				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
				exist = udp_lib_lport_inuse2(net, snum, hslot2,
298
							     sk);
E
Eric Dumazet 已提交
299 300 301 302 303 304 305
			}
			if (exist)
				goto fail_unlock;
			else
				goto found;
		}
scan_primary_hash:
306
		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
307 308
			goto fail_unlock;
	}
309
found:
E
Eric Dumazet 已提交
310
	inet_sk(sk)->inet_num = snum;
311 312
	udp_sk(sk)->udp_port_hash = snum;
	udp_sk(sk)->udp_portaddr_hash ^= snum;
L
Linus Torvalds 已提交
313
	if (sk_unhashed(sk)) {
314
		if (sk->sk_reuseport &&
315
		    udp_reuseport_add_sock(sk, hslot)) {
316 317 318 319 320 321
			inet_sk(sk)->inet_num = 0;
			udp_sk(sk)->udp_port_hash = 0;
			udp_sk(sk)->udp_portaddr_hash ^= snum;
			goto fail_unlock;
		}

322
		sk_add_node_rcu(sk, &hslot->head);
E
Eric Dumazet 已提交
323
		hslot->count++;
324
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
325 326 327

		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
		spin_lock(&hslot2->lock);
328
		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
329 330 331
		    sk->sk_family == AF_INET6)
			hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
332
		else
333 334
			hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
335 336
		hslot2->count++;
		spin_unlock(&hslot2->lock);
L
Linus Torvalds 已提交
337
	}
338
	sock_set_flag(sk, SOCK_RCU_FREE);
339
	error = 0;
340 341
fail_unlock:
	spin_unlock_bh(&hslot->lock);
L
Linus Torvalds 已提交
342
fail:
343 344
	return error;
}
E
Eric Dumazet 已提交
345
EXPORT_SYMBOL(udp_lib_get_port);
346

347
int udp_v4_get_port(struct sock *sk, unsigned short snum)
348
{
E
Eric Dumazet 已提交
349
	unsigned int hash2_nulladdr =
350
		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
E
Eric Dumazet 已提交
351
	unsigned int hash2_partial =
352
		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
E
Eric Dumazet 已提交
353

354
	/* precompute partial secondary hash */
E
Eric Dumazet 已提交
355
	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
356
	return udp_lib_get_port(sk, snum, hash2_nulladdr);
357 358
}

359 360
static int compute_score(struct sock *sk, struct net *net,
			 __be32 saddr, __be16 sport,
361
			 __be32 daddr, unsigned short hnum,
362
			 int dif, int sdif)
363
{
364 365
	int score;
	struct inet_sock *inet;
366
	bool dev_match;
367

368 369 370 371
	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    ipv6_only_sock(sk))
		return -1;
372

373 374
	if (sk->sk_rcv_saddr != daddr)
		return -1;
375

376
	score = (sk->sk_family == PF_INET) ? 2 : 1;
377

378
	inet = inet_sk(sk);
379 380 381 382 383 384 385 386 387 388 389 390
	if (inet->inet_daddr) {
		if (inet->inet_daddr != saddr)
			return -1;
		score += 4;
	}

	if (inet->inet_dport) {
		if (inet->inet_dport != sport)
			return -1;
		score += 4;
	}

391 392 393 394
	dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
					dif, sdif);
	if (!dev_match)
		return -1;
395 396
	if (sk->sk_bound_dev_if)
		score += 4;
397

398
	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
399
		score++;
400 401 402
	return score;
}

403 404 405
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
		       const __u16 lport, const __be32 faddr,
		       const __be16 fport)
406
{
407 408 409 410
	static u32 udp_ehash_secret __read_mostly;

	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));

411
	return __inet_ehashfn(laddr, lport, faddr, fport,
412
			      udp_ehash_secret + net_hash_mix(net));
413 414
}

415 416 417 418
static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
				     struct sk_buff *skb,
				     __be32 saddr, __be16 sport,
				     __be32 daddr, unsigned short hnum)
419 420 421 422 423 424 425 426 427 428 429 430
{
	struct sock *reuse_sk = NULL;
	u32 hash;

	if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
		hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
		reuse_sk = reuseport_select_sock(sk, hash, skb,
						 sizeof(struct udphdr));
	}
	return reuse_sk;
}

431
/* called with rcu_read_lock() */
E
Eric Dumazet 已提交
432
static struct sock *udp4_lib_lookup2(struct net *net,
433 434
				     __be32 saddr, __be16 sport,
				     __be32 daddr, unsigned int hnum,
435
				     int dif, int sdif,
436 437
				     struct udp_hslot *hslot2,
				     struct sk_buff *skb)
E
Eric Dumazet 已提交
438 439
{
	struct sock *sk, *result;
P
Paolo Abeni 已提交
440
	int score, badness;
E
Eric Dumazet 已提交
441 442

	result = NULL;
443
	badness = 0;
444
	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
445
		score = compute_score(sk, net, saddr, sport,
446
				      daddr, hnum, dif, sdif);
E
Eric Dumazet 已提交
447
		if (score > badness) {
448 449
			result = lookup_reuseport(net, sk, skb,
						  saddr, sport, daddr, hnum);
450 451
			/* Fall back to scoring if group has connections */
			if (result && !reuseport_has_conns(sk, false))
452 453
				return result;

454
			result = result ? : sk;
455
			badness = score;
E
Eric Dumazet 已提交
456 457 458 459 460
		}
	}
	return result;
}

461 462 463 464
static struct sock *udp4_lookup_run_bpf(struct net *net,
					struct udp_table *udptable,
					struct sk_buff *skb,
					__be32 saddr, __be16 sport,
465
					__be32 daddr, u16 hnum, const int dif)
466 467 468 469 470 471 472
{
	struct sock *sk, *reuse_sk;
	bool no_reuseport;

	if (udptable != &udp_table)
		return NULL; /* only UDP is supported */

473 474
	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
					    daddr, hnum, dif, &sk);
475 476 477 478
	if (no_reuseport || IS_ERR_OR_NULL(sk))
		return sk;

	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
479
	if (reuse_sk)
480 481 482 483
		sk = reuse_sk;
	return sk;
}

484 485 486
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
487
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
488 489
		__be16 sport, __be32 daddr, __be16 dport, int dif,
		int sdif, struct udp_table *udptable, struct sk_buff *skb)
490 491
{
	unsigned short hnum = ntohs(dport);
492 493
	unsigned int hash2, slot2;
	struct udp_hslot *hslot2;
494
	struct sock *result, *sk;
495

496 497 498 499
	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
	slot2 = hash2 & udptable->mask;
	hslot2 = &udptable->hash2[slot2];

500
	/* Lookup connected or non-wildcard socket */
501 502
	result = udp4_lib_lookup2(net, saddr, sport,
				  daddr, hnum, dif, sdif,
503
				  hslot2, skb);
504 505 506 507 508 509
	if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
		goto done;

	/* Lookup redirect from BPF */
	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
		sk = udp4_lookup_run_bpf(net, udptable, skb,
510
					 saddr, sport, daddr, hnum, dif);
511 512 513 514
		if (sk) {
			result = sk;
			goto done;
		}
515
	}
516 517 518 519 520 521 522 523 524 525 526 527 528 529

	/* Got non-wildcard socket or error on first lookup */
	if (result)
		goto done;

	/* Lookup wildcard sockets */
	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
	slot2 = hash2 & udptable->mask;
	hslot2 = &udptable->hash2[slot2];

	result = udp4_lib_lookup2(net, saddr, sport,
				  htonl(INADDR_ANY), hnum, dif, sdif,
				  hslot2, skb);
done:
530
	if (IS_ERR(result))
531
		return NULL;
532 533
	return result;
}
534
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
535

536 537
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
538
						 struct udp_table *udptable)
539 540 541
{
	const struct iphdr *iph = ip_hdr(skb);

542
	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
543
				 iph->daddr, dport, inet_iif(skb),
544
				 inet_sdif(skb), udptable, skb);
545 546
}

547
struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
548 549
				 __be16 sport, __be16 dport)
{
550 551 552 553 554
	const struct iphdr *iph = ip_hdr(skb);

	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
				 iph->daddr, dport, inet_iif(skb),
				 inet_sdif(skb), &udp_table, NULL);
555 556
}

557 558 559
/* Must be called under rcu_read_lock().
 * Does increment socket refcount.
 */
560
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
561 562 563
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
			     __be32 daddr, __be16 dport, int dif)
{
564 565 566
	struct sock *sk;

	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
567
			       dif, 0, &udp_table, NULL);
568
	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
569 570
		sk = NULL;
	return sk;
571 572
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
573
#endif
574

S
Shawn Bohrer 已提交
575 576 577
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
				       __be16 loc_port, __be32 loc_addr,
				       __be16 rmt_port, __be32 rmt_addr,
578
				       int dif, int sdif, unsigned short hnum)
S
Shawn Bohrer 已提交
579 580 581 582 583 584 585 586 587
{
	struct inet_sock *inet = inet_sk(sk);

	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
	    ipv6_only_sock(sk) ||
588
	    !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
S
Shawn Bohrer 已提交
589
		return false;
590
	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
S
Shawn Bohrer 已提交
591 592 593 594
		return false;
	return true;
}

595 596 597
DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
P
Paolo Abeni 已提交
598
	static_branch_inc(&udp_encap_needed_key);
599 600 601
}
EXPORT_SYMBOL(udp_encap_enable);

602 603 604 605 606 607
void udp_encap_disable(void)
{
	static_branch_dec(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_disable);

608 609 610 611 612 613 614 615 616
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
 * through error handlers in encapsulations looking for a match.
 */
static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{
	int i;

	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
		int (*handler)(struct sk_buff *skb, u32 info);
617
		const struct ip_tunnel_encap_ops *encap;
618

619 620
		encap = rcu_dereference(iptun_encaps[i]);
		if (!encap)
621
			continue;
622
		handler = encap->err_handler;
623 624 625 626 627 628 629
		if (handler && !handler(skb, info))
			return 0;
	}

	return -ENOENT;
}

630 631 632 633 634 635 636
/* Try to match ICMP errors to UDP tunnels by looking up a socket without
 * reversing source and destination port: this will match tunnels that force the
 * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
 * lwtunnels might actually break this assumption by being configured with
 * different destination ports on endpoints, in this case we won't be able to
 * trace ICMP messages back to them.
 *
637 638 639 640
 * If this doesn't match any socket, probe tunnels with arbitrary destination
 * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
 * we've sent packets to won't necessarily match the local destination port.
 *
641 642 643
 * Then ask the tunnel implementation to match the error against a valid
 * association.
 *
644 645
 * Return an error if we can't find a match, the socket if we need further
 * processing, zero otherwise.
646 647 648 649 650
 */
static struct sock *__udp4_lib_err_encap(struct net *net,
					 const struct iphdr *iph,
					 struct udphdr *uh,
					 struct udp_table *udptable,
651
					 struct sock *sk,
652
					 struct sk_buff *skb, u32 info)
653
{
654
	int (*lookup)(struct sock *sk, struct sk_buff *skb);
655
	int network_offset, transport_offset;
656
	struct udp_sock *up;
657 658 659 660 661 662 663 664 665 666

	network_offset = skb_network_offset(skb);
	transport_offset = skb_transport_offset(skb);

	/* Network header needs to point to the outer IPv4 header inside ICMP */
	skb_reset_network_header(skb);

	/* Transport header needs to point to the UDP header */
	skb_set_transport_header(skb, iph->ihl << 2);

667 668 669 670 671 672 673 674 675 676
	if (sk) {
		up = udp_sk(sk);

		lookup = READ_ONCE(up->encap_err_lookup);
		if (lookup && lookup(sk, skb))
			sk = NULL;

		goto out;
	}

677 678 679 680
	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
			       udptable, NULL);
	if (sk) {
681
		up = udp_sk(sk);
682 683 684 685 686 687

		lookup = READ_ONCE(up->encap_err_lookup);
		if (!lookup || lookup(sk, skb))
			sk = NULL;
	}

688
out:
689 690
	if (!sk)
		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
691 692 693 694 695 696 697

	skb_set_transport_header(skb, transport_offset);
	skb_set_network_header(skb, network_offset);

	return sk;
}

698 699 700 701 702 703 704 705 706 707 708
/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.
 * Header points to the ip header of the error packet. We move
 * on past this. Then (as it used to claim before adjustment)
 * header points to the first 8 bytes of the udp header.  We need
 * to find the appropriate port.
 */

709
int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
710 711
{
	struct inet_sock *inet;
712
	const struct iphdr *iph = (const struct iphdr *)skb->data;
E
Eric Dumazet 已提交
713
	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
714 715
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
716
	bool tunnel = false;
717 718 719
	struct sock *sk;
	int harderr;
	int err;
720
	struct net *net = dev_net(skb->dev);
721

722
	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
723 724
			       iph->saddr, uh->source, skb->dev->ifindex,
			       inet_sdif(skb), udptable, NULL);
725

726
	if (!sk || udp_sk(sk)->encap_type) {
727
		/* No socket for error: try tunnels before discarding */
728
		if (static_branch_unlikely(&udp_encap_needed_key)) {
729
			sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
730 731 732
						  info);
			if (!sk)
				return 0;
733 734
		} else
			sk = ERR_PTR(-ENOENT);
735

736
		if (IS_ERR(sk)) {
737
			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
738
			return PTR_ERR(sk);
739
		}
740

741
		tunnel = true;
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
	}

	err = 0;
	harderr = 0;
	inet = inet_sk(sk);

	switch (type) {
	default:
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	case ICMP_SOURCE_QUENCH:
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		harderr = 1;
		break;
	case ICMP_DEST_UNREACH:
		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
761
			ipv4_sk_update_pmtu(skb, sk, info);
762 763 764 765 766 767 768 769 770 771 772 773 774
			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
				err = EMSGSIZE;
				harderr = 1;
				break;
			}
			goto out;
		}
		err = EHOSTUNREACH;
		if (code <= NR_ICMP_UNREACH) {
			harderr = icmp_err_convert[code].fatal;
			err = icmp_err_convert[code].errno;
		}
		break;
775 776
	case ICMP_REDIRECT:
		ipv4_sk_redirect(skb, sk);
777
		goto out;
778 779 780 781 782 783
	}

	/*
	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
	 *	4.1.3.3.
	 */
784 785
	if (tunnel) {
		/* ...not for tunnels though: we don't have a sending socket */
786 787
		if (udp_sk(sk)->encap_err_rcv)
			udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
788 789
		goto out;
	}
790 791 792
	if (!inet->recverr) {
		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
			goto out;
793
	} else
E
Eric Dumazet 已提交
794
		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
795

796
	sk->sk_err = err;
797
	sk_error_report(sk);
798
out:
799
	return 0;
800 801
}

802
int udp_err(struct sk_buff *skb, u32 info)
803
{
804
	return __udp4_lib_err(skb, info, &udp_table);
805 806 807 808 809
}

/*
 * Throw away all pending data and cancel the corking. Socket is locked.
 */
810
void udp_flush_pending_frames(struct sock *sk)
811 812 813 814 815 816 817 818 819
{
	struct udp_sock *up = udp_sk(sk);

	if (up->pending) {
		up->len = 0;
		up->pending = 0;
		ip_flush_pending_frames(sk);
	}
}
820
EXPORT_SYMBOL(udp_flush_pending_frames);
821 822

/**
H
Herbert Xu 已提交
823
 * 	udp4_hwcsum  -  handle outgoing HW checksumming
824 825
 * 	@skb: 	sk_buff containing the filled-in UDP header
 * 	        (checksum field must be zeroed out)
H
Herbert Xu 已提交
826 827
 *	@src:	source IP address
 *	@dst:	destination IP address
828
 */
829
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
830 831
{
	struct udphdr *uh = udp_hdr(skb);
H
Herbert Xu 已提交
832 833 834
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
	int hlen = len;
835 836
	__wsum csum = 0;

837
	if (!skb_has_frag_list(skb)) {
838 839 840 841 842
		/*
		 * Only one fragment on the socket.
		 */
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
H
Herbert Xu 已提交
843 844
		uh->check = ~csum_tcpudp_magic(src, dst, len,
					       IPPROTO_UDP, 0);
845
	} else {
846 847
		struct sk_buff *frags;

848 849 850 851 852
		/*
		 * HW-checksum won't work as there are two or more
		 * fragments on the socket so that all csums of sk_buffs
		 * should be together
		 */
853
		skb_walk_frags(skb, frags) {
H
Herbert Xu 已提交
854 855
			csum = csum_add(csum, frags->csum);
			hlen -= frags->len;
856
		}
857

H
Herbert Xu 已提交
858
		csum = skb_checksum(skb, offset, hlen, csum);
859 860 861 862 863 864 865
		skb->ip_summed = CHECKSUM_NONE;

		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;
	}
}
866
EXPORT_SYMBOL_GPL(udp4_hwcsum);
867

868 869 870 871 872 873 874 875
/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp_set_csum(bool nocheck, struct sk_buff *skb,
		  __be32 saddr, __be32 daddr, int len)
{
	struct udphdr *uh = udp_hdr(skb);

876
	if (nocheck) {
877
		uh->check = 0;
878
	} else if (skb_is_gso(skb)) {
879
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
880 881 882 883 884
	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
		uh->check = 0;
		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;
885
	} else {
886 887 888 889 890 891 892 893
		skb->ip_summed = CHECKSUM_PARTIAL;
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
	}
}
EXPORT_SYMBOL(udp_set_csum);

894 895
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
			struct inet_cork *cork)
896
{
H
Herbert Xu 已提交
897
	struct sock *sk = skb->sk;
898 899
	struct inet_sock *inet = inet_sk(sk);
	struct udphdr *uh;
900
	int err;
901
	int is_udplite = IS_UDPLITE(sk);
H
Herbert Xu 已提交
902 903
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
J
Josh Hunt 已提交
904
	int datalen = len - sizeof(*uh);
905 906 907 908 909 910
	__wsum csum = 0;

	/*
	 * Create a UDP header
	 */
	uh = udp_hdr(skb);
H
Herbert Xu 已提交
911
	uh->source = inet->inet_sport;
912
	uh->dest = fl4->fl4_dport;
H
Herbert Xu 已提交
913
	uh->len = htons(len);
914 915
	uh->check = 0;

916 917 918 919
	if (cork->gso_size) {
		const int hlen = skb_network_header_len(skb) +
				 sizeof(struct udphdr);

920 921
		if (hlen + cork->gso_size > cork->fragsize) {
			kfree_skb(skb);
922
			return -EINVAL;
923
		}
924
		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
925
			kfree_skb(skb);
926
			return -EINVAL;
927 928 929
		}
		if (sk->sk_no_check_tx) {
			kfree_skb(skb);
930
			return -EINVAL;
931
		}
932
		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
933 934
		    dst_xfrm(skb_dst(skb))) {
			kfree_skb(skb);
935
			return -EIO;
936
		}
937

J
Josh Hunt 已提交
938 939 940 941 942 943
		if (datalen > cork->gso_size) {
			skb_shinfo(skb)->gso_size = cork->gso_size;
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
								 cork->gso_size);
		}
944
		goto csum_partial;
945 946
	}

947
	if (is_udplite)  				 /*     UDP-Lite      */
H
Herbert Xu 已提交
948
		csum = udplite_csum(skb);
949

950
	else if (sk->sk_no_check_tx) {			 /* UDP csum off */
951 952 953 954 955

		skb->ip_summed = CHECKSUM_NONE;
		goto send;

	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
956
csum_partial:
957

958
		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
959 960
		goto send;

H
Herbert Xu 已提交
961 962
	} else
		csum = udp_csum(skb);
963 964

	/* add protocol-dependent pseudo-header */
965
	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
E
Eric Dumazet 已提交
966
				      sk->sk_protocol, csum);
967 968 969 970
	if (uh->check == 0)
		uh->check = CSUM_MANGLED_0;

send:
E
Eric Dumazet 已提交
971
	err = ip_send_skb(sock_net(sk), skb);
E
Eric Dumazet 已提交
972 973
	if (err) {
		if (err == -ENOBUFS && !inet->recverr) {
974 975
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_SNDBUFERRORS, is_udplite);
E
Eric Dumazet 已提交
976 977 978
			err = 0;
		}
	} else
979 980
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_OUTDATAGRAMS, is_udplite);
H
Herbert Xu 已提交
981 982 983 984 985 986
	return err;
}

/*
 * Push out all pending data as one UDP datagram. Socket is locked.
 */
987
int udp_push_pending_frames(struct sock *sk)
H
Herbert Xu 已提交
988 989 990
{
	struct udp_sock  *up = udp_sk(sk);
	struct inet_sock *inet = inet_sk(sk);
D
David S. Miller 已提交
991
	struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
H
Herbert Xu 已提交
992 993 994
	struct sk_buff *skb;
	int err = 0;

995
	skb = ip_finish_skb(sk, fl4);
H
Herbert Xu 已提交
996 997 998
	if (!skb)
		goto out;

999
	err = udp_send_skb(skb, fl4, &inet->cork.base);
H
Herbert Xu 已提交
1000

1001 1002 1003 1004 1005
out:
	up->len = 0;
	up->pending = 0;
	return err;
}
1006
EXPORT_SYMBOL(udp_push_pending_frames);
1007

W
Willem de Bruijn 已提交
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
	switch (cmsg->cmsg_type) {
	case UDP_SEGMENT:
		if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
			return -EINVAL;
		*gso_size = *(__u16 *)CMSG_DATA(cmsg);
		return 0;
	default:
		return -EINVAL;
	}
}

int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
{
	struct cmsghdr *cmsg;
	bool need_ip = false;
	int err;

	for_each_cmsghdr(cmsg, msg) {
		if (!CMSG_OK(msg, cmsg))
			return -EINVAL;

		if (cmsg->cmsg_level != SOL_UDP) {
			need_ip = true;
			continue;
		}

		err = __udp_cmsg_send(cmsg, gso_size);
		if (err)
			return err;
	}

	return need_ip;
}
EXPORT_SYMBOL_GPL(udp_cmsg_send);

1045
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1046 1047 1048
{
	struct inet_sock *inet = inet_sk(sk);
	struct udp_sock *up = udp_sk(sk);
A
Andrey Ignatov 已提交
1049
	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
1050
	struct flowi4 fl4_stack;
D
David S. Miller 已提交
1051
	struct flowi4 *fl4;
1052 1053 1054 1055 1056 1057 1058 1059 1060
	int ulen = len;
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
	int free = 0;
	int connected = 0;
	__be32 daddr, faddr, saddr;
	__be16 dport;
	u8  tos;
	int err, is_udplite = IS_UDPLITE(sk);
1061
	int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
1062
	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
H
Herbert Xu 已提交
1063
	struct sk_buff *skb;
1064
	struct ip_options_data opt_copy;
1065 1066 1067 1068 1069 1070 1071 1072

	if (len > 0xFFFF)
		return -EMSGSIZE;

	/*
	 *	Check the flags.
	 */

E
Eric Dumazet 已提交
1073
	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
1074 1075
		return -EOPNOTSUPP;

H
Herbert Xu 已提交
1076 1077
	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;

1078
	fl4 = &inet->cork.fl.u.ip4;
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
	if (up->pending) {
		/*
		 * There are pending frames.
		 * The socket lock must be held while it's corked.
		 */
		lock_sock(sk);
		if (likely(up->pending)) {
			if (unlikely(up->pending != AF_INET)) {
				release_sock(sk);
				return -EINVAL;
			}
			goto do_append_data;
		}
		release_sock(sk);
	}
	ulen += sizeof(struct udphdr);

	/*
	 *	Get and verify the address.
	 */
A
Andrey Ignatov 已提交
1099
	if (usin) {
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
		if (msg->msg_namelen < sizeof(*usin))
			return -EINVAL;
		if (usin->sin_family != AF_INET) {
			if (usin->sin_family != AF_UNSPEC)
				return -EAFNOSUPPORT;
		}

		daddr = usin->sin_addr.s_addr;
		dport = usin->sin_port;
		if (dport == 0)
			return -EINVAL;
	} else {
		if (sk->sk_state != TCP_ESTABLISHED)
			return -EDESTADDRREQ;
E
Eric Dumazet 已提交
1114 1115
		daddr = inet->inet_daddr;
		dport = inet->inet_dport;
1116 1117 1118 1119 1120 1121
		/* Open fast path for connected socket.
		   Route will not be used, if at least one option is set.
		 */
		connected = 1;
	}

1122
	ipcm_init_sk(&ipc, inet);
1123
	ipc.gso_size = READ_ONCE(up->gso_size);
1124

1125
	if (msg->msg_controllen) {
W
Willem de Bruijn 已提交
1126 1127 1128 1129 1130
		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
		if (err > 0)
			err = ip_cmsg_send(sk, msg, &ipc,
					   sk->sk_family == AF_INET6);
		if (unlikely(err < 0)) {
1131
			kfree(ipc.opt);
1132
			return err;
1133
		}
1134 1135 1136 1137
		if (ipc.opt)
			free = 1;
		connected = 0;
	}
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	if (!ipc.opt) {
		struct ip_options_rcu *inet_opt;

		rcu_read_lock();
		inet_opt = rcu_dereference(inet->inet_opt);
		if (inet_opt) {
			memcpy(&opt_copy, inet_opt,
			       sizeof(*inet_opt) + inet_opt->opt.optlen);
			ipc.opt = &opt_copy.opt;
		}
		rcu_read_unlock();
	}
1150

1151
	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
A
Andrey Ignatov 已提交
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
					    (struct sockaddr *)usin, &ipc.addr);
		if (err)
			goto out_free;
		if (usin) {
			if (usin->sin_port == 0) {
				/* BPF program set invalid port. Reject it. */
				err = -EINVAL;
				goto out_free;
			}
			daddr = usin->sin_addr.s_addr;
			dport = usin->sin_port;
		}
	}

1167 1168 1169
	saddr = ipc.addr;
	ipc.addr = faddr = daddr;

1170
	if (ipc.opt && ipc.opt->opt.srr) {
1171 1172 1173 1174
		if (!daddr) {
			err = -EINVAL;
			goto out_free;
		}
1175
		faddr = ipc.opt->opt.faddr;
1176 1177
		connected = 0;
	}
1178
	tos = get_rttos(&ipc, inet);
1179 1180
	if (sock_flag(sk, SOCK_LOCALROUTE) ||
	    (msg->msg_flags & MSG_DONTROUTE) ||
1181
	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
1182 1183 1184 1185 1186
		tos |= RTO_ONLINK;
		connected = 0;
	}

	if (ipv4_is_multicast(daddr)) {
1187
		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
1188 1189 1190 1191
			ipc.oif = inet->mc_index;
		if (!saddr)
			saddr = inet->mc_addr;
		connected = 0;
1192
	} else if (!ipc.oif) {
1193
		ipc.oif = inet->uc_index;
1194 1195
	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
		/* oif is set, packet is to local broadcast and
R
Randy Dunlap 已提交
1196
		 * uc_index is set. oif is most likely set
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
		 * by sk_bound_dev_if. If uc_index != oif check if the
		 * oif is an L3 master and uc_index is an L3 slave.
		 * If so, we want to allow the send using the uc_index.
		 */
		if (ipc.oif != inet->uc_index &&
		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
							      inet->uc_index)) {
			ipc.oif = inet->uc_index;
		}
	}
1207 1208

	if (connected)
E
Eric Dumazet 已提交
1209
		rt = (struct rtable *)sk_dst_check(sk, 0);
1210

1211
	if (!rt) {
1212
		struct net *net = sock_net(sk);
D
David Ahern 已提交
1213
		__u8 flow_flags = inet_sk_flowi_flags(sk);
1214

1215
		fl4 = &fl4_stack;
D
David Ahern 已提交
1216

W
Willem de Bruijn 已提交
1217
		flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
1218
				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
D
David Ahern 已提交
1219
				   flow_flags,
1220 1221
				   faddr, saddr, dport, inet->inet_sport,
				   sk->sk_uid);
1222

1223
		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
1224
		rt = ip_route_output_flow(net, fl4, sk);
1225 1226
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
1227
			rt = NULL;
1228
			if (err == -ENETUNREACH)
1229
				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
1230 1231 1232 1233 1234 1235 1236 1237
			goto out;
		}

		err = -EACCES;
		if ((rt->rt_flags & RTCF_BROADCAST) &&
		    !sock_flag(sk, SOCK_BROADCAST))
			goto out;
		if (connected)
1238
			sk_dst_set(sk, dst_clone(&rt->dst));
1239 1240 1241 1242 1243 1244
	}

	if (msg->msg_flags&MSG_CONFIRM)
		goto do_confirm;
back_from_confirm:

1245
	saddr = fl4->saddr;
1246
	if (!ipc.addr)
1247
		daddr = ipc.addr = fl4->daddr;
1248

H
Herbert Xu 已提交
1249 1250
	/* Lockless fast path for the non-corking case. */
	if (!corkreq) {
W
Willem de Bruijn 已提交
1251 1252
		struct inet_cork cork;

1253
		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
H
Herbert Xu 已提交
1254
				  sizeof(struct udphdr), &ipc, &rt,
W
Willem de Bruijn 已提交
1255
				  &cork, msg->msg_flags);
H
Herbert Xu 已提交
1256
		err = PTR_ERR(skb);
1257
		if (!IS_ERR_OR_NULL(skb))
1258
			err = udp_send_skb(skb, fl4, &cork);
H
Herbert Xu 已提交
1259 1260 1261
		goto out;
	}

1262 1263 1264 1265 1266 1267
	lock_sock(sk);
	if (unlikely(up->pending)) {
		/* The socket is already corked while preparing it. */
		/* ... which is an evident application bug. --ANK */
		release_sock(sk);

1268
		net_dbg_ratelimited("socket already corked\n");
1269 1270 1271 1272 1273 1274
		err = -EINVAL;
		goto out;
	}
	/*
	 *	Now cork the socket to pend data.
	 */
D
David S. Miller 已提交
1275 1276 1277
	fl4 = &inet->cork.fl.u.ip4;
	fl4->daddr = daddr;
	fl4->saddr = saddr;
1278 1279
	fl4->fl4_dport = dport;
	fl4->fl4_sport = inet->inet_sport;
1280 1281 1282 1283
	up->pending = AF_INET;

do_append_data:
	up->len += ulen;
1284
	err = ip_append_data(sk, fl4, getfrag, msg, ulen,
1285 1286
			     sizeof(struct udphdr), &ipc, &rt,
			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
	if (err)
		udp_flush_pending_frames(sk);
	else if (!corkreq)
		err = udp_push_pending_frames(sk);
	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
		up->pending = 0;
	release_sock(sk);

out:
	ip_rt_put(rt);
1297
out_free:
1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309
	if (free)
		kfree(ipc.opt);
	if (!err)
		return len;
	/*
	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
	 * we don't have a good statistic (IpOutDiscards but it can be too many
	 * things).  We could add another new stat but at least for now that
	 * seems like overkill.
	 */
	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1310 1311
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_SNDBUFERRORS, is_udplite);
1312 1313 1314 1315
	}
	return err;

do_confirm:
1316 1317
	if (msg->msg_flags & MSG_PROBE)
		dst_confirm_neigh(&rt->dst, &fl4->daddr);
1318 1319 1320 1321 1322
	if (!(msg->msg_flags&MSG_PROBE) || len)
		goto back_from_confirm;
	err = 0;
	goto out;
}
E
Eric Dumazet 已提交
1323
EXPORT_SYMBOL(udp_sendmsg);
1324 1325 1326 1327

int udp_sendpage(struct sock *sk, struct page *page, int offset,
		 size_t size, int flags)
{
1328
	struct inet_sock *inet = inet_sk(sk);
1329 1330 1331
	struct udp_sock *up = udp_sk(sk);
	int ret;

1332 1333 1334
	if (flags & MSG_SENDPAGE_NOTLAST)
		flags |= MSG_MORE;

1335 1336 1337 1338 1339 1340 1341
	if (!up->pending) {
		struct msghdr msg = {	.msg_flags = flags|MSG_MORE };

		/* Call udp_sendmsg to specify destination address which
		 * sendpage interface can't pass.
		 * This will succeed only when the socket is connected.
		 */
1342
		ret = udp_sendmsg(sk, &msg, 0);
1343 1344 1345 1346 1347 1348 1349 1350 1351
		if (ret < 0)
			return ret;
	}

	lock_sock(sk);

	if (unlikely(!up->pending)) {
		release_sock(sk);

1352
		net_dbg_ratelimited("cork failed\n");
1353 1354 1355
		return -EINVAL;
	}

1356 1357
	ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
			     page, offset, size, flags);
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
	if (ret == -EOPNOTSUPP) {
		release_sock(sk);
		return sock_no_sendpage(sk->sk_socket, page, offset,
					size, flags);
	}
	if (ret < 0) {
		udp_flush_pending_frames(sk);
		goto out;
	}

	up->len += size;
1369
	if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
1370 1371 1372 1373 1374 1375 1376 1377
		ret = udp_push_pending_frames(sk);
	if (!ret)
		ret = size;
out:
	release_sock(sk);
	return ret;
}

1378 1379
#define UDP_SKB_IS_STATELESS 0x80000000

1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
/* all head states (dst, sk, nf conntrack) except skb extensions are
 * cleared by udp_rcv().
 *
 * We need to preserve secpath, if present, to eventually process
 * IP_CMSG_PASSSEC at recvmsg() time.
 *
 * Other extensions can be cleared.
 */
static bool udp_try_make_stateless(struct sk_buff *skb)
{
	if (!skb_has_extensions(skb))
		return true;

	if (!secpath_exists(skb)) {
		skb_ext_reset(skb);
		return true;
	}

	return false;
}

1401 1402
static void udp_set_dev_scratch(struct sk_buff *skb)
{
1403
	struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
1404 1405

	BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
1406 1407
	scratch->_tsize_state = skb->truesize;
#if BITS_PER_LONG == 64
1408 1409 1410
	scratch->len = skb->len;
	scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
	scratch->is_linear = !skb_is_nonlinear(skb);
1411
#endif
1412
	if (udp_try_make_stateless(skb))
1413
		scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
1414 1415
}

1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
{
	/* We come here after udp_lib_checksum_complete() returned 0.
	 * This means that __skb_checksum_complete() might have
	 * set skb->csum_valid to 1.
	 * On 64bit platforms, we can set csum_unnecessary
	 * to true, but only if the skb is not shared.
	 */
#if BITS_PER_LONG == 64
	if (!skb_shared(skb))
		udp_skb_scratch(skb)->csum_unnecessary = true;
#endif
}

1430 1431
static int udp_skb_truesize(struct sk_buff *skb)
{
1432
	return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
1433 1434
}

1435
static bool udp_skb_has_head_state(struct sk_buff *skb)
1436
{
1437
	return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
1438 1439
}

1440
/* fully reclaim rmem/fwd memory allocated for skb */
1441 1442
static void udp_rmem_release(struct sock *sk, int size, int partial,
			     bool rx_queue_lock_held)
1443
{
1444
	struct udp_sock *up = udp_sk(sk);
1445
	struct sk_buff_head *sk_queue;
1446 1447
	int amt;

1448 1449 1450
	if (likely(partial)) {
		up->forward_deficit += size;
		size = up->forward_deficit;
1451 1452
		if (size < (sk->sk_rcvbuf >> 2) &&
		    !skb_queue_empty(&up->reader_queue))
1453 1454 1455 1456 1457 1458
			return;
	} else {
		size += up->forward_deficit;
	}
	up->forward_deficit = 0;

1459 1460 1461
	/* acquire the sk_receive_queue for fwd allocated memory scheduling,
	 * if the called don't held it already
	 */
1462
	sk_queue = &sk->sk_receive_queue;
1463 1464 1465
	if (!rx_queue_lock_held)
		spin_lock(&sk_queue->lock);

1466

1467
	sk->sk_forward_alloc += size;
1468
	amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
1469 1470 1471
	sk->sk_forward_alloc -= amt;

	if (amt)
1472
		__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
1473 1474

	atomic_sub(size, &sk->sk_rmem_alloc);
1475 1476 1477 1478

	/* this can save us from acquiring the rx queue lock on next receive */
	skb_queue_splice_tail_init(sk_queue, &up->reader_queue);

1479 1480
	if (!rx_queue_lock_held)
		spin_unlock(&sk_queue->lock);
1481 1482
}

1483
/* Note: called with reader_queue.lock held.
1484 1485 1486 1487
 * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
 * This avoids a cache line miss while receive_queue lock is held.
 * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
 */
1488
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
1489
{
1490 1491
	prefetch(&skb->data);
	udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
1492
}
1493
EXPORT_SYMBOL(udp_skb_destructor);
1494

1495
/* as above, but the caller held the rx queue lock, too */
1496
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
1497
{
1498 1499
	prefetch(&skb->data);
	udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
1500 1501
}

E
Eric Dumazet 已提交
1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
/* Idea of busylocks is to let producers grab an extra spinlock
 * to relieve pressure on the receive_queue spinlock shared by consumer.
 * Under flood, this means that only one producer can be in line
 * trying to acquire the receive_queue spinlock.
 * These busylock can be allocated on a per cpu manner, instead of a
 * per socket one (that would consume a cache line per socket)
 */
static int udp_busylocks_log __read_mostly;
static spinlock_t *udp_busylocks __read_mostly;

static spinlock_t *busylock_acquire(void *ptr)
{
	spinlock_t *busy;

	busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
	spin_lock(busy);
	return busy;
}

static void busylock_release(spinlock_t *busy)
{
	if (busy)
		spin_unlock(busy);
}

1527 1528 1529 1530
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
	struct sk_buff_head *list = &sk->sk_receive_queue;
	int rmem, delta, amt, err = -ENOMEM;
E
Eric Dumazet 已提交
1531
	spinlock_t *busy = NULL;
1532
	int size;
1533 1534 1535 1536 1537

	/* try to avoid the costly atomic add/sub pair when the receive
	 * queue is full; always allow at least a packet
	 */
	rmem = atomic_read(&sk->sk_rmem_alloc);
1538
	if (rmem > sk->sk_rcvbuf)
1539 1540
		goto drop;

1541 1542 1543 1544 1545 1546
	/* Under mem pressure, it might be helpful to help udp_recvmsg()
	 * having linear skbs :
	 * - Reduce memory overhead and thus increase receive queue capacity
	 * - Less cache line misses at copyout() time
	 * - Less work at consume_skb() (less alien page frag freeing)
	 */
E
Eric Dumazet 已提交
1547
	if (rmem > (sk->sk_rcvbuf >> 1)) {
1548
		skb_condense(skb);
E
Eric Dumazet 已提交
1549 1550 1551

		busy = busylock_acquire(sk);
	}
1552
	size = skb->truesize;
1553
	udp_set_dev_scratch(skb);
1554

1555 1556 1557 1558
	/* we drop only if the receive buf is full and the receive
	 * queue contains some other skb
	 */
	rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
1559
	if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
1560 1561 1562 1563 1564
		goto uncharge_drop;

	spin_lock(&list->lock);
	if (size >= sk->sk_forward_alloc) {
		amt = sk_mem_pages(size);
1565
		delta = amt << PAGE_SHIFT;
1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
		if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
			err = -ENOBUFS;
			spin_unlock(&list->lock);
			goto uncharge_drop;
		}

		sk->sk_forward_alloc += delta;
	}

	sk->sk_forward_alloc -= size;

1577 1578 1579
	/* no need to setup a destructor, we will explicitly release the
	 * forward allocated memory on dequeue
	 */
1580 1581 1582 1583 1584 1585 1586 1587
	sock_skb_set_dropcount(sk, skb);

	__skb_queue_tail(list, skb);
	spin_unlock(&list->lock);

	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk);

E
Eric Dumazet 已提交
1588
	busylock_release(busy);
1589 1590 1591 1592 1593 1594 1595
	return 0;

uncharge_drop:
	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);

drop:
	atomic_inc(&sk->sk_drops);
E
Eric Dumazet 已提交
1596
	busylock_release(busy);
1597 1598 1599 1600
	return err;
}
EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);

1601
void udp_destruct_common(struct sock *sk)
1602 1603
{
	/* reclaim completely the forward allocated memory */
1604
	struct udp_sock *up = udp_sk(sk);
1605 1606 1607
	unsigned int total = 0;
	struct sk_buff *skb;

1608 1609
	skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
	while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
1610 1611 1612
		total += skb->truesize;
		kfree_skb(skb);
	}
1613
	udp_rmem_release(sk, total, 0, true);
1614 1615
}
EXPORT_SYMBOL_GPL(udp_destruct_common);
1616

1617 1618 1619
static void udp_destruct_sock(struct sock *sk)
{
	udp_destruct_common(sk);
1620 1621 1622 1623 1624
	inet_sock_destruct(sk);
}

int udp_init_sock(struct sock *sk)
{
1625
	skb_queue_head_init(&udp_sk(sk)->reader_queue);
1626
	sk->sk_destruct = udp_destruct_sock;
1627
	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638
	return 0;
}

void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{
	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
		bool slow = lock_sock_fast(sk);

		sk_peek_offset_bwd(sk, len);
		unlock_sock_fast(sk, slow);
	}
P
Paolo Abeni 已提交
1639

1640 1641 1642
	if (!skb_unref(skb))
		return;

1643 1644
	/* In the more common cases we cleared the head states previously,
	 * see __udp_queue_rcv_skb().
1645
	 */
1646
	if (unlikely(udp_skb_has_head_state(skb)))
1647
		skb_release_head_state(skb);
1648
	__consume_stateless_skb(skb);
1649 1650 1651
}
EXPORT_SYMBOL_GPL(skb_consume_udp);

1652 1653 1654 1655 1656 1657
static struct sk_buff *__first_packet_length(struct sock *sk,
					     struct sk_buff_head *rcvq,
					     int *total)
{
	struct sk_buff *skb;

P
Paolo Abeni 已提交
1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
	while ((skb = skb_peek(rcvq)) != NULL) {
		if (udp_lib_checksum_complete(skb)) {
			__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
					IS_UDPLITE(sk));
			__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
					IS_UDPLITE(sk));
			atomic_inc(&sk->sk_drops);
			__skb_unlink(skb, rcvq);
			*total += skb->truesize;
			kfree_skb(skb);
		} else {
1669
			udp_skb_csum_unnecessary_set(skb);
P
Paolo Abeni 已提交
1670 1671
			break;
		}
1672 1673 1674 1675
	}
	return skb;
}

E
Eric Dumazet 已提交
1676 1677 1678 1679 1680
/**
 *	first_packet_length	- return length of first packet in receive queue
 *	@sk: socket
 *
 *	Drops all bad checksum frames, until a valid one is found.
1681
 *	Returns the length of found skb, or -1 if none is found.
E
Eric Dumazet 已提交
1682
 */
1683
static int first_packet_length(struct sock *sk)
E
Eric Dumazet 已提交
1684
{
1685 1686
	struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
E
Eric Dumazet 已提交
1687
	struct sk_buff *skb;
1688
	int total = 0;
1689
	int res;
E
Eric Dumazet 已提交
1690 1691

	spin_lock_bh(&rcvq->lock);
1692
	skb = __first_packet_length(sk, rcvq, &total);
E
Eric Dumazet 已提交
1693
	if (!skb && !skb_queue_empty_lockless(sk_queue)) {
1694 1695 1696 1697 1698
		spin_lock(&sk_queue->lock);
		skb_queue_splice_tail_init(sk_queue, rcvq);
		spin_unlock(&sk_queue->lock);

		skb = __first_packet_length(sk, rcvq, &total);
E
Eric Dumazet 已提交
1699
	}
1700
	res = skb ? skb->len : -1;
1701
	if (total)
1702
		udp_rmem_release(sk, total, 1, false);
E
Eric Dumazet 已提交
1703 1704 1705 1706
	spin_unlock_bh(&rcvq->lock);
	return res;
}

L
Linus Torvalds 已提交
1707 1708 1709
/*
 *	IOCTL requests applicable to the UDP protocol
 */
1710

L
Linus Torvalds 已提交
1711 1712
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
1713 1714
	switch (cmd) {
	case SIOCOUTQ:
L
Linus Torvalds 已提交
1715
	{
1716 1717
		int amount = sk_wmem_alloc_get(sk);

1718 1719
		return put_user(amount, (int __user *)arg);
	}
L
Linus Torvalds 已提交
1720

1721 1722
	case SIOCINQ:
	{
1723
		int amount = max_t(int, 0, first_packet_length(sk));
1724 1725 1726

		return put_user(amount, (int __user *)arg);
	}
L
Linus Torvalds 已提交
1727

1728 1729
	default:
		return -ENOIOCTLCMD;
L
Linus Torvalds 已提交
1730
	}
1731 1732

	return 0;
L
Linus Torvalds 已提交
1733
}
E
Eric Dumazet 已提交
1734
EXPORT_SYMBOL(udp_ioctl);
L
Linus Torvalds 已提交
1735

1736
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
1737
			       int *off, int *err)
1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
{
	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
	struct sk_buff_head *queue;
	struct sk_buff *last;
	long timeo;
	int error;

	queue = &udp_sk(sk)->reader_queue;
	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
	do {
		struct sk_buff *skb;

		error = sock_error(sk);
		if (error)
			break;

		error = -EAGAIN;
		do {
			spin_lock_bh(&queue->lock);
1757 1758
			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
							err, &last);
1759
			if (skb) {
1760 1761
				if (!(flags & MSG_PEEK))
					udp_skb_destructor(sk, skb);
1762 1763 1764 1765
				spin_unlock_bh(&queue->lock);
				return skb;
			}

E
Eric Dumazet 已提交
1766
			if (skb_queue_empty_lockless(sk_queue)) {
1767 1768 1769 1770
				spin_unlock_bh(&queue->lock);
				goto busy_check;
			}

1771 1772 1773 1774 1775
			/* refill the reader queue and walk it again
			 * keep both queues locked to avoid re-acquiring
			 * the sk_receive_queue lock if fwd memory scheduling
			 * is needed.
			 */
1776 1777 1778
			spin_lock(&sk_queue->lock);
			skb_queue_splice_tail_init(sk_queue, queue);

1779 1780 1781 1782
			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
							err, &last);
			if (skb && !(flags & MSG_PEEK))
				udp_skb_dtor_locked(sk, skb);
1783
			spin_unlock(&sk_queue->lock);
1784
			spin_unlock_bh(&queue->lock);
1785
			if (skb)
1786 1787 1788 1789 1790 1791 1792
				return skb;

busy_check:
			if (!sk_can_busy_loop(sk))
				break;

			sk_busy_loop(sk, flags & MSG_DONTWAIT);
E
Eric Dumazet 已提交
1793
		} while (!skb_queue_empty_lockless(sk_queue));
1794 1795 1796

		/* sk_queue is empty, reader_queue may contain peeked packets */
	} while (timeo &&
1797 1798
		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
					      &error, &timeo,
1799 1800 1801 1802 1803
					      (struct sk_buff *)sk_queue));

	*err = error;
	return NULL;
}
1804
EXPORT_SYMBOL(__skb_recv_udp);
1805

1806
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
1807
{
P
Peilin Ye 已提交
1808 1809
	struct sk_buff *skb;
	int err, copied;
1810

P
Peilin Ye 已提交
1811 1812 1813 1814
try_again:
	skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
	if (!skb)
		return err;
1815

P
Peilin Ye 已提交
1816 1817 1818
	if (udp_lib_checksum_complete(skb)) {
		int is_udplite = IS_UDPLITE(sk);
		struct net *net = sock_net(sk);
1819

P
Peilin Ye 已提交
1820 1821 1822
		__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
		__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
		atomic_inc(&sk->sk_drops);
1823
		kfree_skb(skb);
P
Peilin Ye 已提交
1824
		goto try_again;
1825 1826
	}

P
Peilin Ye 已提交
1827 1828 1829 1830
	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
	copied = recv_actor(sk, skb);
	kfree_skb(skb);

1831 1832
	return copied;
}
1833
EXPORT_SYMBOL(udp_read_skb);
1834

1835 1836 1837 1838 1839
/*
 * 	This should be easy, if there is something there we
 * 	return it, otherwise we block.
 */

1840 1841
int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
		int *addr_len)
1842 1843
{
	struct inet_sock *inet = inet_sk(sk);
1844
	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
1845
	struct sk_buff *skb;
1846
	unsigned int ulen, copied;
1847
	int off, err, peeking = flags & MSG_PEEK;
1848
	int is_udplite = IS_UDPLITE(sk);
1849
	bool checksum_valid = false;
1850 1851

	if (flags & MSG_ERRQUEUE)
1852
		return ip_recv_error(sk, msg, len, addr_len);
1853 1854

try_again:
1855
	off = sk_peek_offset(sk, flags);
1856
	skb = __skb_recv_udp(sk, flags, &off, &err);
1857
	if (!skb)
1858
		return err;
1859

1860
	ulen = udp_skb_len(skb);
1861
	copied = len;
1862 1863
	if (copied > ulen - off)
		copied = ulen - off;
1864
	else if (copied < ulen)
1865 1866 1867 1868 1869 1870 1871 1872
		msg->msg_flags |= MSG_TRUNC;

	/*
	 * If checksum is needed at all, try to do it while copying the
	 * data.  If the data is truncated, or if we only want a partial
	 * coverage checksum (UDP-Lite), do it before the copy.
	 */

1873 1874
	if (copied < ulen || peeking ||
	    (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
1875 1876
		checksum_valid = udp_skb_csum_unnecessary(skb) ||
				!__udp_lib_checksum_complete(skb);
1877
		if (!checksum_valid)
1878 1879 1880
			goto csum_copy_err;
	}

1881 1882 1883 1884 1885 1886
	if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
		if (udp_skb_is_linear(skb))
			err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
		else
			err = skb_copy_datagram_msg(skb, off, msg, copied);
	} else {
1887
		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
1888 1889 1890 1891 1892

		if (err == -EINVAL)
			goto csum_copy_err;
	}

1893
	if (unlikely(err)) {
1894
		if (!peeking) {
1895
			atomic_inc(&sk->sk_drops);
1896 1897
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_INERRORS, is_udplite);
1898
		}
1899
		kfree_skb(skb);
1900
		return err;
1901
	}
1902

1903
	if (!peeking)
1904 1905
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_INDATAGRAMS, is_udplite);
1906

1907
	sock_recv_cmsgs(msg, sk, skb);
1908 1909

	/* Copy the address. */
E
Eric Dumazet 已提交
1910
	if (sin) {
1911 1912 1913 1914
		sin->sin_family = AF_INET;
		sin->sin_port = udp_hdr(skb)->source;
		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
1915
		*addr_len = sizeof(*sin);
D
Daniel Borkmann 已提交
1916

1917 1918
		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
						      (struct sockaddr *)sin);
1919
	}
P
Paolo Abeni 已提交
1920 1921 1922 1923

	if (udp_sk(sk)->gro_enabled)
		udp_cmsg_recv(msg, sk, skb);

1924
	if (inet->cmsg_flags)
1925
		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
1926

1927
	err = copied;
1928 1929 1930
	if (flags & MSG_TRUNC)
		err = ulen;

1931
	skb_consume_udp(sk, skb, peeking ? -err : err);
1932 1933 1934
	return err;

csum_copy_err:
1935 1936
	if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
				 udp_skb_destructor)) {
1937 1938
		UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1939
	}
1940
	kfree_skb(skb);
1941

1942 1943
	/* starting over for a new packet, but check if we need to yield */
	cond_resched();
1944
	msg->msg_flags &= ~MSG_TRUNC;
1945 1946 1947
	goto try_again;
}

A
Andrey Ignatov 已提交
1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	/* This check is replicated from __ip4_datagram_connect() and
	 * intended to prevent BPF program called below from accessing bytes
	 * that are out of the bound specified by user in addr_len.
	 */
	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
}
EXPORT_SYMBOL(udp_pre_connect);

1961
int __udp_disconnect(struct sock *sk, int flags)
L
Linus Torvalds 已提交
1962 1963 1964 1965 1966
{
	struct inet_sock *inet = inet_sk(sk);
	/*
	 *	1003.1g - break association.
	 */
1967

L
Linus Torvalds 已提交
1968
	sk->sk_state = TCP_CLOSE;
E
Eric Dumazet 已提交
1969 1970
	inet->inet_daddr = 0;
	inet->inet_dport = 0;
1971
	sock_rps_reset_rxhash(sk);
L
Linus Torvalds 已提交
1972
	sk->sk_bound_dev_if = 0;
W
Willem de Bruijn 已提交
1973
	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
L
Linus Torvalds 已提交
1974
		inet_reset_saddr(sk);
W
Willem de Bruijn 已提交
1975 1976 1977 1978
		if (sk->sk_prot->rehash &&
		    (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
			sk->sk_prot->rehash(sk);
	}
L
Linus Torvalds 已提交
1979 1980 1981

	if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
		sk->sk_prot->unhash(sk);
E
Eric Dumazet 已提交
1982
		inet->inet_sport = 0;
L
Linus Torvalds 已提交
1983 1984 1985 1986
	}
	sk_dst_reset(sk);
	return 0;
}
1987 1988 1989 1990 1991 1992 1993 1994 1995
EXPORT_SYMBOL(__udp_disconnect);

int udp_disconnect(struct sock *sk, int flags)
{
	lock_sock(sk);
	__udp_disconnect(sk, flags);
	release_sock(sk);
	return 0;
}
E
Eric Dumazet 已提交
1996
EXPORT_SYMBOL(udp_disconnect);
L
Linus Torvalds 已提交
1997

1998 1999
void udp_lib_unhash(struct sock *sk)
{
2000 2001
	if (sk_hashed(sk)) {
		struct udp_table *udptable = sk->sk_prot->h.udp_table;
2002 2003 2004 2005 2006
		struct udp_hslot *hslot, *hslot2;

		hslot  = udp_hashslot(udptable, sock_net(sk),
				      udp_sk(sk)->udp_port_hash);
		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
2007

2008
		spin_lock_bh(&hslot->lock);
2009 2010
		if (rcu_access_pointer(sk->sk_reuseport_cb))
			reuseport_detach_sock(sk);
2011
		if (sk_del_node_init_rcu(sk)) {
E
Eric Dumazet 已提交
2012
			hslot->count--;
E
Eric Dumazet 已提交
2013
			inet_sk(sk)->inet_num = 0;
2014
			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
2015 2016

			spin_lock(&hslot2->lock);
2017
			hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
2018 2019
			hslot2->count--;
			spin_unlock(&hslot2->lock);
2020 2021
		}
		spin_unlock_bh(&hslot->lock);
2022 2023 2024 2025
	}
}
EXPORT_SYMBOL(udp_lib_unhash);

E
Eric Dumazet 已提交
2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037
/*
 * inet_rcv_saddr was changed, we must rehash secondary hash
 */
void udp_lib_rehash(struct sock *sk, u16 newhash)
{
	if (sk_hashed(sk)) {
		struct udp_table *udptable = sk->sk_prot->h.udp_table;
		struct udp_hslot *hslot, *hslot2, *nhslot2;

		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
		nhslot2 = udp_hashslot2(udptable, newhash);
		udp_sk(sk)->udp_portaddr_hash = newhash;
2038 2039 2040

		if (hslot2 != nhslot2 ||
		    rcu_access_pointer(sk->sk_reuseport_cb)) {
E
Eric Dumazet 已提交
2041 2042 2043 2044
			hslot = udp_hashslot(udptable, sock_net(sk),
					     udp_sk(sk)->udp_port_hash);
			/* we must lock primary chain too */
			spin_lock_bh(&hslot->lock);
2045 2046 2047 2048 2049
			if (rcu_access_pointer(sk->sk_reuseport_cb))
				reuseport_detach_sock(sk);

			if (hslot2 != nhslot2) {
				spin_lock(&hslot2->lock);
2050
				hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
2051 2052 2053 2054
				hslot2->count--;
				spin_unlock(&hslot2->lock);

				spin_lock(&nhslot2->lock);
2055
				hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
2056 2057 2058 2059
							 &nhslot2->head);
				nhslot2->count++;
				spin_unlock(&nhslot2->lock);
			}
E
Eric Dumazet 已提交
2060 2061 2062 2063 2064 2065 2066

			spin_unlock_bh(&hslot->lock);
		}
	}
}
EXPORT_SYMBOL(udp_lib_rehash);

2067
void udp_v4_rehash(struct sock *sk)
E
Eric Dumazet 已提交
2068
{
2069
	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
E
Eric Dumazet 已提交
2070 2071 2072 2073 2074
					  inet_sk(sk)->inet_rcv_saddr,
					  inet_sk(sk)->inet_num);
	udp_lib_rehash(sk, new_hash);
}

2075
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
2076
{
T
Tom Herbert 已提交
2077
	int rc;
E
Eric Dumazet 已提交
2078

2079
	if (inet_sk(sk)->inet_daddr) {
2080
		sock_rps_save_rxhash(sk, skb);
2081
		sk_mark_napi_id(sk, skb);
E
Eric Dumazet 已提交
2082
		sk_incoming_cpu_update(sk);
2083 2084
	} else {
		sk_mark_napi_id_once(sk, skb);
2085
	}
T
Tom Herbert 已提交
2086

2087
	rc = __udp_enqueue_schedule_skb(sk, skb);
E
Eric Dumazet 已提交
2088 2089
	if (rc < 0) {
		int is_udplite = IS_UDPLITE(sk);
2090
		int drop_reason;
H
Herbert Xu 已提交
2091 2092

		/* Note that an ENOMEM error is charged twice */
2093
		if (rc == -ENOMEM) {
2094
			UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
2095
					is_udplite);
2096 2097
			drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
		} else {
2098 2099
			UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
				      is_udplite);
2100 2101
			drop_reason = SKB_DROP_REASON_PROTO_MEM;
		}
2102
		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
2103
		kfree_skb_reason(skb, drop_reason);
2104
		trace_udp_fail_queue_rcv_skb(rc, sk);
E
Eric Dumazet 已提交
2105
		return -1;
H
Herbert Xu 已提交
2106 2107 2108 2109 2110
	}

	return 0;
}

2111 2112 2113 2114 2115 2116 2117 2118
/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
2119
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
2120
{
2121
	int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2122 2123 2124 2125 2126 2127
	struct udp_sock *up = udp_sk(sk);
	int is_udplite = IS_UDPLITE(sk);

	/*
	 *	Charge it to the socket, dropping if the queue is full.
	 */
2128 2129
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2130
		goto drop;
2131
	}
2132
	nf_reset_ct(skb);
2133

2134
	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
2135 2136
		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);

2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148
		/*
		 * This is an encapsulation socket so pass the skb to
		 * the socket's udp_encap_rcv() hook. Otherwise, just
		 * fall through and pass this up the UDP socket.
		 * up->encap_rcv() returns the following value:
		 * =0 if skb was successfully passed to the encap
		 *    handler or was discarded by it.
		 * >0 if skb should be passed on to UDP.
		 * <0 if skb should be resubmitted as proto -N
		 */

		/* if we're overly short, let UDP handle it */
2149
		encap_rcv = READ_ONCE(up->encap_rcv);
2150
		if (encap_rcv) {
2151 2152
			int ret;

2153 2154 2155 2156
			/* Verify checksum before giving to encap */
			if (udp_lib_checksum_complete(skb))
				goto csum_error;

2157
			ret = encap_rcv(sk, skb);
2158
			if (ret <= 0) {
2159 2160 2161
				__UDP_INC_STATS(sock_net(sk),
						UDP_MIB_INDATAGRAMS,
						is_udplite);
2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
				return -ret;
			}
		}

		/* FALLTHROUGH -- it's a UDP Packet */
	}

	/*
	 * 	UDP-Lite specific tests, ignored on UDP sockets
	 */
2172
	if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185

		/*
		 * MIB statistics other than incrementing the error count are
		 * disabled for the following two types of errors: these depend
		 * on the application settings, not on the functioning of the
		 * protocol stack as such.
		 *
		 * RFC 3828 here recommends (sec 3.3): "There should also be a
		 * way ... to ... at least let the receiving application block
		 * delivery of packets with coverage values less than a value
		 * provided by the application."
		 */
		if (up->pcrlen == 0) {          /* full coverage was set  */
2186 2187
			net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
					    UDP_SKB_CB(skb)->cscov, skb->len);
2188 2189 2190 2191 2192 2193 2194 2195 2196
			goto drop;
		}
		/* The next case involves violating the min. coverage requested
		 * by the receiver. This is subtle: if receiver wants x and x is
		 * greater than the buffersize/MTU then receiver will complain
		 * that it wants x while sender emits packets of smaller size y.
		 * Therefore the above ...()->partial_cov statement is essential.
		 */
		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
2197 2198
			net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
					    UDP_SKB_CB(skb)->cscov, up->pcrlen);
2199 2200 2201 2202
			goto drop;
		}
	}

2203
	prefetch(&sk->sk_rmem_alloc);
2204 2205
	if (rcu_access_pointer(sk->sk_filter) &&
	    udp_lib_checksum_complete(skb))
2206
			goto csum_error;
2207

2208 2209
	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2210
		goto drop;
2211
	}
2212

2213
	udp_csum_pull_header(skb);
2214

2215
	ipv4_pktinfo_prepare(sk, skb);
2216
	return __udp_queue_rcv_skb(sk, skb);
2217

2218
csum_error:
2219
	drop_reason = SKB_DROP_REASON_UDP_CSUM;
2220
	__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
2221
drop:
2222
	__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
E
Eric Dumazet 已提交
2223
	atomic_inc(&sk->sk_drops);
2224
	kfree_skb_reason(skb, drop_reason);
2225 2226 2227
	return -1;
}

2228 2229 2230 2231 2232 2233 2234 2235
static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	struct sk_buff *next, *segs;
	int ret;

	if (likely(!udp_unexpected_gso(sk, skb)))
		return udp_queue_rcv_one_skb(sk, skb);

C
Cambda Zhu 已提交
2236
	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
2237 2238
	__skb_push(skb, -skb_mac_offset(skb));
	segs = udp_rcv_segment(sk, skb, true);
2239
	skb_list_walk_safe(segs, skb, next) {
2240
		__skb_pull(skb, skb_transport_offset(skb));
2241 2242

		udp_post_segment_fix_csum(skb);
2243 2244
		ret = udp_queue_rcv_one_skb(sk, skb);
		if (ret > 0)
2245
			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
2246 2247 2248 2249
	}
	return 0;
}

2250
/* For TCP sockets, sk_rx_dst is protected by socket lock
2251
 * For UDP, we use xchg() to guard against concurrent changes.
2252
 */
2253
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
S
Shawn Bohrer 已提交
2254
{
2255 2256
	struct dst_entry *old;

2257
	if (dst_hold_safe(dst)) {
2258
		old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst);
2259
		dst_release(old);
2260
		return old != dst;
2261
	}
2262
	return false;
S
Shawn Bohrer 已提交
2263
}
2264
EXPORT_SYMBOL(udp_sk_rx_dst_set);
S
Shawn Bohrer 已提交
2265

2266 2267 2268
/*
 *	Multicasts and broadcasts go to each listener.
 *
2269
 *	Note: called only from the BH handler context.
2270
 */
2271
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
2272 2273
				    struct udphdr  *uh,
				    __be32 saddr, __be32 daddr,
2274 2275
				    struct udp_table *udptable,
				    int proto)
2276
{
2277
	struct sock *sk, *first = NULL;
2278 2279
	unsigned short hnum = ntohs(uh->dest);
	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
2280
	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
2281 2282
	unsigned int offset = offsetof(typeof(*sk), sk_node);
	int dif = skb->dev->ifindex;
2283
	int sdif = inet_sdif(skb);
2284 2285
	struct hlist_node *node;
	struct sk_buff *nskb;
2286 2287

	if (use_hash2) {
2288
		hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
2289
			    udptable->mask;
2290
		hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
2291
start_lookup:
2292
		hslot = &udptable->hash2[hash2];
2293 2294
		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
	}
2295

2296 2297
	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
2298
					 uh->source, saddr, dif, sdif, hnum))
2299 2300 2301 2302 2303
			continue;

		if (!first) {
			first = sk;
			continue;
2304
		}
2305
		nskb = skb_clone(skb, GFP_ATOMIC);
2306

2307 2308
		if (unlikely(!nskb)) {
			atomic_inc(&sk->sk_drops);
2309 2310 2311 2312
			__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
					IS_UDPLITE(sk));
			__UDP_INC_STATS(net, UDP_MIB_INERRORS,
					IS_UDPLITE(sk));
2313 2314 2315 2316 2317
			continue;
		}
		if (udp_queue_rcv_skb(sk, nskb) > 0)
			consume_skb(nskb);
	}
2318

2319 2320 2321 2322 2323 2324
	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
	if (use_hash2 && hash2 != hash2_any) {
		hash2 = hash2_any;
		goto start_lookup;
	}

2325 2326 2327
	if (first) {
		if (udp_queue_rcv_skb(first, skb) > 0)
			consume_skb(skb);
2328
	} else {
2329
		kfree_skb(skb);
2330 2331
		__UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
				proto == IPPROTO_UDPLITE);
2332
	}
2333 2334 2335 2336 2337
	return 0;
}

/* Initialize UDP checksum. If exited with zero value (success),
 * CHECKSUM_UNNECESSARY means, that no more checks are required.
S
Su Yanjun 已提交
2338
 * Otherwise, csum completion requires checksumming packet body,
2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
 * including udp header and folding it to skb->csum.
 */
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
				 int proto)
{
	int err;

	UDP_SKB_CB(skb)->partial_cov = 0;
	UDP_SKB_CB(skb)->cscov = skb->len;

	if (proto == IPPROTO_UDPLITE) {
		err = udplite_checksum_init(skb, uh);
		if (err)
			return err;
2353 2354 2355 2356 2357

		if (UDP_SKB_CB(skb)->partial_cov) {
			skb->csum = inet_compute_pseudo(skb, proto);
			return 0;
		}
2358 2359
	}

2360 2361 2362
	/* Note, we are only interested in != 0 or == 0, thus the
	 * force to int.
	 */
2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380
	err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
							inet_compute_pseudo);
	if (err)
		return err;

	if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
		/* If SW calculated the value, we know it's bad */
		if (skb->csum_complete_sw)
			return 1;

		/* HW says the value is bad. Let's validate that.
		 * skb->csum is no longer the full packet checksum,
		 * so don't treat it as such.
		 */
		skb_checksum_complete_unset(skb);
	}

	return 0;
2381 2382
}

2383 2384 2385 2386 2387 2388 2389 2390 2391
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
 * return code conversion for ip layer consumption
 */
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
			       struct udphdr *uh)
{
	int ret;

	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
2392
		skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403

	ret = udp_queue_rcv_skb(sk, skb);

	/* a return value > 0 means to resubmit the input, but
	 * it wants the return to be -protocol, or 0
	 */
	if (ret > 0)
		return -ret;
	return 0;
}

2404 2405 2406 2407
/*
 *	All we need to do is get the socket, and then do a checksum.
 */

2408
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
2409 2410 2411
		   int proto)
{
	struct sock *sk;
2412
	struct udphdr *uh;
2413
	unsigned short ulen;
E
Eric Dumazet 已提交
2414
	struct rtable *rt = skb_rtable(skb);
2415
	__be32 saddr, daddr;
2416
	struct net *net = dev_net(skb->dev);
2417
	bool refcounted;
2418 2419 2420
	int drop_reason;

	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2421 2422 2423 2424 2425 2426 2427

	/*
	 *  Validate the packet.
	 */
	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
		goto drop;		/* No space for header. */

2428
	uh   = udp_hdr(skb);
2429
	ulen = ntohs(uh->len);
2430 2431 2432
	saddr = ip_hdr(skb)->saddr;
	daddr = ip_hdr(skb)->daddr;

2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445
	if (ulen > skb->len)
		goto short_packet;

	if (proto == IPPROTO_UDP) {
		/* UDP validates ulen. */
		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
			goto short_packet;
		uh = udp_hdr(skb);
	}

	if (udp4_csum_init(skb, uh, proto))
		goto csum_error;

2446
	sk = skb_steal_sock(skb, &refcounted);
2447
	if (sk) {
2448
		struct dst_entry *dst = skb_dst(skb);
S
Shawn Bohrer 已提交
2449 2450
		int ret;

2451
		if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
2452
			udp_sk_rx_dst_set(sk, dst);
2453

2454
		ret = udp_unicast_rcv_skb(sk, skb, uh);
2455 2456
		if (refcounted)
			sock_put(sk);
2457
		return ret;
S
Shawn Bohrer 已提交
2458
	}
2459

F
Fabian Frederick 已提交
2460 2461
	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
		return __udp4_lib_mcast_deliver(net, skb, uh,
2462
						saddr, daddr, udptable, proto);
F
Fabian Frederick 已提交
2463 2464

	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
2465 2466
	if (sk)
		return udp_unicast_rcv_skb(sk, skb, uh);
2467 2468 2469

	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
		goto drop;
2470
	nf_reset_ct(skb);
2471 2472 2473 2474 2475

	/* No socket. Drop packet silently, if checksum is wrong */
	if (udp_lib_checksum_complete(skb))
		goto csum_error;

2476
	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2477
	__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
2478 2479 2480 2481 2482 2483
	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

	/*
	 * Hmm.  We got an UDP packet to a port to which we
	 * don't wanna listen.  Ignore it.
	 */
2484
	kfree_skb_reason(skb, drop_reason);
2485 2486 2487
	return 0;

short_packet:
2488
	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2489 2490 2491 2492 2493
	net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
			    proto == IPPROTO_UDPLITE ? "Lite" : "",
			    &saddr, ntohs(uh->source),
			    ulen, skb->len,
			    &daddr, ntohs(uh->dest));
2494 2495 2496 2497 2498 2499 2500
	goto drop;

csum_error:
	/*
	 * RFC1122: OK.  Discards the bad packet silently (as far as
	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
	 */
2501
	drop_reason = SKB_DROP_REASON_UDP_CSUM;
2502 2503 2504 2505
	net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
			    proto == IPPROTO_UDPLITE ? "Lite" : "",
			    &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
			    ulen);
2506
	__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
2507
drop:
2508
	__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
2509
	kfree_skb_reason(skb, drop_reason);
2510 2511 2512
	return 0;
}

S
Shawn Bohrer 已提交
2513 2514 2515 2516 2517 2518
/* We can only early demux multicast if there is a single matching socket.
 * If more than one socket found returns NULL
 */
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
						  __be16 loc_port, __be32 loc_addr,
						  __be16 rmt_port, __be32 rmt_addr,
2519
						  int dif, int sdif)
S
Shawn Bohrer 已提交
2520 2521 2522
{
	struct sock *sk, *result;
	unsigned short hnum = ntohs(loc_port);
2523
	unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
S
Shawn Bohrer 已提交
2524 2525
	struct udp_hslot *hslot = &udp_table.hash[slot];

2526 2527 2528 2529
	/* Do not bother scanning a too big list */
	if (hslot->count > 10)
		return NULL;

S
Shawn Bohrer 已提交
2530
	result = NULL;
2531 2532
	sk_for_each_rcu(sk, &hslot->head) {
		if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
2533
					rmt_port, rmt_addr, dif, sdif, hnum)) {
2534 2535
			if (result)
				return NULL;
S
Shawn Bohrer 已提交
2536 2537 2538
			result = sk;
		}
	}
2539

S
Shawn Bohrer 已提交
2540 2541 2542 2543 2544 2545 2546 2547 2548 2549
	return result;
}

/* For unicast we should only early demux connected sockets or we can
 * break forwarding setups.  The chains here can be long so only check
 * if the first socket is an exact match and if not move on.
 */
static struct sock *__udp4_lib_demux_lookup(struct net *net,
					    __be16 loc_port, __be32 loc_addr,
					    __be16 rmt_port, __be32 rmt_addr,
2550
					    int dif, int sdif)
S
Shawn Bohrer 已提交
2551 2552
{
	unsigned short hnum = ntohs(loc_port);
2553
	unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
S
Shawn Bohrer 已提交
2554 2555
	unsigned int slot2 = hash2 & udp_table.mask;
	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
2556
	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
S
Shawn Bohrer 已提交
2557
	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
2558
	struct sock *sk;
S
Shawn Bohrer 已提交
2559

2560
	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
E
Eric Dumazet 已提交
2561
		if (inet_match(net, sk, acookie, ports, dif, sdif))
2562
			return sk;
S
Shawn Bohrer 已提交
2563 2564 2565
		/* Only check first socket in chain */
		break;
	}
2566
	return NULL;
S
Shawn Bohrer 已提交
2567 2568
}

2569
int udp_v4_early_demux(struct sk_buff *skb)
S
Shawn Bohrer 已提交
2570
{
2571
	struct net *net = dev_net(skb->dev);
2572
	struct in_device *in_dev = NULL;
2573 2574
	const struct iphdr *iph;
	const struct udphdr *uh;
2575
	struct sock *sk = NULL;
S
Shawn Bohrer 已提交
2576 2577
	struct dst_entry *dst;
	int dif = skb->dev->ifindex;
2578
	int sdif = inet_sdif(skb);
2579
	int ours;
S
Shawn Bohrer 已提交
2580 2581 2582

	/* validate the packet */
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
2583
		return 0;
S
Shawn Bohrer 已提交
2584

2585 2586 2587
	iph = ip_hdr(skb);
	uh = udp_hdr(skb);

P
Paolo Abeni 已提交
2588
	if (skb->pkt_type == PACKET_MULTICAST) {
2589
		in_dev = __in_dev_get_rcu(skb->dev);
2590 2591

		if (!in_dev)
2592
			return 0;
2593

P
Paolo Abeni 已提交
2594 2595 2596 2597
		ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
				       iph->protocol);
		if (!ours)
			return 0;
2598

S
Shawn Bohrer 已提交
2599
		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
2600 2601
						   uh->source, iph->saddr,
						   dif, sdif);
2602
	} else if (skb->pkt_type == PACKET_HOST) {
S
Shawn Bohrer 已提交
2603
		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
2604
					     uh->source, iph->saddr, dif, sdif);
2605
	}
S
Shawn Bohrer 已提交
2606

2607
	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
2608
		return 0;
S
Shawn Bohrer 已提交
2609 2610

	skb->sk = sk;
2611
	skb->destructor = sock_efree;
2612
	dst = rcu_dereference(sk->sk_rx_dst);
S
Shawn Bohrer 已提交
2613 2614 2615

	if (dst)
		dst = dst_check(dst, 0);
2616
	if (dst) {
2617 2618
		u32 itag = 0;

2619 2620 2621 2622 2623
		/* set noref for now.
		 * any place which wants to hold dst has to call
		 * dst_hold_safe()
		 */
		skb_dst_set_noref(skb, dst);
2624 2625 2626 2627 2628 2629

		/* for unconnected multicast sockets we need to validate
		 * the source on each packet
		 */
		if (!inet_sk(sk)->inet_daddr && in_dev)
			return ip_mc_validate_source(skb, iph->daddr,
2630 2631
						     iph->saddr,
						     iph->tos & IPTOS_RT_MASK,
2632
						     skb->dev, in_dev, &itag);
2633
	}
2634
	return 0;
S
Shawn Bohrer 已提交
2635 2636
}

2637 2638
int udp_rcv(struct sk_buff *skb)
{
2639
	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
2640 2641
}

2642
void udp_destroy_sock(struct sock *sk)
2643
{
T
Tom Parkin 已提交
2644
	struct udp_sock *up = udp_sk(sk);
2645
	bool slow = lock_sock_fast(sk);
2646 2647 2648

	/* protects from races with udp_abort() */
	sock_set_flag(sk, SOCK_DEAD);
2649
	udp_flush_pending_frames(sk);
2650
	unlock_sock_fast(sk, slow);
2651 2652 2653 2654 2655 2656 2657 2658
	if (static_branch_unlikely(&udp_encap_needed_key)) {
		if (up->encap_type) {
			void (*encap_destroy)(struct sock *sk);
			encap_destroy = READ_ONCE(up->encap_destroy);
			if (encap_destroy)
				encap_destroy(sk);
		}
		if (up->encap_enabled)
P
Paolo Abeni 已提交
2659
			static_branch_dec(&udp_encap_needed_key);
T
Tom Parkin 已提交
2660
	}
2661 2662
}

L
Linus Torvalds 已提交
2663 2664 2665
/*
 *	Socket option code for UDP
 */
2666
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2667
		       sockptr_t optval, unsigned int optlen,
2668
		       int (*push_pending_frames)(struct sock *))
L
Linus Torvalds 已提交
2669 2670
{
	struct udp_sock *up = udp_sk(sk);
2671
	int val, valbool;
L
Linus Torvalds 已提交
2672
	int err = 0;
W
Wang Chen 已提交
2673
	int is_udplite = IS_UDPLITE(sk);
L
Linus Torvalds 已提交
2674

E
Eric Dumazet 已提交
2675
	if (optlen < sizeof(int))
L
Linus Torvalds 已提交
2676 2677
		return -EINVAL;

2678
	if (copy_from_sockptr(&val, optval, sizeof(val)))
L
Linus Torvalds 已提交
2679 2680
		return -EFAULT;

2681 2682
	valbool = val ? 1 : 0;

2683
	switch (optname) {
L
Linus Torvalds 已提交
2684 2685
	case UDP_CORK:
		if (val != 0) {
2686
			WRITE_ONCE(up->corkflag, 1);
L
Linus Torvalds 已提交
2687
		} else {
2688
			WRITE_ONCE(up->corkflag, 0);
L
Linus Torvalds 已提交
2689
			lock_sock(sk);
2690
			push_pending_frames(sk);
L
Linus Torvalds 已提交
2691 2692 2693
			release_sock(sk);
		}
		break;
2694

L
Linus Torvalds 已提交
2695 2696 2697
	case UDP_ENCAP:
		switch (val) {
		case 0:
2698
#ifdef CONFIG_XFRM
L
Linus Torvalds 已提交
2699 2700
		case UDP_ENCAP_ESPINUDP:
		case UDP_ENCAP_ESPINUDP_NON_IKE:
2701 2702 2703 2704 2705 2706
#if IS_ENABLED(CONFIG_IPV6)
			if (sk->sk_family == AF_INET6)
				up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv;
			else
#endif
				up->encap_rcv = xfrm4_udp_encap_rcv;
2707
#endif
J
Joe Perches 已提交
2708
			fallthrough;
2709
		case UDP_ENCAP_L2TPINUDP:
L
Linus Torvalds 已提交
2710
			up->encap_type = val;
2711 2712 2713
			lock_sock(sk);
			udp_tunnel_encap_enable(sk->sk_socket);
			release_sock(sk);
L
Linus Torvalds 已提交
2714 2715 2716 2717 2718 2719 2720
			break;
		default:
			err = -ENOPROTOOPT;
			break;
		}
		break;

2721 2722 2723 2724 2725 2726 2727 2728
	case UDP_NO_CHECK6_TX:
		up->no_check6_tx = valbool;
		break;

	case UDP_NO_CHECK6_RX:
		up->no_check6_rx = valbool;
		break;

2729 2730 2731
	case UDP_SEGMENT:
		if (val < 0 || val > USHRT_MAX)
			return -EINVAL;
2732
		WRITE_ONCE(up->gso_size, val);
2733 2734
		break;

2735 2736
	case UDP_GRO:
		lock_sock(sk);
2737 2738

		/* when enabling GRO, accept the related GSO packet type */
2739 2740 2741
		if (valbool)
			udp_tunnel_encap_enable(sk->sk_socket);
		up->gro_enabled = valbool;
2742
		up->accept_udp_l4 = valbool;
2743 2744 2745
		release_sock(sk);
		break;

2746 2747 2748 2749 2750 2751
	/*
	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
	 */
	/* The sender sets actual checksum coverage length via this option.
	 * The case coverage > packet length is handled by send module. */
	case UDPLITE_SEND_CSCOV:
W
Wang Chen 已提交
2752
		if (!is_udplite)         /* Disable the option on UDP sockets */
2753 2754 2755
			return -ENOPROTOOPT;
		if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
			val = 8;
2756 2757
		else if (val > USHRT_MAX)
			val = USHRT_MAX;
2758 2759 2760 2761
		up->pcslen = val;
		up->pcflag |= UDPLITE_SEND_CC;
		break;

2762 2763
	/* The receiver specifies a minimum checksum coverage value. To make
	 * sense, this should be set to at least 8 (as done below). If zero is
2764 2765
	 * used, this again means full checksum coverage.                     */
	case UDPLITE_RECV_CSCOV:
W
Wang Chen 已提交
2766
		if (!is_udplite)         /* Disable the option on UDP sockets */
2767 2768 2769
			return -ENOPROTOOPT;
		if (val != 0 && val < 8) /* Avoid silly minimal values.       */
			val = 8;
2770 2771
		else if (val > USHRT_MAX)
			val = USHRT_MAX;
2772 2773 2774 2775
		up->pcrlen = val;
		up->pcflag |= UDPLITE_RECV_CC;
		break;

L
Linus Torvalds 已提交
2776 2777 2778
	default:
		err = -ENOPROTOOPT;
		break;
2779
	}
L
Linus Torvalds 已提交
2780 2781 2782

	return err;
}
E
Eric Dumazet 已提交
2783
EXPORT_SYMBOL(udp_lib_setsockopt);
L
Linus Torvalds 已提交
2784

2785 2786
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
		   unsigned int optlen)
2787 2788
{
	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
2789
		return udp_lib_setsockopt(sk, level, optname,
2790
					  optval, optlen,
2791 2792 2793 2794
					  udp_push_pending_frames);
	return ip_setsockopt(sk, level, optname, optval, optlen);
}

2795 2796
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
		       char __user *optval, int __user *optlen)
L
Linus Torvalds 已提交
2797 2798 2799 2800
{
	struct udp_sock *up = udp_sk(sk);
	int val, len;

E
Eric Dumazet 已提交
2801
	if (get_user(len, optlen))
L
Linus Torvalds 已提交
2802 2803 2804
		return -EFAULT;

	len = min_t(unsigned int, len, sizeof(int));
2805

2806
	if (len < 0)
L
Linus Torvalds 已提交
2807 2808
		return -EINVAL;

2809
	switch (optname) {
L
Linus Torvalds 已提交
2810
	case UDP_CORK:
2811
		val = READ_ONCE(up->corkflag);
L
Linus Torvalds 已提交
2812 2813 2814 2815 2816 2817
		break;

	case UDP_ENCAP:
		val = up->encap_type;
		break;

2818 2819 2820 2821 2822 2823 2824 2825
	case UDP_NO_CHECK6_TX:
		val = up->no_check6_tx;
		break;

	case UDP_NO_CHECK6_RX:
		val = up->no_check6_rx;
		break;

2826
	case UDP_SEGMENT:
2827
		val = READ_ONCE(up->gso_size);
2828 2829
		break;

2830 2831 2832 2833
	case UDP_GRO:
		val = up->gro_enabled;
		break;

2834 2835 2836 2837 2838 2839 2840 2841 2842 2843
	/* The following two cannot be changed on UDP sockets, the return is
	 * always 0 (which corresponds to the full checksum coverage of UDP). */
	case UDPLITE_SEND_CSCOV:
		val = up->pcslen;
		break;

	case UDPLITE_RECV_CSCOV:
		val = up->pcrlen;
		break;

L
Linus Torvalds 已提交
2844 2845
	default:
		return -ENOPROTOOPT;
2846
	}
L
Linus Torvalds 已提交
2847

2848
	if (put_user(len, optlen))
2849
		return -EFAULT;
E
Eric Dumazet 已提交
2850
	if (copy_to_user(optval, &val, len))
L
Linus Torvalds 已提交
2851
		return -EFAULT;
2852
	return 0;
L
Linus Torvalds 已提交
2853
}
E
Eric Dumazet 已提交
2854
EXPORT_SYMBOL(udp_lib_getsockopt);
L
Linus Torvalds 已提交
2855

2856 2857 2858 2859 2860 2861 2862 2863
int udp_getsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, int __user *optlen)
{
	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
	return ip_getsockopt(sk, level, optname, optval, optlen);
}

L
Linus Torvalds 已提交
2864 2865
/**
 * 	udp_poll - wait for a UDP event.
A
Andrew Lunn 已提交
2866 2867 2868
 *	@file: - file struct
 *	@sock: - socket
 *	@wait: - poll table
L
Linus Torvalds 已提交
2869
 *
2870
 *	This is same as datagram poll, except for the special case of
L
Linus Torvalds 已提交
2871 2872 2873 2874 2875 2876
 *	blocking sockets. If application is using a blocking fd
 *	and a packet with checksum error is in the queue;
 *	then it could get return from select indicating data available
 *	but then block when reading it. Add special case code
 *	to work around these arguably broken applications.
 */
2877
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
L
Linus Torvalds 已提交
2878
{
2879
	__poll_t mask = datagram_poll(file, sock, wait);
L
Linus Torvalds 已提交
2880
	struct sock *sk = sock->sk;
2881

2882
	if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
2883
		mask |= EPOLLIN | EPOLLRDNORM;
2884

L
Linus Torvalds 已提交
2885
	/* Check for false positives due to checksum errors */
2886
	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
2887
	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
2888
		mask &= ~(EPOLLIN | EPOLLRDNORM);
L
Linus Torvalds 已提交
2889

2890 2891 2892
	/* psock ingress_msg queue should not contain any bad checksum frames */
	if (sk_is_readable(sk))
		mask |= EPOLLIN | EPOLLRDNORM;
L
Linus Torvalds 已提交
2893
	return mask;
2894

L
Linus Torvalds 已提交
2895
}
2896
EXPORT_SYMBOL(udp_poll);
L
Linus Torvalds 已提交
2897

2898 2899 2900 2901
int udp_abort(struct sock *sk, int err)
{
	lock_sock(sk);

2902 2903 2904 2905 2906 2907
	/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
	 * with close()
	 */
	if (sock_flag(sk, SOCK_DEAD))
		goto out;

2908
	sk->sk_err = err;
2909
	sk_error_report(sk);
2910
	__udp_disconnect(sk, 0);
2911

2912
out:
2913 2914 2915 2916 2917 2918
	release_sock(sk);

	return 0;
}
EXPORT_SYMBOL_GPL(udp_abort);

2919
struct proto udp_prot = {
2920 2921 2922
	.name			= "UDP",
	.owner			= THIS_MODULE,
	.close			= udp_lib_close,
A
Andrey Ignatov 已提交
2923
	.pre_connect		= udp_pre_connect,
2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938
	.connect		= ip4_datagram_connect,
	.disconnect		= udp_disconnect,
	.ioctl			= udp_ioctl,
	.init			= udp_init_sock,
	.destroy		= udp_destroy_sock,
	.setsockopt		= udp_setsockopt,
	.getsockopt		= udp_getsockopt,
	.sendmsg		= udp_sendmsg,
	.recvmsg		= udp_recvmsg,
	.sendpage		= udp_sendpage,
	.release_cb		= ip4_datagram_release_cb,
	.hash			= udp_lib_hash,
	.unhash			= udp_lib_unhash,
	.rehash			= udp_v4_rehash,
	.get_port		= udp_v4_get_port,
2939
	.put_port		= udp_lib_unhash,
2940 2941 2942
#ifdef CONFIG_BPF_SYSCALL
	.psock_update_sk_prot	= udp_bpf_update_proto,
#endif
2943
	.memory_allocated	= &udp_memory_allocated,
2944 2945
	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,

2946 2947 2948 2949 2950 2951
	.sysctl_mem		= sysctl_udp_mem,
	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
	.obj_size		= sizeof(struct udp_sock),
	.h.udp_table		= &udp_table,
	.diag_destroy		= udp_abort,
2952
};
E
Eric Dumazet 已提交
2953
EXPORT_SYMBOL(udp_prot);
L
Linus Torvalds 已提交
2954 2955 2956 2957

/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS

2958
static struct sock *udp_get_first(struct seq_file *seq, int start)
L
Linus Torvalds 已提交
2959 2960
{
	struct sock *sk;
2961
	struct udp_seq_afinfo *afinfo;
L
Linus Torvalds 已提交
2962
	struct udp_iter_state *state = seq->private;
2963
	struct net *net = seq_file_net(seq);
L
Linus Torvalds 已提交
2964

2965 2966 2967
	if (state->bpf_seq_afinfo)
		afinfo = state->bpf_seq_afinfo;
	else
M
Muchun Song 已提交
2968
		afinfo = pde_data(file_inode(seq->file));
2969

2970
	for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
2971
	     ++state->bucket) {
2972
		struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
2973

2974
		if (hlist_empty(&hslot->head))
2975 2976
			continue;

2977
		spin_lock_bh(&hslot->lock);
2978
		sk_for_each(sk, &hslot->head) {
2979
			if (!net_eq(sock_net(sk), net))
2980
				continue;
2981 2982
			if (afinfo->family == AF_UNSPEC ||
			    sk->sk_family == afinfo->family)
L
Linus Torvalds 已提交
2983 2984
				goto found;
		}
2985
		spin_unlock_bh(&hslot->lock);
L
Linus Torvalds 已提交
2986 2987 2988 2989 2990 2991 2992 2993
	}
	sk = NULL;
found:
	return sk;
}

static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
2994
	struct udp_seq_afinfo *afinfo;
L
Linus Torvalds 已提交
2995
	struct udp_iter_state *state = seq->private;
2996
	struct net *net = seq_file_net(seq);
L
Linus Torvalds 已提交
2997

2998 2999 3000
	if (state->bpf_seq_afinfo)
		afinfo = state->bpf_seq_afinfo;
	else
M
Muchun Song 已提交
3001
		afinfo = pde_data(file_inode(seq->file));
3002

L
Linus Torvalds 已提交
3003
	do {
3004
		sk = sk_next(sk);
3005 3006 3007
	} while (sk && (!net_eq(sock_net(sk), net) ||
			(afinfo->family != AF_UNSPEC &&
			 sk->sk_family != afinfo->family)));
L
Linus Torvalds 已提交
3008

3009
	if (!sk) {
3010 3011
		if (state->bucket <= afinfo->udp_table->mask)
			spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
3012
		return udp_get_first(seq, state->bucket + 1);
L
Linus Torvalds 已提交
3013 3014 3015 3016 3017 3018
	}
	return sk;
}

static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
{
3019
	struct sock *sk = udp_get_first(seq, 0);
L
Linus Torvalds 已提交
3020 3021

	if (sk)
3022
		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
L
Linus Torvalds 已提交
3023 3024 3025 3026
			--pos;
	return pos ? NULL : sk;
}

3027
void *udp_seq_start(struct seq_file *seq, loff_t *pos)
L
Linus Torvalds 已提交
3028
{
3029
	struct udp_iter_state *state = seq->private;
3030
	state->bucket = MAX_UDP_PORTS;
3031

3032
	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
L
Linus Torvalds 已提交
3033
}
3034
EXPORT_SYMBOL(udp_seq_start);
L
Linus Torvalds 已提交
3035

3036
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
L
Linus Torvalds 已提交
3037 3038 3039
{
	struct sock *sk;

3040
	if (v == SEQ_START_TOKEN)
L
Linus Torvalds 已提交
3041 3042 3043 3044 3045 3046 3047
		sk = udp_get_idx(seq, 0);
	else
		sk = udp_get_next(seq, v);

	++*pos;
	return sk;
}
3048
EXPORT_SYMBOL(udp_seq_next);
L
Linus Torvalds 已提交
3049

3050
void udp_seq_stop(struct seq_file *seq, void *v)
L
Linus Torvalds 已提交
3051
{
3052
	struct udp_seq_afinfo *afinfo;
3053 3054
	struct udp_iter_state *state = seq->private;

3055 3056 3057
	if (state->bpf_seq_afinfo)
		afinfo = state->bpf_seq_afinfo;
	else
M
Muchun Song 已提交
3058
		afinfo = pde_data(file_inode(seq->file));
3059

3060 3061
	if (state->bucket <= afinfo->udp_table->mask)
		spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
L
Linus Torvalds 已提交
3062
}
3063
EXPORT_SYMBOL(udp_seq_stop);
3064 3065

/* ------------------------------------------------------------------------ */
3066
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
3067
		int bucket)
3068 3069
{
	struct inet_sock *inet = inet_sk(sp);
E
Eric Dumazet 已提交
3070 3071 3072 3073
	__be32 dest = inet->inet_daddr;
	__be32 src  = inet->inet_rcv_saddr;
	__u16 destp	  = ntohs(inet->inet_dport);
	__u16 srcp	  = ntohs(inet->inet_sport);
3074

3075
	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
3076
		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
3077
		bucket, src, srcp, dest, destp, sp->sk_state,
3078
		sk_wmem_alloc_get(sp),
3079
		udp_rqueue_get(sp),
3080 3081 3082
		0, 0L, 0,
		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
		0, sock_i_ino(sp),
3083
		refcount_read(&sp->sk_refcnt), sp,
3084
		atomic_read(&sp->sk_drops));
3085 3086 3087 3088
}

int udp4_seq_show(struct seq_file *seq, void *v)
{
3089
	seq_setwidth(seq, 127);
3090
	if (v == SEQ_START_TOKEN)
3091
		seq_puts(seq, "   sl  local_address rem_address   st tx_queue "
3092
			   "rx_queue tr tm->when retrnsmt   uid  timeout "
E
Eric Dumazet 已提交
3093
			   "inode ref pointer drops");
3094 3095 3096
	else {
		struct udp_iter_state *state = seq->private;

3097
		udp4_format_sock(v, seq, state->bucket);
3098
	}
3099
	seq_pad(seq, '\n');
3100 3101 3102
	return 0;
}

3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163
#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__udp {
	__bpf_md_ptr(struct bpf_iter_meta *, meta);
	__bpf_md_ptr(struct udp_sock *, udp_sk);
	uid_t uid __aligned(8);
	int bucket __aligned(8);
};

static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
			     struct udp_sock *udp_sk, uid_t uid, int bucket)
{
	struct bpf_iter__udp ctx;

	meta->seq_num--;  /* skip SEQ_START_TOKEN */
	ctx.meta = meta;
	ctx.udp_sk = udp_sk;
	ctx.uid = uid;
	ctx.bucket = bucket;
	return bpf_iter_run_prog(prog, &ctx);
}

static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
{
	struct udp_iter_state *state = seq->private;
	struct bpf_iter_meta meta;
	struct bpf_prog *prog;
	struct sock *sk = v;
	uid_t uid;

	if (v == SEQ_START_TOKEN)
		return 0;

	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
	meta.seq = seq;
	prog = bpf_iter_get_info(&meta, false);
	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
}

static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
{
	struct bpf_iter_meta meta;
	struct bpf_prog *prog;

	if (!v) {
		meta.seq = seq;
		prog = bpf_iter_get_info(&meta, true);
		if (prog)
			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
	}

	udp_seq_stop(seq, v);
}

static const struct seq_operations bpf_iter_udp_seq_ops = {
	.start		= udp_seq_start,
	.next		= udp_seq_next,
	.stop		= bpf_iter_udp_seq_stop,
	.show		= bpf_iter_udp_seq_show,
};
#endif

3164
const struct seq_operations udp_seq_ops = {
3165 3166 3167 3168 3169
	.start		= udp_seq_start,
	.next		= udp_seq_next,
	.stop		= udp_seq_stop,
	.show		= udp4_seq_show,
};
3170
EXPORT_SYMBOL(udp_seq_ops);
3171

3172 3173
static struct udp_seq_afinfo udp4_seq_afinfo = {
	.family		= AF_INET,
3174
	.udp_table	= &udp_table,
3175 3176
};

3177
static int __net_init udp4_proc_init_net(struct net *net)
3178
{
3179 3180
	if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
			sizeof(struct udp_iter_state), &udp4_seq_afinfo))
3181 3182
		return -ENOMEM;
	return 0;
3183 3184
}

3185
static void __net_exit udp4_proc_exit_net(struct net *net)
3186
{
3187
	remove_proc_entry("udp", net->proc_net);
3188 3189 3190 3191 3192 3193 3194
}

static struct pernet_operations udp4_net_ops = {
	.init = udp4_proc_init_net,
	.exit = udp4_proc_exit_net,
};

3195 3196
int __init udp4_proc_init(void)
{
3197
	return register_pernet_subsys(&udp4_net_ops);
3198 3199 3200 3201
}

void udp4_proc_exit(void)
{
3202
	unregister_pernet_subsys(&udp4_net_ops);
3203
}
L
Linus Torvalds 已提交
3204 3205
#endif /* CONFIG_PROC_FS */

3206 3207
static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
3208
{
3209 3210
	ssize_t ret;

3211 3212
	if (!str)
		return 0;
3213 3214 3215 3216 3217

	ret = kstrtoul(str, 0, &uhash_entries);
	if (ret)
		return 0;

3218 3219 3220 3221 3222
	if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
		uhash_entries = UDP_HTABLE_SIZE_MIN;
	return 1;
}
__setup("uhash_entries=", set_uhash_entries);
3223

3224 3225 3226 3227
void __init udp_table_init(struct udp_table *table, const char *name)
{
	unsigned int i;

3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
	table->hash = alloc_large_system_hash(name,
					      2 * sizeof(struct udp_hslot),
					      uhash_entries,
					      21, /* one slot per 2 MB */
					      0,
					      &table->log,
					      &table->mask,
					      UDP_HTABLE_SIZE_MIN,
					      64 * 1024);

3238
	table->hash2 = table->hash + (table->mask + 1);
3239
	for (i = 0; i <= table->mask; i++) {
3240
		INIT_HLIST_HEAD(&table->hash[i].head);
E
Eric Dumazet 已提交
3241
		table->hash[i].count = 0;
3242 3243
		spin_lock_init(&table->hash[i].lock);
	}
3244
	for (i = 0; i <= table->mask; i++) {
3245
		INIT_HLIST_HEAD(&table->hash2[i].head);
3246 3247 3248
		table->hash2[i].count = 0;
		spin_lock_init(&table->hash2[i].lock);
	}
3249 3250
}

3251 3252 3253 3254 3255 3256 3257 3258 3259 3260
u32 udp_flow_hashrnd(void)
{
	static u32 hashrnd __read_mostly;

	net_get_random_once(&hashrnd, sizeof(hashrnd));

	return hashrnd;
}
EXPORT_SYMBOL(udp_flow_hashrnd);

3261
static int __net_init udp_sysctl_init(struct net *net)
3262
{
3263 3264
	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
3265 3266 3267 3268 3269 3270 3271 3272 3273

#ifdef CONFIG_NET_L3_MASTER_DEV
	net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif

	return 0;
}

static struct pernet_operations __net_initdata udp_sysctl_ops = {
K
Kirill Tkhai 已提交
3274
	.init	= udp_sysctl_init,
3275 3276
};

3277 3278 3279 3280
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
		     struct udp_sock *udp_sk, uid_t uid, int bucket)

3281
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293
{
	struct udp_iter_state *st = priv_data;
	struct udp_seq_afinfo *afinfo;
	int ret;

	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
	if (!afinfo)
		return -ENOMEM;

	afinfo->family = AF_UNSPEC;
	afinfo->udp_table = &udp_table;
	st->bpf_seq_afinfo = afinfo;
3294
	ret = bpf_iter_init_seq_net(priv_data, aux);
3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307
	if (ret)
		kfree(afinfo);
	return ret;
}

static void bpf_iter_fini_udp(void *priv_data)
{
	struct udp_iter_state *st = priv_data;

	kfree(st->bpf_seq_afinfo);
	bpf_iter_fini_seq_net(priv_data);
}

3308
static const struct bpf_iter_seq_info udp_seq_info = {
3309 3310 3311 3312
	.seq_ops		= &bpf_iter_udp_seq_ops,
	.init_seq_private	= bpf_iter_init_udp,
	.fini_seq_private	= bpf_iter_fini_udp,
	.seq_priv_size		= sizeof(struct udp_iter_state),
3313 3314 3315 3316
};

static struct bpf_iter_reg udp_reg_info = {
	.target			= "udp",
3317 3318 3319 3320 3321
	.ctx_arg_info_size	= 1,
	.ctx_arg_info		= {
		{ offsetof(struct bpf_iter__udp, udp_sk),
		  PTR_TO_BTF_ID_OR_NULL },
	},
3322
	.seq_info		= &udp_seq_info,
3323 3324 3325 3326
};

static void __init bpf_iter_register(void)
{
3327
	udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
3328 3329 3330 3331 3332
	if (bpf_iter_reg_target(&udp_reg_info))
		pr_warn("Warning: could not register bpf iterator udp\n");
}
#endif

H
Hideo Aoki 已提交
3333 3334
void __init udp_init(void)
{
3335
	unsigned long limit;
E
Eric Dumazet 已提交
3336
	unsigned int i;
H
Hideo Aoki 已提交
3337

3338
	udp_table_init(&udp_table, "UDP");
3339
	limit = nr_free_buffer_pages() / 8;
H
Hideo Aoki 已提交
3340 3341 3342 3343 3344
	limit = max(limit, 128UL);
	sysctl_udp_mem[0] = limit / 4 * 3;
	sysctl_udp_mem[1] = limit;
	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;

E
Eric Dumazet 已提交
3345 3346 3347 3348 3349 3350 3351 3352
	/* 16 spinlocks per cpu */
	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
				GFP_KERNEL);
	if (!udp_busylocks)
		panic("UDP: failed to alloc udp_busylocks\n");
	for (i = 0; i < (1U << udp_busylocks_log); i++)
		spin_lock_init(udp_busylocks + i);
3353 3354 3355

	if (register_pernet_subsys(&udp_sysctl_ops))
		panic("UDP: failed to init sysctl parameters.\n");
3356 3357 3358 3359

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
	bpf_iter_register();
#endif
H
Hideo Aoki 已提交
3360
}