inet_hashtables.c 20.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic INET transport hashtables
 *
 * Authors:	Lotsa people, from code originally in tcp
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

16
#include <linux/module.h>
17
#include <linux/random.h>
18
#include <linux/sched.h>
19
#include <linux/slab.h>
20
#include <linux/wait.h>
21
#include <linux/vmalloc.h>
M
Mike Rapoport 已提交
22
#include <linux/memblock.h>
23

24
#include <net/addrconf.h>
25
#include <net/inet_connection_sock.h>
26
#include <net/inet_hashtables.h>
27
#include <net/secure_seq.h>
28
#include <net/ip.h>
29
#include <net/tcp.h>
30
#include <net/sock_reuseport.h>
31

32 33 34
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
			const __u16 lport, const __be32 faddr,
			const __be16 fport)
35
{
36 37 38 39
	static u32 inet_ehash_secret __read_mostly;

	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));

40 41 42 43
	return __inet_ehashfn(laddr, lport, faddr, fport,
			      inet_ehash_secret + net_hash_mix(net));
}

44 45 46
/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
E
Eric Dumazet 已提交
47
static u32 sk_ehashfn(const struct sock *sk)
48
{
49 50 51 52 53 54 55
#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6 &&
	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		return inet6_ehashfn(sock_net(sk),
				     &sk->sk_v6_rcv_saddr, sk->sk_num,
				     &sk->sk_v6_daddr, sk->sk_dport);
#endif
56 57 58
	return inet_ehashfn(sock_net(sk),
			    sk->sk_rcv_saddr, sk->sk_num,
			    sk->sk_daddr, sk->sk_dport);
59 60
}

61 62 63 64
/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
65
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
66
						 struct net *net,
67
						 struct inet_bind_hashbucket *head,
68 69
						 const unsigned short snum,
						 int l3mdev)
70
{
71
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
72

73
	if (tb) {
74
		write_pnet(&tb->ib_net, net);
75
		tb->l3mdev    = l3mdev;
76 77
		tb->port      = snum;
		tb->fastreuse = 0;
78
		tb->fastreuseport = 0;
79 80 81 82 83 84 85 86 87
		INIT_HLIST_HEAD(&tb->owners);
		hlist_add_head(&tb->node, &head->chain);
	}
	return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
88
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
89 90 91 92 93 94
{
	if (hlist_empty(&tb->owners)) {
		__hlist_del(&tb->node);
		kmem_cache_free(cachep, tb);
	}
}
95 96 97 98

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
E
Eric Dumazet 已提交
99
	inet_sk(sk)->inet_num = snum;
100
	sk_add_bind_node(sk, &tb->owners);
101
	inet_csk(sk)->icsk_bind_hash = tb;
102 103 104 105 106
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
107
static void __inet_put_port(struct sock *sk)
108
{
109
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
E
Eric Dumazet 已提交
110
	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
111
			hashinfo->bhash_size);
112 113 114 115
	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
116
	tb = inet_csk(sk)->icsk_bind_hash;
117
	__sk_del_bind_node(sk);
118
	inet_csk(sk)->icsk_bind_hash = NULL;
E
Eric Dumazet 已提交
119
	inet_sk(sk)->inet_num = 0;
120 121 122 123
	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
	spin_unlock(&head->lock);
}

124
void inet_put_port(struct sock *sk)
125 126
{
	local_bh_disable();
127
	__inet_put_port(sk);
128 129 130
	local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);
131

132
int __inet_inherit_port(const struct sock *sk, struct sock *child)
133 134
{
	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
135 136
	unsigned short port = inet_sk(child)->inet_num;
	const int bhash = inet_bhashfn(sock_net(sk), port,
137
			table->bhash_size);
138 139
	struct inet_bind_hashbucket *head = &table->bhash[bhash];
	struct inet_bind_bucket *tb;
140
	int l3mdev;
141 142 143

	spin_lock(&head->lock);
	tb = inet_csk(sk)->icsk_bind_hash;
144 145 146 147
	if (unlikely(!tb)) {
		spin_unlock(&head->lock);
		return -ENOENT;
	}
148
	if (tb->port != port) {
149 150
		l3mdev = inet_sk_bound_l3mdev(sk);

151 152 153 154 155
		/* NOTE: using tproxy and redirecting skbs to a proxy
		 * on a different listener port breaks the assumption
		 * that the listener socket's icsk_bind_hash is the same
		 * as that of the child socket. We have to look up or
		 * create a new bind bucket for the child here. */
156
		inet_bind_bucket_for_each(tb, &head->chain) {
157
			if (net_eq(ib_net(tb), sock_net(sk)) &&
158
			    tb->l3mdev == l3mdev && tb->port == port)
159 160
				break;
		}
161
		if (!tb) {
162
			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
163 164
						     sock_net(sk), head, port,
						     l3mdev);
165 166 167 168 169 170
			if (!tb) {
				spin_unlock(&head->lock);
				return -ENOMEM;
			}
		}
	}
171
	inet_bind_hash(child, tb, port);
172
	spin_unlock(&head->lock);
173 174

	return 0;
175 176 177
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
{
	u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6)
		hash = ipv6_portaddr_hash(sock_net(sk),
					  &sk->sk_v6_rcv_saddr,
					  inet_sk(sk)->inet_num);
	else
#endif
		hash = ipv4_portaddr_hash(sock_net(sk),
					  inet_sk(sk)->inet_rcv_saddr,
					  inet_sk(sk)->inet_num);
	return inet_lhash2_bucket(h, hash);
}

static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2)
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	else
		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	ilb2->count++;
	spin_unlock(&ilb2->lock);
}

static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2 ||
	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
	ilb2->count--;
	spin_unlock(&ilb2->lock);
}

232 233
static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
234
				const int dif, const int sdif, bool exact_dif)
235 236 237
{
	int score = -1;

238
	if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
239
			!ipv6_only_sock(sk)) {
240 241 242 243
		if (sk->sk_rcv_saddr != daddr)
			return -1;

		if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
244
			return -1;
245

246
		score = sk->sk_family == PF_INET ? 2 : 1;
247 248
		if (sk->sk_incoming_cpu == raw_smp_processor_id())
			score++;
249 250 251 252
	}
	return score;
}

253
/*
254 255
 * Here are some nice properties to exploit here. The BSD API
 * does not allow a listening sock to specify the remote port nor the
256 257 258
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */
259

260
/* called with rcu_read_lock() : No refcount taken on the socket */
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
static struct sock *inet_lhash2_lookup(struct net *net,
				struct inet_listen_hashbucket *ilb2,
				struct sk_buff *skb, int doff,
				const __be32 saddr, __be16 sport,
				const __be32 daddr, const unsigned short hnum,
				const int dif, const int sdif)
{
	bool exact_dif = inet_exact_dif_match(net, skb);
	struct inet_connection_sock *icsk;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	u32 phash = 0;

	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
		sk = (struct sock *)icsk;
		score = compute_score(sk, net, hnum, daddr,
				      dif, sdif, exact_dif);
		if (score > hiscore) {
			if (sk->sk_reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
				result = reuseport_select_sock(sk, phash,
							       skb, doff);
				if (result)
					return result;
			}
			result = sk;
			hiscore = score;
		}
	}

	return result;
}

295 296
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
297
				    struct sk_buff *skb, int doff,
298
				    const __be32 saddr, __be16 sport,
299
				    const __be32 daddr, const unsigned short hnum,
300
				    const int dif, const int sdif)
301
{
302
	struct inet_listen_hashbucket *ilb2;
303
	struct sock *result = NULL;
304 305 306 307 308 309 310 311 312
	unsigned int hash2;

	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);

	result = inet_lhash2_lookup(net, ilb2, skb, doff,
				    saddr, sport, daddr, hnum,
				    dif, sdif);
	if (result)
313
		goto done;
314 315 316 317 318

	/* Lookup lhash2 with INADDR_ANY */
	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);

319
	result = inet_lhash2_lookup(net, ilb2, skb, doff,
320
				    saddr, sport, htonl(INADDR_ANY), hnum,
321 322 323 324
				    dif, sdif);
done:
	if (unlikely(IS_ERR(result)))
		return NULL;
325
	return result;
326
}
327
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
328

E
Eric Dumazet 已提交
329 330 331
/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
332
	if (!refcount_dec_and_test(&sk->sk_refcnt))
E
Eric Dumazet 已提交
333 334 335 336
		return;

	if (sk->sk_state == TCP_TIME_WAIT)
		inet_twsk_free(inet_twsk(sk));
337 338
	else if (sk->sk_state == TCP_NEW_SYN_RECV)
		reqsk_free(inet_reqsk(sk));
E
Eric Dumazet 已提交
339 340 341 342 343
	else
		sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

344 345 346 347 348 349
void sock_edemux(struct sk_buff *skb)
{
	sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

D
Daniel Baluta 已提交
350
struct sock *__inet_lookup_established(struct net *net,
351
				  struct inet_hashinfo *hashinfo,
352 353
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
354
				  const int dif, const int sdif)
355
{
356
	INET_ADDR_COOKIE(acookie, saddr, daddr);
357 358
	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
359
	const struct hlist_nulls_node *node;
360 361 362
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
363
	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
364
	unsigned int slot = hash & hashinfo->ehash_mask;
365
	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
366

367 368
begin:
	sk_nulls_for_each_rcu(sk, node, &head->chain) {
369 370 371
		if (sk->sk_hash != hash)
			continue;
		if (likely(INET_MATCH(sk, net, acookie,
372
				      saddr, daddr, ports, dif, sdif))) {
373
			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
E
Eric Dumazet 已提交
374
				goto out;
375
			if (unlikely(!INET_MATCH(sk, net, acookie,
376 377
						 saddr, daddr, ports,
						 dif, sdif))) {
E
Eric Dumazet 已提交
378
				sock_gen_put(sk);
379 380
				goto begin;
			}
E
Eric Dumazet 已提交
381
			goto found;
382
		}
383
	}
384 385 386 387 388 389 390
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != slot)
		goto begin;
391
out:
E
Eric Dumazet 已提交
392 393
	sk = NULL;
found:
394 395 396 397
	return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

398 399 400 401 402 403 404
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
				    struct sock *sk, __u16 lport,
				    struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
405 406
	__be32 daddr = inet->inet_rcv_saddr;
	__be32 saddr = inet->inet_daddr;
407
	int dif = sk->sk_bound_dev_if;
408 409
	struct net *net = sock_net(sk);
	int sdif = l3mdev_master_ifindex_by_index(net, dif);
410
	INET_ADDR_COOKIE(acookie, saddr, daddr);
E
Eric Dumazet 已提交
411 412 413
	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
	unsigned int hash = inet_ehashfn(net, daddr, lport,
					 saddr, inet->inet_dport);
414
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
415
	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
416
	struct sock *sk2;
417
	const struct hlist_nulls_node *node;
E
Eric Dumazet 已提交
418
	struct inet_timewait_sock *tw = NULL;
419

420
	spin_lock(lock);
421

422
	sk_nulls_for_each(sk2, node, &head->chain) {
423 424
		if (sk2->sk_hash != hash)
			continue;
E
Eric Dumazet 已提交
425

426
		if (likely(INET_MATCH(sk2, net, acookie,
427
					 saddr, daddr, ports, dif, sdif))) {
E
Eric Dumazet 已提交
428 429 430 431 432
			if (sk2->sk_state == TCP_TIME_WAIT) {
				tw = inet_twsk(sk2);
				if (twsk_unique(sk, sk2, twp))
					break;
			}
433
			goto not_unique;
E
Eric Dumazet 已提交
434
		}
435 436 437
	}

	/* Must record num and sport now. Otherwise we will see
E
Eric Dumazet 已提交
438 439
	 * in hash table socket with a funny identity.
	 */
E
Eric Dumazet 已提交
440 441
	inet->inet_num = lport;
	inet->inet_sport = htons(lport);
442
	sk->sk_hash = hash;
443
	WARN_ON(!sk_unhashed(sk));
444
	__sk_nulls_add_node_rcu(sk, &head->chain);
445
	if (tw) {
446
		sk_nulls_del_node_init_rcu((struct sock *)tw);
447
		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
448
	}
449
	spin_unlock(lock);
450
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
451 452 453 454 455

	if (twp) {
		*twp = tw;
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
456
		inet_twsk_deschedule_put(tw);
457 458 459 460
	}
	return 0;

not_unique:
461
	spin_unlock(lock);
462 463 464
	return -EADDRNOTAVAIL;
}

465
static u32 inet_sk_port_offset(const struct sock *sk)
466 467
{
	const struct inet_sock *inet = inet_sk(sk);
468

E
Eric Dumazet 已提交
469 470 471
	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
					  inet->inet_daddr,
					  inet->inet_dport);
472 473
}

474 475 476
/* insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT
 */
477
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
478
{
479
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
480
	struct hlist_nulls_head *list;
481
	struct inet_ehash_bucket *head;
482
	spinlock_t *lock;
483
	bool ret = true;
484

485
	WARN_ON_ONCE(!sk_unhashed(sk));
486

487
	sk->sk_hash = sk_ehashfn(sk);
488 489 490 491
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

492
	spin_lock(lock);
493
	if (osk) {
494 495
		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
		ret = sk_nulls_del_node_init_rcu(osk);
496
	}
497 498
	if (ret)
		__sk_nulls_add_node_rcu(sk, list);
499
	spin_unlock(lock);
500 501 502
	return ret;
}

503
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
504
{
505 506 507 508 509 510
	bool ok = inet_ehash_insert(sk, osk);

	if (ok) {
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	} else {
		percpu_counter_inc(sk->sk_prot->orphan_count);
511
		inet_sk_set_state(sk, TCP_CLOSE);
512 513 514 515
		sock_set_flag(sk, SOCK_DEAD);
		inet_csk_destroy_sock(sk);
	}
	return ok;
516
}
517
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
518

519
static int inet_reuseport_add_sock(struct sock *sk,
520
				   struct inet_listen_hashbucket *ilb)
521
{
522
	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
523 524 525
	struct sock *sk2;
	kuid_t uid = sock_i_uid(sk);

526
	sk_for_each_rcu(sk2, &ilb->head) {
527 528 529 530
		if (sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
531
		    inet_csk(sk2)->icsk_bind_hash == tb &&
532
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
533
		    inet_rcv_saddr_equal(sk, sk2, false))
534 535
			return reuseport_add_sock(sk, sk2,
						  inet_rcv_saddr_any(sk));
536 537
	}

538
	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
539 540
}

541
int __inet_hash(struct sock *sk, struct sock *osk)
542
{
543
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
544
	struct inet_listen_hashbucket *ilb;
545
	int err = 0;
546

547 548
	if (sk->sk_state != TCP_LISTEN) {
		inet_ehash_nolisten(sk, osk);
549
		return 0;
550
	}
551
	WARN_ON(!sk_unhashed(sk));
552
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
553

554
	spin_lock(&ilb->lock);
555
	if (sk->sk_reuseport) {
556
		err = inet_reuseport_add_sock(sk, ilb);
557 558 559
		if (err)
			goto unlock;
	}
560 561 562 563 564
	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
		sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
	else
		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
565
	inet_hash2(hashinfo, sk);
566
	ilb->count++;
567
	sock_set_flag(sk, SOCK_RCU_FREE);
568
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
569
unlock:
570
	spin_unlock(&ilb->lock);
571 572

	return err;
573
}
E
Eric Dumazet 已提交
574
EXPORT_SYMBOL(__inet_hash);
575

576
int inet_hash(struct sock *sk)
577
{
578 579
	int err = 0;

580 581
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
582
		err = __inet_hash(sk, NULL);
583 584
		local_bh_enable();
	}
585

586
	return err;
587 588 589 590 591
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
592
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
593
	struct inet_listen_hashbucket *ilb = NULL;
594
	spinlock_t *lock;
595 596

	if (sk_unhashed(sk))
597
		return;
598

599
	if (sk->sk_state == TCP_LISTEN) {
600 601
		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
		lock = &ilb->lock;
602
	} else {
603
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
604
	}
605
	spin_lock_bh(lock);
606 607 608
	if (sk_unhashed(sk))
		goto unlock;

609 610
	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);
611
	if (ilb) {
612 613 614 615 616
		inet_unhash2(hashinfo, sk);
		 __sk_del_node_init(sk);
		 ilb->count--;
	} else {
		__sk_nulls_del_node_init_rcu(sk);
617
	}
618 619
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
620
	spin_unlock_bh(lock);
621 622
}
EXPORT_SYMBOL_GPL(inet_unhash);
623

624
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
625
		struct sock *sk, u32 port_offset,
626
		int (*check_established)(struct inet_timewait_death_row *,
627
			struct sock *, __u16, struct inet_timewait_sock **))
628 629
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
630
	struct inet_timewait_sock *tw = NULL;
631
	struct inet_bind_hashbucket *head;
632
	int port = inet_sk(sk)->inet_num;
633
	struct net *net = sock_net(sk);
634 635 636 637
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	static u32 hint;
638
	int l3mdev;
639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655

	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		spin_lock_bh(&head->lock);
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			spin_unlock_bh(&head->lock);
			return 0;
		}
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, port, NULL);
		local_bh_enable();
		return ret;
	}
656

657 658
	l3mdev = inet_sk_bound_l3mdev(sk);

659 660 661 662 663
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;
664

665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
	offset = (hint + port_offset) % remaining;
	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
	offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
680

681 682
		/* Does not bother with rcv_saddr checks, because
		 * the established check is already unique enough.
683
		 */
684
		inet_bind_bucket_for_each(tb, &head->chain) {
685 686
			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
			    tb->port == port) {
687 688
				if (tb->fastreuse >= 0 ||
				    tb->fastreuseport >= 0)
689
					goto next_port;
690 691 692 693 694
				WARN_ON(hlist_empty(&tb->owners));
				if (!check_established(death_row, sk,
						       port, &tw))
					goto ok;
				goto next_port;
695 696
			}
		}
697

698
		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
699
					     net, head, port, l3mdev);
700 701 702
		if (!tb) {
			spin_unlock_bh(&head->lock);
			return -ENOMEM;
703
		}
704 705 706 707 708 709 710
		tb->fastreuse = -1;
		tb->fastreuseport = -1;
		goto ok;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}
711

712 713 714
	offset++;
	if ((offset & 1) && remaining > 1)
		goto other_parity_scan;
715

716
	return -EADDRNOTAVAIL;
717

718 719 720 721 722 723 724 725
ok:
	hint += i + 2;

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, port);
	if (sk_unhashed(sk)) {
		inet_sk(sk)->inet_sport = htons(port);
		inet_ehash_nolisten(sk, (struct sock *)tw);
726
	}
727 728 729 730 731 732 733
	if (tw)
		inet_twsk_bind_unhash(tw, hinfo);
	spin_unlock(&head->lock);
	if (tw)
		inet_twsk_deschedule_put(tw);
	local_bh_enable();
	return 0;
734
}
735 736 737 738 739 740 741

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
		      struct sock *sk)
{
742 743 744 745 746
	u32 port_offset = 0;

	if (!inet_sk(sk)->inet_num)
		port_offset = inet_sk_port_offset(sk);
	return __inet_hash_connect(death_row, sk, port_offset,
747
				   __inet_check_established);
748
}
749
EXPORT_SYMBOL_GPL(inet_hash_connect);
750 751 752 753 754

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

755
	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
756
		spin_lock_init(&h->listening_hash[i].lock);
757
		INIT_HLIST_HEAD(&h->listening_hash[i].head);
758
		h->listening_hash[i].count = 0;
759
	}
760 761

	h->lhash2 = NULL;
762 763
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
764

765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
				unsigned long numentries, int scale,
				unsigned long low_limit,
				unsigned long high_limit)
{
	unsigned int i;

	h->lhash2 = alloc_large_system_hash(name,
					    sizeof(*h->lhash2),
					    numentries,
					    scale,
					    0,
					    NULL,
					    &h->lhash2_mask,
					    low_limit,
					    high_limit);

	for (i = 0; i <= h->lhash2_mask; i++) {
		spin_lock_init(&h->lhash2[i].lock);
		INIT_HLIST_HEAD(&h->lhash2[i].head);
		h->lhash2[i].count = 0;
	}
}

789 790
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
791
	unsigned int locksz = sizeof(spinlock_t);
792 793
	unsigned int i, nblocks = 1;

794
	if (locksz != 0) {
795
		/* allocate 2 cache lines or at least one spinlock per cpu */
796
		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
797 798 799 800 801
		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());

		/* no more locks than number of hash buckets */
		nblocks = min(nblocks, hashinfo->ehash_mask + 1);

802
		hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
803 804 805 806 807 808 809 810 811 812
		if (!hashinfo->ehash_locks)
			return -ENOMEM;

		for (i = 0; i < nblocks; i++)
			spin_lock_init(&hashinfo->ehash_locks[i]);
	}
	hashinfo->ehash_locks_mask = nblocks - 1;
	return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);