inet_hashtables.c 18.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic INET transport hashtables
 *
 * Authors:	Lotsa people, from code originally in tcp
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

16
#include <linux/module.h>
17
#include <linux/random.h>
18
#include <linux/sched.h>
19
#include <linux/slab.h>
20
#include <linux/wait.h>
21
#include <linux/vmalloc.h>
22

23
#include <net/addrconf.h>
24
#include <net/inet_connection_sock.h>
25
#include <net/inet_hashtables.h>
26
#include <net/secure_seq.h>
27
#include <net/ip.h>
28
#include <net/sock_reuseport.h>
29

30 31 32
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
			const __u16 lport, const __be32 faddr,
			const __be16 fport)
33
{
34 35 36 37
	static u32 inet_ehash_secret __read_mostly;

	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));

38 39 40 41
	return __inet_ehashfn(laddr, lport, faddr, fport,
			      inet_ehash_secret + net_hash_mix(net));
}

42 43 44
/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
45
u32 sk_ehashfn(const struct sock *sk)
46
{
47 48 49 50 51 52 53
#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6 &&
	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		return inet6_ehashfn(sock_net(sk),
				     &sk->sk_v6_rcv_saddr, sk->sk_num,
				     &sk->sk_v6_daddr, sk->sk_dport);
#endif
54 55 56
	return inet_ehashfn(sock_net(sk),
			    sk->sk_rcv_saddr, sk->sk_num,
			    sk->sk_daddr, sk->sk_dport);
57 58
}

59 60 61 62
/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
63
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
64
						 struct net *net,
65 66 67
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
68
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
69

70
	if (tb) {
71
		write_pnet(&tb->ib_net, net);
72 73
		tb->port      = snum;
		tb->fastreuse = 0;
74
		tb->fastreuseport = 0;
75
		tb->num_owners = 0;
76 77 78 79 80 81 82 83 84
		INIT_HLIST_HEAD(&tb->owners);
		hlist_add_head(&tb->node, &head->chain);
	}
	return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
85
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
86 87 88 89 90 91
{
	if (hlist_empty(&tb->owners)) {
		__hlist_del(&tb->node);
		kmem_cache_free(cachep, tb);
	}
}
92 93 94 95

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
E
Eric Dumazet 已提交
96
	inet_sk(sk)->inet_num = snum;
97
	sk_add_bind_node(sk, &tb->owners);
98
	tb->num_owners++;
99
	inet_csk(sk)->icsk_bind_hash = tb;
100 101 102 103 104
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
105
static void __inet_put_port(struct sock *sk)
106
{
107
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
E
Eric Dumazet 已提交
108
	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
109
			hashinfo->bhash_size);
110 111 112 113
	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
114
	tb = inet_csk(sk)->icsk_bind_hash;
115
	__sk_del_bind_node(sk);
116
	tb->num_owners--;
117
	inet_csk(sk)->icsk_bind_hash = NULL;
E
Eric Dumazet 已提交
118
	inet_sk(sk)->inet_num = 0;
119 120 121 122
	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
	spin_unlock(&head->lock);
}

123
void inet_put_port(struct sock *sk)
124 125
{
	local_bh_disable();
126
	__inet_put_port(sk);
127 128 129
	local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);
130

131
int __inet_inherit_port(const struct sock *sk, struct sock *child)
132 133
{
	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
134 135
	unsigned short port = inet_sk(child)->inet_num;
	const int bhash = inet_bhashfn(sock_net(sk), port,
136
			table->bhash_size);
137 138 139 140 141
	struct inet_bind_hashbucket *head = &table->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
	tb = inet_csk(sk)->icsk_bind_hash;
142 143 144 145
	if (unlikely(!tb)) {
		spin_unlock(&head->lock);
		return -ENOENT;
	}
146 147 148 149 150 151
	if (tb->port != port) {
		/* NOTE: using tproxy and redirecting skbs to a proxy
		 * on a different listener port breaks the assumption
		 * that the listener socket's icsk_bind_hash is the same
		 * as that of the child socket. We have to look up or
		 * create a new bind bucket for the child here. */
152
		inet_bind_bucket_for_each(tb, &head->chain) {
153 154 155 156
			if (net_eq(ib_net(tb), sock_net(sk)) &&
			    tb->port == port)
				break;
		}
157
		if (!tb) {
158 159 160 161 162 163 164 165
			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
						     sock_net(sk), head, port);
			if (!tb) {
				spin_unlock(&head->lock);
				return -ENOMEM;
			}
		}
	}
166
	inet_bind_hash(child, tb, port);
167
	spin_unlock(&head->lock);
168 169

	return 0;
170 171 172
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

173 174 175 176 177 178 179
static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
				const int dif)
{
	int score = -1;
	struct inet_sock *inet = inet_sk(sk);

E
Eric Dumazet 已提交
180
	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
181
			!ipv6_only_sock(sk)) {
E
Eric Dumazet 已提交
182
		__be32 rcv_saddr = inet->inet_rcv_saddr;
183
		score = sk->sk_family == PF_INET ? 2 : 1;
184 185 186
		if (rcv_saddr) {
			if (rcv_saddr != daddr)
				return -1;
187
			score += 4;
188 189 190 191
		}
		if (sk->sk_bound_dev_if) {
			if (sk->sk_bound_dev_if != dif)
				return -1;
192
			score += 4;
193
		}
194 195
		if (sk->sk_incoming_cpu == raw_smp_processor_id())
			score++;
196 197 198 199
	}
	return score;
}

200 201 202 203 204 205
/*
 * Don't inline this cruft. Here are some nice properties to exploit here. The
 * BSD API does not allow a listening sock to specify the remote port nor the
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */
206

207

208 209
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
210
				    struct sk_buff *skb, int doff,
211
				    const __be32 saddr, __be16 sport,
212
				    const __be32 daddr, const unsigned short hnum,
213
				    const int dif)
214
{
215 216 217 218
	struct sock *sk, *result;
	struct hlist_nulls_node *node;
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
219
	int score, hiscore, matches = 0, reuseport = 0;
220
	bool select_ok = true;
221
	u32 phash = 0;
222

223 224
begin:
	result = NULL;
225
	hiscore = 0;
226 227 228 229 230
	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
		score = compute_score(sk, net, hnum, daddr, dif);
		if (score > hiscore) {
			result = sk;
			hiscore = score;
231 232 233 234
			reuseport = sk->sk_reuseport;
			if (reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
235 236 237 238 239 240 241 242 243
				if (select_ok) {
					struct sock *sk2;
					sk2 = reuseport_select_sock(sk, phash,
								    skb, doff);
					if (sk2) {
						result = sk2;
						goto found;
					}
				}
244 245 246 247
				matches = 1;
			}
		} else if (score == hiscore && reuseport) {
			matches++;
248
			if (reciprocal_scale(phash, matches) == 0)
249 250
				result = sk;
			phash = next_pseudo_random32(phash);
251
		}
252
	}
253 254 255 256 257 258 259 260
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
		goto begin;
	if (result) {
261
found:
262 263 264 265 266
		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
			result = NULL;
		else if (unlikely(compute_score(result, net, hnum, daddr,
				  dif) < hiscore)) {
			sock_put(result);
267
			select_ok = false;
268 269
			goto begin;
		}
270
	}
271
	return result;
272
}
273
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
274

E
Eric Dumazet 已提交
275 276 277 278 279 280 281 282
/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
	if (!atomic_dec_and_test(&sk->sk_refcnt))
		return;

	if (sk->sk_state == TCP_TIME_WAIT)
		inet_twsk_free(inet_twsk(sk));
283 284
	else if (sk->sk_state == TCP_NEW_SYN_RECV)
		reqsk_free(inet_reqsk(sk));
E
Eric Dumazet 已提交
285 286 287 288 289
	else
		sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

290 291 292 293 294 295
void sock_edemux(struct sk_buff *skb)
{
	sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

D
Daniel Baluta 已提交
296
struct sock *__inet_lookup_established(struct net *net,
297
				  struct inet_hashinfo *hashinfo,
298 299 300 301
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
				  const int dif)
{
302
	INET_ADDR_COOKIE(acookie, saddr, daddr);
303 304
	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
305
	const struct hlist_nulls_node *node;
306 307 308
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
309
	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
310
	unsigned int slot = hash & hashinfo->ehash_mask;
311
	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
312

313 314
begin:
	sk_nulls_for_each_rcu(sk, node, &head->chain) {
315 316 317 318
		if (sk->sk_hash != hash)
			continue;
		if (likely(INET_MATCH(sk, net, acookie,
				      saddr, daddr, ports, dif))) {
319
			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
E
Eric Dumazet 已提交
320
				goto out;
321 322
			if (unlikely(!INET_MATCH(sk, net, acookie,
						 saddr, daddr, ports, dif))) {
E
Eric Dumazet 已提交
323
				sock_gen_put(sk);
324 325
				goto begin;
			}
E
Eric Dumazet 已提交
326
			goto found;
327
		}
328
	}
329 330 331 332 333 334 335
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != slot)
		goto begin;
336
out:
E
Eric Dumazet 已提交
337 338
	sk = NULL;
found:
339 340 341 342
	return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

343 344 345 346 347 348 349
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
				    struct sock *sk, __u16 lport,
				    struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
350 351
	__be32 daddr = inet->inet_rcv_saddr;
	__be32 saddr = inet->inet_daddr;
352
	int dif = sk->sk_bound_dev_if;
353
	INET_ADDR_COOKIE(acookie, saddr, daddr);
E
Eric Dumazet 已提交
354
	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
355
	struct net *net = sock_net(sk);
E
Eric Dumazet 已提交
356 357
	unsigned int hash = inet_ehashfn(net, daddr, lport,
					 saddr, inet->inet_dport);
358
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
359
	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
360
	struct sock *sk2;
361
	const struct hlist_nulls_node *node;
E
Eric Dumazet 已提交
362
	struct inet_timewait_sock *tw = NULL;
363

364
	spin_lock(lock);
365

366
	sk_nulls_for_each(sk2, node, &head->chain) {
367 368
		if (sk2->sk_hash != hash)
			continue;
E
Eric Dumazet 已提交
369

370
		if (likely(INET_MATCH(sk2, net, acookie,
E
Eric Dumazet 已提交
371 372 373 374 375 376
					 saddr, daddr, ports, dif))) {
			if (sk2->sk_state == TCP_TIME_WAIT) {
				tw = inet_twsk(sk2);
				if (twsk_unique(sk, sk2, twp))
					break;
			}
377
			goto not_unique;
E
Eric Dumazet 已提交
378
		}
379 380 381
	}

	/* Must record num and sport now. Otherwise we will see
E
Eric Dumazet 已提交
382 383
	 * in hash table socket with a funny identity.
	 */
E
Eric Dumazet 已提交
384 385
	inet->inet_num = lport;
	inet->inet_sport = htons(lport);
386
	sk->sk_hash = hash;
387
	WARN_ON(!sk_unhashed(sk));
388
	__sk_nulls_add_node_rcu(sk, &head->chain);
389
	if (tw) {
390
		sk_nulls_del_node_init_rcu((struct sock *)tw);
391 392
		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
	}
393
	spin_unlock(lock);
394
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
395 396 397 398 399

	if (twp) {
		*twp = tw;
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
400
		inet_twsk_deschedule_put(tw);
401 402 403 404
	}
	return 0;

not_unique:
405
	spin_unlock(lock);
406 407 408
	return -EADDRNOTAVAIL;
}

409
static u32 inet_sk_port_offset(const struct sock *sk)
410 411
{
	const struct inet_sock *inet = inet_sk(sk);
412

E
Eric Dumazet 已提交
413 414 415
	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
					  inet->inet_daddr,
					  inet->inet_dport);
416 417
}

418 419 420
/* insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT
 */
421
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
422
{
423
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
424
	struct hlist_nulls_head *list;
425
	struct inet_ehash_bucket *head;
426
	spinlock_t *lock;
427
	bool ret = true;
428

429
	WARN_ON_ONCE(!sk_unhashed(sk));
430

431
	sk->sk_hash = sk_ehashfn(sk);
432 433 434 435
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

436
	spin_lock(lock);
437
	if (osk) {
438 439
		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
		ret = sk_nulls_del_node_init_rcu(osk);
440
	}
441 442
	if (ret)
		__sk_nulls_add_node_rcu(sk, list);
443
	spin_unlock(lock);
444 445 446
	return ret;
}

447
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
448
{
449 450 451 452 453 454 455 456 457 458 459
	bool ok = inet_ehash_insert(sk, osk);

	if (ok) {
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	} else {
		percpu_counter_inc(sk->sk_prot->orphan_count);
		sk->sk_state = TCP_CLOSE;
		sock_set_flag(sk, SOCK_DEAD);
		inet_csk_destroy_sock(sk);
	}
	return ok;
460
}
461
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
462

463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
static int inet_reuseport_add_sock(struct sock *sk,
				   struct inet_listen_hashbucket *ilb,
				   int (*saddr_same)(const struct sock *sk1,
						     const struct sock *sk2,
						     bool match_wildcard))
{
	struct sock *sk2;
	struct hlist_nulls_node *node;
	kuid_t uid = sock_i_uid(sk);

	sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
		if (sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
		    saddr_same(sk, sk2, false))
			return reuseport_add_sock(sk, sk2);
	}

	/* Initial allocation may have already happened via setsockopt */
	if (!rcu_access_pointer(sk->sk_reuseport_cb))
		return reuseport_alloc(sk);
	return 0;
}

int __inet_hash(struct sock *sk, struct sock *osk,
		 int (*saddr_same)(const struct sock *sk1,
				   const struct sock *sk2,
				   bool match_wildcard))
493
{
494
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
495
	struct inet_listen_hashbucket *ilb;
496
	int err = 0;
497

498 499
	if (sk->sk_state != TCP_LISTEN) {
		inet_ehash_nolisten(sk, osk);
500
		return 0;
501
	}
502
	WARN_ON(!sk_unhashed(sk));
503
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
504

505
	spin_lock(&ilb->lock);
506 507 508 509 510
	if (sk->sk_reuseport) {
		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
		if (err)
			goto unlock;
	}
511
	__sk_nulls_add_node_rcu(sk, &ilb->head);
512
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
513
unlock:
514
	spin_unlock(&ilb->lock);
515 516

	return err;
517
}
E
Eric Dumazet 已提交
518
EXPORT_SYMBOL(__inet_hash);
519

520
int inet_hash(struct sock *sk)
521
{
522 523
	int err = 0;

524 525
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
526
		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
527 528
		local_bh_enable();
	}
529

530
	return err;
531 532 533 534 535
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
536
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
537 538
	spinlock_t *lock;
	int done;
539 540

	if (sk_unhashed(sk))
541
		return;
542

543 544 545 546
	if (sk->sk_state == TCP_LISTEN)
		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
	else
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
547

548
	spin_lock_bh(lock);
549 550
	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);
C
Camelia Groza 已提交
551
	done = __sk_nulls_del_node_init_rcu(sk);
552 553
	if (done)
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
554
	spin_unlock_bh(lock);
555 556
}
EXPORT_SYMBOL_GPL(inet_unhash);
557

558
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
559
		struct sock *sk, u32 port_offset,
560
		int (*check_established)(struct inet_timewait_death_row *,
561
			struct sock *, __u16, struct inet_timewait_sock **))
562 563
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
564
	struct inet_timewait_sock *tw = NULL;
565
	struct inet_bind_hashbucket *head;
566
	int port = inet_sk(sk)->inet_num;
567
	struct net *net = sock_net(sk);
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	static u32 hint;

	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		spin_lock_bh(&head->lock);
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			spin_unlock_bh(&head->lock);
			return 0;
		}
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, port, NULL);
		local_bh_enable();
		return ret;
	}
589

590 591 592 593 594
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;
595

596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
	offset = (hint + port_offset) % remaining;
	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
	offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
611

612 613
		/* Does not bother with rcv_saddr checks, because
		 * the established check is already unique enough.
614
		 */
615 616 617 618
		inet_bind_bucket_for_each(tb, &head->chain) {
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (tb->fastreuse >= 0 ||
				    tb->fastreuseport >= 0)
619
					goto next_port;
620 621 622 623 624
				WARN_ON(hlist_empty(&tb->owners));
				if (!check_established(death_row, sk,
						       port, &tw))
					goto ok;
				goto next_port;
625 626
			}
		}
627

628 629 630 631 632
		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					     net, head, port);
		if (!tb) {
			spin_unlock_bh(&head->lock);
			return -ENOMEM;
633
		}
634 635 636 637 638 639 640
		tb->fastreuse = -1;
		tb->fastreuseport = -1;
		goto ok;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}
641

642 643 644
	offset++;
	if ((offset & 1) && remaining > 1)
		goto other_parity_scan;
645

646
	return -EADDRNOTAVAIL;
647

648 649 650 651 652 653 654 655
ok:
	hint += i + 2;

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, port);
	if (sk_unhashed(sk)) {
		inet_sk(sk)->inet_sport = htons(port);
		inet_ehash_nolisten(sk, (struct sock *)tw);
656
	}
657 658 659 660 661 662 663
	if (tw)
		inet_twsk_bind_unhash(tw, hinfo);
	spin_unlock(&head->lock);
	if (tw)
		inet_twsk_deschedule_put(tw);
	local_bh_enable();
	return 0;
664
}
665 666 667 668 669 670 671

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
		      struct sock *sk)
{
672 673 674 675 676
	u32 port_offset = 0;

	if (!inet_sk(sk)->inet_num)
		port_offset = inet_sk_port_offset(sk);
	return __inet_hash_connect(death_row, sk, port_offset,
677
				   __inet_check_established);
678
}
679
EXPORT_SYMBOL_GPL(inet_hash_connect);
680 681 682 683 684

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

685
	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
686
		spin_lock_init(&h->listening_hash[i].lock);
687 688 689
		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
				      i + LISTENING_NULLS_BASE);
		}
690 691
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
692 693 694

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
695
	unsigned int locksz = sizeof(spinlock_t);
696 697
	unsigned int i, nblocks = 1;

698
	if (locksz != 0) {
699
		/* allocate 2 cache lines or at least one spinlock per cpu */
700
		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
701 702 703 704 705
		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());

		/* no more locks than number of hash buckets */
		nblocks = min(nblocks, hashinfo->ehash_mask + 1);

706
		hashinfo->ehash_locks =	kmalloc_array(nblocks, locksz,
707 708
						      GFP_KERNEL | __GFP_NOWARN);
		if (!hashinfo->ehash_locks)
709
			hashinfo->ehash_locks = vmalloc(nblocks * locksz);
710 711 712 713 714 715 716 717 718 719 720

		if (!hashinfo->ehash_locks)
			return -ENOMEM;

		for (i = 0; i < nblocks; i++)
			spin_lock_init(&hashinfo->ehash_locks[i]);
	}
	hashinfo->ehash_locks_mask = nblocks - 1;
	return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);