inet_hashtables.c 18.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic INET transport hashtables
 *
 * Authors:	Lotsa people, from code originally in tcp
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

16
#include <linux/module.h>
17
#include <linux/random.h>
18
#include <linux/sched.h>
19
#include <linux/slab.h>
20
#include <linux/wait.h>
21
#include <linux/vmalloc.h>
22

23
#include <net/addrconf.h>
24
#include <net/inet_connection_sock.h>
25
#include <net/inet_hashtables.h>
26
#include <net/secure_seq.h>
27
#include <net/ip.h>
28
#include <net/sock_reuseport.h>
29

30 31 32
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
			const __u16 lport, const __be32 faddr,
			const __be16 fport)
33
{
34 35 36 37
	static u32 inet_ehash_secret __read_mostly;

	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));

38 39 40 41
	return __inet_ehashfn(laddr, lport, faddr, fport,
			      inet_ehash_secret + net_hash_mix(net));
}

42 43 44
/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
45
u32 sk_ehashfn(const struct sock *sk)
46
{
47 48 49 50 51 52 53
#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6 &&
	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		return inet6_ehashfn(sock_net(sk),
				     &sk->sk_v6_rcv_saddr, sk->sk_num,
				     &sk->sk_v6_daddr, sk->sk_dport);
#endif
54 55 56
	return inet_ehashfn(sock_net(sk),
			    sk->sk_rcv_saddr, sk->sk_num,
			    sk->sk_daddr, sk->sk_dport);
57 58
}

59 60 61 62
/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
63
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
64
						 struct net *net,
65 66 67
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
68
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
69

70
	if (tb) {
71
		write_pnet(&tb->ib_net, net);
72 73
		tb->port      = snum;
		tb->fastreuse = 0;
74
		tb->fastreuseport = 0;
75
		tb->num_owners = 0;
76 77 78 79 80 81 82 83 84
		INIT_HLIST_HEAD(&tb->owners);
		hlist_add_head(&tb->node, &head->chain);
	}
	return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
85
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
86 87 88 89 90 91
{
	if (hlist_empty(&tb->owners)) {
		__hlist_del(&tb->node);
		kmem_cache_free(cachep, tb);
	}
}
92 93 94 95

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
E
Eric Dumazet 已提交
96
	inet_sk(sk)->inet_num = snum;
97
	sk_add_bind_node(sk, &tb->owners);
98
	tb->num_owners++;
99
	inet_csk(sk)->icsk_bind_hash = tb;
100 101 102 103 104
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
105
static void __inet_put_port(struct sock *sk)
106
{
107
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
E
Eric Dumazet 已提交
108
	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
109
			hashinfo->bhash_size);
110 111 112 113
	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
114
	tb = inet_csk(sk)->icsk_bind_hash;
115
	__sk_del_bind_node(sk);
116
	tb->num_owners--;
117
	inet_csk(sk)->icsk_bind_hash = NULL;
E
Eric Dumazet 已提交
118
	inet_sk(sk)->inet_num = 0;
119 120 121 122
	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
	spin_unlock(&head->lock);
}

123
void inet_put_port(struct sock *sk)
124 125
{
	local_bh_disable();
126
	__inet_put_port(sk);
127 128 129
	local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);
130

131
int __inet_inherit_port(const struct sock *sk, struct sock *child)
132 133
{
	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
134 135
	unsigned short port = inet_sk(child)->inet_num;
	const int bhash = inet_bhashfn(sock_net(sk), port,
136
			table->bhash_size);
137 138 139 140 141
	struct inet_bind_hashbucket *head = &table->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
	tb = inet_csk(sk)->icsk_bind_hash;
142 143 144 145
	if (unlikely(!tb)) {
		spin_unlock(&head->lock);
		return -ENOENT;
	}
146 147 148 149 150 151
	if (tb->port != port) {
		/* NOTE: using tproxy and redirecting skbs to a proxy
		 * on a different listener port breaks the assumption
		 * that the listener socket's icsk_bind_hash is the same
		 * as that of the child socket. We have to look up or
		 * create a new bind bucket for the child here. */
152
		inet_bind_bucket_for_each(tb, &head->chain) {
153 154 155 156
			if (net_eq(ib_net(tb), sock_net(sk)) &&
			    tb->port == port)
				break;
		}
157
		if (!tb) {
158 159 160 161 162 163 164 165
			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
						     sock_net(sk), head, port);
			if (!tb) {
				spin_unlock(&head->lock);
				return -ENOMEM;
			}
		}
	}
166
	inet_bind_hash(child, tb, port);
167
	spin_unlock(&head->lock);
168 169

	return 0;
170 171 172
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

173 174 175 176 177 178 179
static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
				const int dif)
{
	int score = -1;
	struct inet_sock *inet = inet_sk(sk);

E
Eric Dumazet 已提交
180
	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
181
			!ipv6_only_sock(sk)) {
E
Eric Dumazet 已提交
182
		__be32 rcv_saddr = inet->inet_rcv_saddr;
183
		score = sk->sk_family == PF_INET ? 2 : 1;
184 185 186
		if (rcv_saddr) {
			if (rcv_saddr != daddr)
				return -1;
187
			score += 4;
188 189 190 191
		}
		if (sk->sk_bound_dev_if) {
			if (sk->sk_bound_dev_if != dif)
				return -1;
192
			score += 4;
193
		}
194 195
		if (sk->sk_incoming_cpu == raw_smp_processor_id())
			score++;
196 197 198 199
	}
	return score;
}

200 201 202 203 204 205
/*
 * Don't inline this cruft. Here are some nice properties to exploit here. The
 * BSD API does not allow a listening sock to specify the remote port nor the
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */
206

207

208 209
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
210
				    struct sk_buff *skb, int doff,
211
				    const __be32 saddr, __be16 sport,
212
				    const __be32 daddr, const unsigned short hnum,
213
				    const int dif)
214
{
215 216 217 218
	struct sock *sk, *result;
	struct hlist_nulls_node *node;
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
219
	int score, hiscore, matches = 0, reuseport = 0;
220
	bool select_ok = true;
221
	u32 phash = 0;
222

223 224 225
	rcu_read_lock();
begin:
	result = NULL;
226
	hiscore = 0;
227 228 229 230 231
	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
		score = compute_score(sk, net, hnum, daddr, dif);
		if (score > hiscore) {
			result = sk;
			hiscore = score;
232 233 234 235
			reuseport = sk->sk_reuseport;
			if (reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
236 237 238 239 240 241 242 243 244
				if (select_ok) {
					struct sock *sk2;
					sk2 = reuseport_select_sock(sk, phash,
								    skb, doff);
					if (sk2) {
						result = sk2;
						goto found;
					}
				}
245 246 247 248
				matches = 1;
			}
		} else if (score == hiscore && reuseport) {
			matches++;
249
			if (reciprocal_scale(phash, matches) == 0)
250 251
				result = sk;
			phash = next_pseudo_random32(phash);
252
		}
253
	}
254 255 256 257 258 259 260 261
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
		goto begin;
	if (result) {
262
found:
263 264 265 266 267
		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
			result = NULL;
		else if (unlikely(compute_score(result, net, hnum, daddr,
				  dif) < hiscore)) {
			sock_put(result);
268
			select_ok = false;
269 270
			goto begin;
		}
271
	}
272 273
	rcu_read_unlock();
	return result;
274
}
275
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
276

E
Eric Dumazet 已提交
277 278 279 280 281 282 283 284
/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
	if (!atomic_dec_and_test(&sk->sk_refcnt))
		return;

	if (sk->sk_state == TCP_TIME_WAIT)
		inet_twsk_free(inet_twsk(sk));
285 286
	else if (sk->sk_state == TCP_NEW_SYN_RECV)
		reqsk_free(inet_reqsk(sk));
E
Eric Dumazet 已提交
287 288 289 290 291
	else
		sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

292 293 294 295 296 297
void sock_edemux(struct sk_buff *skb)
{
	sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

D
Daniel Baluta 已提交
298
struct sock *__inet_lookup_established(struct net *net,
299
				  struct inet_hashinfo *hashinfo,
300 301 302 303
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
				  const int dif)
{
304
	INET_ADDR_COOKIE(acookie, saddr, daddr);
305 306
	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
307
	const struct hlist_nulls_node *node;
308 309 310
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
311
	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
312
	unsigned int slot = hash & hashinfo->ehash_mask;
313
	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
314

315 316 317
	rcu_read_lock();
begin:
	sk_nulls_for_each_rcu(sk, node, &head->chain) {
318 319 320 321
		if (sk->sk_hash != hash)
			continue;
		if (likely(INET_MATCH(sk, net, acookie,
				      saddr, daddr, ports, dif))) {
322
			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
E
Eric Dumazet 已提交
323
				goto out;
324 325
			if (unlikely(!INET_MATCH(sk, net, acookie,
						 saddr, daddr, ports, dif))) {
E
Eric Dumazet 已提交
326
				sock_gen_put(sk);
327 328
				goto begin;
			}
E
Eric Dumazet 已提交
329
			goto found;
330
		}
331
	}
332 333 334 335 336 337 338
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != slot)
		goto begin;
339
out:
E
Eric Dumazet 已提交
340 341
	sk = NULL;
found:
342
	rcu_read_unlock();
343 344 345 346
	return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

347 348 349 350 351 352 353
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
				    struct sock *sk, __u16 lport,
				    struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
354 355
	__be32 daddr = inet->inet_rcv_saddr;
	__be32 saddr = inet->inet_daddr;
356
	int dif = sk->sk_bound_dev_if;
357
	INET_ADDR_COOKIE(acookie, saddr, daddr);
E
Eric Dumazet 已提交
358
	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
359
	struct net *net = sock_net(sk);
E
Eric Dumazet 已提交
360 361
	unsigned int hash = inet_ehashfn(net, daddr, lport,
					 saddr, inet->inet_dport);
362
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
363
	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
364
	struct sock *sk2;
365
	const struct hlist_nulls_node *node;
E
Eric Dumazet 已提交
366
	struct inet_timewait_sock *tw = NULL;
367

368
	spin_lock(lock);
369

370
	sk_nulls_for_each(sk2, node, &head->chain) {
371 372
		if (sk2->sk_hash != hash)
			continue;
E
Eric Dumazet 已提交
373

374
		if (likely(INET_MATCH(sk2, net, acookie,
E
Eric Dumazet 已提交
375 376 377 378 379 380
					 saddr, daddr, ports, dif))) {
			if (sk2->sk_state == TCP_TIME_WAIT) {
				tw = inet_twsk(sk2);
				if (twsk_unique(sk, sk2, twp))
					break;
			}
381
			goto not_unique;
E
Eric Dumazet 已提交
382
		}
383 384 385
	}

	/* Must record num and sport now. Otherwise we will see
E
Eric Dumazet 已提交
386 387
	 * in hash table socket with a funny identity.
	 */
E
Eric Dumazet 已提交
388 389
	inet->inet_num = lport;
	inet->inet_sport = htons(lport);
390
	sk->sk_hash = hash;
391
	WARN_ON(!sk_unhashed(sk));
392
	__sk_nulls_add_node_rcu(sk, &head->chain);
393
	if (tw) {
394
		sk_nulls_del_node_init_rcu((struct sock *)tw);
395 396
		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
	}
397
	spin_unlock(lock);
398
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
399 400 401 402 403

	if (twp) {
		*twp = tw;
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
404
		inet_twsk_deschedule_put(tw);
405 406 407 408
	}
	return 0;

not_unique:
409
	spin_unlock(lock);
410 411 412
	return -EADDRNOTAVAIL;
}

413
static u32 inet_sk_port_offset(const struct sock *sk)
414 415
{
	const struct inet_sock *inet = inet_sk(sk);
416

E
Eric Dumazet 已提交
417 418 419
	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
					  inet->inet_daddr,
					  inet->inet_dport);
420 421
}

422 423 424
/* insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT
 */
425
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
426
{
427
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
428
	struct hlist_nulls_head *list;
429
	struct inet_ehash_bucket *head;
430
	spinlock_t *lock;
431
	bool ret = true;
432

433
	WARN_ON_ONCE(!sk_unhashed(sk));
434

435
	sk->sk_hash = sk_ehashfn(sk);
436 437 438 439
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

440
	spin_lock(lock);
441
	if (osk) {
442 443
		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
		ret = sk_nulls_del_node_init_rcu(osk);
444
	}
445 446
	if (ret)
		__sk_nulls_add_node_rcu(sk, list);
447
	spin_unlock(lock);
448 449 450
	return ret;
}

451
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
452
{
453 454 455 456 457 458 459 460 461 462 463
	bool ok = inet_ehash_insert(sk, osk);

	if (ok) {
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	} else {
		percpu_counter_inc(sk->sk_prot->orphan_count);
		sk->sk_state = TCP_CLOSE;
		sock_set_flag(sk, SOCK_DEAD);
		inet_csk_destroy_sock(sk);
	}
	return ok;
464
}
465
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
466

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
static int inet_reuseport_add_sock(struct sock *sk,
				   struct inet_listen_hashbucket *ilb,
				   int (*saddr_same)(const struct sock *sk1,
						     const struct sock *sk2,
						     bool match_wildcard))
{
	struct sock *sk2;
	struct hlist_nulls_node *node;
	kuid_t uid = sock_i_uid(sk);

	sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
		if (sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
		    saddr_same(sk, sk2, false))
			return reuseport_add_sock(sk, sk2);
	}

	/* Initial allocation may have already happened via setsockopt */
	if (!rcu_access_pointer(sk->sk_reuseport_cb))
		return reuseport_alloc(sk);
	return 0;
}

int __inet_hash(struct sock *sk, struct sock *osk,
		 int (*saddr_same)(const struct sock *sk1,
				   const struct sock *sk2,
				   bool match_wildcard))
497
{
498
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
499
	struct inet_listen_hashbucket *ilb;
500
	int err = 0;
501

502 503
	if (sk->sk_state != TCP_LISTEN) {
		inet_ehash_nolisten(sk, osk);
504
		return 0;
505
	}
506
	WARN_ON(!sk_unhashed(sk));
507
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
508

509
	spin_lock(&ilb->lock);
510 511 512 513 514
	if (sk->sk_reuseport) {
		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
		if (err)
			goto unlock;
	}
515
	__sk_nulls_add_node_rcu(sk, &ilb->head);
516
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
517
unlock:
518
	spin_unlock(&ilb->lock);
519 520

	return err;
521
}
E
Eric Dumazet 已提交
522
EXPORT_SYMBOL(__inet_hash);
523

524
int inet_hash(struct sock *sk)
525
{
526 527
	int err = 0;

528 529
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
530
		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
531 532
		local_bh_enable();
	}
533

534
	return err;
535 536 537 538 539
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
540
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
541 542
	spinlock_t *lock;
	int done;
543 544

	if (sk_unhashed(sk))
545
		return;
546

547 548 549 550
	if (sk->sk_state == TCP_LISTEN)
		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
	else
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
551

552
	spin_lock_bh(lock);
553 554
	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);
C
Camelia Groza 已提交
555
	done = __sk_nulls_del_node_init_rcu(sk);
556 557
	if (done)
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
558
	spin_unlock_bh(lock);
559 560
}
EXPORT_SYMBOL_GPL(inet_unhash);
561

562
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
563
		struct sock *sk, u32 port_offset,
564
		int (*check_established)(struct inet_timewait_death_row *,
565
			struct sock *, __u16, struct inet_timewait_sock **))
566 567
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
E
Eric Dumazet 已提交
568
	const unsigned short snum = inet_sk(sk)->inet_num;
569 570
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
571
	int ret;
572
	struct net *net = sock_net(sk);
573

574
	if (!snum) {
575
		int i, remaining, low, high, port;
576
		static u32 hint;
577
		u32 offset = hint + port_offset;
578
		struct inet_timewait_sock *tw = NULL;
579

580
		inet_get_local_port_range(net, &low, &high);
581
		remaining = (high - low) + 1;
582

583 584 585 586 587 588
		/* By starting with offset being an even number,
		 * we tend to leave about 50% of ports for other uses,
		 * like bind(0).
		 */
		offset &= ~1;

589
		local_bh_disable();
590
		for (i = 0; i < remaining; i++) {
591
			port = low + (i + offset) % remaining;
592
			if (inet_is_local_reserved_port(net, port))
593
				continue;
594 595
			head = &hinfo->bhash[inet_bhashfn(net, port,
					hinfo->bhash_size)];
596
			spin_lock(&head->lock);
597

598 599 600 601
			/* Does not bother with rcv_saddr checks,
			 * because the established check is already
			 * unique enough.
			 */
602
			inet_bind_bucket_for_each(tb, &head->chain) {
O
Octavian Purdila 已提交
603 604
				if (net_eq(ib_net(tb), net) &&
				    tb->port == port) {
605 606
					if (tb->fastreuse >= 0 ||
					    tb->fastreuseport >= 0)
607
						goto next_port;
608
					WARN_ON(hlist_empty(&tb->owners));
609 610
					if (!check_established(death_row, sk,
								port, &tw))
611 612 613 614 615
						goto ok;
					goto next_port;
				}
			}

616 617
			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					net, head, port);
618 619 620 621 622
			if (!tb) {
				spin_unlock(&head->lock);
				break;
			}
			tb->fastreuse = -1;
623
			tb->fastreuseport = -1;
624 625 626 627 628 629 630 631
			goto ok;

		next_port:
			spin_unlock(&head->lock);
		}
		local_bh_enable();

		return -EADDRNOTAVAIL;
632 633

ok:
634
		hint += (i + 2) & ~1;
635

636 637
		/* Head lock still held and bh's disabled */
		inet_bind_hash(sk, tb, port);
638
		if (sk_unhashed(sk)) {
E
Eric Dumazet 已提交
639
			inet_sk(sk)->inet_sport = htons(port);
640
			inet_ehash_nolisten(sk, (struct sock *)tw);
641
		}
642
		if (tw)
643
			inet_twsk_bind_unhash(tw, hinfo);
644
		spin_unlock(&head->lock);
645

646 647
		if (tw)
			inet_twsk_deschedule_put(tw);
648 649 650

		ret = 0;
		goto out;
651
	}
652

653
	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
654
	tb  = inet_csk(sk)->icsk_bind_hash;
655 656
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
657
		inet_ehash_nolisten(sk, NULL);
658 659 660 661 662
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
663
		ret = check_established(death_row, sk, snum, NULL);
664 665 666 667 668
out:
		local_bh_enable();
		return ret;
	}
}
669 670 671 672 673 674 675

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
		      struct sock *sk)
{
676 677 678 679 680
	u32 port_offset = 0;

	if (!inet_sk(sk)->inet_num)
		port_offset = inet_sk_port_offset(sk);
	return __inet_hash_connect(death_row, sk, port_offset,
681
				   __inet_check_established);
682
}
683
EXPORT_SYMBOL_GPL(inet_hash_connect);
684 685 686 687 688

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

689
	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
690
		spin_lock_init(&h->listening_hash[i].lock);
691 692 693
		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
				      i + LISTENING_NULLS_BASE);
		}
694 695
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
696 697 698

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
699
	unsigned int locksz = sizeof(spinlock_t);
700 701
	unsigned int i, nblocks = 1;

702
	if (locksz != 0) {
703
		/* allocate 2 cache lines or at least one spinlock per cpu */
704
		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
705 706 707 708 709
		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());

		/* no more locks than number of hash buckets */
		nblocks = min(nblocks, hashinfo->ehash_mask + 1);

710
		hashinfo->ehash_locks =	kmalloc_array(nblocks, locksz,
711 712
						      GFP_KERNEL | __GFP_NOWARN);
		if (!hashinfo->ehash_locks)
713
			hashinfo->ehash_locks = vmalloc(nblocks * locksz);
714 715 716 717 718 719 720 721 722 723 724

		if (!hashinfo->ehash_locks)
			return -ENOMEM;

		for (i = 0; i < nblocks; i++)
			spin_lock_init(&hashinfo->ehash_locks[i]);
	}
	hashinfo->ehash_locks_mask = nblocks - 1;
	return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);