inet_hashtables.c 21.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic INET transport hashtables
 *
 * Authors:	Lotsa people, from code originally in tcp
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

16
#include <linux/module.h>
17
#include <linux/random.h>
18
#include <linux/sched.h>
19
#include <linux/slab.h>
20
#include <linux/wait.h>
21
#include <linux/vmalloc.h>
22
#include <linux/bootmem.h>
23

24
#include <net/addrconf.h>
25
#include <net/inet_connection_sock.h>
26
#include <net/inet_hashtables.h>
27
#include <net/secure_seq.h>
28
#include <net/ip.h>
29
#include <net/tcp.h>
30
#include <net/sock_reuseport.h>
31

32 33 34
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
			const __u16 lport, const __be32 faddr,
			const __be16 fport)
35
{
36 37 38 39
	static u32 inet_ehash_secret __read_mostly;

	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));

40 41 42 43
	return __inet_ehashfn(laddr, lport, faddr, fport,
			      inet_ehash_secret + net_hash_mix(net));
}

44 45 46
/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
E
Eric Dumazet 已提交
47
static u32 sk_ehashfn(const struct sock *sk)
48
{
49 50 51 52 53 54 55
#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6 &&
	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		return inet6_ehashfn(sock_net(sk),
				     &sk->sk_v6_rcv_saddr, sk->sk_num,
				     &sk->sk_v6_daddr, sk->sk_dport);
#endif
56 57 58
	return inet_ehashfn(sock_net(sk),
			    sk->sk_rcv_saddr, sk->sk_num,
			    sk->sk_daddr, sk->sk_dport);
59 60
}

61 62 63 64
/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
65
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
66
						 struct net *net,
67 68 69
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
70
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
71

72
	if (tb) {
73
		write_pnet(&tb->ib_net, net);
74 75
		tb->port      = snum;
		tb->fastreuse = 0;
76
		tb->fastreuseport = 0;
77 78 79 80 81 82 83 84 85
		INIT_HLIST_HEAD(&tb->owners);
		hlist_add_head(&tb->node, &head->chain);
	}
	return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
86
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
87 88 89 90 91 92
{
	if (hlist_empty(&tb->owners)) {
		__hlist_del(&tb->node);
		kmem_cache_free(cachep, tb);
	}
}
93 94 95 96

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
E
Eric Dumazet 已提交
97
	inet_sk(sk)->inet_num = snum;
98
	sk_add_bind_node(sk, &tb->owners);
99
	inet_csk(sk)->icsk_bind_hash = tb;
100 101 102 103 104
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
105
static void __inet_put_port(struct sock *sk)
106
{
107
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
E
Eric Dumazet 已提交
108
	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
109
			hashinfo->bhash_size);
110 111 112 113
	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
114
	tb = inet_csk(sk)->icsk_bind_hash;
115
	__sk_del_bind_node(sk);
116
	inet_csk(sk)->icsk_bind_hash = NULL;
E
Eric Dumazet 已提交
117
	inet_sk(sk)->inet_num = 0;
118 119 120 121
	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
	spin_unlock(&head->lock);
}

122
void inet_put_port(struct sock *sk)
123 124
{
	local_bh_disable();
125
	__inet_put_port(sk);
126 127 128
	local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);
129

130
int __inet_inherit_port(const struct sock *sk, struct sock *child)
131 132
{
	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
133 134
	unsigned short port = inet_sk(child)->inet_num;
	const int bhash = inet_bhashfn(sock_net(sk), port,
135
			table->bhash_size);
136 137 138 139 140
	struct inet_bind_hashbucket *head = &table->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
	tb = inet_csk(sk)->icsk_bind_hash;
141 142 143 144
	if (unlikely(!tb)) {
		spin_unlock(&head->lock);
		return -ENOENT;
	}
145 146 147 148 149 150
	if (tb->port != port) {
		/* NOTE: using tproxy and redirecting skbs to a proxy
		 * on a different listener port breaks the assumption
		 * that the listener socket's icsk_bind_hash is the same
		 * as that of the child socket. We have to look up or
		 * create a new bind bucket for the child here. */
151
		inet_bind_bucket_for_each(tb, &head->chain) {
152 153 154 155
			if (net_eq(ib_net(tb), sock_net(sk)) &&
			    tb->port == port)
				break;
		}
156
		if (!tb) {
157 158 159 160 161 162 163 164
			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
						     sock_net(sk), head, port);
			if (!tb) {
				spin_unlock(&head->lock);
				return -ENOMEM;
			}
		}
	}
165
	inet_bind_hash(child, tb, port);
166
	spin_unlock(&head->lock);
167 168

	return 0;
169 170 171
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
{
	u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
	if (sk->sk_family == AF_INET6)
		hash = ipv6_portaddr_hash(sock_net(sk),
					  &sk->sk_v6_rcv_saddr,
					  inet_sk(sk)->inet_num);
	else
#endif
		hash = ipv4_portaddr_hash(sock_net(sk),
					  inet_sk(sk)->inet_rcv_saddr,
					  inet_sk(sk)->inet_num);
	return inet_lhash2_bucket(h, hash);
}

static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2)
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	else
		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	ilb2->count++;
	spin_unlock(&ilb2->lock);
}

static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2 ||
	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
	ilb2->count--;
	spin_unlock(&ilb2->lock);
}

226 227
static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
228
				const int dif, const int sdif, bool exact_dif)
229 230 231 232
{
	int score = -1;
	struct inet_sock *inet = inet_sk(sk);

E
Eric Dumazet 已提交
233
	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
234
			!ipv6_only_sock(sk)) {
E
Eric Dumazet 已提交
235
		__be32 rcv_saddr = inet->inet_rcv_saddr;
236
		score = sk->sk_family == PF_INET ? 2 : 1;
237 238 239
		if (rcv_saddr) {
			if (rcv_saddr != daddr)
				return -1;
240
			score += 4;
241
		}
242
		if (sk->sk_bound_dev_if || exact_dif) {
243 244 245
			bool dev_match = (sk->sk_bound_dev_if == dif ||
					  sk->sk_bound_dev_if == sdif);

246
			if (!dev_match)
247
				return -1;
248
			if (sk->sk_bound_dev_if)
249
				score += 4;
250
		}
251
		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
252
			score++;
253 254 255 256
	}
	return score;
}

257
/*
258 259
 * Here are some nice properties to exploit here. The BSD API
 * does not allow a listening sock to specify the remote port nor the
260 261 262
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */
263

264
/* called with rcu_read_lock() : No refcount taken on the socket */
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
static struct sock *inet_lhash2_lookup(struct net *net,
				struct inet_listen_hashbucket *ilb2,
				struct sk_buff *skb, int doff,
				const __be32 saddr, __be16 sport,
				const __be32 daddr, const unsigned short hnum,
				const int dif, const int sdif)
{
	bool exact_dif = inet_exact_dif_match(net, skb);
	struct inet_connection_sock *icsk;
	struct sock *sk, *result = NULL;
	int score, hiscore = 0;
	u32 phash = 0;

	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
		sk = (struct sock *)icsk;
		score = compute_score(sk, net, hnum, daddr,
				      dif, sdif, exact_dif);
		if (score > hiscore) {
			if (sk->sk_reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
				result = reuseport_select_sock(sk, phash,
							       skb, doff);
				if (result)
					return result;
			}
			result = sk;
			hiscore = score;
		}
	}

	return result;
}

299 300
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
301
				    struct sk_buff *skb, int doff,
302
				    const __be32 saddr, __be16 sport,
303
				    const __be32 daddr, const unsigned short hnum,
304
				    const int dif, const int sdif)
305
{
306 307
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
308
	bool exact_dif = inet_exact_dif_match(net, skb);
309
	struct inet_listen_hashbucket *ilb2;
310
	struct sock *sk, *result = NULL;
P
Paolo Abeni 已提交
311
	int score, hiscore = 0;
312
	unsigned int hash2;
313
	u32 phash = 0;
314

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	if (ilb->count <= 10 || !hashinfo->lhash2)
		goto port_lookup;

	/* Too many sk in the ilb bucket (which is hashed by port alone).
	 * Try lhash2 (which is hashed by port and addr) instead.
	 */

	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

	result = inet_lhash2_lookup(net, ilb2, skb, doff,
				    saddr, sport, daddr, hnum,
				    dif, sdif);
	if (result)
331
		goto done;
332 333 334 335 336 337 338 339

	/* Lookup lhash2 with INADDR_ANY */

	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
	if (ilb2->count > ilb->count)
		goto port_lookup;

340 341 342 343
	result = inet_lhash2_lookup(net, ilb2, skb, doff,
				    saddr, sport, daddr, hnum,
				    dif, sdif);
	goto done;
344 345

port_lookup:
346
	sk_for_each_rcu(sk, &ilb->head) {
347 348
		score = compute_score(sk, net, hnum, daddr,
				      dif, sdif, exact_dif);
349
		if (score > hiscore) {
P
Paolo Abeni 已提交
350
			if (sk->sk_reuseport) {
351 352
				phash = inet_ehashfn(net, daddr, hnum,
						     saddr, sport);
353 354 355
				result = reuseport_select_sock(sk, phash,
							       skb, doff);
				if (result)
356
					goto done;
357
			}
358 359
			result = sk;
			hiscore = score;
360
		}
361
	}
362 363 364
done:
	if (unlikely(IS_ERR(result)))
		return NULL;
365
	return result;
366
}
367
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
368

E
Eric Dumazet 已提交
369 370 371
/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
372
	if (!refcount_dec_and_test(&sk->sk_refcnt))
E
Eric Dumazet 已提交
373 374 375 376
		return;

	if (sk->sk_state == TCP_TIME_WAIT)
		inet_twsk_free(inet_twsk(sk));
377 378
	else if (sk->sk_state == TCP_NEW_SYN_RECV)
		reqsk_free(inet_reqsk(sk));
E
Eric Dumazet 已提交
379 380 381 382 383
	else
		sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

384 385 386 387 388 389
void sock_edemux(struct sk_buff *skb)
{
	sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

D
Daniel Baluta 已提交
390
struct sock *__inet_lookup_established(struct net *net,
391
				  struct inet_hashinfo *hashinfo,
392 393
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
394
				  const int dif, const int sdif)
395
{
396
	INET_ADDR_COOKIE(acookie, saddr, daddr);
397 398
	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
399
	const struct hlist_nulls_node *node;
400 401 402
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
403
	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
404
	unsigned int slot = hash & hashinfo->ehash_mask;
405
	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
406

407 408
begin:
	sk_nulls_for_each_rcu(sk, node, &head->chain) {
409 410 411
		if (sk->sk_hash != hash)
			continue;
		if (likely(INET_MATCH(sk, net, acookie,
412
				      saddr, daddr, ports, dif, sdif))) {
413
			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
E
Eric Dumazet 已提交
414
				goto out;
415
			if (unlikely(!INET_MATCH(sk, net, acookie,
416 417
						 saddr, daddr, ports,
						 dif, sdif))) {
E
Eric Dumazet 已提交
418
				sock_gen_put(sk);
419 420
				goto begin;
			}
E
Eric Dumazet 已提交
421
			goto found;
422
		}
423
	}
424 425 426 427 428 429 430
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != slot)
		goto begin;
431
out:
E
Eric Dumazet 已提交
432 433
	sk = NULL;
found:
434 435 436 437
	return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

438 439 440 441 442 443 444
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
				    struct sock *sk, __u16 lport,
				    struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
445 446
	__be32 daddr = inet->inet_rcv_saddr;
	__be32 saddr = inet->inet_daddr;
447
	int dif = sk->sk_bound_dev_if;
448 449
	struct net *net = sock_net(sk);
	int sdif = l3mdev_master_ifindex_by_index(net, dif);
450
	INET_ADDR_COOKIE(acookie, saddr, daddr);
E
Eric Dumazet 已提交
451 452 453
	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
	unsigned int hash = inet_ehashfn(net, daddr, lport,
					 saddr, inet->inet_dport);
454
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
455
	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
456
	struct sock *sk2;
457
	const struct hlist_nulls_node *node;
E
Eric Dumazet 已提交
458
	struct inet_timewait_sock *tw = NULL;
459

460
	spin_lock(lock);
461

462
	sk_nulls_for_each(sk2, node, &head->chain) {
463 464
		if (sk2->sk_hash != hash)
			continue;
E
Eric Dumazet 已提交
465

466
		if (likely(INET_MATCH(sk2, net, acookie,
467
					 saddr, daddr, ports, dif, sdif))) {
E
Eric Dumazet 已提交
468 469 470 471 472
			if (sk2->sk_state == TCP_TIME_WAIT) {
				tw = inet_twsk(sk2);
				if (twsk_unique(sk, sk2, twp))
					break;
			}
473
			goto not_unique;
E
Eric Dumazet 已提交
474
		}
475 476 477
	}

	/* Must record num and sport now. Otherwise we will see
E
Eric Dumazet 已提交
478 479
	 * in hash table socket with a funny identity.
	 */
E
Eric Dumazet 已提交
480 481
	inet->inet_num = lport;
	inet->inet_sport = htons(lport);
482
	sk->sk_hash = hash;
483
	WARN_ON(!sk_unhashed(sk));
484
	__sk_nulls_add_node_rcu(sk, &head->chain);
485
	if (tw) {
486
		sk_nulls_del_node_init_rcu((struct sock *)tw);
487
		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
488
	}
489
	spin_unlock(lock);
490
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
491 492 493 494 495

	if (twp) {
		*twp = tw;
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
496
		inet_twsk_deschedule_put(tw);
497 498 499 500
	}
	return 0;

not_unique:
501
	spin_unlock(lock);
502 503 504
	return -EADDRNOTAVAIL;
}

505
static u32 inet_sk_port_offset(const struct sock *sk)
506 507
{
	const struct inet_sock *inet = inet_sk(sk);
508

E
Eric Dumazet 已提交
509 510 511
	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
					  inet->inet_daddr,
					  inet->inet_dport);
512 513
}

514 515 516
/* insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT
 */
517
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
518
{
519
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
520
	struct hlist_nulls_head *list;
521
	struct inet_ehash_bucket *head;
522
	spinlock_t *lock;
523
	bool ret = true;
524

525
	WARN_ON_ONCE(!sk_unhashed(sk));
526

527
	sk->sk_hash = sk_ehashfn(sk);
528 529 530 531
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

532
	spin_lock(lock);
533
	if (osk) {
534 535
		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
		ret = sk_nulls_del_node_init_rcu(osk);
536
	}
537 538
	if (ret)
		__sk_nulls_add_node_rcu(sk, list);
539
	spin_unlock(lock);
540 541 542
	return ret;
}

543
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
544
{
545 546 547 548 549 550
	bool ok = inet_ehash_insert(sk, osk);

	if (ok) {
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	} else {
		percpu_counter_inc(sk->sk_prot->orphan_count);
551
		inet_sk_set_state(sk, TCP_CLOSE);
552 553 554 555
		sock_set_flag(sk, SOCK_DEAD);
		inet_csk_destroy_sock(sk);
	}
	return ok;
556
}
557
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
558

559
static int inet_reuseport_add_sock(struct sock *sk,
560
				   struct inet_listen_hashbucket *ilb)
561
{
562
	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
563 564 565
	struct sock *sk2;
	kuid_t uid = sock_i_uid(sk);

566
	sk_for_each_rcu(sk2, &ilb->head) {
567 568 569 570
		if (sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
571
		    inet_csk(sk2)->icsk_bind_hash == tb &&
572
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
573
		    inet_rcv_saddr_equal(sk, sk2, false))
574 575
			return reuseport_add_sock(sk, sk2,
						  inet_rcv_saddr_any(sk));
576 577
	}

578
	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
579 580
}

581
int __inet_hash(struct sock *sk, struct sock *osk)
582
{
583
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
584
	struct inet_listen_hashbucket *ilb;
585
	int err = 0;
586

587 588
	if (sk->sk_state != TCP_LISTEN) {
		inet_ehash_nolisten(sk, osk);
589
		return 0;
590
	}
591
	WARN_ON(!sk_unhashed(sk));
592
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
593

594
	spin_lock(&ilb->lock);
595
	if (sk->sk_reuseport) {
596
		err = inet_reuseport_add_sock(sk, ilb);
597 598 599
		if (err)
			goto unlock;
	}
600 601 602 603 604
	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
		sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
	else
		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
605
	inet_hash2(hashinfo, sk);
606
	ilb->count++;
607
	sock_set_flag(sk, SOCK_RCU_FREE);
608
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
609
unlock:
610
	spin_unlock(&ilb->lock);
611 612

	return err;
613
}
E
Eric Dumazet 已提交
614
EXPORT_SYMBOL(__inet_hash);
615

616
int inet_hash(struct sock *sk)
617
{
618 619
	int err = 0;

620 621
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
622
		err = __inet_hash(sk, NULL);
623 624
		local_bh_enable();
	}
625

626
	return err;
627 628 629 630 631
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
632
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
633
	struct inet_listen_hashbucket *ilb = NULL;
634
	spinlock_t *lock;
635 636

	if (sk_unhashed(sk))
637
		return;
638

639
	if (sk->sk_state == TCP_LISTEN) {
640 641
		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
		lock = &ilb->lock;
642
	} else {
643
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
644
	}
645
	spin_lock_bh(lock);
646 647 648
	if (sk_unhashed(sk))
		goto unlock;

649 650
	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);
651
	if (ilb) {
652 653 654 655 656
		inet_unhash2(hashinfo, sk);
		 __sk_del_node_init(sk);
		 ilb->count--;
	} else {
		__sk_nulls_del_node_init_rcu(sk);
657
	}
658 659
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
660
	spin_unlock_bh(lock);
661 662
}
EXPORT_SYMBOL_GPL(inet_unhash);
663

664
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
665
		struct sock *sk, u32 port_offset,
666
		int (*check_established)(struct inet_timewait_death_row *,
667
			struct sock *, __u16, struct inet_timewait_sock **))
668 669
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
670
	struct inet_timewait_sock *tw = NULL;
671
	struct inet_bind_hashbucket *head;
672
	int port = inet_sk(sk)->inet_num;
673
	struct net *net = sock_net(sk);
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	static u32 hint;

	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		spin_lock_bh(&head->lock);
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			spin_unlock_bh(&head->lock);
			return 0;
		}
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, port, NULL);
		local_bh_enable();
		return ret;
	}
695

696 697 698 699 700
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;
701

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
	offset = (hint + port_offset) % remaining;
	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
	offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
717

718 719
		/* Does not bother with rcv_saddr checks, because
		 * the established check is already unique enough.
720
		 */
721 722 723 724
		inet_bind_bucket_for_each(tb, &head->chain) {
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (tb->fastreuse >= 0 ||
				    tb->fastreuseport >= 0)
725
					goto next_port;
726 727 728 729 730
				WARN_ON(hlist_empty(&tb->owners));
				if (!check_established(death_row, sk,
						       port, &tw))
					goto ok;
				goto next_port;
731 732
			}
		}
733

734 735 736 737 738
		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					     net, head, port);
		if (!tb) {
			spin_unlock_bh(&head->lock);
			return -ENOMEM;
739
		}
740 741 742 743 744 745 746
		tb->fastreuse = -1;
		tb->fastreuseport = -1;
		goto ok;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}
747

748 749 750
	offset++;
	if ((offset & 1) && remaining > 1)
		goto other_parity_scan;
751

752
	return -EADDRNOTAVAIL;
753

754 755 756 757 758 759 760 761
ok:
	hint += i + 2;

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, port);
	if (sk_unhashed(sk)) {
		inet_sk(sk)->inet_sport = htons(port);
		inet_ehash_nolisten(sk, (struct sock *)tw);
762
	}
763 764 765 766 767 768 769
	if (tw)
		inet_twsk_bind_unhash(tw, hinfo);
	spin_unlock(&head->lock);
	if (tw)
		inet_twsk_deschedule_put(tw);
	local_bh_enable();
	return 0;
770
}
771 772 773 774 775 776 777

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
		      struct sock *sk)
{
778 779 780 781 782
	u32 port_offset = 0;

	if (!inet_sk(sk)->inet_num)
		port_offset = inet_sk_port_offset(sk);
	return __inet_hash_connect(death_row, sk, port_offset,
783
				   __inet_check_established);
784
}
785
EXPORT_SYMBOL_GPL(inet_hash_connect);
786 787 788 789 790

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

791
	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
792
		spin_lock_init(&h->listening_hash[i].lock);
793
		INIT_HLIST_HEAD(&h->listening_hash[i].head);
794
		h->listening_hash[i].count = 0;
795
	}
796 797

	h->lhash2 = NULL;
798 799
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
800

801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
				unsigned long numentries, int scale,
				unsigned long low_limit,
				unsigned long high_limit)
{
	unsigned int i;

	h->lhash2 = alloc_large_system_hash(name,
					    sizeof(*h->lhash2),
					    numentries,
					    scale,
					    0,
					    NULL,
					    &h->lhash2_mask,
					    low_limit,
					    high_limit);

	for (i = 0; i <= h->lhash2_mask; i++) {
		spin_lock_init(&h->lhash2[i].lock);
		INIT_HLIST_HEAD(&h->lhash2[i].head);
		h->lhash2[i].count = 0;
	}
}

825 826
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
827
	unsigned int locksz = sizeof(spinlock_t);
828 829
	unsigned int i, nblocks = 1;

830
	if (locksz != 0) {
831
		/* allocate 2 cache lines or at least one spinlock per cpu */
832
		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
833 834 835 836 837
		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());

		/* no more locks than number of hash buckets */
		nblocks = min(nblocks, hashinfo->ehash_mask + 1);

838
		hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
839 840 841 842 843 844 845 846 847 848
		if (!hashinfo->ehash_locks)
			return -ENOMEM;

		for (i = 0; i < nblocks; i++)
			spin_lock_init(&hashinfo->ehash_locks[i]);
	}
	hashinfo->ehash_locks_mask = nblocks - 1;
	return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);