inet_hashtables.c 12.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic INET transport hashtables
 *
 * Authors:	Lotsa people, from code originally in tcp
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

16
#include <linux/module.h>
17
#include <linux/random.h>
18
#include <linux/sched.h>
19
#include <linux/slab.h>
20
#include <linux/wait.h>
21

22
#include <net/inet_connection_sock.h>
23
#include <net/inet_hashtables.h>
24
#include <net/ip.h>
25 26 27 28 29

/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
30
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
31
						 struct net *net,
32 33 34
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
35
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
36 37

	if (tb != NULL) {
38
		tb->ib_net       = net;
39 40 41 42 43 44 45 46 47 48 49
		tb->port      = snum;
		tb->fastreuse = 0;
		INIT_HLIST_HEAD(&tb->owners);
		hlist_add_head(&tb->node, &head->chain);
	}
	return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
50
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
51 52 53 54 55 56
{
	if (hlist_empty(&tb->owners)) {
		__hlist_del(&tb->node);
		kmem_cache_free(cachep, tb);
	}
}
57 58 59 60

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
61
	inet_sk(sk)->num = snum;
62
	sk_add_bind_node(sk, &tb->owners);
63
	inet_csk(sk)->icsk_bind_hash = tb;
64 65 66 67 68
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
69
static void __inet_put_port(struct sock *sk)
70
{
71
	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
72
	const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
73 74 75 76
	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
	struct inet_bind_bucket *tb;

	spin_lock(&head->lock);
77
	tb = inet_csk(sk)->icsk_bind_hash;
78
	__sk_del_bind_node(sk);
79 80
	inet_csk(sk)->icsk_bind_hash = NULL;
	inet_sk(sk)->num = 0;
81 82 83 84
	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
	spin_unlock(&head->lock);
}

85
void inet_put_port(struct sock *sk)
86 87
{
	local_bh_disable();
88
	__inet_put_port(sk);
89 90 91 92
	local_bh_enable();
}

EXPORT_SYMBOL(inet_put_port);
93 94 95 96 97 98 99 100 101

/*
 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 * Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines (wake up each
 * exclusive lock release). It should be ifdefed really.
 */
void inet_listen_wlock(struct inet_hashinfo *hashinfo)
102
	__acquires(hashinfo->lhash_lock)
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
{
	write_lock(&hashinfo->lhash_lock);

	if (atomic_read(&hashinfo->lhash_users)) {
		DEFINE_WAIT(wait);

		for (;;) {
			prepare_to_wait_exclusive(&hashinfo->lhash_wait,
						  &wait, TASK_UNINTERRUPTIBLE);
			if (!atomic_read(&hashinfo->lhash_users))
				break;
			write_unlock_bh(&hashinfo->lhash_lock);
			schedule();
			write_lock_bh(&hashinfo->lhash_lock);
		}

		finish_wait(&hashinfo->lhash_wait, &wait);
	}
}

EXPORT_SYMBOL(inet_listen_wlock);
124 125 126 127 128 129 130

/*
 * Don't inline this cruft. Here are some nice properties to exploit here. The
 * BSD API does not allow a listening sock to specify the remote port nor the
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */
131 132
static struct sock *inet_lookup_listener_slow(struct net *net,
					      const struct hlist_head *head,
133
					      const __be32 daddr,
134 135
					      const unsigned short hnum,
					      const int dif)
136 137 138 139 140 141 142 143
{
	struct sock *result = NULL, *sk;
	const struct hlist_node *node;
	int hiscore = -1;

	sk_for_each(sk, node, head) {
		const struct inet_sock *inet = inet_sk(sk);

144 145
		if (sk->sk_net == net && inet->num == hnum &&
				!ipv6_only_sock(sk)) {
146
			const __be32 rcv_saddr = inet->rcv_saddr;
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
			int score = sk->sk_family == PF_INET ? 1 : 0;

			if (rcv_saddr) {
				if (rcv_saddr != daddr)
					continue;
				score += 2;
			}
			if (sk->sk_bound_dev_if) {
				if (sk->sk_bound_dev_if != dif)
					continue;
				score += 2;
			}
			if (score == 5)
				return sk;
			if (score > hiscore) {
				hiscore	= score;
				result	= sk;
			}
		}
	}
	return result;
}
169

170
/* Optimize the common listener case. */
171 172
struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
173
				    const __be32 daddr, const unsigned short hnum,
174
				    const int dif)
175 176 177 178 179 180 181 182 183 184 185 186
{
	struct sock *sk = NULL;
	const struct hlist_head *head;

	read_lock(&hashinfo->lhash_lock);
	head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
	if (!hlist_empty(head)) {
		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));

		if (inet->num == hnum && !sk->sk_node.next &&
		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
187
		    !sk->sk_bound_dev_if && sk->sk_net == net)
188
			goto sherry_cache;
189
		sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
190 191 192 193 194 195 196 197
	}
	if (sk) {
sherry_cache:
		sock_hold(sk);
	}
	read_unlock(&hashinfo->lhash_lock);
	return sk;
}
198
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
199

200 201
struct sock * __inet_lookup_established(struct net *net,
				  struct inet_hashinfo *hashinfo,
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
				  const int dif)
{
	INET_ADDR_COOKIE(acookie, saddr, daddr)
	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
	const struct hlist_node *node;
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);

	prefetch(head->chain.first);
	read_lock(lock);
	sk_for_each(sk, node, &head->chain) {
220 221
		if (INET_MATCH(sk, net, hash, acookie,
					saddr, daddr, ports, dif))
222 223 224 225 226
			goto hit; /* You sunk my battleship! */
	}

	/* Must check for a TIME_WAIT'er before going to listener hash. */
	sk_for_each(sk, node, &head->twchain) {
227 228
		if (INET_TW_MATCH(sk, net, hash, acookie,
					saddr, daddr, ports, dif))
229 230 231 232 233 234 235 236 237 238 239 240
			goto hit;
	}
	sk = NULL;
out:
	read_unlock(lock);
	return sk;
hit:
	sock_hold(sk);
	goto out;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

241 242 243 244 245 246 247
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
				    struct sock *sk, __u16 lport,
				    struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
248 249
	__be32 daddr = inet->rcv_saddr;
	__be32 saddr = inet->daddr;
250 251
	int dif = sk->sk_bound_dev_if;
	INET_ADDR_COOKIE(acookie, saddr, daddr)
A
Al Viro 已提交
252
	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
253 254
	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
255
	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
256 257 258
	struct sock *sk2;
	const struct hlist_node *node;
	struct inet_timewait_sock *tw;
259
	struct net *net = sk->sk_net;
260 261

	prefetch(head->chain.first);
262
	write_lock(lock);
263 264

	/* Check TIME-WAIT sockets first. */
265
	sk_for_each(sk2, node, &head->twchain) {
266 267
		tw = inet_twsk(sk2);

268 269
		if (INET_TW_MATCH(sk2, net, hash, acookie,
					saddr, daddr, ports, dif)) {
270 271 272 273 274 275 276 277 278 279
			if (twsk_unique(sk, sk2, twp))
				goto unique;
			else
				goto not_unique;
		}
	}
	tw = NULL;

	/* And established part... */
	sk_for_each(sk2, node, &head->chain) {
280 281
		if (INET_MATCH(sk2, net, hash, acookie,
					saddr, daddr, ports, dif))
282 283 284 285 286 287 288 289 290 291 292
			goto not_unique;
	}

unique:
	/* Must record num and sport now. Otherwise we will see
	 * in hash table socket with a funny identity. */
	inet->num = lport;
	inet->sport = htons(lport);
	sk->sk_hash = hash;
	BUG_TRAP(sk_unhashed(sk));
	__sk_add_node(sk, &head->chain);
293
	sock_prot_inuse_add(sk->sk_prot, 1);
294
	write_unlock(lock);
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309

	if (twp) {
		*twp = tw;
		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
		inet_twsk_deschedule(tw, death_row);
		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);

		inet_twsk_put(tw);
	}

	return 0;

not_unique:
310
	write_unlock(lock);
311 312 313 314 315 316
	return -EADDRNOTAVAIL;
}

static inline u32 inet_sk_port_offset(const struct sock *sk)
{
	const struct inet_sock *inet = inet_sk(sk);
317
	return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
318 319 320
					  inet->dport);
}

321
void __inet_hash_nolisten(struct sock *sk)
322
{
323
	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
324 325 326 327 328 329 330 331 332 333 334 335 336
	struct hlist_head *list;
	rwlock_t *lock;
	struct inet_ehash_bucket *head;

	BUG_TRAP(sk_unhashed(sk));

	sk->sk_hash = inet_sk_ehashfn(sk);
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

	write_lock(lock);
	__sk_add_node(sk, list);
337
	sock_prot_inuse_add(sk->sk_prot, 1);
338 339 340 341
	write_unlock(lock);
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);

342
static void __inet_hash(struct sock *sk)
343
{
344
	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
345 346 347 348
	struct hlist_head *list;
	rwlock_t *lock;

	if (sk->sk_state != TCP_LISTEN) {
349
		__inet_hash_nolisten(sk);
350 351 352 353 354 355 356 357 358
		return;
	}

	BUG_TRAP(sk_unhashed(sk));
	list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
	lock = &hashinfo->lhash_lock;

	inet_listen_wlock(hashinfo);
	__sk_add_node(sk, list);
359
	sock_prot_inuse_add(sk->sk_prot, 1);
360 361 362
	write_unlock(lock);
	wake_up(&hashinfo->lhash_wait);
}
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398

void inet_hash(struct sock *sk)
{
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
		__inet_hash(sk);
		local_bh_enable();
	}
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
	rwlock_t *lock;
	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;

	if (sk_unhashed(sk))
		goto out;

	if (sk->sk_state == TCP_LISTEN) {
		local_bh_disable();
		inet_listen_wlock(hashinfo);
		lock = &hashinfo->lhash_lock;
	} else {
		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
		write_lock_bh(lock);
	}

	if (__sk_del_node_init(sk))
		sock_prot_inuse_add(sk->sk_prot, -1);
	write_unlock_bh(lock);
out:
	if (sk->sk_state == TCP_LISTEN)
		wake_up(&hashinfo->lhash_wait);
}
EXPORT_SYMBOL_GPL(inet_unhash);
399

400 401 402 403
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		struct sock *sk,
		int (*check_established)(struct inet_timewait_death_row *,
			struct sock *, __u16, struct inet_timewait_sock **),
404
		void (*hash)(struct sock *sk))
405 406 407
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	const unsigned short snum = inet_sk(sk)->num;
408 409
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
410
	int ret;
411
	struct net *net = sk->sk_net;
412

413
	if (!snum) {
414
		int i, remaining, low, high, port;
415 416 417
		static u32 hint;
		u32 offset = hint + inet_sk_port_offset(sk);
		struct hlist_node *node;
418
		struct inet_timewait_sock *tw = NULL;
419

420
		inet_get_local_port_range(&low, &high);
421
		remaining = (high - low) + 1;
422

423
		local_bh_disable();
424 425
		for (i = 1; i <= remaining; i++) {
			port = low + (i + offset) % remaining;
426 427
			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
			spin_lock(&head->lock);
428

429 430 431 432
			/* Does not bother with rcv_saddr checks,
			 * because the established check is already
			 * unique enough.
			 */
433
			inet_bind_bucket_for_each(tb, node, &head->chain) {
434
				if (tb->ib_net == net && tb->port == port) {
435 436 437
					BUG_TRAP(!hlist_empty(&tb->owners));
					if (tb->fastreuse >= 0)
						goto next_port;
438 439
					if (!check_established(death_row, sk,
								port, &tw))
440 441 442 443 444
						goto ok;
					goto next_port;
				}
			}

445 446
			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					net, head, port);
447 448 449 450 451 452 453 454 455 456 457 458 459
			if (!tb) {
				spin_unlock(&head->lock);
				break;
			}
			tb->fastreuse = -1;
			goto ok;

		next_port:
			spin_unlock(&head->lock);
		}
		local_bh_enable();

		return -EADDRNOTAVAIL;
460 461 462 463

ok:
		hint += i;

464 465
		/* Head lock still held and bh's disabled */
		inet_bind_hash(sk, tb, port);
466
		if (sk_unhashed(sk)) {
467
			inet_sk(sk)->sport = htons(port);
468
			hash(sk);
469 470
		}
		spin_unlock(&head->lock);
471

472 473 474 475
		if (tw) {
			inet_twsk_deschedule(tw, death_row);
			inet_twsk_put(tw);
		}
476 477 478

		ret = 0;
		goto out;
479
	}
480

481 482
	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
	tb  = inet_csk(sk)->icsk_bind_hash;
483 484
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
485
		hash(sk);
486 487 488 489 490
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
491
		ret = check_established(death_row, sk, snum, NULL);
492 493 494 495 496
out:
		local_bh_enable();
		return ret;
	}
}
497 498 499 500 501 502 503 504 505 506 507
EXPORT_SYMBOL_GPL(__inet_hash_connect);

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
		      struct sock *sk)
{
	return __inet_hash_connect(death_row, sk,
			__inet_check_established, __inet_hash_nolisten);
}
508 509

EXPORT_SYMBOL_GPL(inet_hash_connect);