nf_conntrack_core.c 38.0 KB
Newer Older
1 2 3 4 5
/* Connection state tracking for netfilter.  This is separated from,
   but required by, the NAT layer; it can also be used by an iptables
   extension. */

/* (C) 1999-2001 Paul `Rusty' Russell
6
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7 8 9 10 11 12 13 14 15 16
 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
17
#include <linux/sched.h>
18 19 20 21 22 23 24 25 26 27 28 29 30 31
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
#include <linux/notifier.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
A
Al Viro 已提交
32
#include <linux/mm.h>
33
#include <linux/nsproxy.h>
34
#include <linux/rculist_nulls.h>
35 36 37

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
38
#include <net/netfilter/nf_conntrack_l4proto.h>
39
#include <net/netfilter/nf_conntrack_expect.h>
40 41
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
42
#include <net/netfilter/nf_conntrack_extend.h>
43
#include <net/netfilter/nf_conntrack_acct.h>
44
#include <net/netfilter/nf_conntrack_ecache.h>
45
#include <net/netfilter/nf_nat.h>
46
#include <net/netfilter/nf_nat_core.h>
47

48
#define NF_CONNTRACK_VERSION	"0.5.0"
49

50 51
int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
				      enum nf_nat_manip_type manip,
52
				      const struct nlattr *attr) __read_mostly;
53 54
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);

55
DEFINE_SPINLOCK(nf_conntrack_lock);
56
EXPORT_SYMBOL_GPL(nf_conntrack_lock);
57

58
unsigned int nf_conntrack_htable_size __read_mostly;
59 60
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);

61
unsigned int nf_conntrack_max __read_mostly;
62
EXPORT_SYMBOL_GPL(nf_conntrack_max);
63

64
struct nf_conn nf_conntrack_untracked __read_mostly;
65 66
EXPORT_SYMBOL_GPL(nf_conntrack_untracked);

67 68 69 70 71 72
static int nf_conntrack_hash_rnd_initted;
static unsigned int nf_conntrack_hash_rnd;

static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
				  unsigned int size, unsigned int rnd)
{
73 74 75 76 77 78 79 80 81 82 83 84 85
	unsigned int n;
	u_int32_t h;

	/* The direction must be ignored, so we hash everything up to the
	 * destination ports (which is a multiple of 4) and treat the last
	 * three bytes manually.
	 */
	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
	h = jhash2((u32 *)tuple, n,
		   rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
			  tuple->dst.protonum));

	return ((u64)h * size) >> 32;
86 87
}

88 89
static inline u_int32_t hash_conntrack(const struct net *net,
				       const struct nf_conntrack_tuple *tuple)
90
{
91
	return __hash_conntrack(tuple, net->ct.htable_size,
92 93 94
				nf_conntrack_hash_rnd);
}

95
bool
96 97 98 99 100 101 102
nf_ct_get_tuple(const struct sk_buff *skb,
		unsigned int nhoff,
		unsigned int dataoff,
		u_int16_t l3num,
		u_int8_t protonum,
		struct nf_conntrack_tuple *tuple,
		const struct nf_conntrack_l3proto *l3proto,
103
		const struct nf_conntrack_l4proto *l4proto)
104
{
105
	memset(tuple, 0, sizeof(*tuple));
106 107 108

	tuple->src.l3num = l3num;
	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
109
		return false;
110 111 112 113

	tuple->dst.protonum = protonum;
	tuple->dst.dir = IP_CT_DIR_ORIGINAL;

114
	return l4proto->pkt_to_tuple(skb, dataoff, tuple);
115
}
116
EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
117

118 119
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
		       u_int16_t l3num, struct nf_conntrack_tuple *tuple)
120 121 122 123 124 125 126 127 128 129 130 131 132
{
	struct nf_conntrack_l3proto *l3proto;
	struct nf_conntrack_l4proto *l4proto;
	unsigned int protoff;
	u_int8_t protonum;
	int ret;

	rcu_read_lock();

	l3proto = __nf_ct_l3proto_find(l3num);
	ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
	if (ret != NF_ACCEPT) {
		rcu_read_unlock();
133
		return false;
134 135 136 137 138 139 140 141 142 143 144 145
	}

	l4proto = __nf_ct_l4proto_find(l3num, protonum);

	ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
			      l3proto, l4proto);

	rcu_read_unlock();
	return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);

146
bool
147 148 149
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
		   const struct nf_conntrack_tuple *orig,
		   const struct nf_conntrack_l3proto *l3proto,
150
		   const struct nf_conntrack_l4proto *l4proto)
151
{
152
	memset(inverse, 0, sizeof(*inverse));
153 154 155

	inverse->src.l3num = orig->src.l3num;
	if (l3proto->invert_tuple(inverse, orig) == 0)
156
		return false;
157 158 159 160

	inverse->dst.dir = !orig->dst.dir;

	inverse->dst.protonum = orig->dst.protonum;
161
	return l4proto->invert_tuple(inverse, orig);
162
}
163
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
164 165 166 167

static void
clean_from_lists(struct nf_conn *ct)
{
168
	pr_debug("clean_from_lists(%p)\n", ct);
169 170
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
171 172

	/* Destroy all pending expectations */
173
	nf_ct_remove_expectations(ct);
174 175 176 177 178 179
}

static void
destroy_conntrack(struct nf_conntrack *nfct)
{
	struct nf_conn *ct = (struct nf_conn *)nfct;
180
	struct net *net = nf_ct_net(ct);
181
	struct nf_conntrack_l4proto *l4proto;
182

183
	pr_debug("destroy_conntrack(%p)\n", ct);
184 185 186 187 188 189
	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
	NF_CT_ASSERT(!timer_pending(&ct->timeout));

	/* To make sure we don't get any weird locking issues here:
	 * destroy_conntrack() MUST NOT be called with a write lock
	 * to nf_conntrack_lock!!! -HW */
190
	rcu_read_lock();
191
	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
192 193
	if (l4proto && l4proto->destroy)
		l4proto->destroy(ct);
194

195
	rcu_read_unlock();
196

197
	spin_lock_bh(&nf_conntrack_lock);
198 199 200 201
	/* Expectations will have been removed in clean_from_lists,
	 * except TFTP can create an expectation on the first packet,
	 * before connection is in the list, so we need to clean here,
	 * too. */
202
	nf_ct_remove_expectations(ct);
203 204 205

	/* We overload first tuple to link into unconfirmed list. */
	if (!nf_ct_is_confirmed(ct)) {
206 207
		BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
208 209
	}

210
	NF_CT_STAT_INC(net, delete);
211
	spin_unlock_bh(&nf_conntrack_lock);
212 213 214 215

	if (ct->master)
		nf_ct_put(ct->master);

216
	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
217 218 219
	nf_conntrack_free(ct);
}

220
void nf_ct_delete_from_lists(struct nf_conn *ct)
221
{
222
	struct net *net = nf_ct_net(ct);
223

224
	nf_ct_helper_destroy(ct);
225
	spin_lock_bh(&nf_conntrack_lock);
226 227
	/* Inside lock so preempt is disabled on module removal path.
	 * Otherwise we can get spurious warnings. */
228
	NF_CT_STAT_INC(net, delete_list);
229
	clean_from_lists(ct);
230
	spin_unlock_bh(&nf_conntrack_lock);
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
}
EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);

static void death_by_event(unsigned long ul_conntrack)
{
	struct nf_conn *ct = (void *)ul_conntrack;
	struct net *net = nf_ct_net(ct);

	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
		/* bad luck, let's retry again */
		ct->timeout.expires = jiffies +
			(random32() % net->ct.sysctl_events_retry_timeout);
		add_timer(&ct->timeout);
		return;
	}
	/* we've got the event delivered, now it's dying */
	set_bit(IPS_DYING_BIT, &ct->status);
	spin_lock(&nf_conntrack_lock);
	hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
	spin_unlock(&nf_conntrack_lock);
	nf_ct_put(ct);
}

void nf_ct_insert_dying_list(struct nf_conn *ct)
{
	struct net *net = nf_ct_net(ct);

	/* add this conntrack to the dying list */
	spin_lock_bh(&nf_conntrack_lock);
	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
			     &net->ct.dying);
	spin_unlock_bh(&nf_conntrack_lock);
	/* set a new timer to retry event delivery */
	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
	ct->timeout.expires = jiffies +
		(random32() % net->ct.sysctl_events_retry_timeout);
	add_timer(&ct->timeout);
}
EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);

static void death_by_timeout(unsigned long ul_conntrack)
{
	struct nf_conn *ct = (void *)ul_conntrack;

	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
		/* destroy event was not delivered */
		nf_ct_delete_from_lists(ct);
		nf_ct_insert_dying_list(ct);
		return;
	}
	set_bit(IPS_DYING_BIT, &ct->status);
	nf_ct_delete_from_lists(ct);
284 285 286
	nf_ct_put(ct);
}

287 288 289 290 291 292 293
/*
 * Warning :
 * - Caller must take a reference on returned object
 *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 * OR
 * - Caller must lock nf_conntrack_lock before calling this function
 */
294
struct nf_conntrack_tuple_hash *
295
__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
296 297
{
	struct nf_conntrack_tuple_hash *h;
298
	struct hlist_nulls_node *n;
299
	unsigned int hash = hash_conntrack(net, tuple);
300

301 302 303 304
	/* Disable BHs the entire time since we normally need to disable them
	 * at least once for the stats anyway.
	 */
	local_bh_disable();
305 306
begin:
	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
307
		if (nf_ct_tuple_equal(tuple, &h->tuple)) {
308
			NF_CT_STAT_INC(net, found);
309
			local_bh_enable();
310 311
			return h;
		}
312
		NF_CT_STAT_INC(net, searched);
313
	}
314 315 316 317 318 319 320
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(n) != hash)
		goto begin;
321
	local_bh_enable();
322 323 324

	return NULL;
}
325
EXPORT_SYMBOL_GPL(__nf_conntrack_find);
326 327 328

/* Find a connection corresponding to a tuple. */
struct nf_conntrack_tuple_hash *
329
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
330 331
{
	struct nf_conntrack_tuple_hash *h;
332
	struct nf_conn *ct;
333

334
	rcu_read_lock();
335
begin:
336
	h = __nf_conntrack_find(net, tuple);
337 338
	if (h) {
		ct = nf_ct_tuplehash_to_ctrack(h);
339 340
		if (unlikely(nf_ct_is_dying(ct) ||
			     !atomic_inc_not_zero(&ct->ct_general.use)))
341
			h = NULL;
342 343 344 345 346 347
		else {
			if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) {
				nf_ct_put(ct);
				goto begin;
			}
		}
348 349
	}
	rcu_read_unlock();
350 351 352

	return h;
}
353
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
354

355 356
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
				       unsigned int hash,
357
				       unsigned int repl_hash)
358
{
359 360
	struct net *net = nf_ct_net(ct);

361
	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
362
			   &net->ct.hash[hash]);
363
	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
364
			   &net->ct.hash[repl_hash]);
365 366 367 368
}

void nf_conntrack_hash_insert(struct nf_conn *ct)
{
369
	struct net *net = nf_ct_net(ct);
370 371
	unsigned int hash, repl_hash;

372 373
	hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
	repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
374 375 376

	__nf_conntrack_hash_insert(ct, hash, repl_hash);
}
377
EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
378

379 380
/* Confirm a connection given skb; places it in hash table */
int
381
__nf_conntrack_confirm(struct sk_buff *skb)
382 383
{
	unsigned int hash, repl_hash;
P
Patrick McHardy 已提交
384
	struct nf_conntrack_tuple_hash *h;
385
	struct nf_conn *ct;
P
Patrick McHardy 已提交
386
	struct nf_conn_help *help;
387
	struct hlist_nulls_node *n;
388
	enum ip_conntrack_info ctinfo;
389
	struct net *net;
390

391
	ct = nf_ct_get(skb, &ctinfo);
392
	net = nf_ct_net(ct);
393 394 395 396 397 398 399 400

	/* ipt_REJECT uses nf_conntrack_attach to attach related
	   ICMP/TCP RST packets in other direction.  Actual packet
	   which created connection will be IP_CT_NEW or for an
	   expected connection, IP_CT_RELATED. */
	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
		return NF_ACCEPT;

401 402
	hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
	repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
403 404 405 406 407 408 409 410 411

	/* We're not in hash table, and we refuse to set up related
	   connections for unconfirmed conns.  But packet copies and
	   REJECT will give spurious warnings here. */
	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */

	/* No external references means noone else could have
	   confirmed us. */
	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
412
	pr_debug("Confirming conntrack %p\n", ct);
413

414
	spin_lock_bh(&nf_conntrack_lock);
415 416 417 418

	/* See if there's one in the list already, including reverse:
	   NAT could have grabbed it without realizing, since we're
	   not in the hash.  If there is, we lost race. */
419
	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
P
Patrick McHardy 已提交
420 421 422
		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				      &h->tuple))
			goto out;
423
	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
P
Patrick McHardy 已提交
424 425 426
		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				      &h->tuple))
			goto out;
427

P
Patrick McHardy 已提交
428
	/* Remove from unconfirmed list */
429
	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
P
Patrick McHardy 已提交
430 431 432 433 434 435 436 437

	/* Timer relative to confirmation time, not original
	   setting time, otherwise we'd get timer wrap in
	   weird delay cases. */
	ct->timeout.expires += jiffies;
	add_timer(&ct->timeout);
	atomic_inc(&ct->ct_general.use);
	set_bit(IPS_CONFIRMED_BIT, &ct->status);
438 439 440 441 442 443 444

	/* Since the lookup is lockless, hash insertion must be done after
	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
	 * guarantee that no other CPU can find the conntrack before the above
	 * stores are visible.
	 */
	__nf_conntrack_hash_insert(ct, hash, repl_hash);
445
	NF_CT_STAT_INC(net, insert);
446
	spin_unlock_bh(&nf_conntrack_lock);
447

P
Patrick McHardy 已提交
448 449
	help = nfct_help(ct);
	if (help && help->helper)
450
		nf_conntrack_event_cache(IPCT_HELPER, ct);
451

P
Patrick McHardy 已提交
452
	nf_conntrack_event_cache(master_ct(ct) ?
453
				 IPCT_RELATED : IPCT_NEW, ct);
P
Patrick McHardy 已提交
454
	return NF_ACCEPT;
455

P
Patrick McHardy 已提交
456
out:
457
	NF_CT_STAT_INC(net, insert_failed);
458
	spin_unlock_bh(&nf_conntrack_lock);
459 460
	return NF_DROP;
}
461
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
462 463 464 465 466 467 468

/* Returns true if a connection correspondings to the tuple (required
   for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
			 const struct nf_conn *ignored_conntrack)
{
469
	struct net *net = nf_ct_net(ignored_conntrack);
470
	struct nf_conntrack_tuple_hash *h;
471
	struct hlist_nulls_node *n;
472
	unsigned int hash = hash_conntrack(net, tuple);
473

474 475 476 477
	/* Disable BHs the entire time since we need to disable them at
	 * least once for the stats anyway.
	 */
	rcu_read_lock_bh();
478
	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
479 480
		if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
		    nf_ct_tuple_equal(tuple, &h->tuple)) {
481
			NF_CT_STAT_INC(net, found);
482
			rcu_read_unlock_bh();
483 484
			return 1;
		}
485
		NF_CT_STAT_INC(net, searched);
486
	}
487
	rcu_read_unlock_bh();
488

489
	return 0;
490
}
491
EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
492

493 494
#define NF_CT_EVICTION_RANGE	8

495 496
/* There's a small race here where we may free a just-assured
   connection.  Too bad: we're in trouble anyway. */
497
static noinline int early_drop(struct net *net, unsigned int hash)
498
{
499
	/* Use oldest entry, which is roughly LRU */
500
	struct nf_conntrack_tuple_hash *h;
P
Patrick McHardy 已提交
501
	struct nf_conn *ct = NULL, *tmp;
502
	struct hlist_nulls_node *n;
503
	unsigned int i, cnt = 0;
504 505
	int dropped = 0;

506
	rcu_read_lock();
507
	for (i = 0; i < net->ct.htable_size; i++) {
508 509
		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
					 hnnode) {
510 511 512 513 514
			tmp = nf_ct_tuplehash_to_ctrack(h);
			if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
				ct = tmp;
			cnt++;
		}
515

516 517 518 519 520 521 522 523 524
		if (ct != NULL) {
			if (likely(!nf_ct_is_dying(ct) &&
				   atomic_inc_not_zero(&ct->ct_general.use)))
				break;
			else
				ct = NULL;
		}

		if (cnt >= NF_CT_EVICTION_RANGE)
525
			break;
526

527
		hash = (hash + 1) % net->ct.htable_size;
528
	}
529
	rcu_read_unlock();
530 531 532 533 534 535 536

	if (!ct)
		return dropped;

	if (del_timer(&ct->timeout)) {
		death_by_timeout((unsigned long)ct);
		dropped = 1;
537
		NF_CT_STAT_INC_ATOMIC(net, early_drop);
538 539 540 541 542
	}
	nf_ct_put(ct);
	return dropped;
}

543 544
struct nf_conn *nf_conntrack_alloc(struct net *net,
				   const struct nf_conntrack_tuple *orig,
545 546
				   const struct nf_conntrack_tuple *repl,
				   gfp_t gfp)
547
{
548
	struct nf_conn *ct;
549

550
	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
551 552
		get_random_bytes(&nf_conntrack_hash_rnd,
				sizeof(nf_conntrack_hash_rnd));
553 554 555
		nf_conntrack_hash_rnd_initted = 1;
	}

556
	/* We don't want any race condition at early drop stage */
557
	atomic_inc(&net->ct.count);
558

559
	if (nf_conntrack_max &&
560
	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
561
		unsigned int hash = hash_conntrack(net, orig);
562
		if (!early_drop(net, hash)) {
563
			atomic_dec(&net->ct.count);
564 565 566 567 568 569 570 571
			if (net_ratelimit())
				printk(KERN_WARNING
				       "nf_conntrack: table full, dropping"
				       " packet.\n");
			return ERR_PTR(-ENOMEM);
		}
	}

572 573 574 575
	/*
	 * Do not use kmem_cache_zalloc(), as this cache uses
	 * SLAB_DESTROY_BY_RCU.
	 */
576
	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
577
	if (ct == NULL) {
578
		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
579
		atomic_dec(&net->ct.count);
580
		return ERR_PTR(-ENOMEM);
581
	}
582 583 584 585 586 587
	/*
	 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
	 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
	 */
	memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
	       sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
588
	spin_lock_init(&ct->lock);
589
	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
590
	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
591
	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
592
	ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
593
	/* Don't set timer yet: wait for confirmation */
594
	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
595 596 597
#ifdef CONFIG_NET_NS
	ct->ct_net = net;
#endif
598

599 600 601 602 603
	/*
	 * changes to lookup keys must be done before setting refcnt to 1
	 */
	smp_wmb();
	atomic_set(&ct->ct_general.use, 1);
604
	return ct;
605
}
606
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
607

608
void nf_conntrack_free(struct nf_conn *ct)
609
{
610 611
	struct net *net = nf_ct_net(ct);

612
	nf_ct_ext_destroy(ct);
613
	atomic_dec(&net->ct.count);
614
	nf_ct_ext_free(ct);
615
	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
616
}
617
EXPORT_SYMBOL_GPL(nf_conntrack_free);
618 619 620 621

/* Allocate a new conntrack: we return -ENOMEM if classification
   failed due to stress.  Otherwise it really is unclassifiable. */
static struct nf_conntrack_tuple_hash *
622 623
init_conntrack(struct net *net,
	       const struct nf_conntrack_tuple *tuple,
624
	       struct nf_conntrack_l3proto *l3proto,
625
	       struct nf_conntrack_l4proto *l4proto,
626 627 628
	       struct sk_buff *skb,
	       unsigned int dataoff)
{
629
	struct nf_conn *ct;
630
	struct nf_conn_help *help;
631 632 633
	struct nf_conntrack_tuple repl_tuple;
	struct nf_conntrack_expect *exp;

634
	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
635
		pr_debug("Can't invert tuple.\n");
636 637 638
		return NULL;
	}

639
	ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
640
	if (IS_ERR(ct)) {
641
		pr_debug("Can't allocate conntrack.\n");
642
		return (struct nf_conntrack_tuple_hash *)ct;
643 644
	}

645 646
	if (!l4proto->new(ct, skb, dataoff)) {
		nf_conntrack_free(ct);
647
		pr_debug("init conntrack: can't track with proto module\n");
648 649 650
		return NULL;
	}

651
	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
652
	nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
653

654
	spin_lock_bh(&nf_conntrack_lock);
655
	exp = nf_ct_find_expectation(net, tuple);
656
	if (exp) {
657
		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
658
			 ct, exp);
659
		/* Welcome, Mr. Bond.  We've been expecting you... */
660 661
		__set_bit(IPS_EXPECTED_BIT, &ct->status);
		ct->master = exp->master;
662
		if (exp->helper) {
663
			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
664 665 666 667
			if (help)
				rcu_assign_pointer(help->helper, exp->helper);
		}

668
#ifdef CONFIG_NF_CONNTRACK_MARK
669
		ct->mark = exp->master->mark;
670 671
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
672
		ct->secmark = exp->master->secmark;
673
#endif
674
		nf_conntrack_get(&ct->master->ct_general);
675
		NF_CT_STAT_INC(net, expect_new);
676
	} else {
677
		__nf_ct_try_assign_helper(ct, GFP_ATOMIC);
678
		NF_CT_STAT_INC(net, new);
679
	}
680 681

	/* Overload tuple linked list to put us in unconfirmed list. */
682
	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
683
		       &net->ct.unconfirmed);
684

685
	spin_unlock_bh(&nf_conntrack_lock);
686 687 688

	if (exp) {
		if (exp->expectfn)
689
			exp->expectfn(ct, exp);
690
		nf_ct_expect_put(exp);
691 692
	}

693
	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
694 695 696 697
}

/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
static inline struct nf_conn *
698 699
resolve_normal_ct(struct net *net,
		  struct sk_buff *skb,
700 701 702 703
		  unsigned int dataoff,
		  u_int16_t l3num,
		  u_int8_t protonum,
		  struct nf_conntrack_l3proto *l3proto,
704
		  struct nf_conntrack_l4proto *l4proto,
705 706 707 708 709 710 711
		  int *set_reply,
		  enum ip_conntrack_info *ctinfo)
{
	struct nf_conntrack_tuple tuple;
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;

712
	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
713
			     dataoff, l3num, protonum, &tuple, l3proto,
714
			     l4proto)) {
715
		pr_debug("resolve_normal_ct: Can't get tuple\n");
716 717 718 719
		return NULL;
	}

	/* look for tuple match */
720
	h = nf_conntrack_find_get(net, &tuple);
721
	if (!h) {
722
		h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
		if (!h)
			return NULL;
		if (IS_ERR(h))
			return (void *)h;
	}
	ct = nf_ct_tuplehash_to_ctrack(h);

	/* It exists; we have (non-exclusive) reference. */
	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
		/* Please set reply bit if this packet OK */
		*set_reply = 1;
	} else {
		/* Once we've had two way comms, always ESTABLISHED. */
		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
738
			pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
739 740
			*ctinfo = IP_CT_ESTABLISHED;
		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
741 742
			pr_debug("nf_conntrack_in: related packet for %p\n",
				 ct);
743 744
			*ctinfo = IP_CT_RELATED;
		} else {
745
			pr_debug("nf_conntrack_in: new packet for %p\n", ct);
746 747 748 749 750 751 752 753 754 755
			*ctinfo = IP_CT_NEW;
		}
		*set_reply = 0;
	}
	skb->nfct = &ct->ct_general;
	skb->nfctinfo = *ctinfo;
	return ct;
}

unsigned int
756 757
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
		struct sk_buff *skb)
758 759 760 761
{
	struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;
	struct nf_conntrack_l3proto *l3proto;
762
	struct nf_conntrack_l4proto *l4proto;
763 764 765 766 767 768
	unsigned int dataoff;
	u_int8_t protonum;
	int set_reply = 0;
	int ret;

	/* Previously seen (loopback or untracked)?  Ignore. */
769
	if (skb->nfct) {
770
		NF_CT_STAT_INC_ATOMIC(net, ignore);
771 772 773
		return NF_ACCEPT;
	}

774
	/* rcu_read_lock()ed by nf_hook_slow */
775
	l3proto = __nf_ct_l3proto_find(pf);
776
	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
777 778
				   &dataoff, &protonum);
	if (ret <= 0) {
779
		pr_debug("not prepared to track yet or error occured\n");
780 781
		NF_CT_STAT_INC_ATOMIC(net, error);
		NF_CT_STAT_INC_ATOMIC(net, invalid);
782 783 784
		return -ret;
	}

785
	l4proto = __nf_ct_l4proto_find(pf, protonum);
786 787 788 789

	/* It may be an special packet, error, unclean...
	 * inverse of the return code tells to the netfilter
	 * core what to do with the packet. */
790 791 792
	if (l4proto->error != NULL) {
		ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum);
		if (ret <= 0) {
793 794
			NF_CT_STAT_INC_ATOMIC(net, error);
			NF_CT_STAT_INC_ATOMIC(net, invalid);
795 796
			return -ret;
		}
797 798
	}

799 800
	ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
			       l3proto, l4proto, &set_reply, &ctinfo);
801 802
	if (!ct) {
		/* Not valid part of a connection */
803
		NF_CT_STAT_INC_ATOMIC(net, invalid);
804 805 806 807 808
		return NF_ACCEPT;
	}

	if (IS_ERR(ct)) {
		/* Too stressed to deal. */
809
		NF_CT_STAT_INC_ATOMIC(net, drop);
810 811 812
		return NF_DROP;
	}

813
	NF_CT_ASSERT(skb->nfct);
814

815
	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
816
	if (ret <= 0) {
817 818
		/* Invalid: inverse of the return code tells
		 * the netfilter core what to do */
819
		pr_debug("nf_conntrack_in: Can't track with proto module\n");
820 821
		nf_conntrack_put(skb->nfct);
		skb->nfct = NULL;
822
		NF_CT_STAT_INC_ATOMIC(net, invalid);
823 824
		if (ret == -NF_DROP)
			NF_CT_STAT_INC_ATOMIC(net, drop);
825 826 827 828
		return -ret;
	}

	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
829
		nf_conntrack_event_cache(IPCT_STATUS, ct);
830 831 832

	return ret;
}
833
EXPORT_SYMBOL_GPL(nf_conntrack_in);
834

835 836
bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
			  const struct nf_conntrack_tuple *orig)
837
{
838
	bool ret;
839 840 841 842 843 844 845 846

	rcu_read_lock();
	ret = nf_ct_invert_tuple(inverse, orig,
				 __nf_ct_l3proto_find(orig->src.l3num),
				 __nf_ct_l4proto_find(orig->src.l3num,
						      orig->dst.protonum));
	rcu_read_unlock();
	return ret;
847
}
848
EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
849

850 851 852 853 854 855 856 857 858 859
/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
   implicitly racy: see __nf_conntrack_confirm */
void nf_conntrack_alter_reply(struct nf_conn *ct,
			      const struct nf_conntrack_tuple *newreply)
{
	struct nf_conn_help *help = nfct_help(ct);

	/* Should be unconfirmed, so not in hash table yet */
	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));

860
	pr_debug("Altering reply tuple of %p to ", ct);
861
	nf_ct_dump_tuple(newreply);
862 863

	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
864
	if (ct->master || (help && !hlist_empty(&help->expectations)))
865
		return;
866

867
	rcu_read_lock();
868
	__nf_ct_try_assign_helper(ct, GFP_ATOMIC);
869
	rcu_read_unlock();
870
}
871
EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
872

873 874 875 876 877 878 879 880 881 882
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
			  enum ip_conntrack_info ctinfo,
			  const struct sk_buff *skb,
			  unsigned long extra_jiffies,
			  int do_acct)
{
	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
	NF_CT_ASSERT(skb);

883
	/* Only update if this is not a fixed timeout */
884 885
	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
		goto acct;
886

887 888 889 890
	/* If not in hash table, timer will not be active yet */
	if (!nf_ct_is_confirmed(ct)) {
		ct->timeout.expires = extra_jiffies;
	} else {
891 892 893 894 895
		unsigned long newtime = jiffies + extra_jiffies;

		/* Only update the timeout if the new timeout is at least
		   HZ jiffies from the old timeout. Need del_timer for race
		   avoidance (may already be dying). */
896 897
		if (newtime - ct->timeout.expires >= HZ)
			mod_timer_pending(&ct->timeout, newtime);
898 899
	}

900
acct:
901
	if (do_acct) {
902
		struct nf_conn_counter *acct;
903

904 905
		acct = nf_conn_acct_find(ct);
		if (acct) {
906
			spin_lock_bh(&ct->lock);
907 908 909
			acct[CTINFO2DIR(ctinfo)].packets++;
			acct[CTINFO2DIR(ctinfo)].bytes +=
				skb->len - skb_network_offset(skb);
910
			spin_unlock_bh(&ct->lock);
911
		}
912 913
	}
}
914
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
915

916 917 918 919
bool __nf_ct_kill_acct(struct nf_conn *ct,
		       enum ip_conntrack_info ctinfo,
		       const struct sk_buff *skb,
		       int do_acct)
920
{
921
	if (do_acct) {
922 923 924 925
		struct nf_conn_counter *acct;

		acct = nf_conn_acct_find(ct);
		if (acct) {
926
			spin_lock_bh(&ct->lock);
927 928 929
			acct[CTINFO2DIR(ctinfo)].packets++;
			acct[CTINFO2DIR(ctinfo)].bytes +=
				skb->len - skb_network_offset(skb);
930
			spin_unlock_bh(&ct->lock);
931
		}
932
	}
933

934
	if (del_timer(&ct->timeout)) {
935
		ct->timeout.function((unsigned long)ct);
936 937 938
		return true;
	}
	return false;
939
}
940
EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
941

942
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
943 944 945

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
I
Ingo Molnar 已提交
946 947
#include <linux/mutex.h>

948 949 950
/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
 * in ip_conntrack_core, since we don't want the protocols to autoload
 * or depend on ctnetlink */
951
int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
952 953
			       const struct nf_conntrack_tuple *tuple)
{
954 955
	NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
	NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
956 957
	return 0;

958
nla_put_failure:
959 960
	return -1;
}
961
EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
962

963 964 965
const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
	[CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
	[CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
966
};
967
EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
968

969
int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
970 971
			       struct nf_conntrack_tuple *t)
{
972
	if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
973 974
		return -EINVAL;

975 976
	t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
	t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
977 978 979

	return 0;
}
980
EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
981 982 983 984 985 986

int nf_ct_port_nlattr_tuple_size(void)
{
	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
}
EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
987 988
#endif

989
/* Used by ipt_REJECT and ip6t_REJECT. */
990
static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
{
	struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;

	/* This ICMP is in reverse direction to the packet which caused it */
	ct = nf_ct_get(skb, &ctinfo);
	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
	else
		ctinfo = IP_CT_RELATED;

	/* Attach to new skbuff, and increment count */
	nskb->nfct = &ct->ct_general;
	nskb->nfctinfo = ctinfo;
	nf_conntrack_get(nskb->nfct);
}

/* Bring out ya dead! */
P
Patrick McHardy 已提交
1009
static struct nf_conn *
1010
get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1011 1012
		void *data, unsigned int *bucket)
{
P
Patrick McHardy 已提交
1013 1014
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
1015
	struct hlist_nulls_node *n;
1016

1017
	spin_lock_bh(&nf_conntrack_lock);
1018
	for (; *bucket < net->ct.htable_size; (*bucket)++) {
1019
		hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
P
Patrick McHardy 已提交
1020 1021 1022 1023
			ct = nf_ct_tuplehash_to_ctrack(h);
			if (iter(ct, data))
				goto found;
		}
1024
	}
1025
	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
P
Patrick McHardy 已提交
1026 1027
		ct = nf_ct_tuplehash_to_ctrack(h);
		if (iter(ct, data))
1028
			set_bit(IPS_DYING_BIT, &ct->status);
P
Patrick McHardy 已提交
1029
	}
1030
	spin_unlock_bh(&nf_conntrack_lock);
P
Patrick McHardy 已提交
1031 1032
	return NULL;
found:
1033
	atomic_inc(&ct->ct_general.use);
1034
	spin_unlock_bh(&nf_conntrack_lock);
P
Patrick McHardy 已提交
1035
	return ct;
1036 1037
}

1038 1039 1040
void nf_ct_iterate_cleanup(struct net *net,
			   int (*iter)(struct nf_conn *i, void *data),
			   void *data)
1041
{
P
Patrick McHardy 已提交
1042
	struct nf_conn *ct;
1043 1044
	unsigned int bucket = 0;

1045
	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1046 1047 1048 1049 1050 1051 1052 1053
		/* Time to push up daises... */
		if (del_timer(&ct->timeout))
			death_by_timeout((unsigned long)ct);
		/* ... else the timer will get him soon. */

		nf_ct_put(ct);
	}
}
1054
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1055

1056 1057 1058 1059 1060
struct __nf_ct_flush_report {
	u32 pid;
	int report;
};

1061
static int kill_report(struct nf_conn *i, void *data)
1062
{
1063 1064
	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;

1065 1066 1067 1068 1069 1070 1071
	/* If we fail to deliver the event, death_by_timeout() will retry */
	if (nf_conntrack_event_report(IPCT_DESTROY, i,
				      fr->pid, fr->report) < 0)
		return 1;

	/* Avoid the delivery of the destroy event in death_by_timeout(). */
	set_bit(IPS_DYING_BIT, &i->status);
1072 1073 1074
	return 1;
}

1075 1076 1077 1078 1079
static int kill_all(struct nf_conn *i, void *data)
{
	return 1;
}

1080
void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
1081 1082 1083 1084
{
	if (vmalloced)
		vfree(hash);
	else
1085
		free_pages((unsigned long)hash,
1086
			   get_order(sizeof(struct hlist_head) * size));
1087
}
1088
EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1089

1090
void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
1091
{
1092 1093 1094 1095
	struct __nf_ct_flush_report fr = {
		.pid 	= pid,
		.report = report,
	};
1096
	nf_ct_iterate_cleanup(net, kill_report, &fr);
1097
}
1098
EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
1099

1100
static void nf_ct_release_dying_list(struct net *net)
1101 1102 1103 1104 1105 1106
{
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
	struct hlist_nulls_node *n;

	spin_lock_bh(&nf_conntrack_lock);
1107
	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
1108 1109 1110 1111 1112 1113 1114
		ct = nf_ct_tuplehash_to_ctrack(h);
		/* never fails to remove them, no listeners at this point */
		nf_ct_kill(ct);
	}
	spin_unlock_bh(&nf_conntrack_lock);
}

1115
static void nf_conntrack_cleanup_init_net(void)
1116
{
1117 1118 1119 1120
	/* wait until all references to nf_conntrack_untracked are dropped */
	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
		schedule();

1121 1122 1123
	nf_conntrack_helper_fini();
	nf_conntrack_proto_fini();
}
1124

1125 1126
static void nf_conntrack_cleanup_net(struct net *net)
{
1127
 i_see_dead_people:
1128
	nf_ct_iterate_cleanup(net, kill_all, NULL);
1129
	nf_ct_release_dying_list(net);
1130
	if (atomic_read(&net->ct.count) != 0) {
1131 1132 1133 1134
		schedule();
		goto i_see_dead_people;
	}

1135
	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1136
			     net->ct.htable_size);
1137
	nf_conntrack_ecache_fini(net);
1138
	nf_conntrack_acct_fini(net);
1139
	nf_conntrack_expect_fini(net);
1140 1141
	kmem_cache_destroy(net->ct.nf_conntrack_cachep);
	kfree(net->ct.slabname);
1142
	free_percpu(net->ct.stat);
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
}

/* Mishearing the voices in his head, our hero wonders how he's
   supposed to kill the mall. */
void nf_conntrack_cleanup(struct net *net)
{
	if (net_eq(net, &init_net))
		rcu_assign_pointer(ip_ct_attach, NULL);

	/* This makes sure all current packets have passed through
	   netfilter framework.  Roll on, two-stage module
	   delete... */
	synchronize_net();

	nf_conntrack_cleanup_net(net);

	if (net_eq(net, &init_net)) {
		rcu_assign_pointer(nf_ct_destroy, NULL);
		nf_conntrack_cleanup_init_net();
	}
1163 1164
}

1165
void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
1166
{
1167 1168 1169
	struct hlist_nulls_head *hash;
	unsigned int nr_slots, i;
	size_t sz;
1170

1171
	*vmalloced = 0;
1172

1173 1174 1175 1176 1177
	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
	sz = nr_slots * sizeof(struct hlist_nulls_head);
	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
					get_order(sz));
1178
	if (!hash) {
1179 1180
		*vmalloced = 1;
		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1181
		hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
1182 1183
	}

1184 1185 1186
	if (hash && nulls)
		for (i = 0; i < nr_slots; i++)
			INIT_HLIST_NULLS_HEAD(&hash[i], i);
1187 1188 1189

	return hash;
}
1190
EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1191

1192
int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1193
{
1194 1195
	int i, bucket, vmalloced, old_vmalloced;
	unsigned int hashsize, old_size;
1196
	struct hlist_nulls_head *hash, *old_hash;
1197 1198
	struct nf_conntrack_tuple_hash *h;

1199 1200 1201
	if (current->nsproxy->net_ns != &init_net)
		return -EOPNOTSUPP;

1202 1203 1204 1205
	/* On boot, we can set this without any fancy locking. */
	if (!nf_conntrack_htable_size)
		return param_set_uint(val, kp);

1206
	hashsize = simple_strtoul(val, NULL, 0);
1207 1208 1209
	if (!hashsize)
		return -EINVAL;

1210
	hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
1211 1212 1213
	if (!hash)
		return -ENOMEM;

1214 1215 1216 1217 1218
	/* Lookups in the old hash might happen in parallel, which means we
	 * might get false negatives during connection lookup. New connections
	 * created because of a false negative won't make it into the hash
	 * though since that required taking the lock.
	 */
1219
	spin_lock_bh(&nf_conntrack_lock);
1220
	for (i = 0; i < init_net.ct.htable_size; i++) {
1221 1222 1223 1224
		while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
			h = hlist_nulls_entry(init_net.ct.hash[i].first,
					struct nf_conntrack_tuple_hash, hnnode);
			hlist_nulls_del_rcu(&h->hnnode);
1225 1226
			bucket = __hash_conntrack(&h->tuple, hashsize,
						  nf_conntrack_hash_rnd);
1227
			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1228 1229
		}
	}
1230
	old_size = init_net.ct.htable_size;
1231 1232
	old_vmalloced = init_net.ct.hash_vmalloc;
	old_hash = init_net.ct.hash;
1233

1234
	init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1235 1236
	init_net.ct.hash_vmalloc = vmalloced;
	init_net.ct.hash = hash;
1237
	spin_unlock_bh(&nf_conntrack_lock);
1238

1239
	nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
1240 1241
	return 0;
}
1242
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1243

1244
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1245 1246
		  &nf_conntrack_htable_size, 0600);

1247
static int nf_conntrack_init_init_net(void)
1248
{
1249
	int max_factor = 8;
1250 1251 1252
	int ret;

	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1253
	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
1254 1255
	if (!nf_conntrack_htable_size) {
		nf_conntrack_htable_size
1256
			= (((totalram_pages << PAGE_SHIFT) / 16384)
1257
			   / sizeof(struct hlist_head));
1258
		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1259 1260 1261 1262 1263 1264 1265 1266 1267
			nf_conntrack_htable_size = 16384;
		if (nf_conntrack_htable_size < 32)
			nf_conntrack_htable_size = 32;

		/* Use a max. factor of four by default to get the same max as
		 * with the old struct list_heads. When a table size is given
		 * we use the old value of 8 to avoid reducing the max.
		 * entries. */
		max_factor = 4;
1268
	}
1269
	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1270 1271 1272 1273 1274

	printk("nf_conntrack version %s (%u buckets, %d max)\n",
	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
	       nf_conntrack_max);

1275 1276
	ret = nf_conntrack_proto_init();
	if (ret < 0)
1277
		goto err_proto;
1278

1279 1280
	ret = nf_conntrack_helper_init();
	if (ret < 0)
1281 1282
		goto err_helper;

1283 1284 1285 1286 1287 1288 1289 1290
	/* Set up fake conntrack: to never be deleted, not in any hashes */
#ifdef CONFIG_NET_NS
	nf_conntrack_untracked.ct_net = &init_net;
#endif
	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
	/*  - and look it like as a confirmed connection */
	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);

1291 1292 1293 1294 1295 1296 1297 1298
	return 0;

err_helper:
	nf_conntrack_proto_fini();
err_proto:
	return ret;
}

1299 1300 1301 1302 1303 1304
/*
 * We need to use special "null" values, not used in hash table
 */
#define UNCONFIRMED_NULLS_VAL	((1<<30)+0)
#define DYING_NULLS_VAL		((1<<30)+1)

1305 1306 1307
static int nf_conntrack_init_net(struct net *net)
{
	int ret;
1308

1309
	atomic_set(&net->ct.count, 0);
1310 1311
	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
1312 1313 1314 1315 1316
	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
	if (!net->ct.stat) {
		ret = -ENOMEM;
		goto err_stat;
	}
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331

	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
	if (!net->ct.slabname) {
		ret = -ENOMEM;
		goto err_slabname;
	}

	net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
							sizeof(struct nf_conn), 0,
							SLAB_DESTROY_BY_RCU, NULL);
	if (!net->ct.nf_conntrack_cachep) {
		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
		ret = -ENOMEM;
		goto err_cache;
	}
1332 1333 1334

	net->ct.htable_size = nf_conntrack_htable_size;
	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
1335
					     &net->ct.hash_vmalloc, 1);
1336 1337 1338 1339 1340 1341 1342 1343
	if (!net->ct.hash) {
		ret = -ENOMEM;
		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
		goto err_hash;
	}
	ret = nf_conntrack_expect_init(net);
	if (ret < 0)
		goto err_expect;
1344
	ret = nf_conntrack_acct_init(net);
1345
	if (ret < 0)
1346
		goto err_acct;
1347 1348 1349
	ret = nf_conntrack_ecache_init(net);
	if (ret < 0)
		goto err_ecache;
1350

1351
	return 0;
1352

1353 1354
err_ecache:
	nf_conntrack_acct_fini(net);
1355
err_acct:
1356
	nf_conntrack_expect_fini(net);
1357
err_expect:
1358
	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1359
			     net->ct.htable_size);
1360
err_hash:
1361 1362 1363 1364
	kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
	kfree(net->ct.slabname);
err_slabname:
1365 1366
	free_percpu(net->ct.stat);
err_stat:
1367 1368 1369
	return ret;
}

1370 1371 1372 1373 1374
s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
			enum ip_conntrack_dir dir,
			u32 seq);
EXPORT_SYMBOL_GPL(nf_ct_nat_offset);

1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
int nf_conntrack_init(struct net *net)
{
	int ret;

	if (net_eq(net, &init_net)) {
		ret = nf_conntrack_init_init_net();
		if (ret < 0)
			goto out_init_net;
	}
	ret = nf_conntrack_init_net(net);
	if (ret < 0)
		goto out_net;

	if (net_eq(net, &init_net)) {
		/* For use by REJECT target */
		rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
		rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
1392 1393 1394

		/* Howto get NAT offsets */
		rcu_assign_pointer(nf_ct_nat_offset, NULL);
1395 1396 1397 1398 1399 1400 1401 1402
	}
	return 0;

out_net:
	if (net_eq(net, &init_net))
		nf_conntrack_cleanup_init_net();
out_init_net:
	return ret;
1403
}