nf_nat_core.c 22.7 KB
Newer Older
1 2
/*
 * (C) 1999-2001 Paul `Rusty' Russell
3
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4
 * (C) 2011 Patrick McHardy <kaber@trash.net>
5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/skbuff.h>
15
#include <linux/gfp.h>
16
#include <net/xfrm.h>
17
#include <linux/jhash.h>
18
#include <linux/rtnetlink.h>
19 20 21 22

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_nat.h>
23 24
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/netfilter/nf_nat_l4proto.h>
25 26 27
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
28
#include <net/netfilter/nf_conntrack_seqadj.h>
29
#include <net/netfilter/nf_conntrack_l3proto.h>
30
#include <net/netfilter/nf_conntrack_zones.h>
31
#include <linux/netfilter/nf_nat.h>
32

33
static DEFINE_SPINLOCK(nf_nat_lock);
34

35 36 37 38
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
						__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
39
						__read_mostly;
40

41 42 43

inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
44
{
45
	return rcu_dereference(nf_nat_l3protos[family]);
46 47
}

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
inline const struct nf_nat_l4proto *
__nf_nat_l4proto_find(u8 family, u8 protonum)
{
	return rcu_dereference(nf_nat_l4protos[family][protonum]);
}
EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);

#ifdef CONFIG_XFRM
static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
{
	const struct nf_nat_l3proto *l3proto;
	const struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;
	enum ip_conntrack_dir dir;
	unsigned  long statusbit;
	u8 family;

	ct = nf_ct_get(skb, &ctinfo);
	if (ct == NULL)
		return;

	family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
	rcu_read_lock();
	l3proto = __nf_nat_l3proto_find(family);
	if (l3proto == NULL)
		goto out;

	dir = CTINFO2DIR(ctinfo);
	if (dir == IP_CT_DIR_ORIGINAL)
		statusbit = IPS_DST_NAT;
	else
		statusbit = IPS_SRC_NAT;

	l3proto->decode_session(skb, ct, dir, statusbit, fl);
out:
	rcu_read_unlock();
}

int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
{
	struct flowi fl;
	unsigned int hh_len;
	struct dst_entry *dst;
91
	int err;
92

93
	err = xfrm_decode_session(skb, &fl, family);
94
	if (err < 0)
95
		return err;
96 97 98 99 100 101 102 103

	dst = skb_dst(skb);
	if (dst->xfrm)
		dst = ((struct xfrm_dst *)dst)->route;
	dst_hold(dst);

	dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
	if (IS_ERR(dst))
104
		return PTR_ERR(dst);
105 106 107 108 109 110 111 112

	skb_dst_drop(skb);
	skb_dst_set(skb, dst);

	/* Change in oif may mean change in hh_len. */
	hh_len = skb_dst(skb)->dev->hard_header_len;
	if (skb_headroom(skb) < hh_len &&
	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
113
		return -ENOMEM;
114 115 116 117 118
	return 0;
}
EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */

119 120
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
121 122
hash_by_src(const struct net *net, u16 zone,
	    const struct nf_conntrack_tuple *tuple)
123
{
124 125
	unsigned int hash;

126
	/* Original src, to ensure we map it consistently if poss. */
127 128 129
	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
		      tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
	return ((u64)hash * net->ct.nat_htable_size) >> 32;
130 131 132 133 134 135 136 137
}

/* Is this tuple already taken? (not by us) */
int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
		  const struct nf_conn *ignored_conntrack)
{
	/* Conntrack tracking doesn't keep track of outgoing tuples; only
138 139 140 141 142
	 * incoming ones.  NAT means they don't have a fixed mapping,
	 * so we invert the tuple and look for the incoming reply.
	 *
	 * We could keep a separate hash if this proves too slow.
	 */
143 144 145 146 147 148 149 150
	struct nf_conntrack_tuple reply;

	nf_ct_invert_tuplepr(&reply, tuple);
	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}
EXPORT_SYMBOL(nf_nat_used_tuple);

/* If we source map this tuple so reply looks like reply_tuple, will
151 152 153 154 155 156
 * that meet the constraints of range.
 */
static int in_range(const struct nf_nat_l3proto *l3proto,
		    const struct nf_nat_l4proto *l4proto,
		    const struct nf_conntrack_tuple *tuple,
		    const struct nf_nat_range *range)
157 158
{
	/* If we are supposed to map IPs, then we must be in the
159 160 161 162 163
	 * range specified, otherwise let this drag us onto a new src IP.
	 */
	if (range->flags & NF_NAT_RANGE_MAP_IPS &&
	    !l3proto->in_range(tuple, range))
		return 0;
164

165
	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
166 167 168
	    l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
			      &range->min_proto, &range->max_proto))
		return 1;
169

170
	return 0;
171 172 173 174 175 176 177 178 179 180
}

static inline int
same_src(const struct nf_conn *ct,
	 const struct nf_conntrack_tuple *tuple)
{
	const struct nf_conntrack_tuple *t;

	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
	return (t->dst.protonum == tuple->dst.protonum &&
181
		nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
182 183 184 185 186
		t->src.u.all == tuple->src.u.all);
}

/* Only called for SRC manip */
static int
187
find_appropriate_src(struct net *net, u16 zone,
188 189
		     const struct nf_nat_l3proto *l3proto,
		     const struct nf_nat_l4proto *l4proto,
190
		     const struct nf_conntrack_tuple *tuple,
191
		     struct nf_conntrack_tuple *result,
192
		     const struct nf_nat_range *range)
193
{
194
	unsigned int h = hash_by_src(net, zone, tuple);
195 196
	const struct nf_conn_nat *nat;
	const struct nf_conn *ct;
197

198
	hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
199
		ct = nat->ct;
200
		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
201 202 203 204 205
			/* Copy source part from reply tuple. */
			nf_ct_invert_tuplepr(result,
				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
			result->dst = tuple->dst;

206
			if (in_range(l3proto, l4proto, result, range))
207 208 209 210 211 212 213
				return 1;
		}
	}
	return 0;
}

/* For [FUTURE] fragmentation handling, we want the least-used
214 215 216 217 218
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 */
219
static void
220
find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
221
		    const struct nf_nat_range *range,
222 223 224
		    const struct nf_conn *ct,
		    enum nf_nat_manip_type maniptype)
{
225 226
	union nf_inet_addr *var_ipp;
	unsigned int i, max;
227
	/* Host order */
228 229
	u32 minip, maxip, j, dist;
	bool full_range;
230 231

	/* No IP mapping?  Do nothing. */
232
	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
233 234
		return;

235
	if (maniptype == NF_NAT_MANIP_SRC)
236
		var_ipp = &tuple->src.u3;
237
	else
238
		var_ipp = &tuple->dst.u3;
239 240

	/* Fast path: only one choice. */
241 242
	if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
		*var_ipp = range->min_addr;
243 244 245
		return;
	}

246 247 248 249 250
	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
		max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
	else
		max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;

251 252 253 254 255
	/* Hashing source and destination IPs gives a fairly even
	 * spread in practice (if there are a small number of IPs
	 * involved, there usually aren't that many connections
	 * anyway).  The consistency means that servers see the same
	 * client coming from the same IP (some Internet Banking sites
256 257
	 * like this), even across reboots.
	 */
258
	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
		   range->flags & NF_NAT_RANGE_PERSISTENT ?
			0 : (__force u32)tuple->dst.u3.all[max] ^ zone);

	full_range = false;
	for (i = 0; i <= max; i++) {
		/* If first bytes of the address are at the maximum, use the
		 * distance. Otherwise use the full range.
		 */
		if (!full_range) {
			minip = ntohl((__force __be32)range->min_addr.all[i]);
			maxip = ntohl((__force __be32)range->max_addr.all[i]);
			dist  = maxip - minip + 1;
		} else {
			minip = 0;
			dist  = ~0;
		}

		var_ipp->all[i] = (__force __u32)
			htonl(minip + (((u64)j * dist) >> 32));
		if (var_ipp->all[i] != range->max_addr.all[i])
			full_range = true;

		if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
			j ^= (__force u32)tuple->dst.u3.all[i];
	}
284 285
}

286 287
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
288
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
289
 * range. It might not be possible to get a unique tuple, but we try.
290 291 292 293 294
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
		 const struct nf_conntrack_tuple *orig_tuple,
295
		 const struct nf_nat_range *range,
296 297 298
		 struct nf_conn *ct,
		 enum nf_nat_manip_type maniptype)
{
299 300
	const struct nf_nat_l3proto *l3proto;
	const struct nf_nat_l4proto *l4proto;
301
	struct net *net = nf_ct_net(ct);
302
	u16 zone = nf_ct_zone(ct);
303

304 305 306 307
	rcu_read_lock();
	l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
	l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
					orig_tuple->dst.protonum);
308

309 310 311 312 313 314 315 316
	/* 1) If this srcip/proto/src-proto-part is currently mapped,
	 * and that same mapping gives a unique tuple within the given
	 * range, use that.
	 *
	 * This is only required for source (ie. NAT/masq) mappings.
	 * So far, we don't do local source mappings, so multiple
	 * manips not an issue.
	 */
317 318
	if (maniptype == NF_NAT_MANIP_SRC &&
	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
319
		/* try the original tuple first */
320
		if (in_range(l3proto, l4proto, orig_tuple, range)) {
321 322
			if (!nf_nat_used_tuple(orig_tuple, ct)) {
				*tuple = *orig_tuple;
323
				goto out;
324
			}
325 326
		} else if (find_appropriate_src(net, zone, l3proto, l4proto,
						orig_tuple, tuple, range)) {
327
			pr_debug("get_unique_tuple: Found current src map\n");
328
			if (!nf_nat_used_tuple(tuple, ct))
329
				goto out;
330 331 332
		}
	}

333
	/* 2) Select the least-used IP/proto combination in the given range */
334
	*tuple = *orig_tuple;
335
	find_best_ips_proto(zone, tuple, range, ct, maniptype);
336 337

	/* 3) The per-protocol part of the manip is made to map into
338 339
	 * the range to make a unique tuple.
	 */
340 341

	/* Only bother mapping if it's not already in range and unique */
342 343
	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
344 345 346 347
			if (l4proto->in_range(tuple, maniptype,
					      &range->min_proto,
					      &range->max_proto) &&
			    (range->min_proto.all == range->max_proto.all ||
348 349 350 351 352 353
			     !nf_nat_used_tuple(tuple, ct)))
				goto out;
		} else if (!nf_nat_used_tuple(tuple, ct)) {
			goto out;
		}
	}
354 355

	/* Last change: get protocol to try to obtain unique tuple. */
356
	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
357 358
out:
	rcu_read_unlock();
359 360 361 362
}

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
363
		  const struct nf_nat_range *range,
364
		  enum nf_nat_manip_type maniptype)
365
{
366
	struct net *net = nf_ct_net(ct);
367
	struct nf_conntrack_tuple curr_tuple, new_tuple;
368
	struct nf_conn_nat *nat;
369

370 371 372 373 374
	/* nat helper or nfctnetlink also setup binding */
	nat = nfct_nat(ct);
	if (!nat) {
		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
		if (nat == NULL) {
375
			pr_debug("failed to add NAT extension\n");
376 377 378 379
			return NF_ACCEPT;
		}
	}

380 381
	NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
		     maniptype == NF_NAT_MANIP_DST);
382 383 384
	BUG_ON(nf_nat_initialized(ct, maniptype));

	/* What we've got will look like inverse of reply. Normally
385 386 387 388
	 * this is what is in the conntrack, except for prior
	 * manipulations (future optimization: if num_manips == 0,
	 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
	 */
389 390 391 392 393 394 395 396 397 398 399 400 401
	nf_ct_invert_tuplepr(&curr_tuple,
			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
		struct nf_conntrack_tuple reply;

		/* Alter conntrack table so will recognize replies. */
		nf_ct_invert_tuplepr(&reply, &new_tuple);
		nf_conntrack_alter_reply(ct, &reply);

		/* Non-atomic: we own this at the moment. */
402
		if (maniptype == NF_NAT_MANIP_SRC)
403 404 405
			ct->status |= IPS_SRC_NAT;
		else
			ct->status |= IPS_DST_NAT;
406 407 408

		if (nfct_help(ct))
			nfct_seqadj_ext_add(ct);
409 410
	}

411
	if (maniptype == NF_NAT_MANIP_SRC) {
412 413
		unsigned int srchash;

414 415
		srchash = hash_by_src(net, nf_ct_zone(ct),
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
416
		spin_lock_bh(&nf_nat_lock);
417
		/* nf_conntrack_alter_reply might re-allocate extension aera */
418 419
		nat = nfct_nat(ct);
		nat->ct = ct;
420
		hlist_add_head_rcu(&nat->bysource,
421
				   &net->ct.nat_bysource[srchash]);
422
		spin_unlock_bh(&nf_nat_lock);
423 424 425
	}

	/* It's done. */
426
	if (maniptype == NF_NAT_MANIP_DST)
427
		ct->status |= IPS_DST_NAT_DONE;
428
	else
429
		ct->status |= IPS_SRC_NAT_DONE;
430 431 432 433 434

	return NF_ACCEPT;
}
EXPORT_SYMBOL(nf_nat_setup_info);

435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
unsigned int
nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
	/* Force range to this IP; let proto decide mapping for
	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
	 * Use reply in case it's already been mangled (eg local packet).
	 */
	union nf_inet_addr ip =
		(HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
	struct nf_nat_range range = {
		.flags		= NF_NAT_RANGE_MAP_IPS,
		.min_addr	= ip,
		.max_addr	= ip,
	};
	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);

455 456 457 458
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
			   enum ip_conntrack_info ctinfo,
			   unsigned int hooknum,
459
			   struct sk_buff *skb)
460
{
461 462
	const struct nf_nat_l3proto *l3proto;
	const struct nf_nat_l4proto *l4proto;
463 464 465 466
	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
	unsigned long statusbit;
	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);

467
	if (mtype == NF_NAT_MANIP_SRC)
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
		statusbit = IPS_SRC_NAT;
	else
		statusbit = IPS_DST_NAT;

	/* Invert if this is reply dir. */
	if (dir == IP_CT_DIR_REPLY)
		statusbit ^= IPS_NAT_MASK;

	/* Non-atomic: these bits don't change. */
	if (ct->status & statusbit) {
		struct nf_conntrack_tuple target;

		/* We are aiming to look like inverse of other direction. */
		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);

483 484 485 486
		l3proto = __nf_nat_l3proto_find(target.src.l3num);
		l4proto = __nf_nat_l4proto_find(target.src.l3num,
						target.dst.protonum);
		if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
487 488 489 490 491 492
			return NF_DROP;
	}
	return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_nat_packet);

493 494 495 496 497
struct nf_nat_proto_clean {
	u8	l3proto;
	u8	l4proto;
};

498 499
/* kill conntracks with affected NAT section */
static int nf_nat_proto_remove(struct nf_conn *i, void *data)
500
{
501 502
	const struct nf_nat_proto_clean *clean = data;
	struct nf_conn_nat *nat = nfct_nat(i);
503

504
	if (!nat)
505
		return 0;
506

507 508
	if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
	    (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
509 510
		return 0;

511
	return i->status & IPS_NAT_MASK ? 1 : 0;
512
}
513

514 515 516 517 518 519 520 521 522 523
static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
{
	struct nf_nat_proto_clean clean = {
		.l3proto = l3proto,
		.l4proto = l4proto,
	};
	struct net *net;

	rtnl_lock();
	for_each_net(net)
524
		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
525 526
	rtnl_unlock();
}
527

528 529 530 531 532 533 534 535
static void nf_nat_l3proto_clean(u8 l3proto)
{
	struct nf_nat_proto_clean clean = {
		.l3proto = l3proto,
	};
	struct net *net;

	rtnl_lock();
536

537
	for_each_net(net)
538
		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
539
	rtnl_unlock();
540 541 542
}

/* Protocol registration. */
543
int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
544
{
545 546
	const struct nf_nat_l4proto **l4protos;
	unsigned int i;
547 548
	int ret = 0;

549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
	mutex_lock(&nf_nat_proto_mutex);
	if (nf_nat_l4protos[l3proto] == NULL) {
		l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *),
				   GFP_KERNEL);
		if (l4protos == NULL) {
			ret = -ENOMEM;
			goto out;
		}

		for (i = 0; i < IPPROTO_MAX; i++)
			RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);

		/* Before making proto_array visible to lockless readers,
		 * we must make sure its content is committed to memory.
		 */
		smp_wmb();

		nf_nat_l4protos[l3proto] = l4protos;
	}

E
Eric Dumazet 已提交
569
	if (rcu_dereference_protected(
570 571 572
			nf_nat_l4protos[l3proto][l4proto->l4proto],
			lockdep_is_held(&nf_nat_proto_mutex)
			) != &nf_nat_l4proto_unknown) {
573 574 575
		ret = -EBUSY;
		goto out;
	}
576
	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
577
 out:
578
	mutex_unlock(&nf_nat_proto_mutex);
579 580
	return ret;
}
581
EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
582

L
Lucas De Marchi 已提交
583
/* No one stores the protocol anywhere; simply delete it. */
584
void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
585
{
586 587 588 589
	mutex_lock(&nf_nat_proto_mutex);
	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
			 &nf_nat_l4proto_unknown);
	mutex_unlock(&nf_nat_proto_mutex);
590
	synchronize_rcu();
591 592

	nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
593
}
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626
EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);

int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
{
	int err;

	err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
	if (err < 0)
		return err;

	mutex_lock(&nf_nat_proto_mutex);
	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
			 &nf_nat_l4proto_tcp);
	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
			 &nf_nat_l4proto_udp);
	mutex_unlock(&nf_nat_proto_mutex);

	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
	return 0;
}
EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);

void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
{
	mutex_lock(&nf_nat_proto_mutex);
	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
	mutex_unlock(&nf_nat_proto_mutex);
	synchronize_rcu();

	nf_nat_l3proto_clean(l3proto->l3proto);
	nf_ct_l3proto_module_put(l3proto->l3proto);
}
EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
627

L
Lucas De Marchi 已提交
628
/* No one using conntrack by the time this called. */
629 630 631 632
static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);

633
	if (nat == NULL || nat->ct == NULL)
634 635
		return;

636
	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
637

638
	spin_lock_bh(&nf_nat_lock);
639
	hlist_del_rcu(&nat->bysource);
640
	spin_unlock_bh(&nf_nat_lock);
641 642
}

643
static void nf_nat_move_storage(void *new, void *old)
644
{
645 646
	struct nf_conn_nat *new_nat = new;
	struct nf_conn_nat *old_nat = old;
647
	struct nf_conn *ct = old_nat->ct;
648

649
	if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
650 651
		return;

652
	spin_lock_bh(&nf_nat_lock);
653
	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
654
	spin_unlock_bh(&nf_nat_lock);
655 656
}

657
static struct nf_ct_ext_type nat_extend __read_mostly = {
658 659 660 661 662 663
	.len		= sizeof(struct nf_conn_nat),
	.align		= __alignof__(struct nf_conn_nat),
	.destroy	= nf_nat_cleanup_conntrack,
	.move		= nf_nat_move_storage,
	.id		= NF_CT_EXT_NAT,
	.flags		= NF_CT_EXT_F_PREALLOC,
664 665
};

666 667 668 669 670 671 672 673 674 675 676 677
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
};

static int nfnetlink_parse_nat_proto(struct nlattr *attr,
				     const struct nf_conn *ct,
678
				     struct nf_nat_range *range)
679 680
{
	struct nlattr *tb[CTA_PROTONAT_MAX+1];
681
	const struct nf_nat_l4proto *l4proto;
682 683 684 685 686 687
	int err;

	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
	if (err < 0)
		return err;

688 689 690 691
	l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
	if (l4proto->nlattr_to_range)
		err = l4proto->nlattr_to_range(tb, range);

692 693 694 695
	return err;
}

static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
696 697
	[CTA_NAT_V4_MINIP]	= { .type = NLA_U32 },
	[CTA_NAT_V4_MAXIP]	= { .type = NLA_U32 },
698 699
	[CTA_NAT_V6_MINIP]	= { .len = sizeof(struct in6_addr) },
	[CTA_NAT_V6_MAXIP]	= { .len = sizeof(struct in6_addr) },
700
	[CTA_NAT_PROTO]		= { .type = NLA_NESTED },
701 702 703
};

static int
704
nfnetlink_parse_nat(const struct nlattr *nat,
705
		    const struct nf_conn *ct, struct nf_nat_range *range)
706
{
707
	const struct nf_nat_l3proto *l3proto;
708 709 710 711 712 713 714 715 716
	struct nlattr *tb[CTA_NAT_MAX+1];
	int err;

	memset(range, 0, sizeof(*range));

	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
	if (err < 0)
		return err;

717 718 719 720 721 722 723 724 725
	rcu_read_lock();
	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
	if (l3proto == NULL) {
		err = -EAGAIN;
		goto out;
	}
	err = l3proto->nlattr_to_range(tb, range);
	if (err < 0)
		goto out;
726 727

	if (!tb[CTA_NAT_PROTO])
728
		goto out;
729 730

	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
731 732 733
out:
	rcu_read_unlock();
	return err;
734 735 736 737 738
}

static int
nfnetlink_parse_nat_setup(struct nf_conn *ct,
			  enum nf_nat_manip_type manip,
739
			  const struct nlattr *attr)
740
{
741 742
	struct nf_nat_range range;
	int err;
743

744 745 746
	err = nfnetlink_parse_nat(attr, ct, &range);
	if (err < 0)
		return err;
747 748 749 750 751 752 753 754 755
	if (nf_nat_initialized(ct, manip))
		return -EEXIST;

	return nf_nat_setup_info(ct, &range, manip);
}
#else
static int
nfnetlink_parse_nat_setup(struct nf_conn *ct,
			  enum nf_nat_manip_type manip,
756
			  const struct nlattr *attr)
757 758 759 760 761
{
	return -EOPNOTSUPP;
}
#endif

762 763
static int __net_init nf_nat_net_init(struct net *net)
{
764
	/* Leave them the same for the moment. */
765 766 767
	net->ct.nat_htable_size = net->ct.htable_size;
	net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
	if (!net->ct.nat_bysource)
768 769 770 771 772 773
		return -ENOMEM;
	return 0;
}

static void __net_exit nf_nat_net_exit(struct net *net)
{
774 775
	struct nf_nat_proto_clean clean = {};

776
	nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0);
777
	synchronize_rcu();
778
	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
779 780 781 782 783 784 785
}

static struct pernet_operations nf_nat_net_ops = {
	.init = nf_nat_net_init,
	.exit = nf_nat_net_exit,
};

786 787 788 789 790
static struct nf_ct_helper_expectfn follow_master_nat = {
	.name		= "nat-follow-master",
	.expectfn	= nf_nat_follow_master,
};

791 792
static int __init nf_nat_init(void)
{
793 794 795 796 797 798 799
	int ret;

	ret = nf_ct_extend_register(&nat_extend);
	if (ret < 0) {
		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
		return ret;
	}
800

801 802
	ret = register_pernet_subsys(&nf_nat_net_ops);
	if (ret < 0)
803
		goto cleanup_extend;
804

805
	nf_ct_helper_expectfn_register(&follow_master_nat);
806 807

	/* Initialize fake conntrack so that NAT will skip it */
808
	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
809

810
	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
811
	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
812
			   nfnetlink_parse_nat_setup);
813 814 815 816
#ifdef CONFIG_XFRM
	BUG_ON(nf_nat_decode_session_hook != NULL);
	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
#endif
817
	return 0;
818 819 820 821

 cleanup_extend:
	nf_ct_extend_unregister(&nat_extend);
	return ret;
822 823 824 825
}

static void __exit nf_nat_cleanup(void)
{
826 827
	unsigned int i;

828
	unregister_pernet_subsys(&nf_nat_net_ops);
829
	nf_ct_extend_unregister(&nat_extend);
830
	nf_ct_helper_expectfn_unregister(&follow_master_nat);
831
	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
832 833 834 835 836
#ifdef CONFIG_XFRM
	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
#endif
	for (i = 0; i < NFPROTO_NUMPROTO; i++)
		kfree(nf_nat_l4protos[i]);
837
	synchronize_net();
838 839 840 841 842 843
}

MODULE_LICENSE("GPL");

module_init(nf_nat_init);
module_exit(nf_nat_cleanup);