ip_fragment.c 17.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

L
Linus Torvalds 已提交
60 61
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
62 63
	struct inet_frag_queue q;

64
	u8		ecn; /* RFC3168 support */
65
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
66 67 68
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
69 70
};

71
static u8 ip4_frag_ecn(u8 tos)
72
{
73
	return 1 << (tos & INET_ECN_MASK);
74 75
}

76
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
77

78 79
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
			 struct sk_buff *prev_tail, struct net_device *dev);
80

81

82
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
83 84
{
	struct ipq *qp = container_of(q, struct ipq, q);
85
	struct net *net = q->fqdir->net;
86

87
	const struct frag_v4_compare_key *key = a;
88

89 90
	q->key.v4 = *key;
	qp->ecn = 0;
E
Eric Dumazet 已提交
91
	qp->peer = q->fqdir->max_dist ?
92
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
93
		NULL;
94 95
}

96
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
97
{
98 99 100 101 102
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
103 104 105 106 107
}


/* Destruction primitives. */

108
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
109
{
110
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
111 112 113 114 115 116 117
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
118
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
119 120
}

121 122 123 124
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
125 126 127
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
128 129
}

L
Linus Torvalds 已提交
130 131 132
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
133
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
134
{
135
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
136
	const struct iphdr *iph;
137
	struct sk_buff *head = NULL;
138
	struct net *net;
139 140
	struct ipq *qp;
	int err;
141

142
	qp = container_of(frag, struct ipq, q);
143
	net = qp->q.fqdir->net;
L
Linus Torvalds 已提交
144

145
	rcu_read_lock();
146
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
147

148
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
149 150 151
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
152
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
153
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
154

155
	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
156
		goto out;
157

158 159 160 161
	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
	 * pull the head out of the tree in order to be able to
	 * deal with head->dev.
	 */
162 163 164
	head = inet_frag_pull_head(&qp->q);
	if (!head)
		goto out;
165 166 167
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
168

169

170 171 172
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
173
					   iph->tos, head->dev);
174 175 176 177 178 179 180 181 182 183
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

184 185 186
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	goto out_rcu_unlock;
187

L
Linus Torvalds 已提交
188
out:
189
	spin_unlock(&qp->q.lock);
190 191
out_rcu_unlock:
	rcu_read_unlock();
192
	kfree_skb(head);
193
	ipq_put(qp);
L
Linus Torvalds 已提交
194 195
}

196 197 198
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
199 200
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
201
{
202 203 204 205 206 207 208 209
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
210
	struct inet_frag_queue *q;
211

212
	q = inet_frag_find(net->ipv4.fqdir, &key);
213
	if (!q)
214
		return NULL;
215

216
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
217 218
}

H
Herbert Xu 已提交
219
/* Is the fragment too far ahead to be part of ipq? */
220
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
221 222
{
	struct inet_peer *peer = qp->peer;
E
Eric Dumazet 已提交
223
	unsigned int max = qp->q.fqdir->max_dist;
H
Herbert Xu 已提交
224 225 226 227 228 229 230 231 232 233 234
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

235
	rc = qp->q.fragments_tail && (end - start) > max;
H
Herbert Xu 已提交
236

237 238
	if (rc)
		__IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
239 240 241 242 243 244

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
245
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
246

E
Eric Dumazet 已提交
247
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
248
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
249 250 251
		return -ETIMEDOUT;
	}

252
	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
E
Eric Dumazet 已提交
253
	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
H
Herbert Xu 已提交
254

255
	qp->q.flags = 0;
256 257
	qp->q.len = 0;
	qp->q.meat = 0;
258
	qp->q.rb_fragments = RB_ROOT;
259
	qp->q.fragments_tail = NULL;
260
	qp->q.last_run_head = NULL;
H
Herbert Xu 已提交
261
	qp->iif = 0;
262
	qp->ecn = 0;
H
Herbert Xu 已提交
263 264 265 266

	return 0;
}

L
Linus Torvalds 已提交
267
/* Add new segment to existing queue. */
268
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
269
{
270
	struct net *net = qp->q.fqdir->net;
271 272
	int ihl, end, flags, offset;
	struct sk_buff *prev_tail;
273
	struct net_device *dev;
274
	unsigned int fragsize;
275
	int err = -ENOENT;
276
	u8 ecn;
L
Linus Torvalds 已提交
277

278
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
279 280
		goto err;

H
Herbert Xu 已提交
281
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
282 283
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
284 285 286 287
		ipq_kill(qp);
		goto err;
	}

288
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
289
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
290 291 292
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
293
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
294 295

	/* Determine the position of this fragment. */
296
	end = offset + skb->len - skb_network_offset(skb) - ihl;
297
	err = -EINVAL;
L
Linus Torvalds 已提交
298 299 300 301

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
302
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
303
		 */
304
		if (end < qp->q.len ||
305
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
306
			goto discard_qp;
307
		qp->q.flags |= INET_FRAG_LAST_IN;
308
		qp->q.len = end;
L
Linus Torvalds 已提交
309 310 311 312 313 314
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
315
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
316
			/* Some bits beyond end -> corruption. */
317
			if (qp->q.flags & INET_FRAG_LAST_IN)
318
				goto discard_qp;
319
			qp->q.len = end;
L
Linus Torvalds 已提交
320 321 322
		}
	}
	if (end == offset)
323
		goto discard_qp;
L
Linus Torvalds 已提交
324

325
	err = -ENOMEM;
326
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
327
		goto discard_qp;
328 329 330

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
331
		goto discard_qp;
L
Linus Torvalds 已提交
332

333 334 335 336
	/* Note : skb->rbnode and skb->dev share the same location. */
	dev = skb->dev;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
L
Linus Torvalds 已提交
337

338
	prev_tail = qp->q.fragments_tail;
339 340 341
	err = inet_frag_queue_insert(&qp->q, skb, offset, end);
	if (err)
		goto insert_error;
L
Linus Torvalds 已提交
342

343 344
	if (dev)
		qp->iif = dev->ifindex;
L
Linus Torvalds 已提交
345

346 347
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
348
	qp->ecn |= ecn;
E
Eric Dumazet 已提交
349
	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
L
Linus Torvalds 已提交
350
	if (offset == 0)
351
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
352

353 354 355 356 357
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

358
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
359 360
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
361

362
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
363 364
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
365

366
		skb->_skb_refdst = 0UL;
367
		err = ip_frag_reasm(qp, skb, prev_tail, dev);
368
		skb->_skb_refdst = orefdst;
369 370
		if (err)
			inet_frag_kill(&qp->q);
371 372 373 374
		return err;
	}

	skb_dst_drop(skb);
375
	return -EINPROGRESS;
L
Linus Torvalds 已提交
376

377 378 379 380 381 382
insert_error:
	if (err == IPFRAG_DUP) {
		kfree_skb(skb);
		return -EINVAL;
	}
	err = -EINVAL;
383
	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
384 385
discard_qp:
	inet_frag_kill(&qp->q);
386
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
387 388
err:
	kfree_skb(skb);
389
	return err;
L
Linus Torvalds 已提交
390 391 392
}

/* Build a new IP datagram from all its fragments. */
393
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
394
			 struct sk_buff *prev_tail, struct net_device *dev)
L
Linus Torvalds 已提交
395
{
396
	struct net *net = qp->q.fqdir->net;
L
Linus Torvalds 已提交
397
	struct iphdr *iph;
398 399
	void *reasm_data;
	int len, err;
400
	u8 ecn;
L
Linus Torvalds 已提交
401 402 403

	ipq_kill(qp);

404
	ecn = ip_frag_ecn_table[qp->ecn];
405 406 407 408
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
409

410 411 412 413
	/* Make the one we just received the head. */
	reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
	if (!reasm_data)
		goto out_nomem;
L
Linus Torvalds 已提交
414

415
	len = ip_hdrlen(skb) + qp->q.len;
416
	err = -E2BIG;
S
Stephen Hemminger 已提交
417
	if (len > 65535)
L
Linus Torvalds 已提交
418 419
		goto out_oversize;

420
	inet_frag_reasm_finish(&qp->q, skb, reasm_data);
L
Linus Torvalds 已提交
421

422 423
	skb->dev = dev;
	IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
424

425
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
426
	iph->tot_len = htons(len);
427
	iph->tos |= ecn;
428 429 430 431 432 433 434 435 436 437

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
438
		IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
439 440 441 442 443
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

444 445
	ip_send_check(iph);

E
Eric Dumazet 已提交
446
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
447
	qp->q.rb_fragments = RB_ROOT;
448
	qp->q.fragments_tail = NULL;
449
	qp->q.last_run_head = NULL;
450
	return 0;
L
Linus Torvalds 已提交
451 452

out_nomem:
453
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
454
	err = -ENOMEM;
L
Linus Torvalds 已提交
455 456
	goto out_fail;
out_oversize:
457
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
458
out_fail:
E
Eric Dumazet 已提交
459
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
460
	return err;
L
Linus Torvalds 已提交
461 462 463
}

/* Process an incoming IP datagram fragment. */
464
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
465
{
466
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
467
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
468
	struct ipq *qp;
469

E
Eric Dumazet 已提交
470
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
471
	skb_orphan(skb);
L
Linus Torvalds 已提交
472 473

	/* Lookup (or create) queue header */
474
	qp = ip_find(net, ip_hdr(skb), user, vif);
475
	if (qp) {
476
		int ret;
L
Linus Torvalds 已提交
477

478
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
479

480
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
481

482
		spin_unlock(&qp->q.lock);
483
		ipq_put(qp);
484
		return ret;
L
Linus Torvalds 已提交
485 486
	}

E
Eric Dumazet 已提交
487
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
488
	kfree_skb(skb);
489
	return -ENOMEM;
L
Linus Torvalds 已提交
490
}
E
Eric Dumazet 已提交
491
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
492

493
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
494
{
495
	struct iphdr iph;
496
	int netoff;
497 498 499 500 501
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

502 503 504
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
505 506
		return skb;

507
	if (iph.ihl < 5 || iph.version != 4)
508
		return skb;
509 510

	len = ntohs(iph.tot_len);
511
	if (skb->len < netoff + len || len < (iph.ihl * 4))
512 513
		return skb;

514
	if (ip_is_fragment(&iph)) {
515 516
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
517 518 519 520 521 522 523 524
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
				kfree_skb(skb);
				return NULL;
			}
			if (pskb_trim_rcsum(skb, netoff + len)) {
				kfree_skb(skb);
				return NULL;
			}
525
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
526
			if (ip_defrag(net, skb, user))
527
				return NULL;
528
			skb_clear_hash(skb);
529 530 531 532 533 534
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

535
#ifdef CONFIG_SYSCTL
536
static int dist_min;
537

538
static struct ctl_table ip4_frags_ns_ctl_table[] = {
539 540
	{
		.procname	= "ipfrag_high_thresh",
541
		.maxlen		= sizeof(unsigned long),
542
		.mode		= 0644,
543
		.proc_handler	= proc_doulongvec_minmax,
544 545 546
	},
	{
		.procname	= "ipfrag_low_thresh",
547
		.maxlen		= sizeof(unsigned long),
548
		.mode		= 0644,
549
		.proc_handler	= proc_doulongvec_minmax,
550 551 552 553 554
	},
	{
		.procname	= "ipfrag_time",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
555
		.proc_handler	= proc_dointvec_jiffies,
556
	},
557 558 559 560 561
	{
		.procname	= "ipfrag_max_dist",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
562
		.extra1		= &dist_min,
563
	},
564 565 566
	{ }
};

567 568
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
569
static struct ctl_table ip4_frags_ctl_table[] = {
570 571
	{
		.procname	= "ipfrag_secret_interval",
572
		.data		= &ip4_frags_secret_interval_unused,
573 574
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
575
		.proc_handler	= proc_dointvec_jiffies,
576 577 578 579
	},
	{ }
};

580
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
581
{
582
	struct ctl_table *table;
583 584
	struct ctl_table_header *hdr;

585
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
586
	if (!net_eq(net, &init_net)) {
587
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
588
		if (!table)
589 590 591
			goto err_alloc;

	}
592 593 594 595 596 597
	table[0].data	= &net->ipv4.fqdir->high_thresh;
	table[0].extra1	= &net->ipv4.fqdir->low_thresh;
	table[1].data	= &net->ipv4.fqdir->low_thresh;
	table[1].extra2	= &net->ipv4.fqdir->high_thresh;
	table[2].data	= &net->ipv4.fqdir->timeout;
	table[3].data	= &net->ipv4.fqdir->max_dist;
598

599
	hdr = register_net_sysctl(net, "net/ipv4", table);
600
	if (!hdr)
601 602 603 604 605 606
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
607
	if (!net_eq(net, &init_net))
608 609 610 611 612
		kfree(table);
err_alloc:
	return -ENOMEM;
}

613
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
614 615 616 617 618 619
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
620
}
621

622
static void __init ip4_frags_ctl_register(void)
623
{
624
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
625
}
626
#else
627
static int ip4_frags_ns_ctl_register(struct net *net)
628 629 630
{
	return 0;
}
631

632
static void ip4_frags_ns_ctl_unregister(struct net *net)
633 634
{
}
635

636
static void __init ip4_frags_ctl_register(void)
637 638
{
}
639 640
#endif

641
static int __net_init ipv4_frags_init_net(struct net *net)
642
{
643 644
	int res;

645 646 647
	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
	if (res < 0)
		return res;
648 649 650 651 652 653 654 655 656 657 658 659 660
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
661
	 */
662 663
	net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
	net->ipv4.fqdir->low_thresh  = 3 * 1024 * 1024;
664 665 666 667 668
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
669
	net->ipv4.fqdir->timeout = IP_FRAG_TIME;
670

671
	net->ipv4.fqdir->max_dist = 64;
672

673 674
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
675
		fqdir_exit(net->ipv4.fqdir);
676
	return res;
677 678
}

679
static void __net_exit ipv4_frags_exit_net(struct net *net)
680
{
681
	ip4_frags_ns_ctl_unregister(net);
682
	fqdir_exit(net->ipv4.fqdir);
683 684 685 686 687 688 689
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

723
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
724
{
725
	ip4_frags.constructor = ip4_frag_init;
726 727
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
728
	ip4_frags.frag_expire = ip_expire;
729
	ip4_frags.frags_cache_name = ip_frag_cache_name;
730
	ip4_frags.rhash_params = ip4_rhash_params;
731 732
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
733 734
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
735
}