ip_fragment.c 17.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

L
Linus Torvalds 已提交
60 61
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
62 63
	struct inet_frag_queue q;

64
	u8		ecn; /* RFC3168 support */
65
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
66 67 68
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
69 70
};

71
static u8 ip4_frag_ecn(u8 tos)
72
{
73
	return 1 << (tos & INET_ECN_MASK);
74 75
}

76
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
77

78 79
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
			 struct sk_buff *prev_tail, struct net_device *dev);
80

81

82
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
83 84
{
	struct ipq *qp = container_of(q, struct ipq, q);
85
	struct net *net = q->fqdir->net;
86

87
	const struct frag_v4_compare_key *key = a;
88

89 90
	q->key.v4 = *key;
	qp->ecn = 0;
E
Eric Dumazet 已提交
91
	qp->peer = q->fqdir->max_dist ?
92
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
93
		NULL;
94 95
}

96
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
97
{
98 99 100 101 102
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
103 104 105 106 107
}


/* Destruction primitives. */

108
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
109
{
110
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
111 112 113 114 115 116 117
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
118
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
119 120
}

121 122 123 124
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
125 126 127
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
128 129
}

L
Linus Torvalds 已提交
130 131 132
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
133
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
134
{
135
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
136
	const struct iphdr *iph;
137
	struct sk_buff *head = NULL;
138
	struct net *net;
139 140
	struct ipq *qp;
	int err;
141

142
	qp = container_of(frag, struct ipq, q);
143
	net = qp->q.fqdir->net;
L
Linus Torvalds 已提交
144

145
	rcu_read_lock();
146 147 148 149

	if (qp->q.fqdir->dead)
		goto out_rcu_unlock;

150
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
151

152
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
153 154 155
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
156
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
157
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
158

159
	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
160
		goto out;
161

162 163 164 165
	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
	 * pull the head out of the tree in order to be able to
	 * deal with head->dev.
	 */
166 167 168
	head = inet_frag_pull_head(&qp->q);
	if (!head)
		goto out;
169 170 171
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
172

173

174 175 176
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
177
					   iph->tos, head->dev);
178 179 180 181 182 183 184 185 186 187
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

188 189 190
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	goto out_rcu_unlock;
191

L
Linus Torvalds 已提交
192
out:
193
	spin_unlock(&qp->q.lock);
194 195
out_rcu_unlock:
	rcu_read_unlock();
196
	kfree_skb(head);
197
	ipq_put(qp);
L
Linus Torvalds 已提交
198 199
}

200 201 202
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
203 204
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
205
{
206 207 208 209 210 211 212 213
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
214
	struct inet_frag_queue *q;
215

216
	q = inet_frag_find(net->ipv4.fqdir, &key);
217
	if (!q)
218
		return NULL;
219

220
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
221 222
}

H
Herbert Xu 已提交
223
/* Is the fragment too far ahead to be part of ipq? */
224
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
225 226
{
	struct inet_peer *peer = qp->peer;
E
Eric Dumazet 已提交
227
	unsigned int max = qp->q.fqdir->max_dist;
H
Herbert Xu 已提交
228 229 230 231 232 233 234 235 236 237 238
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

239
	rc = qp->q.fragments_tail && (end - start) > max;
H
Herbert Xu 已提交
240

241 242
	if (rc)
		__IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
243 244 245 246 247 248

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
249
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
250

E
Eric Dumazet 已提交
251
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
252
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
253 254 255
		return -ETIMEDOUT;
	}

256
	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
E
Eric Dumazet 已提交
257
	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
H
Herbert Xu 已提交
258

259
	qp->q.flags = 0;
260 261
	qp->q.len = 0;
	qp->q.meat = 0;
262
	qp->q.rb_fragments = RB_ROOT;
263
	qp->q.fragments_tail = NULL;
264
	qp->q.last_run_head = NULL;
H
Herbert Xu 已提交
265
	qp->iif = 0;
266
	qp->ecn = 0;
H
Herbert Xu 已提交
267 268 269 270

	return 0;
}

L
Linus Torvalds 已提交
271
/* Add new segment to existing queue. */
272
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
273
{
274
	struct net *net = qp->q.fqdir->net;
275 276
	int ihl, end, flags, offset;
	struct sk_buff *prev_tail;
277
	struct net_device *dev;
278
	unsigned int fragsize;
279
	int err = -ENOENT;
280
	u8 ecn;
L
Linus Torvalds 已提交
281

282
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
283 284
		goto err;

H
Herbert Xu 已提交
285
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
286 287
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
288 289 290 291
		ipq_kill(qp);
		goto err;
	}

292
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
293
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
294 295 296
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
297
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
298 299

	/* Determine the position of this fragment. */
300
	end = offset + skb->len - skb_network_offset(skb) - ihl;
301
	err = -EINVAL;
L
Linus Torvalds 已提交
302 303 304 305

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
306
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
307
		 */
308
		if (end < qp->q.len ||
309
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
310
			goto discard_qp;
311
		qp->q.flags |= INET_FRAG_LAST_IN;
312
		qp->q.len = end;
L
Linus Torvalds 已提交
313 314 315 316 317 318
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
319
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
320
			/* Some bits beyond end -> corruption. */
321
			if (qp->q.flags & INET_FRAG_LAST_IN)
322
				goto discard_qp;
323
			qp->q.len = end;
L
Linus Torvalds 已提交
324 325 326
		}
	}
	if (end == offset)
327
		goto discard_qp;
L
Linus Torvalds 已提交
328

329
	err = -ENOMEM;
330
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
331
		goto discard_qp;
332 333 334

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
335
		goto discard_qp;
L
Linus Torvalds 已提交
336

337 338 339 340
	/* Note : skb->rbnode and skb->dev share the same location. */
	dev = skb->dev;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
L
Linus Torvalds 已提交
341

342
	prev_tail = qp->q.fragments_tail;
343 344 345
	err = inet_frag_queue_insert(&qp->q, skb, offset, end);
	if (err)
		goto insert_error;
L
Linus Torvalds 已提交
346

347 348
	if (dev)
		qp->iif = dev->ifindex;
L
Linus Torvalds 已提交
349

350 351
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
352
	qp->ecn |= ecn;
E
Eric Dumazet 已提交
353
	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
L
Linus Torvalds 已提交
354
	if (offset == 0)
355
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
356

357 358 359 360 361
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

362
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
363 364
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
365

366
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
367 368
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
369

370
		skb->_skb_refdst = 0UL;
371
		err = ip_frag_reasm(qp, skb, prev_tail, dev);
372
		skb->_skb_refdst = orefdst;
373 374
		if (err)
			inet_frag_kill(&qp->q);
375 376 377 378
		return err;
	}

	skb_dst_drop(skb);
379
	return -EINPROGRESS;
L
Linus Torvalds 已提交
380

381 382 383 384 385 386
insert_error:
	if (err == IPFRAG_DUP) {
		kfree_skb(skb);
		return -EINVAL;
	}
	err = -EINVAL;
387
	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
388 389
discard_qp:
	inet_frag_kill(&qp->q);
390
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
391 392
err:
	kfree_skb(skb);
393
	return err;
L
Linus Torvalds 已提交
394 395
}

396 397 398 399 400
static bool ip_frag_coalesce_ok(const struct ipq *qp)
{
	return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
}

L
Linus Torvalds 已提交
401
/* Build a new IP datagram from all its fragments. */
402
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
403
			 struct sk_buff *prev_tail, struct net_device *dev)
L
Linus Torvalds 已提交
404
{
405
	struct net *net = qp->q.fqdir->net;
L
Linus Torvalds 已提交
406
	struct iphdr *iph;
407 408
	void *reasm_data;
	int len, err;
409
	u8 ecn;
L
Linus Torvalds 已提交
410 411 412

	ipq_kill(qp);

413
	ecn = ip_frag_ecn_table[qp->ecn];
414 415 416 417
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
418

419 420 421 422
	/* Make the one we just received the head. */
	reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
	if (!reasm_data)
		goto out_nomem;
L
Linus Torvalds 已提交
423

424
	len = ip_hdrlen(skb) + qp->q.len;
425
	err = -E2BIG;
S
Stephen Hemminger 已提交
426
	if (len > 65535)
L
Linus Torvalds 已提交
427 428
		goto out_oversize;

429 430
	inet_frag_reasm_finish(&qp->q, skb, reasm_data,
			       ip_frag_coalesce_ok(qp));
L
Linus Torvalds 已提交
431

432 433
	skb->dev = dev;
	IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
434

435
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
436
	iph->tot_len = htons(len);
437
	iph->tos |= ecn;
438 439 440 441 442 443 444 445 446 447

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
448
		IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
449 450 451 452 453
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

454 455
	ip_send_check(iph);

E
Eric Dumazet 已提交
456
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
457
	qp->q.rb_fragments = RB_ROOT;
458
	qp->q.fragments_tail = NULL;
459
	qp->q.last_run_head = NULL;
460
	return 0;
L
Linus Torvalds 已提交
461 462

out_nomem:
463
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
464
	err = -ENOMEM;
L
Linus Torvalds 已提交
465 466
	goto out_fail;
out_oversize:
467
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
468
out_fail:
E
Eric Dumazet 已提交
469
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
470
	return err;
L
Linus Torvalds 已提交
471 472 473
}

/* Process an incoming IP datagram fragment. */
474
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
475
{
476
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
477
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
478
	struct ipq *qp;
479

E
Eric Dumazet 已提交
480
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
481
	skb_orphan(skb);
L
Linus Torvalds 已提交
482 483

	/* Lookup (or create) queue header */
484
	qp = ip_find(net, ip_hdr(skb), user, vif);
485
	if (qp) {
486
		int ret;
L
Linus Torvalds 已提交
487

488
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
489

490
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
491

492
		spin_unlock(&qp->q.lock);
493
		ipq_put(qp);
494
		return ret;
L
Linus Torvalds 已提交
495 496
	}

E
Eric Dumazet 已提交
497
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
498
	kfree_skb(skb);
499
	return -ENOMEM;
L
Linus Torvalds 已提交
500
}
E
Eric Dumazet 已提交
501
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
502

503
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
504
{
505
	struct iphdr iph;
506
	int netoff;
507 508 509 510 511
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

512 513 514
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
515 516
		return skb;

517
	if (iph.ihl < 5 || iph.version != 4)
518
		return skb;
519 520

	len = ntohs(iph.tot_len);
521
	if (skb->len < netoff + len || len < (iph.ihl * 4))
522 523
		return skb;

524
	if (ip_is_fragment(&iph)) {
525 526
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
527 528 529 530 531 532 533 534
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
				kfree_skb(skb);
				return NULL;
			}
			if (pskb_trim_rcsum(skb, netoff + len)) {
				kfree_skb(skb);
				return NULL;
			}
535
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
536
			if (ip_defrag(net, skb, user))
537
				return NULL;
538
			skb_clear_hash(skb);
539 540 541 542 543 544
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

545
#ifdef CONFIG_SYSCTL
546
static int dist_min;
547

548
static struct ctl_table ip4_frags_ns_ctl_table[] = {
549 550
	{
		.procname	= "ipfrag_high_thresh",
551
		.maxlen		= sizeof(unsigned long),
552
		.mode		= 0644,
553
		.proc_handler	= proc_doulongvec_minmax,
554 555 556
	},
	{
		.procname	= "ipfrag_low_thresh",
557
		.maxlen		= sizeof(unsigned long),
558
		.mode		= 0644,
559
		.proc_handler	= proc_doulongvec_minmax,
560 561 562 563 564
	},
	{
		.procname	= "ipfrag_time",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
565
		.proc_handler	= proc_dointvec_jiffies,
566
	},
567 568 569 570 571
	{
		.procname	= "ipfrag_max_dist",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
572
		.extra1		= &dist_min,
573
	},
574 575 576
	{ }
};

577 578
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
579
static struct ctl_table ip4_frags_ctl_table[] = {
580 581
	{
		.procname	= "ipfrag_secret_interval",
582
		.data		= &ip4_frags_secret_interval_unused,
583 584
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
585
		.proc_handler	= proc_dointvec_jiffies,
586 587 588 589
	},
	{ }
};

590
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
591
{
592
	struct ctl_table *table;
593 594
	struct ctl_table_header *hdr;

595
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
596
	if (!net_eq(net, &init_net)) {
597
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
598
		if (!table)
599 600 601
			goto err_alloc;

	}
602 603 604 605 606 607
	table[0].data	= &net->ipv4.fqdir->high_thresh;
	table[0].extra1	= &net->ipv4.fqdir->low_thresh;
	table[1].data	= &net->ipv4.fqdir->low_thresh;
	table[1].extra2	= &net->ipv4.fqdir->high_thresh;
	table[2].data	= &net->ipv4.fqdir->timeout;
	table[3].data	= &net->ipv4.fqdir->max_dist;
608

609
	hdr = register_net_sysctl(net, "net/ipv4", table);
610
	if (!hdr)
611 612 613 614 615 616
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
617
	if (!net_eq(net, &init_net))
618 619 620 621 622
		kfree(table);
err_alloc:
	return -ENOMEM;
}

623
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
624 625 626 627 628 629
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
630
}
631

632
static void __init ip4_frags_ctl_register(void)
633
{
634
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
635
}
636
#else
637
static int ip4_frags_ns_ctl_register(struct net *net)
638 639 640
{
	return 0;
}
641

642
static void ip4_frags_ns_ctl_unregister(struct net *net)
643 644
{
}
645

646
static void __init ip4_frags_ctl_register(void)
647 648
{
}
649 650
#endif

651
static int __net_init ipv4_frags_init_net(struct net *net)
652
{
653 654
	int res;

655 656 657
	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
	if (res < 0)
		return res;
658 659 660 661 662 663 664 665 666 667 668 669 670
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
671
	 */
672 673
	net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
	net->ipv4.fqdir->low_thresh  = 3 * 1024 * 1024;
674 675 676 677 678
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
679
	net->ipv4.fqdir->timeout = IP_FRAG_TIME;
680

681
	net->ipv4.fqdir->max_dist = 64;
682

683 684
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
685
		fqdir_exit(net->ipv4.fqdir);
686
	return res;
687 688
}

689 690 691 692 693
static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
{
	fqdir_pre_exit(net->ipv4.fqdir);
}

694
static void __net_exit ipv4_frags_exit_net(struct net *net)
695
{
696
	ip4_frags_ns_ctl_unregister(net);
697
	fqdir_exit(net->ipv4.fqdir);
698 699 700
}

static struct pernet_operations ip4_frags_ops = {
701 702 703
	.init		= ipv4_frags_init_net,
	.pre_exit	= ipv4_frags_pre_exit_net,
	.exit		= ipv4_frags_exit_net,
704 705
};

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

739
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
740
{
741
	ip4_frags.constructor = ip4_frag_init;
742 743
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
744
	ip4_frags.frag_expire = ip_expire;
745
	ip4_frags.frags_cache_name = ip_frag_cache_name;
746
	ip4_frags.rhash_params = ip4_rhash_params;
747 748
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
749 750
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
751
}