ip_fragment.c 21.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

L
Linus Torvalds 已提交
60 61
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
62 63
	struct inet_frag_queue q;

64
	u8		ecn; /* RFC3168 support */
65
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
66 67 68
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
69 70
};

71
static u8 ip4_frag_ecn(u8 tos)
72
{
73
	return 1 << (tos & INET_ECN_MASK);
74 75
}

76
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
77

78 79 80
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

81

82
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
83 84
{
	struct ipq *qp = container_of(q, struct ipq, q);
85 86 87 88
	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
					       frags);
	struct net *net = container_of(ipv4, struct net, ipv4);

89
	const struct frag_v4_compare_key *key = a;
90

91 92
	q->key.v4 = *key;
	qp->ecn = 0;
93
	qp->peer = q->net->max_dist ?
94
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
95
		NULL;
96 97
}

98
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
99
{
100 101 102 103 104
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
105 106 107 108 109
}


/* Destruction primitives. */

110
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
111
{
112
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
113 114 115 116 117 118 119
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
120
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
121 122
}

123 124 125 126
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
127 128 129
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
130 131
}

L
Linus Torvalds 已提交
132 133 134
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
135
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
136
{
137
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
138
	const struct iphdr *iph;
139
	struct sk_buff *head = NULL;
140
	struct net *net;
141 142
	struct ipq *qp;
	int err;
143

144
	qp = container_of(frag, struct ipq, q);
145
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
146

147
	rcu_read_lock();
148
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
149

150
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
151 152 153
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
154
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
155
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
156

157
	if (!qp->q.flags & INET_FRAG_FIRST_IN)
158
		goto out;
159

160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
	 * pull the head out of the tree in order to be able to
	 * deal with head->dev.
	 */
	if (qp->q.fragments) {
		head = qp->q.fragments;
		qp->q.fragments = head->next;
	} else {
		head = skb_rb_first(&qp->q.rb_fragments);
		if (!head)
			goto out;
		rb_erase(&head->rbnode, &qp->q.rb_fragments);
		memset(&head->rbnode, 0, sizeof(head->rbnode));
		barrier();
	}
	if (head == qp->q.fragments_tail)
		qp->q.fragments_tail = NULL;

	sub_frag_mem_limit(qp->q.net, head->truesize);

180 181 182
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
183

184

185 186 187
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
188
					   iph->tos, head->dev);
189 190 191 192 193 194 195 196 197 198
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

199 200 201
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	goto out_rcu_unlock;
202

L
Linus Torvalds 已提交
203
out:
204
	spin_unlock(&qp->q.lock);
205 206
out_rcu_unlock:
	rcu_read_unlock();
207 208
	if (head)
		kfree_skb(head);
209
	ipq_put(qp);
L
Linus Torvalds 已提交
210 211
}

212 213 214
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
215 216
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
217
{
218 219 220 221 222 223 224 225
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
226
	struct inet_frag_queue *q;
227

228
	q = inet_frag_find(&net->ipv4.frags, &key);
229
	if (!q)
230
		return NULL;
231

232
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
233 234
}

H
Herbert Xu 已提交
235
/* Is the fragment too far ahead to be part of ipq? */
236
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
237 238
{
	struct inet_peer *peer = qp->peer;
239
	unsigned int max = qp->q.net->max_dist;
H
Herbert Xu 已提交
240 241 242 243 244 245 246 247 248 249 250
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

251
	rc = qp->q.fragments_tail && (end - start) > max;
H
Herbert Xu 已提交
252 253

	if (rc) {
254 255 256
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
E
Eric Dumazet 已提交
257
		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
258 259 260 261 262 263 264
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
265
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
266

267
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
268
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
269 270 271
		return -ETIMEDOUT;
	}

272
	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
273
	sub_frag_mem_limit(qp->q.net, sum_truesize);
H
Herbert Xu 已提交
274

275
	qp->q.flags = 0;
276 277 278
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
279
	qp->q.rb_fragments = RB_ROOT;
280
	qp->q.fragments_tail = NULL;
H
Herbert Xu 已提交
281
	qp->iif = 0;
282
	qp->ecn = 0;
H
Herbert Xu 已提交
283 284 285 286

	return 0;
}

L
Linus Torvalds 已提交
287
/* Add new segment to existing queue. */
288
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
289
{
290
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
291 292
	struct rb_node **rbn, *parent;
	struct sk_buff *skb1;
293
	struct net_device *dev;
294
	unsigned int fragsize;
L
Linus Torvalds 已提交
295 296
	int flags, offset;
	int ihl, end;
297
	int err = -ENOENT;
298
	u8 ecn;
L
Linus Torvalds 已提交
299

300
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
301 302
		goto err;

H
Herbert Xu 已提交
303
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
304 305
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
306 307 308 309
		ipq_kill(qp);
		goto err;
	}

310
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
311
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
312 313 314
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
315
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
316 317

	/* Determine the position of this fragment. */
318
	end = offset + skb->len - skb_network_offset(skb) - ihl;
319
	err = -EINVAL;
L
Linus Torvalds 已提交
320 321 322 323

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
324
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
325
		 */
326
		if (end < qp->q.len ||
327
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
328
			goto err;
329
		qp->q.flags |= INET_FRAG_LAST_IN;
330
		qp->q.len = end;
L
Linus Torvalds 已提交
331 332 333 334 335 336
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
337
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
338
			/* Some bits beyond end -> corruption. */
339
			if (qp->q.flags & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
340
				goto err;
341
			qp->q.len = end;
L
Linus Torvalds 已提交
342 343 344 345 346
		}
	}
	if (end == offset)
		goto err;

347
	err = -ENOMEM;
348
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
L
Linus Torvalds 已提交
349
		goto err;
350 351 352

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
353 354
		goto err;

355 356 357 358
	/* Note : skb->rbnode and skb->dev share the same location. */
	dev = skb->dev;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
L
Linus Torvalds 已提交
359

360 361 362 363 364 365
	/* RFC5722, Section 4, amended by Errata ID : 3089
	 *                          When reassembling an IPv6 datagram, if
	 *   one or more its constituent fragments is determined to be an
	 *   overlapping fragment, the entire datagram (and any constituent
	 *   fragments) MUST be silently discarded.
	 *
366
	 * We do the same here for IPv4 (and increment an snmp counter).
L
Linus Torvalds 已提交
367 368
	 */

369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
	/* Find out where to put this fragment.  */
	skb1 = qp->q.fragments_tail;
	if (!skb1) {
		/* This is the first fragment we've received. */
		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
		qp->q.fragments_tail = skb;
	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
		/* This is the common/special case: skb goes to the end. */
		/* Detect and discard overlaps. */
		if (offset < (skb1->ip_defrag_offset + skb1->len))
			goto discard_qp;
		/* Insert after skb1. */
		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
		qp->q.fragments_tail = skb;
	} else {
		/* Binary search. Note that skb can become the first fragment, but
		 * not the last (covered above). */
		rbn = &qp->q.rb_fragments.rb_node;
		do {
			parent = *rbn;
			skb1 = rb_to_skb(parent);
			if (end <= skb1->ip_defrag_offset)
				rbn = &parent->rb_left;
			else if (offset >= skb1->ip_defrag_offset + skb1->len)
				rbn = &parent->rb_right;
			else /* Found an overlap with skb1. */
				goto discard_qp;
		} while (*rbn);
		/* Here we have parent properly set, and rbn pointing to
		 * one of its NULL left/right children. Insert skb. */
		rb_link_node(&skb->rbnode, parent, rbn);
	}
	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
L
Linus Torvalds 已提交
402

403 404 405
	if (dev)
		qp->iif = dev->ifindex;
	skb->ip_defrag_offset = offset;
L
Linus Torvalds 已提交
406

407 408
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
409
	qp->ecn |= ecn;
410
	add_frag_mem_limit(qp->q.net, skb->truesize);
L
Linus Torvalds 已提交
411
	if (offset == 0)
412
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
413

414 415 416 417 418
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

419
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
420 421
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
422

423
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
424 425
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
426

427
		skb->_skb_refdst = 0UL;
428
		err = ip_frag_reasm(qp, skb, dev);
429 430 431 432 433
		skb->_skb_refdst = orefdst;
		return err;
	}

	skb_dst_drop(skb);
434
	return -EINPROGRESS;
L
Linus Torvalds 已提交
435

436 437 438 439
discard_qp:
	inet_frag_kill(&qp->q);
	err = -EINVAL;
	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
L
Linus Torvalds 已提交
440 441
err:
	kfree_skb(skb);
442
	return err;
L
Linus Torvalds 已提交
443 444 445
}

/* Build a new IP datagram from all its fragments. */
446
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
447
			 struct net_device *dev)
L
Linus Torvalds 已提交
448
{
449
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
450
	struct iphdr *iph;
451 452 453
	struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
	struct sk_buff **nextp; /* To build frag_list. */
	struct rb_node *rbn;
L
Linus Torvalds 已提交
454 455
	int len;
	int ihlen;
456
	int err;
457
	u8 ecn;
L
Linus Torvalds 已提交
458 459 460

	ipq_kill(qp);

461
	ecn = ip_frag_ecn_table[qp->ecn];
462 463 464 465
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
466
	/* Make the one we just received the head. */
467 468
	if (head != skb) {
		fp = skb_clone(skb, GFP_ATOMIC);
469 470
		if (!fp)
			goto out_nomem;
471 472
		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
		if (qp->q.fragments_tail == skb)
473
			qp->q.fragments_tail = fp;
474 475 476 477 478
		skb_morph(skb, head);
		rb_replace_node(&head->rbnode, &skb->rbnode,
				&qp->q.rb_fragments);
		consume_skb(head);
		head = skb;
479 480
	}

481
	WARN_ON(head->ip_defrag_offset != 0);
L
Linus Torvalds 已提交
482 483

	/* Allocate a new buffer for the datagram. */
484
	ihlen = ip_hdrlen(head);
485
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
486

487
	err = -E2BIG;
S
Stephen Hemminger 已提交
488
	if (len > 65535)
L
Linus Torvalds 已提交
489 490 491
		goto out_oversize;

	/* Head of list must not be cloned. */
492
	if (skb_unclone(head, GFP_ATOMIC))
L
Linus Torvalds 已提交
493 494 495 496 497
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
498
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
499 500 501
		struct sk_buff *clone;
		int i, plen = 0;

502 503
		clone = alloc_skb(0, GFP_ATOMIC);
		if (!clone)
L
Linus Torvalds 已提交
504 505
			goto out_nomem;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
506
		skb_frag_list_init(head);
E
Eric Dumazet 已提交
507 508
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
L
Linus Torvalds 已提交
509
		clone->len = clone->data_len = head->data_len - plen;
510
		skb->truesize += clone->truesize;
L
Linus Torvalds 已提交
511 512
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
513
		add_frag_mem_limit(qp->q.net, clone->truesize);
514 515 516 517
		skb_shinfo(head)->frag_list = clone;
		nextp = &clone->next;
	} else {
		nextp = &skb_shinfo(head)->frag_list;
L
Linus Torvalds 已提交
518 519
	}

520
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
521

522 523 524 525 526 527 528 529 530 531 532 533
	/* Traverse the tree in order, to build frag_list. */
	rbn = rb_next(&head->rbnode);
	rb_erase(&head->rbnode, &qp->q.rb_fragments);
	while (rbn) {
		struct rb_node *rbnext = rb_next(rbn);
		fp = rb_to_skb(rbn);
		rb_erase(rbn, &qp->q.rb_fragments);
		rbn = rbnext;
		*nextp = fp;
		nextp = &fp->next;
		fp->prev = NULL;
		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
534 535
		head->data_len += fp->len;
		head->len += fp->len;
L
Linus Torvalds 已提交
536 537
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
538
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
539
			head->csum = csum_add(head->csum, fp->csum);
540
		head->truesize += fp->truesize;
L
Linus Torvalds 已提交
541
	}
542
	sub_frag_mem_limit(qp->q.net, head->truesize);
L
Linus Torvalds 已提交
543

544
	*nextp = NULL;
L
Linus Torvalds 已提交
545
	head->next = NULL;
546
	head->prev = NULL;
L
Linus Torvalds 已提交
547
	head->dev = dev;
548
	head->tstamp = qp->q.stamp;
549
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
550

551
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
552
	iph->tot_len = htons(len);
553
	iph->tos |= ecn;
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

570 571
	ip_send_check(iph);

E
Eric Dumazet 已提交
572
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
573
	qp->q.fragments = NULL;
574
	qp->q.rb_fragments = RB_ROOT;
575
	qp->q.fragments_tail = NULL;
576
	return 0;
L
Linus Torvalds 已提交
577 578

out_nomem:
579
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
580
	err = -ENOMEM;
L
Linus Torvalds 已提交
581 582
	goto out_fail;
out_oversize:
583
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
584
out_fail:
E
Eric Dumazet 已提交
585
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
586
	return err;
L
Linus Torvalds 已提交
587 588 589
}

/* Process an incoming IP datagram fragment. */
590
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
591
{
592
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
593
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
594
	struct ipq *qp;
595

E
Eric Dumazet 已提交
596
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
597
	skb_orphan(skb);
L
Linus Torvalds 已提交
598 599

	/* Lookup (or create) queue header */
600
	qp = ip_find(net, ip_hdr(skb), user, vif);
601
	if (qp) {
602
		int ret;
L
Linus Torvalds 已提交
603

604
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
605

606
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
607

608
		spin_unlock(&qp->q.lock);
609
		ipq_put(qp);
610
		return ret;
L
Linus Torvalds 已提交
611 612
	}

E
Eric Dumazet 已提交
613
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
614
	kfree_skb(skb);
615
	return -ENOMEM;
L
Linus Torvalds 已提交
616
}
E
Eric Dumazet 已提交
617
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
618

619
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
620
{
621
	struct iphdr iph;
622
	int netoff;
623 624 625 626 627
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

628 629 630
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
631 632
		return skb;

633
	if (iph.ihl < 5 || iph.version != 4)
634
		return skb;
635 636

	len = ntohs(iph.tot_len);
637
	if (skb->len < netoff + len || len < (iph.ihl * 4))
638 639
		return skb;

640
	if (ip_is_fragment(&iph)) {
641 642
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
643
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
644
				return skb;
645
			if (pskb_trim_rcsum(skb, netoff + len))
646 647
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
648
			if (ip_defrag(net, skb, user))
649
				return NULL;
650
			skb_clear_hash(skb);
651 652 653 654 655 656
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

657
#ifdef CONFIG_SYSCTL
658
static int dist_min;
659

660
static struct ctl_table ip4_frags_ns_ctl_table[] = {
661 662
	{
		.procname	= "ipfrag_high_thresh",
663
		.data		= &init_net.ipv4.frags.high_thresh,
664
		.maxlen		= sizeof(unsigned long),
665
		.mode		= 0644,
666
		.proc_handler	= proc_doulongvec_minmax,
667
		.extra1		= &init_net.ipv4.frags.low_thresh
668 669 670
	},
	{
		.procname	= "ipfrag_low_thresh",
671
		.data		= &init_net.ipv4.frags.low_thresh,
672
		.maxlen		= sizeof(unsigned long),
673
		.mode		= 0644,
674
		.proc_handler	= proc_doulongvec_minmax,
675
		.extra2		= &init_net.ipv4.frags.high_thresh
676 677 678
	},
	{
		.procname	= "ipfrag_time",
679
		.data		= &init_net.ipv4.frags.timeout,
680 681
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
682
		.proc_handler	= proc_dointvec_jiffies,
683
	},
684 685 686 687 688 689
	{
		.procname	= "ipfrag_max_dist",
		.data		= &init_net.ipv4.frags.max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
690
		.extra1		= &dist_min,
691
	},
692 693 694
	{ }
};

695 696
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
697
static struct ctl_table ip4_frags_ctl_table[] = {
698 699
	{
		.procname	= "ipfrag_secret_interval",
700
		.data		= &ip4_frags_secret_interval_unused,
701 702
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
703
		.proc_handler	= proc_dointvec_jiffies,
704 705 706 707
	},
	{ }
};

708
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
709
{
710
	struct ctl_table *table;
711 712
	struct ctl_table_header *hdr;

713
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
714
	if (!net_eq(net, &init_net)) {
715
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
716
		if (!table)
717 718
			goto err_alloc;

719
		table[0].data = &net->ipv4.frags.high_thresh;
720 721
		table[0].extra1 = &net->ipv4.frags.low_thresh;
		table[0].extra2 = &init_net.ipv4.frags.high_thresh;
722
		table[1].data = &net->ipv4.frags.low_thresh;
723
		table[1].extra2 = &net->ipv4.frags.high_thresh;
724
		table[2].data = &net->ipv4.frags.timeout;
725
		table[3].data = &net->ipv4.frags.max_dist;
726 727
	}

728
	hdr = register_net_sysctl(net, "net/ipv4", table);
729
	if (!hdr)
730 731 732 733 734 735
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
736
	if (!net_eq(net, &init_net))
737 738 739 740 741
		kfree(table);
err_alloc:
	return -ENOMEM;
}

742
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
743 744 745 746 747 748
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
749
}
750

751
static void __init ip4_frags_ctl_register(void)
752
{
753
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
754
}
755
#else
756
static int ip4_frags_ns_ctl_register(struct net *net)
757 758 759
{
	return 0;
}
760

761
static void ip4_frags_ns_ctl_unregister(struct net *net)
762 763
{
}
764

765
static void __init ip4_frags_ctl_register(void)
766 767
{
}
768 769
#endif

770
static int __net_init ipv4_frags_init_net(struct net *net)
771
{
772 773
	int res;

774 775 776 777 778 779 780 781 782 783 784 785 786
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
787
	 */
788 789
	net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
790 791 792 793 794 795 796
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

797
	net->ipv4.frags.max_dist = 64;
798
	net->ipv4.frags.f = &ip4_frags;
799

800 801 802 803 804
	res = inet_frags_init_net(&net->ipv4.frags);
	if (res < 0)
		return res;
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
805
		inet_frags_exit_net(&net->ipv4.frags);
806
	return res;
807 808
}

809
static void __net_exit ipv4_frags_exit_net(struct net *net)
810
{
811
	ip4_frags_ns_ctl_unregister(net);
812
	inet_frags_exit_net(&net->ipv4.frags);
813 814 815 816 817 818 819
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

853
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
854
{
855
	ip4_frags.constructor = ip4_frag_init;
856 857
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
858
	ip4_frags.frag_expire = ip_expire;
859
	ip4_frags.frags_cache_name = ip_frag_cache_name;
860
	ip4_frags.rhash_params = ip4_rhash_params;
861 862
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
863 864
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
865
}