ip_fragment.c 24.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
/* Use skb->cb to track consecutive/adjacent fragments coming at
 * the end of the queue. Nodes in the rb-tree queue will
 * contain "runs" of one or more adjacent fragments.
 *
 * Invariants:
 * - next_frag is NULL at the tail of a "run";
 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
 */
struct ipfrag_skb_cb {
	struct inet_skb_parm	h;
	struct sk_buff		*next_frag;
	int			frag_run_len;
};

#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))

static void ip4_frag_init_run(struct sk_buff *skb)
{
	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));

	FRAG_CB(skb)->next_frag = NULL;
	FRAG_CB(skb)->frag_run_len = skb->len;
}

/* Append skb to the last "run". */
static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
					struct sk_buff *skb)
{
	RB_CLEAR_NODE(&skb->rbnode);
	FRAG_CB(skb)->next_frag = NULL;

	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
	FRAG_CB(q->fragments_tail)->next_frag = skb;
	q->fragments_tail = skb;
}

/* Create a new "run" with the skb. */
static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
{
	if (q->last_run_head)
		rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
			     &q->last_run_head->rbnode.rb_right);
	else
		rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
	rb_insert_color(&skb->rbnode, &q->rb_fragments);

	ip4_frag_init_run(skb);
	q->fragments_tail = skb;
	q->last_run_head = skb;
}

L
Linus Torvalds 已提交
111 112
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
113 114
	struct inet_frag_queue q;

115
	u8		ecn; /* RFC3168 support */
116
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
117 118 119
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
120 121
};

122
static u8 ip4_frag_ecn(u8 tos)
123
{
124
	return 1 << (tos & INET_ECN_MASK);
125 126
}

127
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
128

129 130
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
			 struct sk_buff *prev_tail, struct net_device *dev);
131

132

133
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
134 135
{
	struct ipq *qp = container_of(q, struct ipq, q);
136 137 138 139
	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
					       frags);
	struct net *net = container_of(ipv4, struct net, ipv4);

140
	const struct frag_v4_compare_key *key = a;
141

142 143
	q->key.v4 = *key;
	qp->ecn = 0;
144
	qp->peer = q->net->max_dist ?
145
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
146
		NULL;
147 148
}

149
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
150
{
151 152 153 154 155
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
156 157 158 159 160
}


/* Destruction primitives. */

161
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
162
{
163
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
164 165 166 167 168 169 170
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
171
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
172 173
}

174 175 176 177
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
178 179 180
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
181 182
}

L
Linus Torvalds 已提交
183 184 185
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
186
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
187
{
188
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
189
	const struct iphdr *iph;
190
	struct sk_buff *head = NULL;
191
	struct net *net;
192 193
	struct ipq *qp;
	int err;
194

195
	qp = container_of(frag, struct ipq, q);
196
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
197

198
	rcu_read_lock();
199
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
200

201
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
202 203 204
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
205
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
206
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
207

208
	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
209
		goto out;
210

211 212 213 214 215 216 217 218 219 220 221
	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
	 * pull the head out of the tree in order to be able to
	 * deal with head->dev.
	 */
	if (qp->q.fragments) {
		head = qp->q.fragments;
		qp->q.fragments = head->next;
	} else {
		head = skb_rb_first(&qp->q.rb_fragments);
		if (!head)
			goto out;
222 223 224 225 226 227
		if (FRAG_CB(head)->next_frag)
			rb_replace_node(&head->rbnode,
					&FRAG_CB(head)->next_frag->rbnode,
					&qp->q.rb_fragments);
		else
			rb_erase(&head->rbnode, &qp->q.rb_fragments);
228 229 230 231 232 233 234 235
		memset(&head->rbnode, 0, sizeof(head->rbnode));
		barrier();
	}
	if (head == qp->q.fragments_tail)
		qp->q.fragments_tail = NULL;

	sub_frag_mem_limit(qp->q.net, head->truesize);

236 237 238
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
239

240

241 242 243
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
244
					   iph->tos, head->dev);
245 246 247 248 249 250 251 252 253 254
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

255 256 257
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	goto out_rcu_unlock;
258

L
Linus Torvalds 已提交
259
out:
260
	spin_unlock(&qp->q.lock);
261 262
out_rcu_unlock:
	rcu_read_unlock();
263 264
	if (head)
		kfree_skb(head);
265
	ipq_put(qp);
L
Linus Torvalds 已提交
266 267
}

268 269 270
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
271 272
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
273
{
274 275 276 277 278 279 280 281
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
282
	struct inet_frag_queue *q;
283

284
	q = inet_frag_find(&net->ipv4.frags, &key);
285
	if (!q)
286
		return NULL;
287

288
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
289 290
}

H
Herbert Xu 已提交
291
/* Is the fragment too far ahead to be part of ipq? */
292
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
293 294
{
	struct inet_peer *peer = qp->peer;
295
	unsigned int max = qp->q.net->max_dist;
H
Herbert Xu 已提交
296 297 298 299 300 301 302 303 304 305 306
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

307
	rc = qp->q.fragments_tail && (end - start) > max;
H
Herbert Xu 已提交
308 309

	if (rc) {
310 311 312
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
E
Eric Dumazet 已提交
313
		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
314 315 316 317 318 319 320
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
321
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
322

323
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
324
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
325 326 327
		return -ETIMEDOUT;
	}

328
	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
329
	sub_frag_mem_limit(qp->q.net, sum_truesize);
H
Herbert Xu 已提交
330

331
	qp->q.flags = 0;
332 333 334
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
335
	qp->q.rb_fragments = RB_ROOT;
336
	qp->q.fragments_tail = NULL;
337
	qp->q.last_run_head = NULL;
H
Herbert Xu 已提交
338
	qp->iif = 0;
339
	qp->ecn = 0;
H
Herbert Xu 已提交
340 341 342 343

	return 0;
}

L
Linus Torvalds 已提交
344
/* Add new segment to existing queue. */
345
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
346
{
347
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
348
	struct rb_node **rbn, *parent;
349
	struct sk_buff *skb1, *prev_tail;
350
	struct net_device *dev;
351
	unsigned int fragsize;
L
Linus Torvalds 已提交
352 353
	int flags, offset;
	int ihl, end;
354
	int err = -ENOENT;
355
	u8 ecn;
L
Linus Torvalds 已提交
356

357
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
358 359
		goto err;

H
Herbert Xu 已提交
360
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
361 362
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
363 364 365 366
		ipq_kill(qp);
		goto err;
	}

367
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
368
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
369 370 371
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
372
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
373 374

	/* Determine the position of this fragment. */
375
	end = offset + skb->len - skb_network_offset(skb) - ihl;
376
	err = -EINVAL;
L
Linus Torvalds 已提交
377 378 379 380

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
381
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
382
		 */
383
		if (end < qp->q.len ||
384
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
385
			goto err;
386
		qp->q.flags |= INET_FRAG_LAST_IN;
387
		qp->q.len = end;
L
Linus Torvalds 已提交
388 389 390 391 392 393
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
394
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
395
			/* Some bits beyond end -> corruption. */
396
			if (qp->q.flags & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
397
				goto err;
398
			qp->q.len = end;
L
Linus Torvalds 已提交
399 400 401 402 403
		}
	}
	if (end == offset)
		goto err;

404
	err = -ENOMEM;
405
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
L
Linus Torvalds 已提交
406
		goto err;
407 408 409

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
410 411
		goto err;

412 413 414 415
	/* Note : skb->rbnode and skb->dev share the same location. */
	dev = skb->dev;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
L
Linus Torvalds 已提交
416

417 418 419 420 421 422
	/* RFC5722, Section 4, amended by Errata ID : 3089
	 *                          When reassembling an IPv6 datagram, if
	 *   one or more its constituent fragments is determined to be an
	 *   overlapping fragment, the entire datagram (and any constituent
	 *   fragments) MUST be silently discarded.
	 *
423
	 * We do the same here for IPv4 (and increment an snmp counter).
L
Linus Torvalds 已提交
424 425
	 */

426
	/* Find out where to put this fragment.  */
427 428 429 430 431
	prev_tail = qp->q.fragments_tail;
	if (!prev_tail)
		ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
	else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
		/* This is the common case: skb goes to the end. */
432
		/* Detect and discard overlaps. */
433
		if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
434
			goto discard_qp;
435 436 437 438
		if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
			ip4_frag_append_to_last_run(&qp->q, skb);
		else
			ip4_frag_create_run(&qp->q, skb);
439
	} else {
440 441 442
		/* Binary search. Note that skb can become the first fragment,
		 * but not the last (covered above).
		 */
443 444 445 446 447 448
		rbn = &qp->q.rb_fragments.rb_node;
		do {
			parent = *rbn;
			skb1 = rb_to_skb(parent);
			if (end <= skb1->ip_defrag_offset)
				rbn = &parent->rb_left;
449 450
			else if (offset >= skb1->ip_defrag_offset +
						FRAG_CB(skb1)->frag_run_len)
451 452 453 454 455
				rbn = &parent->rb_right;
			else /* Found an overlap with skb1. */
				goto discard_qp;
		} while (*rbn);
		/* Here we have parent properly set, and rbn pointing to
456 457 458
		 * one of its NULL left/right children. Insert skb.
		 */
		ip4_frag_init_run(skb);
459
		rb_link_node(&skb->rbnode, parent, rbn);
460
		rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
461
	}
L
Linus Torvalds 已提交
462

463 464 465
	if (dev)
		qp->iif = dev->ifindex;
	skb->ip_defrag_offset = offset;
L
Linus Torvalds 已提交
466

467 468
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
469
	qp->ecn |= ecn;
470
	add_frag_mem_limit(qp->q.net, skb->truesize);
L
Linus Torvalds 已提交
471
	if (offset == 0)
472
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
473

474 475 476 477 478
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

479
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
480 481
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
482

483
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
484 485
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
486

487
		skb->_skb_refdst = 0UL;
488
		err = ip_frag_reasm(qp, skb, prev_tail, dev);
489 490 491 492 493
		skb->_skb_refdst = orefdst;
		return err;
	}

	skb_dst_drop(skb);
494
	return -EINPROGRESS;
L
Linus Torvalds 已提交
495

496 497 498 499
discard_qp:
	inet_frag_kill(&qp->q);
	err = -EINVAL;
	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
L
Linus Torvalds 已提交
500 501
err:
	kfree_skb(skb);
502
	return err;
L
Linus Torvalds 已提交
503 504 505
}

/* Build a new IP datagram from all its fragments. */
506
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
507
			 struct sk_buff *prev_tail, struct net_device *dev)
L
Linus Torvalds 已提交
508
{
509
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
510
	struct iphdr *iph;
511 512 513
	struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
	struct sk_buff **nextp; /* To build frag_list. */
	struct rb_node *rbn;
L
Linus Torvalds 已提交
514 515
	int len;
	int ihlen;
516
	int err;
517
	u8 ecn;
L
Linus Torvalds 已提交
518 519 520

	ipq_kill(qp);

521
	ecn = ip_frag_ecn_table[qp->ecn];
522 523 524 525
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
526
	/* Make the one we just received the head. */
527 528
	if (head != skb) {
		fp = skb_clone(skb, GFP_ATOMIC);
529 530
		if (!fp)
			goto out_nomem;
531 532 533 534 535 536
		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
		if (RB_EMPTY_NODE(&skb->rbnode))
			FRAG_CB(prev_tail)->next_frag = fp;
		else
			rb_replace_node(&skb->rbnode, &fp->rbnode,
					&qp->q.rb_fragments);
537
		if (qp->q.fragments_tail == skb)
538
			qp->q.fragments_tail = fp;
539
		skb_morph(skb, head);
540
		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
541 542 543 544
		rb_replace_node(&head->rbnode, &skb->rbnode,
				&qp->q.rb_fragments);
		consume_skb(head);
		head = skb;
545 546
	}

547
	WARN_ON(head->ip_defrag_offset != 0);
L
Linus Torvalds 已提交
548 549

	/* Allocate a new buffer for the datagram. */
550
	ihlen = ip_hdrlen(head);
551
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
552

553
	err = -E2BIG;
S
Stephen Hemminger 已提交
554
	if (len > 65535)
L
Linus Torvalds 已提交
555 556 557
		goto out_oversize;

	/* Head of list must not be cloned. */
558
	if (skb_unclone(head, GFP_ATOMIC))
L
Linus Torvalds 已提交
559 560 561 562 563
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
564
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
565 566 567
		struct sk_buff *clone;
		int i, plen = 0;

568 569
		clone = alloc_skb(0, GFP_ATOMIC);
		if (!clone)
L
Linus Torvalds 已提交
570 571
			goto out_nomem;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
572
		skb_frag_list_init(head);
E
Eric Dumazet 已提交
573 574
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
L
Linus Torvalds 已提交
575
		clone->len = clone->data_len = head->data_len - plen;
576
		head->truesize += clone->truesize;
L
Linus Torvalds 已提交
577 578
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
579
		add_frag_mem_limit(qp->q.net, clone->truesize);
580 581 582 583
		skb_shinfo(head)->frag_list = clone;
		nextp = &clone->next;
	} else {
		nextp = &skb_shinfo(head)->frag_list;
L
Linus Torvalds 已提交
584 585
	}

586
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
587

588
	/* Traverse the tree in order, to build frag_list. */
589
	fp = FRAG_CB(head)->next_frag;
590 591
	rbn = rb_next(&head->rbnode);
	rb_erase(&head->rbnode, &qp->q.rb_fragments);
592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
	while (rbn || fp) {
		/* fp points to the next sk_buff in the current run;
		 * rbn points to the next run.
		 */
		/* Go through the current run. */
		while (fp) {
			*nextp = fp;
			nextp = &fp->next;
			fp->prev = NULL;
			memset(&fp->rbnode, 0, sizeof(fp->rbnode));
			head->data_len += fp->len;
			head->len += fp->len;
			if (head->ip_summed != fp->ip_summed)
				head->ip_summed = CHECKSUM_NONE;
			else if (head->ip_summed == CHECKSUM_COMPLETE)
				head->csum = csum_add(head->csum, fp->csum);
			head->truesize += fp->truesize;
			fp = FRAG_CB(fp)->next_frag;
		}
		/* Move to the next run. */
		if (rbn) {
			struct rb_node *rbnext = rb_next(rbn);

			fp = rb_to_skb(rbn);
			rb_erase(rbn, &qp->q.rb_fragments);
			rbn = rbnext;
		}
L
Linus Torvalds 已提交
619
	}
620
	sub_frag_mem_limit(qp->q.net, head->truesize);
L
Linus Torvalds 已提交
621

622
	*nextp = NULL;
L
Linus Torvalds 已提交
623
	head->next = NULL;
624
	head->prev = NULL;
L
Linus Torvalds 已提交
625
	head->dev = dev;
626
	head->tstamp = qp->q.stamp;
627
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
628

629
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
630
	iph->tot_len = htons(len);
631
	iph->tos |= ecn;
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

648 649
	ip_send_check(iph);

E
Eric Dumazet 已提交
650
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
651
	qp->q.fragments = NULL;
652
	qp->q.rb_fragments = RB_ROOT;
653
	qp->q.fragments_tail = NULL;
654
	qp->q.last_run_head = NULL;
655
	return 0;
L
Linus Torvalds 已提交
656 657

out_nomem:
658
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
659
	err = -ENOMEM;
L
Linus Torvalds 已提交
660 661
	goto out_fail;
out_oversize:
662
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
663
out_fail:
E
Eric Dumazet 已提交
664
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
665
	return err;
L
Linus Torvalds 已提交
666 667 668
}

/* Process an incoming IP datagram fragment. */
669
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
670
{
671
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
672
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
673
	struct ipq *qp;
674

E
Eric Dumazet 已提交
675
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
676
	skb_orphan(skb);
L
Linus Torvalds 已提交
677 678

	/* Lookup (or create) queue header */
679
	qp = ip_find(net, ip_hdr(skb), user, vif);
680
	if (qp) {
681
		int ret;
L
Linus Torvalds 已提交
682

683
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
684

685
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
686

687
		spin_unlock(&qp->q.lock);
688
		ipq_put(qp);
689
		return ret;
L
Linus Torvalds 已提交
690 691
	}

E
Eric Dumazet 已提交
692
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
693
	kfree_skb(skb);
694
	return -ENOMEM;
L
Linus Torvalds 已提交
695
}
E
Eric Dumazet 已提交
696
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
697

698
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
699
{
700
	struct iphdr iph;
701
	int netoff;
702 703 704 705 706
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

707 708 709
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
710 711
		return skb;

712
	if (iph.ihl < 5 || iph.version != 4)
713
		return skb;
714 715

	len = ntohs(iph.tot_len);
716
	if (skb->len < netoff + len || len < (iph.ihl * 4))
717 718
		return skb;

719
	if (ip_is_fragment(&iph)) {
720 721
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
722
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
723
				return skb;
724
			if (pskb_trim_rcsum(skb, netoff + len))
725 726
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
727
			if (ip_defrag(net, skb, user))
728
				return NULL;
729
			skb_clear_hash(skb);
730 731 732 733 734 735
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
{
	struct rb_node *p = rb_first(root);
	unsigned int sum = 0;

	while (p) {
		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

		p = rb_next(p);
		rb_erase(&skb->rbnode, root);
		while (skb) {
			struct sk_buff *next = FRAG_CB(skb)->next_frag;

			sum += skb->truesize;
			kfree_skb(skb);
			skb = next;
		}
	}
	return sum;
}
EXPORT_SYMBOL(inet_frag_rbtree_purge);

758
#ifdef CONFIG_SYSCTL
759
static int dist_min;
760

761
static struct ctl_table ip4_frags_ns_ctl_table[] = {
762 763
	{
		.procname	= "ipfrag_high_thresh",
764
		.data		= &init_net.ipv4.frags.high_thresh,
765
		.maxlen		= sizeof(unsigned long),
766
		.mode		= 0644,
767
		.proc_handler	= proc_doulongvec_minmax,
768
		.extra1		= &init_net.ipv4.frags.low_thresh
769 770 771
	},
	{
		.procname	= "ipfrag_low_thresh",
772
		.data		= &init_net.ipv4.frags.low_thresh,
773
		.maxlen		= sizeof(unsigned long),
774
		.mode		= 0644,
775
		.proc_handler	= proc_doulongvec_minmax,
776
		.extra2		= &init_net.ipv4.frags.high_thresh
777 778 779
	},
	{
		.procname	= "ipfrag_time",
780
		.data		= &init_net.ipv4.frags.timeout,
781 782
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
783
		.proc_handler	= proc_dointvec_jiffies,
784
	},
785 786 787 788 789 790
	{
		.procname	= "ipfrag_max_dist",
		.data		= &init_net.ipv4.frags.max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
791
		.extra1		= &dist_min,
792
	},
793 794 795
	{ }
};

796 797
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
798
static struct ctl_table ip4_frags_ctl_table[] = {
799 800
	{
		.procname	= "ipfrag_secret_interval",
801
		.data		= &ip4_frags_secret_interval_unused,
802 803
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
804
		.proc_handler	= proc_dointvec_jiffies,
805 806 807 808
	},
	{ }
};

809
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
810
{
811
	struct ctl_table *table;
812 813
	struct ctl_table_header *hdr;

814
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
815
	if (!net_eq(net, &init_net)) {
816
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
817
		if (!table)
818 819
			goto err_alloc;

820
		table[0].data = &net->ipv4.frags.high_thresh;
821 822
		table[0].extra1 = &net->ipv4.frags.low_thresh;
		table[0].extra2 = &init_net.ipv4.frags.high_thresh;
823
		table[1].data = &net->ipv4.frags.low_thresh;
824
		table[1].extra2 = &net->ipv4.frags.high_thresh;
825
		table[2].data = &net->ipv4.frags.timeout;
826
		table[3].data = &net->ipv4.frags.max_dist;
827 828
	}

829
	hdr = register_net_sysctl(net, "net/ipv4", table);
830
	if (!hdr)
831 832 833 834 835 836
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
837
	if (!net_eq(net, &init_net))
838 839 840 841 842
		kfree(table);
err_alloc:
	return -ENOMEM;
}

843
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
844 845 846 847 848 849
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
850
}
851

852
static void __init ip4_frags_ctl_register(void)
853
{
854
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
855
}
856
#else
857
static int ip4_frags_ns_ctl_register(struct net *net)
858 859 860
{
	return 0;
}
861

862
static void ip4_frags_ns_ctl_unregister(struct net *net)
863 864
{
}
865

866
static void __init ip4_frags_ctl_register(void)
867 868
{
}
869 870
#endif

871
static int __net_init ipv4_frags_init_net(struct net *net)
872
{
873 874
	int res;

875 876 877 878 879 880 881 882 883 884 885 886 887
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
888
	 */
889 890
	net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
891 892 893 894 895 896 897
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

898
	net->ipv4.frags.max_dist = 64;
899
	net->ipv4.frags.f = &ip4_frags;
900

901 902 903 904 905
	res = inet_frags_init_net(&net->ipv4.frags);
	if (res < 0)
		return res;
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
906
		inet_frags_exit_net(&net->ipv4.frags);
907
	return res;
908 909
}

910
static void __net_exit ipv4_frags_exit_net(struct net *net)
911
{
912
	ip4_frags_ns_ctl_unregister(net);
913
	inet_frags_exit_net(&net->ipv4.frags);
914 915 916 917 918 919 920
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

954
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
955
{
956
	ip4_frags.constructor = ip4_frag_init;
957 958
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
959
	ip4_frags.frag_expire = ip_expire;
960
	ip4_frags.frags_cache_name = ip_frag_cache_name;
961
	ip4_frags.rhash_params = ip4_rhash_params;
962 963
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
964 965
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
966
}