ip_fragment.c 20.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
7
 *
L
Linus Torvalds 已提交
8
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
9
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

H
Herbert Xu 已提交
23
#include <linux/compiler.h>
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33 34
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
35
#include <linux/slab.h>
36 37
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
38 39 40 41
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
42
#include <net/inetpeer.h>
43
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
44 45 46 47
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
48
#include <net/inet_ecn.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */

55
static int sysctl_ipfrag_max_dist __read_mostly = 64;
H
Herbert Xu 已提交
56

L
Linus Torvalds 已提交
57 58 59 60 61 62
struct ipfrag_skb_cb
{
	struct inet_skb_parm	h;
	int			offset;
};

63
#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
L
Linus Torvalds 已提交
64 65 66

/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
67 68
	struct inet_frag_queue q;

L
Linus Torvalds 已提交
69
	u32		user;
70 71 72
	__be32		saddr;
	__be32		daddr;
	__be16		id;
L
Linus Torvalds 已提交
73
	u8		protocol;
74
	u8		ecn; /* RFC3168 support */
H
Herbert Xu 已提交
75 76 77
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
78 79
};

80 81 82 83
/* RFC 3168 support :
 * We want to check ECN values of all fragments, do detect invalid combinations.
 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
 */
84 85 86 87
#define	IPFRAG_ECN_NOT_ECT	0x01 /* one frag had ECN_NOT_ECT */
#define	IPFRAG_ECN_ECT_1	0x02 /* one frag had ECN_ECT_1 */
#define	IPFRAG_ECN_ECT_0	0x04 /* one frag had ECN_ECT_0 */
#define	IPFRAG_ECN_CE		0x08 /* one frag had ECN_CE */
88 89 90

static inline u8 ip4_frag_ecn(u8 tos)
{
91
	return 1 << (tos & INET_ECN_MASK);
92 93
}

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
static const u8 ip4_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};

114
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
115

116
int ip_frag_nqueues(struct net *net)
117
{
118
	return net->ipv4.frags.nqueues;
119
}
L
Linus Torvalds 已提交
120

121
int ip_frag_mem(struct net *net)
122
{
123
	return atomic_read(&net->ipv4.frags.mem);
124
}
L
Linus Torvalds 已提交
125

126 127 128
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

129 130 131 132 133
struct ip4_create_arg {
	struct iphdr *iph;
	u32 user;
};

134
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
L
Linus Torvalds 已提交
135
{
136 137
	return jhash_3words((__force u32)id << 16 | prot,
			    (__force u32)saddr, (__force u32)daddr,
138
			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
L
Linus Torvalds 已提交
139 140
}

141
static unsigned int ip4_hashfn(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
142
{
143
	struct ipq *ipq;
L
Linus Torvalds 已提交
144

145 146
	ipq = container_of(q, struct ipq, q);
	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
L
Linus Torvalds 已提交
147 148
}

149 150 151 152 153 154
static int ip4_frag_match(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp;
	struct ip4_create_arg *arg = a;

	qp = container_of(q, struct ipq, q);
E
Eric Dumazet 已提交
155
	return	qp->id == arg->iph->id &&
156 157 158
			qp->saddr == arg->iph->saddr &&
			qp->daddr == arg->iph->daddr &&
			qp->protocol == arg->iph->protocol &&
E
Eric Dumazet 已提交
159
			qp->user == arg->user;
160 161
}

L
Linus Torvalds 已提交
162
/* Memory Tracking Functions. */
E
Eric Dumazet 已提交
163
static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
L
Linus Torvalds 已提交
164
{
165
	atomic_sub(skb->truesize, &nf->mem);
L
Linus Torvalds 已提交
166 167 168
	kfree_skb(skb);
}

169 170 171 172 173 174 175
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp = container_of(q, struct ipq, q);
	struct ip4_create_arg *arg = a;

	qp->protocol = arg->iph->protocol;
	qp->id = arg->iph->id;
176
	qp->ecn = ip4_frag_ecn(arg->iph->tos);
177 178 179 180
	qp->saddr = arg->iph->saddr;
	qp->daddr = arg->iph->daddr;
	qp->user = arg->user;
	qp->peer = sysctl_ipfrag_max_dist ?
181
		inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
182 183
}

184
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
185
{
186 187 188 189 190
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
191 192 193 194 195
}


/* Destruction primitives. */

196
static __inline__ void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
197
{
P
Pavel Emelyanov 已提交
198
	inet_frag_put(&ipq->q, &ip4_frags);
L
Linus Torvalds 已提交
199 200 201 202 203 204 205
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
206
	inet_frag_kill(&ipq->q, &ip4_frags);
L
Linus Torvalds 已提交
207 208
}

209
/* Memory limiting on fragments.  Evictor trashes the oldest
L
Linus Torvalds 已提交
210 211
 * fragment queue until we are back under the threshold.
 */
212
static void ip_evictor(struct net *net)
L
Linus Torvalds 已提交
213
{
214 215
	int evicted;

216
	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
217
	if (evicted)
218
		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
L
Linus Torvalds 已提交
219 220 221 222 223 224 225
}

/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
static void ip_expire(unsigned long arg)
{
226
	struct ipq *qp;
227
	struct net *net;
228 229

	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
230
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
231

232
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
233

234
	if (qp->q.last_in & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
235 236 237 238
		goto out;

	ipq_kill(qp);

239 240
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
241

242
	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
243
		struct sk_buff *head = qp->q.fragments;
244 245
		const struct iphdr *iph;
		int err;
246

E
Eric Dumazet 已提交
247 248
		rcu_read_lock();
		head->dev = dev_get_by_index_rcu(net, qp->iif);
249 250 251
		if (!head->dev)
			goto out_rcu_unlock;

252 253 254 255 256 257 258 259
		/* skb dst is stale, drop it, and perform route lookup again */
		skb_dst_drop(head);
		iph = ip_hdr(head);
		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
					   iph->tos, head->dev);
		if (err)
			goto out_rcu_unlock;

260
		/*
261 262
		 * Only an end host needs to send an ICMP
		 * "Fragment Reassembly Timeout" message, per RFC792.
263
		 */
264 265 266
		if (qp->user == IP_DEFRAG_AF_PACKET ||
		    (qp->user == IP_DEFRAG_CONNTRACK_IN &&
		     skb_rtable(head)->rt_type != RTN_LOCAL))
267 268
			goto out_rcu_unlock;

269 270 271 272

		/* Send an ICMP "Fragment Reassembly Timeout" message. */
		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
273 274
		rcu_read_unlock();
	}
L
Linus Torvalds 已提交
275
out:
276
	spin_unlock(&qp->q.lock);
277
	ipq_put(qp);
L
Linus Torvalds 已提交
278 279
}

280 281 282
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
283
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
L
Linus Torvalds 已提交
284
{
285 286
	struct inet_frag_queue *q;
	struct ip4_create_arg arg;
287
	unsigned int hash;
L
Linus Torvalds 已提交
288

289 290
	arg.iph = iph;
	arg.user = user;
291 292

	read_lock(&ip4_frags.lock);
293
	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
L
Linus Torvalds 已提交
294

295
	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
296 297
	if (q == NULL)
		goto out_nomem;
L
Linus Torvalds 已提交
298

299
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
300 301

out_nomem:
302
	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
L
Linus Torvalds 已提交
303 304 305
	return NULL;
}

H
Herbert Xu 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
/* Is the fragment too far ahead to be part of ipq? */
static inline int ip_frag_too_far(struct ipq *qp)
{
	struct inet_peer *peer = qp->peer;
	unsigned int max = sysctl_ipfrag_max_dist;
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

322
	rc = qp->q.fragments && (end - start) > max;
H
Herbert Xu 已提交
323 324

	if (rc) {
325 326 327 328
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
329 330 331 332 333 334 335 336 337
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;

338
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
339
		atomic_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
340 341 342
		return -ETIMEDOUT;
	}

343
	fp = qp->q.fragments;
H
Herbert Xu 已提交
344 345
	do {
		struct sk_buff *xp = fp->next;
E
Eric Dumazet 已提交
346
		frag_kfree_skb(qp->q.net, fp);
H
Herbert Xu 已提交
347 348 349
		fp = xp;
	} while (fp);

350 351 352 353
	qp->q.last_in = 0;
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
354
	qp->q.fragments_tail = NULL;
H
Herbert Xu 已提交
355
	qp->iif = 0;
356
	qp->ecn = 0;
H
Herbert Xu 已提交
357 358 359 360

	return 0;
}

L
Linus Torvalds 已提交
361
/* Add new segment to existing queue. */
362
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
363 364
{
	struct sk_buff *prev, *next;
365
	struct net_device *dev;
L
Linus Torvalds 已提交
366 367
	int flags, offset;
	int ihl, end;
368
	int err = -ENOENT;
369
	u8 ecn;
L
Linus Torvalds 已提交
370

371
	if (qp->q.last_in & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
372 373
		goto err;

H
Herbert Xu 已提交
374
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
375 376
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
377 378 379 380
		ipq_kill(qp);
		goto err;
	}

381
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
382
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
383 384 385
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
386
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
387 388

	/* Determine the position of this fragment. */
389
	end = offset + skb->len - ihl;
390
	err = -EINVAL;
L
Linus Torvalds 已提交
391 392 393 394 395 396

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
		 * or have different end, the segment is corrrupted.
		 */
397
		if (end < qp->q.len ||
398
		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
399
			goto err;
400
		qp->q.last_in |= INET_FRAG_LAST_IN;
401
		qp->q.len = end;
L
Linus Torvalds 已提交
402 403 404 405 406 407
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
408
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
409
			/* Some bits beyond end -> corruption. */
410
			if (qp->q.last_in & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
411
				goto err;
412
			qp->q.len = end;
L
Linus Torvalds 已提交
413 414 415 416 417
		}
	}
	if (end == offset)
		goto err;

418
	err = -ENOMEM;
L
Linus Torvalds 已提交
419 420
	if (pskb_pull(skb, ihl) == NULL)
		goto err;
421 422 423

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
424 425 426 427 428 429
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
430 431 432 433 434
	prev = qp->q.fragments_tail;
	if (!prev || FRAG_CB(prev)->offset < offset) {
		next = NULL;
		goto found;
	}
L
Linus Torvalds 已提交
435
	prev = NULL;
436
	for (next = qp->q.fragments; next != NULL; next = next->next) {
L
Linus Torvalds 已提交
437 438 439 440 441
		if (FRAG_CB(next)->offset >= offset)
			break;	/* bingo! */
		prev = next;
	}

442
found:
L
Linus Torvalds 已提交
443 444 445 446 447 448 449 450 451
	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
		int i = (FRAG_CB(prev)->offset + prev->len) - offset;

		if (i > 0) {
			offset += i;
452
			err = -EINVAL;
L
Linus Torvalds 已提交
453 454
			if (end <= offset)
				goto err;
455
			err = -ENOMEM;
L
Linus Torvalds 已提交
456 457 458 459 460 461 462
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

463 464
	err = -ENOMEM;

L
Linus Torvalds 已提交
465 466 467 468 469 470 471 472 473 474
	while (next && FRAG_CB(next)->offset < end) {
		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

		if (i < next->len) {
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
			FRAG_CB(next)->offset += i;
475
			qp->q.meat -= i;
L
Linus Torvalds 已提交
476 477 478 479 480 481
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

482
			/* Old fragment is completely overridden with
L
Linus Torvalds 已提交
483 484 485 486 487 488 489
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
490
				qp->q.fragments = next;
L
Linus Torvalds 已提交
491

492
			qp->q.meat -= free_it->len;
E
Eric Dumazet 已提交
493
			frag_kfree_skb(qp->q.net, free_it);
L
Linus Torvalds 已提交
494 495 496 497 498 499 500
		}
	}

	FRAG_CB(skb)->offset = offset;

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
501 502
	if (!next)
		qp->q.fragments_tail = skb;
L
Linus Torvalds 已提交
503 504 505
	if (prev)
		prev->next = skb;
	else
506
		qp->q.fragments = skb;
L
Linus Torvalds 已提交
507

508 509 510 511 512
	dev = skb->dev;
	if (dev) {
		qp->iif = dev->ifindex;
		skb->dev = NULL;
	}
513 514
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
515
	qp->ecn |= ecn;
516
	atomic_add(skb->truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
517
	if (offset == 0)
518
		qp->q.last_in |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
519

520 521
	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
	    qp->q.meat == qp->q.len)
522 523
		return ip_frag_reasm(qp, prev, dev);

524
	write_lock(&ip4_frags.lock);
525
	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
526
	write_unlock(&ip4_frags.lock);
527
	return -EINPROGRESS;
L
Linus Torvalds 已提交
528 529 530

err:
	kfree_skb(skb);
531
	return err;
L
Linus Torvalds 已提交
532 533 534 535 536
}


/* Build a new IP datagram from all its fragments. */

537 538
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
L
Linus Torvalds 已提交
539
{
540
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
541
	struct iphdr *iph;
542
	struct sk_buff *fp, *head = qp->q.fragments;
L
Linus Torvalds 已提交
543 544
	int len;
	int ihlen;
545
	int err;
546
	u8 ecn;
L
Linus Torvalds 已提交
547 548 549

	ipq_kill(qp);

550 551 552 553 554
	ecn = ip4_frag_ecn_table[qp->ecn];
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
555 556 557 558 559 560 561 562
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
563 564
		if (!fp->next)
			qp->q.fragments_tail = fp;
565 566
		prev->next = fp;

567 568
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
569

570 571
		kfree_skb(qp->q.fragments);
		qp->q.fragments = head;
572 573
	}

574 575
	WARN_ON(head == NULL);
	WARN_ON(FRAG_CB(head)->offset != 0);
L
Linus Torvalds 已提交
576 577

	/* Allocate a new buffer for the datagram. */
578
	ihlen = ip_hdrlen(head);
579
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
580

581
	err = -E2BIG;
S
Stephen Hemminger 已提交
582
	if (len > 65535)
L
Linus Torvalds 已提交
583 584 585 586 587 588 589 590 591
		goto out_oversize;

	/* Head of list must not be cloned. */
	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
592
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
593 594 595 596 597 598 599 600
		struct sk_buff *clone;
		int i, plen = 0;

		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
601
		skb_frag_list_init(head);
L
Linus Torvalds 已提交
602 603 604 605 606 607 608
		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
			plen += skb_shinfo(head)->frags[i].size;
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
609
		atomic_add(clone->truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
610 611 612
	}

	skb_shinfo(head)->frag_list = head->next;
613
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
614 615 616 617 618 619

	for (fp=head->next; fp; fp = fp->next) {
		head->data_len += fp->len;
		head->len += fp->len;
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
620
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
621 622 623
			head->csum = csum_add(head->csum, fp->csum);
		head->truesize += fp->truesize;
	}
E
Eric Dumazet 已提交
624
	atomic_sub(head->truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
625 626 627

	head->next = NULL;
	head->dev = dev;
628
	head->tstamp = qp->q.stamp;
L
Linus Torvalds 已提交
629

630
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
631 632
	iph->frag_off = 0;
	iph->tot_len = htons(len);
633
	iph->tos |= ecn;
634
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
635
	qp->q.fragments = NULL;
636
	qp->q.fragments_tail = NULL;
637
	return 0;
L
Linus Torvalds 已提交
638 639

out_nomem:
640
	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
641
			      "queue %p\n", qp);
642
	err = -ENOMEM;
L
Linus Torvalds 已提交
643 644 645
	goto out_fail;
out_oversize:
	if (net_ratelimit())
646 647
		printk(KERN_INFO "Oversized IP packet from %pI4.\n",
			&qp->saddr);
L
Linus Torvalds 已提交
648
out_fail:
649
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
650
	return err;
L
Linus Torvalds 已提交
651 652 653
}

/* Process an incoming IP datagram fragment. */
654
int ip_defrag(struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
655 656
{
	struct ipq *qp;
657
	struct net *net;
658

E
Eric Dumazet 已提交
659
	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
660
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
L
Linus Torvalds 已提交
661 662

	/* Start by cleaning up the memory. */
663
	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
664
		ip_evictor(net);
L
Linus Torvalds 已提交
665 666

	/* Lookup (or create) queue header */
667
	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
668
		int ret;
L
Linus Torvalds 已提交
669

670
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
671

672
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
673

674
		spin_unlock(&qp->q.lock);
675
		ipq_put(qp);
676
		return ret;
L
Linus Torvalds 已提交
677 678
	}

679
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
680
	kfree_skb(skb);
681
	return -ENOMEM;
L
Linus Torvalds 已提交
682
}
E
Eric Dumazet 已提交
683
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
684

685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
{
	const struct iphdr *iph;
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		return skb;

	iph = ip_hdr(skb);
	if (iph->ihl < 5 || iph->version != 4)
		return skb;
	if (!pskb_may_pull(skb, iph->ihl*4))
		return skb;
	iph = ip_hdr(skb);
	len = ntohs(iph->tot_len);
	if (skb->len < len || len < (iph->ihl * 4))
		return skb;

	if (ip_is_fragment(ip_hdr(skb))) {
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
			if (pskb_trim_rcsum(skb, len))
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
			if (ip_defrag(skb, user))
				return NULL;
			skb->rxhash = 0;
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

721 722 723
#ifdef CONFIG_SYSCTL
static int zero;

724
static struct ctl_table ip4_frags_ns_ctl_table[] = {
725 726
	{
		.procname	= "ipfrag_high_thresh",
727
		.data		= &init_net.ipv4.frags.high_thresh,
728 729
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
730
		.proc_handler	= proc_dointvec
731 732 733
	},
	{
		.procname	= "ipfrag_low_thresh",
734
		.data		= &init_net.ipv4.frags.low_thresh,
735 736
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
737
		.proc_handler	= proc_dointvec
738 739 740
	},
	{
		.procname	= "ipfrag_time",
741
		.data		= &init_net.ipv4.frags.timeout,
742 743
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
744
		.proc_handler	= proc_dointvec_jiffies,
745
	},
746 747 748 749
	{ }
};

static struct ctl_table ip4_frags_ctl_table[] = {
750 751
	{
		.procname	= "ipfrag_secret_interval",
752
		.data		= &ip4_frags.secret_interval,
753 754
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
755
		.proc_handler	= proc_dointvec_jiffies,
756 757 758 759 760 761
	},
	{
		.procname	= "ipfrag_max_dist",
		.data		= &sysctl_ipfrag_max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
762
		.proc_handler	= proc_dointvec_minmax,
763 764 765 766 767
		.extra1		= &zero
	},
	{ }
};

768
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
769
{
770
	struct ctl_table *table;
771 772
	struct ctl_table_header *hdr;

773
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
774
	if (!net_eq(net, &init_net)) {
775
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
776 777 778
		if (table == NULL)
			goto err_alloc;

779 780
		table[0].data = &net->ipv4.frags.high_thresh;
		table[1].data = &net->ipv4.frags.low_thresh;
781
		table[2].data = &net->ipv4.frags.timeout;
782 783 784 785 786 787 788 789 790 791
	}

	hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
	if (hdr == NULL)
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
792
	if (!net_eq(net, &init_net))
793 794 795 796 797
		kfree(table);
err_alloc:
	return -ENOMEM;
}

798
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
799 800 801 802 803 804
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
805
}
806 807 808 809 810

static void ip4_frags_ctl_register(void)
{
	register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
}
811
#else
812
static inline int ip4_frags_ns_ctl_register(struct net *net)
813 814 815
{
	return 0;
}
816

817
static inline void ip4_frags_ns_ctl_unregister(struct net *net)
818 819
{
}
820 821 822 823

static inline void ip4_frags_ctl_register(void)
{
}
824 825
#endif

826
static int __net_init ipv4_frags_init_net(struct net *net)
827
{
828 829 830 831 832 833 834 835
	/*
	 * Fragment cache limits. We will commit 256K at one time. Should we
	 * cross that limit we will prune down to 192K. This should cope with
	 * even the most extreme cases without allowing an attacker to
	 * measurably harm machine performance.
	 */
	net->ipv4.frags.high_thresh = 256 * 1024;
	net->ipv4.frags.low_thresh = 192 * 1024;
836 837 838 839 840 841 842
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

843 844
	inet_frags_init_net(&net->ipv4.frags);

845
	return ip4_frags_ns_ctl_register(net);
846 847
}

848
static void __net_exit ipv4_frags_exit_net(struct net *net)
849
{
850
	ip4_frags_ns_ctl_unregister(net);
851 852 853 854 855 856 857 858
	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

859
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
860
{
861
	ip4_frags_ctl_register();
862
	register_pernet_subsys(&ip4_frags_ops);
863
	ip4_frags.hashfn = ip4_hashfn;
864
	ip4_frags.constructor = ip4_frag_init;
865 866 867
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.skb_free = NULL;
	ip4_frags.qsize = sizeof(struct ipq);
868
	ip4_frags.match = ip4_frag_match;
869
	ip4_frags.frag_expire = ip_expire;
870
	ip4_frags.secret_interval = 10 * 60 * HZ;
871
	inet_frags_init(&ip4_frags);
L
Linus Torvalds 已提交
872
}