ip_fragment.c 21.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
7
 *
L
Linus Torvalds 已提交
8
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
9
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

23 24
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
25
#include <linux/compiler.h>
L
Linus Torvalds 已提交
26 27 28 29 30 31 32 33 34 35 36
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
37
#include <linux/slab.h>
38 39
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
40 41 42 43
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
44
#include <net/inetpeer.h>
45
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
46 47 48 49
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
50
#include <net/inet_ecn.h>
L
Linus Torvalds 已提交
51 52 53 54 55 56

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */

57
static int sysctl_ipfrag_max_dist __read_mostly = 64;
H
Herbert Xu 已提交
58

L
Linus Torvalds 已提交
59 60 61 62 63 64
struct ipfrag_skb_cb
{
	struct inet_skb_parm	h;
	int			offset;
};

65
#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
L
Linus Torvalds 已提交
66 67 68

/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
69 70
	struct inet_frag_queue q;

L
Linus Torvalds 已提交
71
	u32		user;
72 73 74
	__be32		saddr;
	__be32		daddr;
	__be16		id;
L
Linus Torvalds 已提交
75
	u8		protocol;
76
	u8		ecn; /* RFC3168 support */
H
Herbert Xu 已提交
77 78 79
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
80 81
};

82 83 84 85
/* RFC 3168 support :
 * We want to check ECN values of all fragments, do detect invalid combinations.
 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
 */
86 87 88 89
#define	IPFRAG_ECN_NOT_ECT	0x01 /* one frag had ECN_NOT_ECT */
#define	IPFRAG_ECN_ECT_1	0x02 /* one frag had ECN_ECT_1 */
#define	IPFRAG_ECN_ECT_0	0x04 /* one frag had ECN_ECT_0 */
#define	IPFRAG_ECN_CE		0x08 /* one frag had ECN_CE */
90 91 92

static inline u8 ip4_frag_ecn(u8 tos)
{
93
	return 1 << (tos & INET_ECN_MASK);
94 95
}

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
static const u8 ip4_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};

116
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
117

118
int ip_frag_nqueues(struct net *net)
119
{
120
	return net->ipv4.frags.nqueues;
121
}
L
Linus Torvalds 已提交
122

123
int ip_frag_mem(struct net *net)
124
{
125
	return atomic_read(&net->ipv4.frags.mem);
126
}
L
Linus Torvalds 已提交
127

128 129 130
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

131 132 133 134 135
struct ip4_create_arg {
	struct iphdr *iph;
	u32 user;
};

136
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
L
Linus Torvalds 已提交
137
{
138 139
	return jhash_3words((__force u32)id << 16 | prot,
			    (__force u32)saddr, (__force u32)daddr,
140
			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
L
Linus Torvalds 已提交
141 142
}

143
static unsigned int ip4_hashfn(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
144
{
145
	struct ipq *ipq;
L
Linus Torvalds 已提交
146

147 148
	ipq = container_of(q, struct ipq, q);
	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
L
Linus Torvalds 已提交
149 150
}

151
static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
152 153 154 155 156
{
	struct ipq *qp;
	struct ip4_create_arg *arg = a;

	qp = container_of(q, struct ipq, q);
E
Eric Dumazet 已提交
157
	return	qp->id == arg->iph->id &&
158 159 160 161
		qp->saddr == arg->iph->saddr &&
		qp->daddr == arg->iph->daddr &&
		qp->protocol == arg->iph->protocol &&
		qp->user == arg->user;
162 163
}

L
Linus Torvalds 已提交
164
/* Memory Tracking Functions. */
E
Eric Dumazet 已提交
165
static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
L
Linus Torvalds 已提交
166
{
167
	atomic_sub(skb->truesize, &nf->mem);
L
Linus Torvalds 已提交
168 169 170
	kfree_skb(skb);
}

171 172 173
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp = container_of(q, struct ipq, q);
174 175 176 177
	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
					       frags);
	struct net *net = container_of(ipv4, struct net, ipv4);

178 179 180 181
	struct ip4_create_arg *arg = a;

	qp->protocol = arg->iph->protocol;
	qp->id = arg->iph->id;
182
	qp->ecn = ip4_frag_ecn(arg->iph->tos);
183 184 185 186
	qp->saddr = arg->iph->saddr;
	qp->daddr = arg->iph->daddr;
	qp->user = arg->user;
	qp->peer = sysctl_ipfrag_max_dist ?
187
		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
188 189
}

190
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
191
{
192 193 194 195 196
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
197 198 199 200 201
}


/* Destruction primitives. */

202
static __inline__ void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
203
{
P
Pavel Emelyanov 已提交
204
	inet_frag_put(&ipq->q, &ip4_frags);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
212
	inet_frag_kill(&ipq->q, &ip4_frags);
L
Linus Torvalds 已提交
213 214
}

215
/* Memory limiting on fragments.  Evictor trashes the oldest
L
Linus Torvalds 已提交
216 217
 * fragment queue until we are back under the threshold.
 */
218
static void ip_evictor(struct net *net)
L
Linus Torvalds 已提交
219
{
220 221
	int evicted;

222
	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
223
	if (evicted)
224
		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
L
Linus Torvalds 已提交
225 226 227 228 229 230 231
}

/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
static void ip_expire(unsigned long arg)
{
232
	struct ipq *qp;
233
	struct net *net;
234 235

	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
236
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
237

238
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
239

240
	if (qp->q.last_in & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
241 242 243 244
		goto out;

	ipq_kill(qp);

245 246
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
247

248
	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
249
		struct sk_buff *head = qp->q.fragments;
250 251
		const struct iphdr *iph;
		int err;
252

E
Eric Dumazet 已提交
253 254
		rcu_read_lock();
		head->dev = dev_get_by_index_rcu(net, qp->iif);
255 256 257
		if (!head->dev)
			goto out_rcu_unlock;

258 259 260
		/* skb dst is stale, drop it, and perform route lookup again */
		skb_dst_drop(head);
		iph = ip_hdr(head);
261 262
		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
					   iph->tos, head->dev);
263 264 265
		if (err)
			goto out_rcu_unlock;

266
		/*
267 268
		 * Only an end host needs to send an ICMP
		 * "Fragment Reassembly Timeout" message, per RFC792.
269
		 */
270 271 272
		if (qp->user == IP_DEFRAG_AF_PACKET ||
		    (qp->user == IP_DEFRAG_CONNTRACK_IN &&
		     skb_rtable(head)->rt_type != RTN_LOCAL))
273 274
			goto out_rcu_unlock;

275 276 277 278

		/* Send an ICMP "Fragment Reassembly Timeout" message. */
		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
279 280
		rcu_read_unlock();
	}
L
Linus Torvalds 已提交
281
out:
282
	spin_unlock(&qp->q.lock);
283
	ipq_put(qp);
L
Linus Torvalds 已提交
284 285
}

286 287 288
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
289
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
L
Linus Torvalds 已提交
290
{
291 292
	struct inet_frag_queue *q;
	struct ip4_create_arg arg;
293
	unsigned int hash;
L
Linus Torvalds 已提交
294

295 296
	arg.iph = iph;
	arg.user = user;
297 298

	read_lock(&ip4_frags.lock);
299
	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
L
Linus Torvalds 已提交
300

301
	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
302 303
	if (q == NULL)
		goto out_nomem;
L
Linus Torvalds 已提交
304

305
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
306 307

out_nomem:
308
	LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
L
Linus Torvalds 已提交
309 310 311
	return NULL;
}

H
Herbert Xu 已提交
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
/* Is the fragment too far ahead to be part of ipq? */
static inline int ip_frag_too_far(struct ipq *qp)
{
	struct inet_peer *peer = qp->peer;
	unsigned int max = sysctl_ipfrag_max_dist;
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

328
	rc = qp->q.fragments && (end - start) > max;
H
Herbert Xu 已提交
329 330

	if (rc) {
331 332 333 334
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
335 336 337 338 339 340 341 342 343
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;

344
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
345
		atomic_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
346 347 348
		return -ETIMEDOUT;
	}

349
	fp = qp->q.fragments;
H
Herbert Xu 已提交
350 351
	do {
		struct sk_buff *xp = fp->next;
E
Eric Dumazet 已提交
352
		frag_kfree_skb(qp->q.net, fp);
H
Herbert Xu 已提交
353 354 355
		fp = xp;
	} while (fp);

356 357 358 359
	qp->q.last_in = 0;
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
360
	qp->q.fragments_tail = NULL;
H
Herbert Xu 已提交
361
	qp->iif = 0;
362
	qp->ecn = 0;
H
Herbert Xu 已提交
363 364 365 366

	return 0;
}

L
Linus Torvalds 已提交
367
/* Add new segment to existing queue. */
368
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
369 370
{
	struct sk_buff *prev, *next;
371
	struct net_device *dev;
L
Linus Torvalds 已提交
372 373
	int flags, offset;
	int ihl, end;
374
	int err = -ENOENT;
375
	u8 ecn;
L
Linus Torvalds 已提交
376

377
	if (qp->q.last_in & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
378 379
		goto err;

H
Herbert Xu 已提交
380
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
381 382
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
383 384 385 386
		ipq_kill(qp);
		goto err;
	}

387
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
388
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
389 390 391
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
392
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
393 394

	/* Determine the position of this fragment. */
395
	end = offset + skb->len - ihl;
396
	err = -EINVAL;
L
Linus Torvalds 已提交
397 398 399 400

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
401
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
402
		 */
403
		if (end < qp->q.len ||
404
		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
405
			goto err;
406
		qp->q.last_in |= INET_FRAG_LAST_IN;
407
		qp->q.len = end;
L
Linus Torvalds 已提交
408 409 410 411 412 413
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
414
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
415
			/* Some bits beyond end -> corruption. */
416
			if (qp->q.last_in & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
417
				goto err;
418
			qp->q.len = end;
L
Linus Torvalds 已提交
419 420 421 422 423
		}
	}
	if (end == offset)
		goto err;

424
	err = -ENOMEM;
L
Linus Torvalds 已提交
425 426
	if (pskb_pull(skb, ihl) == NULL)
		goto err;
427 428 429

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
430 431 432 433 434 435
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
436 437 438 439 440
	prev = qp->q.fragments_tail;
	if (!prev || FRAG_CB(prev)->offset < offset) {
		next = NULL;
		goto found;
	}
L
Linus Torvalds 已提交
441
	prev = NULL;
442
	for (next = qp->q.fragments; next != NULL; next = next->next) {
L
Linus Torvalds 已提交
443 444 445 446 447
		if (FRAG_CB(next)->offset >= offset)
			break;	/* bingo! */
		prev = next;
	}

448
found:
L
Linus Torvalds 已提交
449 450 451 452 453 454 455 456 457
	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
		int i = (FRAG_CB(prev)->offset + prev->len) - offset;

		if (i > 0) {
			offset += i;
458
			err = -EINVAL;
L
Linus Torvalds 已提交
459 460
			if (end <= offset)
				goto err;
461
			err = -ENOMEM;
L
Linus Torvalds 已提交
462 463 464 465 466 467 468
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

469 470
	err = -ENOMEM;

L
Linus Torvalds 已提交
471 472 473 474 475 476 477 478 479 480
	while (next && FRAG_CB(next)->offset < end) {
		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

		if (i < next->len) {
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
			FRAG_CB(next)->offset += i;
481
			qp->q.meat -= i;
L
Linus Torvalds 已提交
482 483 484 485 486 487
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

488
			/* Old fragment is completely overridden with
L
Linus Torvalds 已提交
489 490 491 492 493 494 495
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
496
				qp->q.fragments = next;
L
Linus Torvalds 已提交
497

498
			qp->q.meat -= free_it->len;
E
Eric Dumazet 已提交
499
			frag_kfree_skb(qp->q.net, free_it);
L
Linus Torvalds 已提交
500 501 502 503 504 505 506
		}
	}

	FRAG_CB(skb)->offset = offset;

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
507 508
	if (!next)
		qp->q.fragments_tail = skb;
L
Linus Torvalds 已提交
509 510 511
	if (prev)
		prev->next = skb;
	else
512
		qp->q.fragments = skb;
L
Linus Torvalds 已提交
513

514 515 516 517 518
	dev = skb->dev;
	if (dev) {
		qp->iif = dev->ifindex;
		skb->dev = NULL;
	}
519 520
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
521
	qp->ecn |= ecn;
522
	atomic_add(skb->truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
523
	if (offset == 0)
524
		qp->q.last_in |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
525

526 527 528 529
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
	    skb->len + ihl > qp->q.max_size)
		qp->q.max_size = skb->len + ihl;

530 531
	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
	    qp->q.meat == qp->q.len)
532 533
		return ip_frag_reasm(qp, prev, dev);

534
	write_lock(&ip4_frags.lock);
535
	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
536
	write_unlock(&ip4_frags.lock);
537
	return -EINPROGRESS;
L
Linus Torvalds 已提交
538 539 540

err:
	kfree_skb(skb);
541
	return err;
L
Linus Torvalds 已提交
542 543 544 545 546
}


/* Build a new IP datagram from all its fragments. */

547 548
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
L
Linus Torvalds 已提交
549
{
550
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
551
	struct iphdr *iph;
552
	struct sk_buff *fp, *head = qp->q.fragments;
L
Linus Torvalds 已提交
553 554
	int len;
	int ihlen;
555
	int err;
556
	int sum_truesize;
557
	u8 ecn;
L
Linus Torvalds 已提交
558 559 560

	ipq_kill(qp);

561 562 563 564 565
	ecn = ip4_frag_ecn_table[qp->ecn];
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
566 567 568 569 570 571 572 573
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
574 575
		if (!fp->next)
			qp->q.fragments_tail = fp;
576 577
		prev->next = fp;

578 579
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
580

581
		consume_skb(qp->q.fragments);
582
		qp->q.fragments = head;
583 584
	}

585 586
	WARN_ON(head == NULL);
	WARN_ON(FRAG_CB(head)->offset != 0);
L
Linus Torvalds 已提交
587 588

	/* Allocate a new buffer for the datagram. */
589
	ihlen = ip_hdrlen(head);
590
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
591

592
	err = -E2BIG;
S
Stephen Hemminger 已提交
593
	if (len > 65535)
L
Linus Torvalds 已提交
594 595 596 597 598 599 600 601 602
		goto out_oversize;

	/* Head of list must not be cloned. */
	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
603
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
604 605 606 607 608 609 610 611
		struct sk_buff *clone;
		int i, plen = 0;

		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
612
		skb_frag_list_init(head);
E
Eric Dumazet 已提交
613 614
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
L
Linus Torvalds 已提交
615 616 617 618 619
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
620
		atomic_add(clone->truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
621 622
	}

623
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
624

625 626 627 628 629 630 631
	sum_truesize = head->truesize;
	for (fp = head->next; fp;) {
		bool headstolen;
		int delta;
		struct sk_buff *next = fp->next;

		sum_truesize += fp->truesize;
L
Linus Torvalds 已提交
632 633
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
634
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
635
			head->csum = csum_add(head->csum, fp->csum);
636 637 638 639 640 641 642 643 644 645 646

		if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
			kfree_skb_partial(fp, headstolen);
		} else {
			if (!skb_shinfo(head)->frag_list)
				skb_shinfo(head)->frag_list = fp;
			head->data_len += fp->len;
			head->len += fp->len;
			head->truesize += fp->truesize;
		}
		fp = next;
L
Linus Torvalds 已提交
647
	}
648
	atomic_sub(sum_truesize, &qp->q.net->mem);
L
Linus Torvalds 已提交
649 650 651

	head->next = NULL;
	head->dev = dev;
652
	head->tstamp = qp->q.stamp;
653
	IPCB(head)->frag_max_size = qp->q.max_size;
L
Linus Torvalds 已提交
654

655
	iph = ip_hdr(head);
656 657
	/* max_size != 0 implies at least one fragment had IP_DF set */
	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
L
Linus Torvalds 已提交
658
	iph->tot_len = htons(len);
659
	iph->tos |= ecn;
660
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
661
	qp->q.fragments = NULL;
662
	qp->q.fragments_tail = NULL;
663
	return 0;
L
Linus Torvalds 已提交
664 665

out_nomem:
666 667
	LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
		       qp);
668
	err = -ENOMEM;
L
Linus Torvalds 已提交
669 670
	goto out_fail;
out_oversize:
671
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
L
Linus Torvalds 已提交
672
out_fail:
673
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
674
	return err;
L
Linus Torvalds 已提交
675 676 677
}

/* Process an incoming IP datagram fragment. */
678
int ip_defrag(struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
679 680
{
	struct ipq *qp;
681
	struct net *net;
682

E
Eric Dumazet 已提交
683
	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
684
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
L
Linus Torvalds 已提交
685 686

	/* Start by cleaning up the memory. */
687
	ip_evictor(net);
L
Linus Torvalds 已提交
688 689

	/* Lookup (or create) queue header */
690
	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
691
		int ret;
L
Linus Torvalds 已提交
692

693
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
694

695
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
696

697
		spin_unlock(&qp->q.lock);
698
		ipq_put(qp);
699
		return ret;
L
Linus Torvalds 已提交
700 701
	}

702
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
703
	kfree_skb(skb);
704
	return -ENOMEM;
L
Linus Torvalds 已提交
705
}
E
Eric Dumazet 已提交
706
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
707

708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
{
	const struct iphdr *iph;
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		return skb;

	iph = ip_hdr(skb);
	if (iph->ihl < 5 || iph->version != 4)
		return skb;
	if (!pskb_may_pull(skb, iph->ihl*4))
		return skb;
	iph = ip_hdr(skb);
	len = ntohs(iph->tot_len);
	if (skb->len < len || len < (iph->ihl * 4))
		return skb;

	if (ip_is_fragment(ip_hdr(skb))) {
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
			if (pskb_trim_rcsum(skb, len))
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
			if (ip_defrag(skb, user))
				return NULL;
			skb->rxhash = 0;
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

744 745 746
#ifdef CONFIG_SYSCTL
static int zero;

747
static struct ctl_table ip4_frags_ns_ctl_table[] = {
748 749
	{
		.procname	= "ipfrag_high_thresh",
750
		.data		= &init_net.ipv4.frags.high_thresh,
751 752
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
753
		.proc_handler	= proc_dointvec
754 755 756
	},
	{
		.procname	= "ipfrag_low_thresh",
757
		.data		= &init_net.ipv4.frags.low_thresh,
758 759
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
760
		.proc_handler	= proc_dointvec
761 762 763
	},
	{
		.procname	= "ipfrag_time",
764
		.data		= &init_net.ipv4.frags.timeout,
765 766
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
767
		.proc_handler	= proc_dointvec_jiffies,
768
	},
769 770 771 772
	{ }
};

static struct ctl_table ip4_frags_ctl_table[] = {
773 774
	{
		.procname	= "ipfrag_secret_interval",
775
		.data		= &ip4_frags.secret_interval,
776 777
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
778
		.proc_handler	= proc_dointvec_jiffies,
779 780 781 782 783 784
	},
	{
		.procname	= "ipfrag_max_dist",
		.data		= &sysctl_ipfrag_max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
785
		.proc_handler	= proc_dointvec_minmax,
786 787 788 789 790
		.extra1		= &zero
	},
	{ }
};

791
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
792
{
793
	struct ctl_table *table;
794 795
	struct ctl_table_header *hdr;

796
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
797
	if (!net_eq(net, &init_net)) {
798
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
799 800 801
		if (table == NULL)
			goto err_alloc;

802 803
		table[0].data = &net->ipv4.frags.high_thresh;
		table[1].data = &net->ipv4.frags.low_thresh;
804
		table[2].data = &net->ipv4.frags.timeout;
805 806
	}

807
	hdr = register_net_sysctl(net, "net/ipv4", table);
808 809 810 811 812 813 814
	if (hdr == NULL)
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
815
	if (!net_eq(net, &init_net))
816 817 818 819 820
		kfree(table);
err_alloc:
	return -ENOMEM;
}

821
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
822 823 824 825 826 827
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
828
}
829 830 831

static void ip4_frags_ctl_register(void)
{
832
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
833
}
834
#else
835
static inline int ip4_frags_ns_ctl_register(struct net *net)
836 837 838
{
	return 0;
}
839

840
static inline void ip4_frags_ns_ctl_unregister(struct net *net)
841 842
{
}
843 844 845 846

static inline void ip4_frags_ctl_register(void)
{
}
847 848
#endif

849
static int __net_init ipv4_frags_init_net(struct net *net)
850
{
851 852 853 854 855 856 857 858
	/*
	 * Fragment cache limits. We will commit 256K at one time. Should we
	 * cross that limit we will prune down to 192K. This should cope with
	 * even the most extreme cases without allowing an attacker to
	 * measurably harm machine performance.
	 */
	net->ipv4.frags.high_thresh = 256 * 1024;
	net->ipv4.frags.low_thresh = 192 * 1024;
859 860 861 862 863 864 865
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

866 867
	inet_frags_init_net(&net->ipv4.frags);

868
	return ip4_frags_ns_ctl_register(net);
869 870
}

871
static void __net_exit ipv4_frags_exit_net(struct net *net)
872
{
873
	ip4_frags_ns_ctl_unregister(net);
874 875 876 877 878 879 880 881
	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

882
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
883
{
884
	ip4_frags_ctl_register();
885
	register_pernet_subsys(&ip4_frags_ops);
886
	ip4_frags.hashfn = ip4_hashfn;
887
	ip4_frags.constructor = ip4_frag_init;
888 889 890
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.skb_free = NULL;
	ip4_frags.qsize = sizeof(struct ipq);
891
	ip4_frags.match = ip4_frag_match;
892
	ip4_frags.frag_expire = ip_expire;
893
	ip4_frags.secret_interval = 10 * 60 * HZ;
894
	inet_frags_init(&ip4_frags);
L
Linus Torvalds 已提交
895
}