ip_fragment.c 17.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
7
 *
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Version:	$Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
 *
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
 *		Alan Cox <Alan.Cox@linux.org>
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

H
Herbert Xu 已提交
25
#include <linux/compiler.h>
L
Linus Torvalds 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
41
#include <net/inetpeer.h>
42
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */

/* Fragment cache limits. We will commit 256K at one time. Should we
 * cross that limit we will prune down to 192K. This should cope with
 * even the most extreme cases without allowing an attacker to measurably
 * harm machine performance.
 */
58 59
int sysctl_ipfrag_high_thresh __read_mostly = 256*1024;
int sysctl_ipfrag_low_thresh __read_mostly = 192*1024;
L
Linus Torvalds 已提交
60

61
int sysctl_ipfrag_max_dist __read_mostly = 64;
H
Herbert Xu 已提交
62

L
Linus Torvalds 已提交
63 64 65
/* Important NOTE! Fragment queue must be destroyed before MSL expires.
 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
 */
66
int sysctl_ipfrag_time __read_mostly = IP_FRAG_TIME;
L
Linus Torvalds 已提交
67 68 69 70 71 72 73 74 75 76 77

struct ipfrag_skb_cb
{
	struct inet_skb_parm	h;
	int			offset;
};

#define FRAG_CB(skb)	((struct ipfrag_skb_cb*)((skb)->cb))

/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
78 79
	struct inet_frag_queue q;

L
Linus Torvalds 已提交
80
	u32		user;
81 82 83
	__be32		saddr;
	__be32		daddr;
	__be16		id;
L
Linus Torvalds 已提交
84
	u8		protocol;
H
Herbert Xu 已提交
85 86 87
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
88 89 90 91 92 93 94
};

/* Hash table. */

#define IPQ_HASHSZ	64

/* Per-bucket lock is easy to add now. */
95
static struct hlist_head ipq_hash[IPQ_HASHSZ];
L
Linus Torvalds 已提交
96 97 98 99 100
static DEFINE_RWLOCK(ipfrag_lock);
static u32 ipfrag_hash_rnd;
static LIST_HEAD(ipq_lru_list);
int ip_frag_nqueues = 0;

101 102 103
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

L
Linus Torvalds 已提交
104 105
static __inline__ void __ipq_unlink(struct ipq *qp)
{
106 107
	hlist_del(&qp->q.list);
	list_del(&qp->q.lru_list);
L
Linus Torvalds 已提交
108 109 110 111 112 113 114 115 116 117
	ip_frag_nqueues--;
}

static __inline__ void ipq_unlink(struct ipq *ipq)
{
	write_lock(&ipfrag_lock);
	__ipq_unlink(ipq);
	write_unlock(&ipfrag_lock);
}

118
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
L
Linus Torvalds 已提交
119
{
120 121
	return jhash_3words((__force u32)id << 16 | prot,
			    (__force u32)saddr, (__force u32)daddr,
L
Linus Torvalds 已提交
122 123 124 125
			    ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
}

static struct timer_list ipfrag_secret_timer;
126
int sysctl_ipfrag_secret_interval __read_mostly = 10 * 60 * HZ;
L
Linus Torvalds 已提交
127 128 129 130 131 132 133 134 135 136

static void ipfrag_secret_rebuild(unsigned long dummy)
{
	unsigned long now = jiffies;
	int i;

	write_lock(&ipfrag_lock);
	get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
	for (i = 0; i < IPQ_HASHSZ; i++) {
		struct ipq *q;
137
		struct hlist_node *p, *n;
L
Linus Torvalds 已提交
138

139
		hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], q.list) {
L
Linus Torvalds 已提交
140 141 142 143
			unsigned int hval = ipqhashfn(q->id, q->saddr,
						      q->daddr, q->protocol);

			if (hval != i) {
144
				hlist_del(&q->q.list);
L
Linus Torvalds 已提交
145 146

				/* Relink to new hash chain. */
147
				hlist_add_head(&q->q.list, &ipq_hash[hval]);
L
Linus Torvalds 已提交
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
			}
		}
	}
	write_unlock(&ipfrag_lock);

	mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
}

atomic_t ip_frag_mem = ATOMIC_INIT(0);	/* Memory used for fragments */

/* Memory Tracking Functions. */
static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
{
	if (work)
		*work -= skb->truesize;
	atomic_sub(skb->truesize, &ip_frag_mem);
	kfree_skb(skb);
}

static __inline__ void frag_free_queue(struct ipq *qp, int *work)
{
	if (work)
		*work -= sizeof(struct ipq);
	atomic_sub(sizeof(struct ipq), &ip_frag_mem);
	kfree(qp);
}

static __inline__ struct ipq *frag_alloc_queue(void)
{
	struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);

S
Stephen Hemminger 已提交
179
	if (!qp)
L
Linus Torvalds 已提交
180 181 182 183 184 185 186 187 188 189 190 191 192
		return NULL;
	atomic_add(sizeof(struct ipq), &ip_frag_mem);
	return qp;
}


/* Destruction primitives. */

/* Complete destruction of ipq. */
static void ip_frag_destroy(struct ipq *qp, int *work)
{
	struct sk_buff *fp;

193 194
	BUG_TRAP(qp->q.last_in&COMPLETE);
	BUG_TRAP(del_timer(&qp->q.timer) == 0);
L
Linus Torvalds 已提交
195

H
Herbert Xu 已提交
196 197 198
	if (qp->peer)
		inet_putpeer(qp->peer);

L
Linus Torvalds 已提交
199
	/* Release all fragment data. */
200
	fp = qp->q.fragments;
L
Linus Torvalds 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213
	while (fp) {
		struct sk_buff *xp = fp->next;

		frag_kfree_skb(fp, work);
		fp = xp;
	}

	/* Finally, release the queue descriptor itself. */
	frag_free_queue(qp, work);
}

static __inline__ void ipq_put(struct ipq *ipq, int *work)
{
214
	if (atomic_dec_and_test(&ipq->q.refcnt))
L
Linus Torvalds 已提交
215 216 217 218 219 220 221 222
		ip_frag_destroy(ipq, work);
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
223 224
	if (del_timer(&ipq->q.timer))
		atomic_dec(&ipq->q.refcnt);
L
Linus Torvalds 已提交
225

226
	if (!(ipq->q.last_in & COMPLETE)) {
L
Linus Torvalds 已提交
227
		ipq_unlink(ipq);
228 229
		atomic_dec(&ipq->q.refcnt);
		ipq->q.last_in |= COMPLETE;
L
Linus Torvalds 已提交
230 231 232
	}
}

233
/* Memory limiting on fragments.  Evictor trashes the oldest
L
Linus Torvalds 已提交
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
 * fragment queue until we are back under the threshold.
 */
static void ip_evictor(void)
{
	struct ipq *qp;
	struct list_head *tmp;
	int work;

	work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
	if (work <= 0)
		return;

	while (work > 0) {
		read_lock(&ipfrag_lock);
		if (list_empty(&ipq_lru_list)) {
			read_unlock(&ipfrag_lock);
			return;
		}
		tmp = ipq_lru_list.next;
253 254
		qp = list_entry(tmp, struct ipq, q.lru_list);
		atomic_inc(&qp->q.refcnt);
L
Linus Torvalds 已提交
255 256
		read_unlock(&ipfrag_lock);

257 258
		spin_lock(&qp->q.lock);
		if (!(qp->q.last_in&COMPLETE))
L
Linus Torvalds 已提交
259
			ipq_kill(qp);
260
		spin_unlock(&qp->q.lock);
L
Linus Torvalds 已提交
261 262 263 264 265 266 267 268 269 270 271 272 273

		ipq_put(qp, &work);
		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
	}
}

/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
static void ip_expire(unsigned long arg)
{
	struct ipq *qp = (struct ipq *) arg;

274
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
275

276
	if (qp->q.last_in & COMPLETE)
L
Linus Torvalds 已提交
277 278 279 280 281 282 283
		goto out;

	ipq_kill(qp);

	IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);

284 285
	if ((qp->q.last_in&FIRST_IN) && qp->q.fragments != NULL) {
		struct sk_buff *head = qp->q.fragments;
L
Linus Torvalds 已提交
286
		/* Send an ICMP "Fragment Reassembly Timeout" message. */
287
		if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) {
L
Linus Torvalds 已提交
288 289 290 291 292
			icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
			dev_put(head->dev);
		}
	}
out:
293
	spin_unlock(&qp->q.lock);
L
Linus Torvalds 已提交
294 295 296 297 298
	ipq_put(qp, NULL);
}

/* Creation primitives. */

299
static struct ipq *ip_frag_intern(struct ipq *qp_in)
L
Linus Torvalds 已提交
300 301
{
	struct ipq *qp;
302 303 304
#ifdef CONFIG_SMP
	struct hlist_node *n;
#endif
305 306
	unsigned int hash;

L
Linus Torvalds 已提交
307
	write_lock(&ipfrag_lock);
308 309
	hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr,
			 qp_in->protocol);
L
Linus Torvalds 已提交
310 311 312 313 314
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
	 * promoted read lock to write lock.
	 */
315
	hlist_for_each_entry(qp, n, &ipq_hash[hash], q.list) {
S
Stephen Hemminger 已提交
316 317 318 319 320
		if (qp->id == qp_in->id		&&
		    qp->saddr == qp_in->saddr	&&
		    qp->daddr == qp_in->daddr	&&
		    qp->protocol == qp_in->protocol &&
		    qp->user == qp_in->user) {
321
			atomic_inc(&qp->q.refcnt);
L
Linus Torvalds 已提交
322
			write_unlock(&ipfrag_lock);
323
			qp_in->q.last_in |= COMPLETE;
L
Linus Torvalds 已提交
324 325 326 327 328 329 330
			ipq_put(qp_in, NULL);
			return qp;
		}
	}
#endif
	qp = qp_in;

331 332
	if (!mod_timer(&qp->q.timer, jiffies + sysctl_ipfrag_time))
		atomic_inc(&qp->q.refcnt);
L
Linus Torvalds 已提交
333

334 335 336 337
	atomic_inc(&qp->q.refcnt);
	hlist_add_head(&qp->q.list, &ipq_hash[hash]);
	INIT_LIST_HEAD(&qp->q.lru_list);
	list_add_tail(&qp->q.lru_list, &ipq_lru_list);
L
Linus Torvalds 已提交
338 339 340 341 342 343
	ip_frag_nqueues++;
	write_unlock(&ipfrag_lock);
	return qp;
}

/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
344
static struct ipq *ip_frag_create(struct iphdr *iph, u32 user)
L
Linus Torvalds 已提交
345 346 347 348 349 350 351
{
	struct ipq *qp;

	if ((qp = frag_alloc_queue()) == NULL)
		goto out_nomem;

	qp->protocol = iph->protocol;
352
	qp->q.last_in = 0;
L
Linus Torvalds 已提交
353 354 355 356
	qp->id = iph->id;
	qp->saddr = iph->saddr;
	qp->daddr = iph->daddr;
	qp->user = user;
357 358 359
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
L
Linus Torvalds 已提交
360
	qp->iif = 0;
H
Herbert Xu 已提交
361
	qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
L
Linus Torvalds 已提交
362 363

	/* Initialize a timer for this entry. */
364 365 366 367 368
	init_timer(&qp->q.timer);
	qp->q.timer.data = (unsigned long) qp;	/* pointer to queue	*/
	qp->q.timer.function = ip_expire;		/* expire function	*/
	spin_lock_init(&qp->q.lock);
	atomic_set(&qp->q.refcnt, 1);
L
Linus Torvalds 已提交
369

370
	return ip_frag_intern(qp);
L
Linus Torvalds 已提交
371 372

out_nomem:
373
	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
L
Linus Torvalds 已提交
374 375 376 377 378 379 380 381
	return NULL;
}

/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
{
382
	__be16 id = iph->id;
383 384
	__be32 saddr = iph->saddr;
	__be32 daddr = iph->daddr;
L
Linus Torvalds 已提交
385
	__u8 protocol = iph->protocol;
386
	unsigned int hash;
L
Linus Torvalds 已提交
387
	struct ipq *qp;
388
	struct hlist_node *n;
L
Linus Torvalds 已提交
389 390

	read_lock(&ipfrag_lock);
391
	hash = ipqhashfn(id, saddr, daddr, protocol);
392
	hlist_for_each_entry(qp, n, &ipq_hash[hash], q.list) {
S
Stephen Hemminger 已提交
393 394 395 396 397
		if (qp->id == id		&&
		    qp->saddr == saddr	&&
		    qp->daddr == daddr	&&
		    qp->protocol == protocol &&
		    qp->user == user) {
398
			atomic_inc(&qp->q.refcnt);
L
Linus Torvalds 已提交
399 400 401 402 403 404
			read_unlock(&ipfrag_lock);
			return qp;
		}
	}
	read_unlock(&ipfrag_lock);

405
	return ip_frag_create(iph, user);
L
Linus Torvalds 已提交
406 407
}

H
Herbert Xu 已提交
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
/* Is the fragment too far ahead to be part of ipq? */
static inline int ip_frag_too_far(struct ipq *qp)
{
	struct inet_peer *peer = qp->peer;
	unsigned int max = sysctl_ipfrag_max_dist;
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

424
	rc = qp->q.fragments && (end - start) > max;
H
Herbert Xu 已提交
425 426 427 428 429 430 431 432 433 434 435 436

	if (rc) {
		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;

437 438
	if (!mod_timer(&qp->q.timer, jiffies + sysctl_ipfrag_time)) {
		atomic_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
439 440 441
		return -ETIMEDOUT;
	}

442
	fp = qp->q.fragments;
H
Herbert Xu 已提交
443 444 445 446 447 448
	do {
		struct sk_buff *xp = fp->next;
		frag_kfree_skb(fp, NULL);
		fp = xp;
	} while (fp);

449 450 451 452
	qp->q.last_in = 0;
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
H
Herbert Xu 已提交
453 454 455 456 457
	qp->iif = 0;

	return 0;
}

L
Linus Torvalds 已提交
458
/* Add new segment to existing queue. */
459
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
460 461
{
	struct sk_buff *prev, *next;
462
	struct net_device *dev;
L
Linus Torvalds 已提交
463 464
	int flags, offset;
	int ihl, end;
465
	int err = -ENOENT;
L
Linus Torvalds 已提交
466

467
	if (qp->q.last_in & COMPLETE)
L
Linus Torvalds 已提交
468 469
		goto err;

H
Herbert Xu 已提交
470
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
471 472
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
473 474 475 476
		ipq_kill(qp);
		goto err;
	}

477
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
478 479 480
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
481
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
482 483

	/* Determine the position of this fragment. */
484
	end = offset + skb->len - ihl;
485
	err = -EINVAL;
L
Linus Torvalds 已提交
486 487 488 489 490 491

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
		 * or have different end, the segment is corrrupted.
		 */
492 493
		if (end < qp->q.len ||
		    ((qp->q.last_in & LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
494
			goto err;
495 496
		qp->q.last_in |= LAST_IN;
		qp->q.len = end;
L
Linus Torvalds 已提交
497 498 499 500 501 502
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
503
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
504
			/* Some bits beyond end -> corruption. */
505
			if (qp->q.last_in & LAST_IN)
L
Linus Torvalds 已提交
506
				goto err;
507
			qp->q.len = end;
L
Linus Torvalds 已提交
508 509 510 511 512
		}
	}
	if (end == offset)
		goto err;

513
	err = -ENOMEM;
L
Linus Torvalds 已提交
514 515
	if (pskb_pull(skb, ihl) == NULL)
		goto err;
516 517 518

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
519 520 521 522 523 524 525
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
	prev = NULL;
526
	for (next = qp->q.fragments; next != NULL; next = next->next) {
L
Linus Torvalds 已提交
527 528 529 530 531 532 533 534 535 536 537 538 539 540
		if (FRAG_CB(next)->offset >= offset)
			break;	/* bingo! */
		prev = next;
	}

	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
		int i = (FRAG_CB(prev)->offset + prev->len) - offset;

		if (i > 0) {
			offset += i;
541
			err = -EINVAL;
L
Linus Torvalds 已提交
542 543
			if (end <= offset)
				goto err;
544
			err = -ENOMEM;
L
Linus Torvalds 已提交
545 546 547 548 549 550 551
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

552 553
	err = -ENOMEM;

L
Linus Torvalds 已提交
554 555 556 557 558 559 560 561 562 563
	while (next && FRAG_CB(next)->offset < end) {
		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

		if (i < next->len) {
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
			FRAG_CB(next)->offset += i;
564
			qp->q.meat -= i;
L
Linus Torvalds 已提交
565 566 567 568 569 570
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

571
			/* Old fragment is completely overridden with
L
Linus Torvalds 已提交
572 573 574 575 576 577 578
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
579
				qp->q.fragments = next;
L
Linus Torvalds 已提交
580

581
			qp->q.meat -= free_it->len;
L
Linus Torvalds 已提交
582 583 584 585 586 587 588 589 590 591 592
			frag_kfree_skb(free_it, NULL);
		}
	}

	FRAG_CB(skb)->offset = offset;

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
	if (prev)
		prev->next = skb;
	else
593
		qp->q.fragments = skb;
L
Linus Torvalds 已提交
594

595 596 597 598 599
	dev = skb->dev;
	if (dev) {
		qp->iif = dev->ifindex;
		skb->dev = NULL;
	}
600 601
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
L
Linus Torvalds 已提交
602 603
	atomic_add(skb->truesize, &ip_frag_mem);
	if (offset == 0)
604
		qp->q.last_in |= FIRST_IN;
L
Linus Torvalds 已提交
605

606
	if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len)
607 608
		return ip_frag_reasm(qp, prev, dev);

L
Linus Torvalds 已提交
609
	write_lock(&ipfrag_lock);
610
	list_move_tail(&qp->q.lru_list, &ipq_lru_list);
L
Linus Torvalds 已提交
611
	write_unlock(&ipfrag_lock);
612
	return -EINPROGRESS;
L
Linus Torvalds 已提交
613 614 615

err:
	kfree_skb(skb);
616
	return err;
L
Linus Torvalds 已提交
617 618 619 620 621
}


/* Build a new IP datagram from all its fragments. */

622 623
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
L
Linus Torvalds 已提交
624 625
{
	struct iphdr *iph;
626
	struct sk_buff *fp, *head = qp->q.fragments;
L
Linus Torvalds 已提交
627 628
	int len;
	int ihlen;
629
	int err;
L
Linus Torvalds 已提交
630 631 632

	ipq_kill(qp);

633 634 635 636 637 638 639 640 641 642 643
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);

		if (!fp)
			goto out_nomem;

		fp->next = head->next;
		prev->next = fp;

644 645
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
646

647 648
		kfree_skb(qp->q.fragments);
		qp->q.fragments = head;
649 650
	}

L
Linus Torvalds 已提交
651 652 653 654
	BUG_TRAP(head != NULL);
	BUG_TRAP(FRAG_CB(head)->offset == 0);

	/* Allocate a new buffer for the datagram. */
655
	ihlen = ip_hdrlen(head);
656
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
657

658
	err = -E2BIG;
S
Stephen Hemminger 已提交
659
	if (len > 65535)
L
Linus Torvalds 已提交
660 661 662
		goto out_oversize;

	/* Head of list must not be cloned. */
663
	err = -ENOMEM;
L
Linus Torvalds 已提交
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
	if (skb_shinfo(head)->frag_list) {
		struct sk_buff *clone;
		int i, plen = 0;

		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
		skb_shinfo(head)->frag_list = NULL;
		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
			plen += skb_shinfo(head)->frags[i].size;
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
		atomic_add(clone->truesize, &ip_frag_mem);
	}

	skb_shinfo(head)->frag_list = head->next;
691
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
692 693 694 695 696 697 698
	atomic_sub(head->truesize, &ip_frag_mem);

	for (fp=head->next; fp; fp = fp->next) {
		head->data_len += fp->len;
		head->len += fp->len;
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
699
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
700 701 702 703 704 705 706
			head->csum = csum_add(head->csum, fp->csum);
		head->truesize += fp->truesize;
		atomic_sub(fp->truesize, &ip_frag_mem);
	}

	head->next = NULL;
	head->dev = dev;
707
	head->tstamp = qp->q.stamp;
L
Linus Torvalds 已提交
708

709
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
710 711 712
	iph->frag_off = 0;
	iph->tot_len = htons(len);
	IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
713
	qp->q.fragments = NULL;
714
	return 0;
L
Linus Torvalds 已提交
715 716

out_nomem:
717
	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
718
			      "queue %p\n", qp);
L
Linus Torvalds 已提交
719 720 721 722 723 724 725 726
	goto out_fail;
out_oversize:
	if (net_ratelimit())
		printk(KERN_INFO
			"Oversized IP packet from %d.%d.%d.%d.\n",
			NIPQUAD(qp->saddr));
out_fail:
	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
727
	return err;
L
Linus Torvalds 已提交
728 729 730
}

/* Process an incoming IP datagram fragment. */
731
int ip_defrag(struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
732 733
{
	struct ipq *qp;
734

L
Linus Torvalds 已提交
735 736 737 738 739 740 741
	IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);

	/* Start by cleaning up the memory. */
	if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
		ip_evictor();

	/* Lookup (or create) queue header */
742
	if ((qp = ip_find(ip_hdr(skb), user)) != NULL) {
743
		int ret;
L
Linus Torvalds 已提交
744

745
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
746

747
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
748

749
		spin_unlock(&qp->q.lock);
L
Linus Torvalds 已提交
750
		ipq_put(qp, NULL);
751
		return ret;
L
Linus Torvalds 已提交
752 753 754 755
	}

	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
	kfree_skb(skb);
756
	return -ENOMEM;
L
Linus Torvalds 已提交
757 758
}

759
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
760 761 762 763 764 765 766 767 768 769 770
{
	ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
				 (jiffies ^ (jiffies >> 6)));

	init_timer(&ipfrag_secret_timer);
	ipfrag_secret_timer.function = ipfrag_secret_rebuild;
	ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
	add_timer(&ipfrag_secret_timer);
}

EXPORT_SYMBOL(ip_defrag);