ip_fragment.c 20.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

L
Linus Torvalds 已提交
60 61
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
62 63
	struct inet_frag_queue q;

64
	u8		ecn; /* RFC3168 support */
65
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
66 67 68
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
69 70
};

71
static u8 ip4_frag_ecn(u8 tos)
72
{
73
	return 1 << (tos & INET_ECN_MASK);
74 75
}

76
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
77

78 79 80
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

81

82
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
83 84
{
	struct ipq *qp = container_of(q, struct ipq, q);
85 86 87 88
	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
					       frags);
	struct net *net = container_of(ipv4, struct net, ipv4);

89
	const struct frag_v4_compare_key *key = a;
90

91 92
	q->key.v4 = *key;
	qp->ecn = 0;
93
	qp->peer = q->net->max_dist ?
94
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
95
		NULL;
96 97
}

98
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
99
{
100 101 102 103 104
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
105 106 107 108 109
}


/* Destruction primitives. */

110
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
111
{
112
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
113 114 115 116 117 118 119
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
120
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
121 122
}

123 124 125 126
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
127 128 129
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
130 131
}

L
Linus Torvalds 已提交
132 133 134
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
135
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
136
{
137
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
138
	const struct iphdr *iph;
139
	struct sk_buff *head;
140
	struct net *net;
141 142
	struct ipq *qp;
	int err;
143

144
	qp = container_of(frag, struct ipq, q);
145
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
146

147
	rcu_read_lock();
148
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
149

150
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
151 152 153
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
154
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
155

156
	head = qp->q.fragments;
157

158
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
159

160 161
	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
		goto out;
162

163 164 165
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
166

167

168 169 170
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
171
					   iph->tos, head->dev);
172 173 174 175 176 177 178 179 180 181
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

182 183 184 185 186
	skb_get(head);
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	kfree_skb(head);
	goto out_rcu_unlock;
187

L
Linus Torvalds 已提交
188
out:
189
	spin_unlock(&qp->q.lock);
190 191
out_rcu_unlock:
	rcu_read_unlock();
192
	ipq_put(qp);
L
Linus Torvalds 已提交
193 194
}

195 196 197
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
198 199
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
200
{
201 202 203 204 205 206 207 208
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
209
	struct inet_frag_queue *q;
210

211
	q = inet_frag_find(&net->ipv4.frags, &key);
212
	if (!q)
213
		return NULL;
214

215
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
216 217
}

H
Herbert Xu 已提交
218
/* Is the fragment too far ahead to be part of ipq? */
219
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
220 221
{
	struct inet_peer *peer = qp->peer;
222
	unsigned int max = qp->q.net->max_dist;
H
Herbert Xu 已提交
223 224 225 226 227 228 229 230 231 232 233
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

234
	rc = qp->q.fragments && (end - start) > max;
H
Herbert Xu 已提交
235 236

	if (rc) {
237 238 239
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
E
Eric Dumazet 已提交
240
		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
241 242 243 244 245 246 247 248
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;
249
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
250

251
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
252
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
253 254 255
		return -ETIMEDOUT;
	}

256
	fp = qp->q.fragments;
H
Herbert Xu 已提交
257 258
	do {
		struct sk_buff *xp = fp->next;
259 260 261

		sum_truesize += fp->truesize;
		kfree_skb(fp);
H
Herbert Xu 已提交
262 263
		fp = xp;
	} while (fp);
264
	sub_frag_mem_limit(qp->q.net, sum_truesize);
H
Herbert Xu 已提交
265

266
	qp->q.flags = 0;
267 268 269
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
270
	qp->q.fragments_tail = NULL;
H
Herbert Xu 已提交
271
	qp->iif = 0;
272
	qp->ecn = 0;
H
Herbert Xu 已提交
273 274 275 276

	return 0;
}

L
Linus Torvalds 已提交
277
/* Add new segment to existing queue. */
278
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
279 280
{
	struct sk_buff *prev, *next;
281
	struct net_device *dev;
282
	unsigned int fragsize;
L
Linus Torvalds 已提交
283 284
	int flags, offset;
	int ihl, end;
285
	int err = -ENOENT;
286
	u8 ecn;
L
Linus Torvalds 已提交
287

288
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
289 290
		goto err;

H
Herbert Xu 已提交
291
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
292 293
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
294 295 296 297
		ipq_kill(qp);
		goto err;
	}

298
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
299
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
300 301 302
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
303
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
304 305

	/* Determine the position of this fragment. */
306
	end = offset + skb->len - skb_network_offset(skb) - ihl;
307
	err = -EINVAL;
L
Linus Torvalds 已提交
308 309 310 311

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
312
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
313
		 */
314
		if (end < qp->q.len ||
315
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
316
			goto err;
317
		qp->q.flags |= INET_FRAG_LAST_IN;
318
		qp->q.len = end;
L
Linus Torvalds 已提交
319 320 321 322 323 324
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
325
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
326
			/* Some bits beyond end -> corruption. */
327
			if (qp->q.flags & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
328
				goto err;
329
			qp->q.len = end;
L
Linus Torvalds 已提交
330 331 332 333 334
		}
	}
	if (end == offset)
		goto err;

335
	err = -ENOMEM;
336
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
L
Linus Torvalds 已提交
337
		goto err;
338 339 340

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
341 342 343 344 345 346
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
347
	prev = qp->q.fragments_tail;
348
	if (!prev || prev->ip_defrag_offset < offset) {
349 350 351
		next = NULL;
		goto found;
	}
L
Linus Torvalds 已提交
352
	prev = NULL;
353
	for (next = qp->q.fragments; next != NULL; next = next->next) {
354
		if (next->ip_defrag_offset >= offset)
L
Linus Torvalds 已提交
355 356 357 358
			break;	/* bingo! */
		prev = next;
	}

359
found:
L
Linus Torvalds 已提交
360 361 362 363 364
	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
365
		int i = (prev->ip_defrag_offset + prev->len) - offset;
L
Linus Torvalds 已提交
366 367 368

		if (i > 0) {
			offset += i;
369
			err = -EINVAL;
L
Linus Torvalds 已提交
370 371
			if (end <= offset)
				goto err;
372
			err = -ENOMEM;
L
Linus Torvalds 已提交
373 374 375 376 377 378 379
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

380 381
	err = -ENOMEM;

382 383
	while (next && next->ip_defrag_offset < end) {
		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
L
Linus Torvalds 已提交
384 385

		if (i < next->len) {
386 387
			int delta = -next->truesize;

L
Linus Torvalds 已提交
388 389 390 391 392
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
393 394 395
			delta += next->truesize;
			if (delta)
				add_frag_mem_limit(qp->q.net, delta);
396
			next->ip_defrag_offset += i;
397
			qp->q.meat -= i;
L
Linus Torvalds 已提交
398 399 400 401 402 403
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

404
			/* Old fragment is completely overridden with
L
Linus Torvalds 已提交
405 406 407 408 409 410 411
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
412
				qp->q.fragments = next;
L
Linus Torvalds 已提交
413

414
			qp->q.meat -= free_it->len;
415
			sub_frag_mem_limit(qp->q.net, free_it->truesize);
416
			kfree_skb(free_it);
L
Linus Torvalds 已提交
417 418 419
		}
	}

420 421 422 423 424 425 426
	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
	dev = skb->dev;
	if (dev)
		qp->iif = dev->ifindex;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
	skb->ip_defrag_offset = offset;
L
Linus Torvalds 已提交
427 428 429

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
430 431
	if (!next)
		qp->q.fragments_tail = skb;
L
Linus Torvalds 已提交
432 433 434
	if (prev)
		prev->next = skb;
	else
435
		qp->q.fragments = skb;
L
Linus Torvalds 已提交
436

437 438
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
439
	qp->ecn |= ecn;
440
	add_frag_mem_limit(qp->q.net, skb->truesize);
L
Linus Torvalds 已提交
441
	if (offset == 0)
442
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
443

444 445 446 447 448
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

449
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
450 451
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
452

453
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
454 455
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
456

457 458 459 460 461 462 463
		skb->_skb_refdst = 0UL;
		err = ip_frag_reasm(qp, prev, dev);
		skb->_skb_refdst = orefdst;
		return err;
	}

	skb_dst_drop(skb);
464
	return -EINPROGRESS;
L
Linus Torvalds 已提交
465 466 467

err:
	kfree_skb(skb);
468
	return err;
L
Linus Torvalds 已提交
469 470 471 472 473
}


/* Build a new IP datagram from all its fragments. */

474 475
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
L
Linus Torvalds 已提交
476
{
477
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
478
	struct iphdr *iph;
479
	struct sk_buff *fp, *head = qp->q.fragments;
L
Linus Torvalds 已提交
480 481
	int len;
	int ihlen;
482
	int err;
483
	u8 ecn;
L
Linus Torvalds 已提交
484 485 486

	ipq_kill(qp);

487
	ecn = ip_frag_ecn_table[qp->ecn];
488 489 490 491
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
492 493 494 495 496 497 498 499
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
500 501
		if (!fp->next)
			qp->q.fragments_tail = fp;
502 503
		prev->next = fp;

504 505
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
506

507
		consume_skb(qp->q.fragments);
508
		qp->q.fragments = head;
509 510
	}

511
	WARN_ON(!head);
512
	WARN_ON(head->ip_defrag_offset != 0);
L
Linus Torvalds 已提交
513 514

	/* Allocate a new buffer for the datagram. */
515
	ihlen = ip_hdrlen(head);
516
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
517

518
	err = -E2BIG;
S
Stephen Hemminger 已提交
519
	if (len > 65535)
L
Linus Torvalds 已提交
520 521 522
		goto out_oversize;

	/* Head of list must not be cloned. */
523
	if (skb_unclone(head, GFP_ATOMIC))
L
Linus Torvalds 已提交
524 525 526 527 528
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
529
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
530 531 532
		struct sk_buff *clone;
		int i, plen = 0;

533 534
		clone = alloc_skb(0, GFP_ATOMIC);
		if (!clone)
L
Linus Torvalds 已提交
535 536 537 538
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
539
		skb_frag_list_init(head);
E
Eric Dumazet 已提交
540 541
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
L
Linus Torvalds 已提交
542 543 544 545 546
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
547
		add_frag_mem_limit(qp->q.net, clone->truesize);
L
Linus Torvalds 已提交
548 549
	}

550
	skb_shinfo(head)->frag_list = head->next;
551
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
552

553 554 555
	for (fp=head->next; fp; fp = fp->next) {
		head->data_len += fp->len;
		head->len += fp->len;
L
Linus Torvalds 已提交
556 557
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
558
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
559
			head->csum = csum_add(head->csum, fp->csum);
560
		head->truesize += fp->truesize;
L
Linus Torvalds 已提交
561
	}
562
	sub_frag_mem_limit(qp->q.net, head->truesize);
L
Linus Torvalds 已提交
563 564 565

	head->next = NULL;
	head->dev = dev;
566
	head->tstamp = qp->q.stamp;
567
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
568

569
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
570
	iph->tot_len = htons(len);
571
	iph->tos |= ecn;
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

588 589
	ip_send_check(iph);

E
Eric Dumazet 已提交
590
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
591
	qp->q.fragments = NULL;
592
	qp->q.fragments_tail = NULL;
593
	return 0;
L
Linus Torvalds 已提交
594 595

out_nomem:
596
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
597
	err = -ENOMEM;
L
Linus Torvalds 已提交
598 599
	goto out_fail;
out_oversize:
600
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
601
out_fail:
E
Eric Dumazet 已提交
602
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
603
	return err;
L
Linus Torvalds 已提交
604 605 606
}

/* Process an incoming IP datagram fragment. */
607
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
608
{
609
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
610
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
611
	struct ipq *qp;
612

E
Eric Dumazet 已提交
613
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
614
	skb_orphan(skb);
L
Linus Torvalds 已提交
615 616

	/* Lookup (or create) queue header */
617
	qp = ip_find(net, ip_hdr(skb), user, vif);
618
	if (qp) {
619
		int ret;
L
Linus Torvalds 已提交
620

621
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
622

623
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
624

625
		spin_unlock(&qp->q.lock);
626
		ipq_put(qp);
627
		return ret;
L
Linus Torvalds 已提交
628 629
	}

E
Eric Dumazet 已提交
630
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
631
	kfree_skb(skb);
632
	return -ENOMEM;
L
Linus Torvalds 已提交
633
}
E
Eric Dumazet 已提交
634
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
635

636
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
637
{
638
	struct iphdr iph;
639
	int netoff;
640 641 642 643 644
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

645 646 647
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
648 649
		return skb;

650
	if (iph.ihl < 5 || iph.version != 4)
651
		return skb;
652 653

	len = ntohs(iph.tot_len);
654
	if (skb->len < netoff + len || len < (iph.ihl * 4))
655 656
		return skb;

657
	if (ip_is_fragment(&iph)) {
658 659
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
660
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
661
				return skb;
662
			if (pskb_trim_rcsum(skb, netoff + len))
663 664
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
665
			if (ip_defrag(net, skb, user))
666
				return NULL;
667
			skb_clear_hash(skb);
668 669 670 671 672 673
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

674
#ifdef CONFIG_SYSCTL
675
static int dist_min;
676

677
static struct ctl_table ip4_frags_ns_ctl_table[] = {
678 679
	{
		.procname	= "ipfrag_high_thresh",
680
		.data		= &init_net.ipv4.frags.high_thresh,
681
		.maxlen		= sizeof(unsigned long),
682
		.mode		= 0644,
683
		.proc_handler	= proc_doulongvec_minmax,
684
		.extra1		= &init_net.ipv4.frags.low_thresh
685 686 687
	},
	{
		.procname	= "ipfrag_low_thresh",
688
		.data		= &init_net.ipv4.frags.low_thresh,
689
		.maxlen		= sizeof(unsigned long),
690
		.mode		= 0644,
691
		.proc_handler	= proc_doulongvec_minmax,
692
		.extra2		= &init_net.ipv4.frags.high_thresh
693 694 695
	},
	{
		.procname	= "ipfrag_time",
696
		.data		= &init_net.ipv4.frags.timeout,
697 698
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
699
		.proc_handler	= proc_dointvec_jiffies,
700
	},
701 702 703 704 705 706
	{
		.procname	= "ipfrag_max_dist",
		.data		= &init_net.ipv4.frags.max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
707
		.extra1		= &dist_min,
708
	},
709 710 711
	{ }
};

712 713
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
714
static struct ctl_table ip4_frags_ctl_table[] = {
715 716
	{
		.procname	= "ipfrag_secret_interval",
717
		.data		= &ip4_frags_secret_interval_unused,
718 719
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
720
		.proc_handler	= proc_dointvec_jiffies,
721 722 723 724
	},
	{ }
};

725
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
726
{
727
	struct ctl_table *table;
728 729
	struct ctl_table_header *hdr;

730
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
731
	if (!net_eq(net, &init_net)) {
732
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
733
		if (!table)
734 735
			goto err_alloc;

736
		table[0].data = &net->ipv4.frags.high_thresh;
737 738
		table[0].extra1 = &net->ipv4.frags.low_thresh;
		table[0].extra2 = &init_net.ipv4.frags.high_thresh;
739
		table[1].data = &net->ipv4.frags.low_thresh;
740
		table[1].extra2 = &net->ipv4.frags.high_thresh;
741
		table[2].data = &net->ipv4.frags.timeout;
742
		table[3].data = &net->ipv4.frags.max_dist;
743 744
	}

745
	hdr = register_net_sysctl(net, "net/ipv4", table);
746
	if (!hdr)
747 748 749 750 751 752
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
753
	if (!net_eq(net, &init_net))
754 755 756 757 758
		kfree(table);
err_alloc:
	return -ENOMEM;
}

759
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
760 761 762 763 764 765
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
766
}
767

768
static void __init ip4_frags_ctl_register(void)
769
{
770
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
771
}
772
#else
773
static int ip4_frags_ns_ctl_register(struct net *net)
774 775 776
{
	return 0;
}
777

778
static void ip4_frags_ns_ctl_unregister(struct net *net)
779 780
{
}
781

782
static void __init ip4_frags_ctl_register(void)
783 784
{
}
785 786
#endif

787
static int __net_init ipv4_frags_init_net(struct net *net)
788
{
789 790
	int res;

791 792 793 794 795 796 797 798 799 800 801 802 803
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
804
	 */
805 806
	net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
807 808 809 810 811 812 813
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

814
	net->ipv4.frags.max_dist = 64;
815
	net->ipv4.frags.f = &ip4_frags;
816

817 818 819 820 821
	res = inet_frags_init_net(&net->ipv4.frags);
	if (res < 0)
		return res;
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
822
		inet_frags_exit_net(&net->ipv4.frags);
823
	return res;
824 825
}

826
static void __net_exit ipv4_frags_exit_net(struct net *net)
827
{
828
	ip4_frags_ns_ctl_unregister(net);
829
	inet_frags_exit_net(&net->ipv4.frags);
830 831 832 833 834 835 836
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

870
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
871
{
872
	ip4_frags.constructor = ip4_frag_init;
873 874
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
875
	ip4_frags.frag_expire = ip_expire;
876
	ip4_frags.frags_cache_name = ip_frag_cache_name;
877
	ip4_frags.rhash_params = ip4_rhash_params;
878 879
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
880 881
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
882
}