ip_fragment.c 20.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
8
 *
L
Linus Torvalds 已提交
9
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
10
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

24 25
#define pr_fmt(fmt) "IPv4: " fmt

H
Herbert Xu 已提交
26
#include <linux/compiler.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
38
#include <linux/slab.h>
39 40
#include <net/route.h>
#include <net/dst.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
H
Herbert Xu 已提交
45
#include <net/inetpeer.h>
46
#include <net/inet_frag.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
51
#include <net/inet_ecn.h>
52
#include <net/l3mdev.h>
L
Linus Torvalds 已提交
53 54 55 56 57

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */
58
static const char ip_frag_cache_name[] = "ip4-frags";
H
Herbert Xu 已提交
59

L
Linus Torvalds 已提交
60 61
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
62 63
	struct inet_frag_queue q;

64
	u8		ecn; /* RFC3168 support */
65
	u16		max_df_size; /* largest frag with DF set seen */
H
Herbert Xu 已提交
66 67 68
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
L
Linus Torvalds 已提交
69 70
};

71
static u8 ip4_frag_ecn(u8 tos)
72
{
73
	return 1 << (tos & INET_ECN_MASK);
74 75
}

76
static struct inet_frags ip4_frags;
L
Linus Torvalds 已提交
77

78 79 80
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

81

82
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
83 84
{
	struct ipq *qp = container_of(q, struct ipq, q);
85 86 87 88
	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
					       frags);
	struct net *net = container_of(ipv4, struct net, ipv4);

89
	const struct frag_v4_compare_key *key = a;
90

91 92
	q->key.v4 = *key;
	qp->ecn = 0;
93
	qp->peer = q->net->max_dist ?
94
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
95
		NULL;
96 97
}

98
static void ip4_frag_free(struct inet_frag_queue *q)
L
Linus Torvalds 已提交
99
{
100 101 102 103 104
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
L
Linus Torvalds 已提交
105 106 107 108 109
}


/* Destruction primitives. */

110
static void ipq_put(struct ipq *ipq)
L
Linus Torvalds 已提交
111
{
112
	inet_frag_put(&ipq->q);
L
Linus Torvalds 已提交
113 114 115 116 117 118 119
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
120
	inet_frag_kill(&ipq->q);
L
Linus Torvalds 已提交
121 122
}

123 124 125 126
static bool frag_expire_skip_icmp(u32 user)
{
	return user == IP_DEFRAG_AF_PACKET ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
127 128 129
					 __IP_DEFRAG_CONNTRACK_IN_END) ||
	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
130 131
}

L
Linus Torvalds 已提交
132 133 134
/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
135
static void ip_expire(struct timer_list *t)
L
Linus Torvalds 已提交
136
{
137
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
138
	const struct iphdr *iph;
139
	struct sk_buff *head;
140
	struct net *net;
141 142
	struct ipq *qp;
	int err;
143

144
	qp = container_of(frag, struct ipq, q);
145
	net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
146

147
	rcu_read_lock();
148
	spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
149

150
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
151 152 153
		goto out;

	ipq_kill(qp);
E
Eric Dumazet 已提交
154
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
155

156
	head = qp->q.fragments;
157

158
	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
159

160 161
	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
		goto out;
162

163 164 165
	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
166

167

168 169 170
	/* skb has no dst, perform route lookup again */
	iph = ip_hdr(head);
	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
171
					   iph->tos, head->dev);
172 173 174 175 176 177 178 179 180 181
	if (err)
		goto out;

	/* Only an end host needs to send an ICMP
	 * "Fragment Reassembly Timeout" message, per RFC792.
	 */
	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

182 183 184 185 186
	skb_get(head);
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	kfree_skb(head);
	goto out_rcu_unlock;
187

L
Linus Torvalds 已提交
188
out:
189
	spin_unlock(&qp->q.lock);
190 191
out_rcu_unlock:
	rcu_read_unlock();
192
	ipq_put(qp);
L
Linus Torvalds 已提交
193 194
}

195 196 197
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
198 199
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
			   u32 user, int vif)
L
Linus Torvalds 已提交
200
{
201 202 203 204 205 206 207 208
	struct frag_v4_compare_key key = {
		.saddr = iph->saddr,
		.daddr = iph->daddr,
		.user = user,
		.vif = vif,
		.id = iph->id,
		.protocol = iph->protocol,
	};
209
	struct inet_frag_queue *q;
210

211
	q = inet_frag_find(&net->ipv4.frags, &key);
212
	if (!q)
213
		return NULL;
214

215
	return container_of(q, struct ipq, q);
L
Linus Torvalds 已提交
216 217
}

H
Herbert Xu 已提交
218
/* Is the fragment too far ahead to be part of ipq? */
219
static int ip_frag_too_far(struct ipq *qp)
H
Herbert Xu 已提交
220 221
{
	struct inet_peer *peer = qp->peer;
222
	unsigned int max = qp->q.net->max_dist;
H
Herbert Xu 已提交
223 224 225 226 227 228 229 230 231 232 233
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

234
	rc = qp->q.fragments && (end - start) > max;
H
Herbert Xu 已提交
235 236

	if (rc) {
237 238 239
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
E
Eric Dumazet 已提交
240
		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
H
Herbert Xu 已提交
241 242 243 244 245 246 247 248
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;
249
	unsigned int sum_truesize = 0;
H
Herbert Xu 已提交
250

251
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
252
		refcount_inc(&qp->q.refcnt);
H
Herbert Xu 已提交
253 254 255
		return -ETIMEDOUT;
	}

256
	fp = qp->q.fragments;
H
Herbert Xu 已提交
257 258
	do {
		struct sk_buff *xp = fp->next;
259 260 261

		sum_truesize += fp->truesize;
		kfree_skb(fp);
H
Herbert Xu 已提交
262 263
		fp = xp;
	} while (fp);
264
	sub_frag_mem_limit(qp->q.net, sum_truesize);
H
Herbert Xu 已提交
265

266
	qp->q.flags = 0;
267 268 269
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
270
	qp->q.fragments_tail = NULL;
H
Herbert Xu 已提交
271
	qp->iif = 0;
272
	qp->ecn = 0;
H
Herbert Xu 已提交
273 274 275 276

	return 0;
}

L
Linus Torvalds 已提交
277
/* Add new segment to existing queue. */
278
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
L
Linus Torvalds 已提交
279 280
{
	struct sk_buff *prev, *next;
281
	struct net_device *dev;
282
	unsigned int fragsize;
L
Linus Torvalds 已提交
283 284
	int flags, offset;
	int ihl, end;
285
	int err = -ENOENT;
286
	u8 ecn;
L
Linus Torvalds 已提交
287

288
	if (qp->q.flags & INET_FRAG_COMPLETE)
L
Linus Torvalds 已提交
289 290
		goto err;

H
Herbert Xu 已提交
291
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
292 293
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
H
Herbert Xu 已提交
294 295 296 297
		ipq_kill(qp);
		goto err;
	}

298
	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
299
	offset = ntohs(ip_hdr(skb)->frag_off);
L
Linus Torvalds 已提交
300 301 302
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
303
	ihl = ip_hdrlen(skb);
L
Linus Torvalds 已提交
304 305

	/* Determine the position of this fragment. */
306
	end = offset + skb->len - skb_network_offset(skb) - ihl;
307
	err = -EINVAL;
L
Linus Torvalds 已提交
308 309 310 311

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
312
		 * or have different end, the segment is corrupted.
L
Linus Torvalds 已提交
313
		 */
314
		if (end < qp->q.len ||
315
		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
L
Linus Torvalds 已提交
316
			goto err;
317
		qp->q.flags |= INET_FRAG_LAST_IN;
318
		qp->q.len = end;
L
Linus Torvalds 已提交
319 320 321 322 323 324
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
325
		if (end > qp->q.len) {
L
Linus Torvalds 已提交
326
			/* Some bits beyond end -> corruption. */
327
			if (qp->q.flags & INET_FRAG_LAST_IN)
L
Linus Torvalds 已提交
328
				goto err;
329
			qp->q.len = end;
L
Linus Torvalds 已提交
330 331 332 333 334
		}
	}
	if (end == offset)
		goto err;

335
	err = -ENOMEM;
336
	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
L
Linus Torvalds 已提交
337
		goto err;
338 339 340

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
L
Linus Torvalds 已提交
341 342 343 344 345 346
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
347
	prev = qp->q.fragments_tail;
348
	if (!prev || prev->ip_defrag_offset < offset) {
349 350 351
		next = NULL;
		goto found;
	}
L
Linus Torvalds 已提交
352
	prev = NULL;
353
	for (next = qp->q.fragments; next != NULL; next = next->next) {
354
		if (next->ip_defrag_offset >= offset)
L
Linus Torvalds 已提交
355 356 357 358
			break;	/* bingo! */
		prev = next;
	}

359
found:
L
Linus Torvalds 已提交
360 361 362 363 364
	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
365
		int i = (prev->ip_defrag_offset + prev->len) - offset;
L
Linus Torvalds 已提交
366 367 368

		if (i > 0) {
			offset += i;
369
			err = -EINVAL;
L
Linus Torvalds 已提交
370 371
			if (end <= offset)
				goto err;
372
			err = -ENOMEM;
L
Linus Torvalds 已提交
373 374 375 376 377 378 379
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

380 381
	err = -ENOMEM;

382 383
	while (next && next->ip_defrag_offset < end) {
		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
L
Linus Torvalds 已提交
384 385 386 387 388 389 390

		if (i < next->len) {
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
391
			next->ip_defrag_offset += i;
392
			qp->q.meat -= i;
L
Linus Torvalds 已提交
393 394 395 396 397 398
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

399
			/* Old fragment is completely overridden with
L
Linus Torvalds 已提交
400 401 402 403 404 405 406
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
407
				qp->q.fragments = next;
L
Linus Torvalds 已提交
408

409
			qp->q.meat -= free_it->len;
410
			sub_frag_mem_limit(qp->q.net, free_it->truesize);
411
			kfree_skb(free_it);
L
Linus Torvalds 已提交
412 413 414
		}
	}

415 416 417 418 419 420 421
	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
	dev = skb->dev;
	if (dev)
		qp->iif = dev->ifindex;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
	skb->ip_defrag_offset = offset;
L
Linus Torvalds 已提交
422 423 424

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
425 426
	if (!next)
		qp->q.fragments_tail = skb;
L
Linus Torvalds 已提交
427 428 429
	if (prev)
		prev->next = skb;
	else
430
		qp->q.fragments = skb;
L
Linus Torvalds 已提交
431

432 433
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
434
	qp->ecn |= ecn;
435
	add_frag_mem_limit(qp->q.net, skb->truesize);
L
Linus Torvalds 已提交
436
	if (offset == 0)
437
		qp->q.flags |= INET_FRAG_FIRST_IN;
L
Linus Torvalds 已提交
438

439 440 441 442 443
	fragsize = skb->len + ihl;

	if (fragsize > qp->q.max_size)
		qp->q.max_size = fragsize;

444
	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
445 446
	    fragsize > qp->max_df_size)
		qp->max_df_size = fragsize;
447

448
	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
449 450
	    qp->q.meat == qp->q.len) {
		unsigned long orefdst = skb->_skb_refdst;
451

452 453 454 455 456 457 458
		skb->_skb_refdst = 0UL;
		err = ip_frag_reasm(qp, prev, dev);
		skb->_skb_refdst = orefdst;
		return err;
	}

	skb_dst_drop(skb);
459
	return -EINPROGRESS;
L
Linus Torvalds 已提交
460 461 462

err:
	kfree_skb(skb);
463
	return err;
L
Linus Torvalds 已提交
464 465 466 467 468
}


/* Build a new IP datagram from all its fragments. */

469 470
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
L
Linus Torvalds 已提交
471
{
472
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
L
Linus Torvalds 已提交
473
	struct iphdr *iph;
474
	struct sk_buff *fp, *head = qp->q.fragments;
L
Linus Torvalds 已提交
475 476
	int len;
	int ihlen;
477
	int err;
478
	u8 ecn;
L
Linus Torvalds 已提交
479 480 481

	ipq_kill(qp);

482
	ecn = ip_frag_ecn_table[qp->ecn];
483 484 485 486
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
487 488 489 490 491 492 493 494
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
495 496
		if (!fp->next)
			qp->q.fragments_tail = fp;
497 498
		prev->next = fp;

499 500
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
501

502
		consume_skb(qp->q.fragments);
503
		qp->q.fragments = head;
504 505
	}

506
	WARN_ON(!head);
507
	WARN_ON(head->ip_defrag_offset != 0);
L
Linus Torvalds 已提交
508 509

	/* Allocate a new buffer for the datagram. */
510
	ihlen = ip_hdrlen(head);
511
	len = ihlen + qp->q.len;
L
Linus Torvalds 已提交
512

513
	err = -E2BIG;
S
Stephen Hemminger 已提交
514
	if (len > 65535)
L
Linus Torvalds 已提交
515 516 517
		goto out_oversize;

	/* Head of list must not be cloned. */
518
	if (skb_unclone(head, GFP_ATOMIC))
L
Linus Torvalds 已提交
519 520 521 522 523
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
524
	if (skb_has_frag_list(head)) {
L
Linus Torvalds 已提交
525 526 527
		struct sk_buff *clone;
		int i, plen = 0;

528 529
		clone = alloc_skb(0, GFP_ATOMIC);
		if (!clone)
L
Linus Torvalds 已提交
530 531 532 533
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
534
		skb_frag_list_init(head);
E
Eric Dumazet 已提交
535 536
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
L
Linus Torvalds 已提交
537 538 539 540 541
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
542
		add_frag_mem_limit(qp->q.net, clone->truesize);
L
Linus Torvalds 已提交
543 544
	}

545
	skb_shinfo(head)->frag_list = head->next;
546
	skb_push(head, head->data - skb_network_header(head));
L
Linus Torvalds 已提交
547

548 549 550
	for (fp=head->next; fp; fp = fp->next) {
		head->data_len += fp->len;
		head->len += fp->len;
L
Linus Torvalds 已提交
551 552
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
553
		else if (head->ip_summed == CHECKSUM_COMPLETE)
L
Linus Torvalds 已提交
554
			head->csum = csum_add(head->csum, fp->csum);
555
		head->truesize += fp->truesize;
L
Linus Torvalds 已提交
556
	}
557
	sub_frag_mem_limit(qp->q.net, head->truesize);
L
Linus Torvalds 已提交
558 559 560

	head->next = NULL;
	head->dev = dev;
561
	head->tstamp = qp->q.stamp;
562
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
L
Linus Torvalds 已提交
563

564
	iph = ip_hdr(head);
L
Linus Torvalds 已提交
565
	iph->tot_len = htons(len);
566
	iph->tos |= ecn;
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582

	/* When we set IP_DF on a refragmented skb we must also force a
	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
	 * original sender only sent fragments of size f (where f < s).
	 *
	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
	 * frag seen to avoid sending tiny DF-fragments in case skb was built
	 * from one very small df-fragment and one large non-df frag.
	 */
	if (qp->max_df_size == qp->q.max_size) {
		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
		iph->frag_off = htons(IP_DF);
	} else {
		iph->frag_off = 0;
	}

583 584
	ip_send_check(iph);

E
Eric Dumazet 已提交
585
	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
586
	qp->q.fragments = NULL;
587
	qp->q.fragments_tail = NULL;
588
	return 0;
L
Linus Torvalds 已提交
589 590

out_nomem:
591
	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
592
	err = -ENOMEM;
L
Linus Torvalds 已提交
593 594
	goto out_fail;
out_oversize:
595
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
L
Linus Torvalds 已提交
596
out_fail:
E
Eric Dumazet 已提交
597
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
598
	return err;
L
Linus Torvalds 已提交
599 600 601
}

/* Process an incoming IP datagram fragment. */
602
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
L
Linus Torvalds 已提交
603
{
604
	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
605
	int vif = l3mdev_master_ifindex_rcu(dev);
L
Linus Torvalds 已提交
606
	struct ipq *qp;
607

E
Eric Dumazet 已提交
608
	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
609
	skb_orphan(skb);
L
Linus Torvalds 已提交
610 611

	/* Lookup (or create) queue header */
612
	qp = ip_find(net, ip_hdr(skb), user, vif);
613
	if (qp) {
614
		int ret;
L
Linus Torvalds 已提交
615

616
		spin_lock(&qp->q.lock);
L
Linus Torvalds 已提交
617

618
		ret = ip_frag_queue(qp, skb);
L
Linus Torvalds 已提交
619

620
		spin_unlock(&qp->q.lock);
621
		ipq_put(qp);
622
		return ret;
L
Linus Torvalds 已提交
623 624
	}

E
Eric Dumazet 已提交
625
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
L
Linus Torvalds 已提交
626
	kfree_skb(skb);
627
	return -ENOMEM;
L
Linus Torvalds 已提交
628
}
E
Eric Dumazet 已提交
629
EXPORT_SYMBOL(ip_defrag);
L
Linus Torvalds 已提交
630

631
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
632
{
633
	struct iphdr iph;
634
	int netoff;
635 636 637 638 639
	u32 len;

	if (skb->protocol != htons(ETH_P_IP))
		return skb;

640 641 642
	netoff = skb_network_offset(skb);

	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
643 644
		return skb;

645
	if (iph.ihl < 5 || iph.version != 4)
646
		return skb;
647 648

	len = ntohs(iph.tot_len);
649
	if (skb->len < netoff + len || len < (iph.ihl * 4))
650 651
		return skb;

652
	if (ip_is_fragment(&iph)) {
653 654
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (skb) {
655
			if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
656
				return skb;
657
			if (pskb_trim_rcsum(skb, netoff + len))
658 659
				return skb;
			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
660
			if (ip_defrag(net, skb, user))
661
				return NULL;
662
			skb_clear_hash(skb);
663 664 665 666 667 668
		}
	}
	return skb;
}
EXPORT_SYMBOL(ip_check_defrag);

669
#ifdef CONFIG_SYSCTL
670
static long zero;
671

672
static struct ctl_table ip4_frags_ns_ctl_table[] = {
673 674
	{
		.procname	= "ipfrag_high_thresh",
675
		.data		= &init_net.ipv4.frags.high_thresh,
676
		.maxlen		= sizeof(unsigned long),
677
		.mode		= 0644,
678
		.proc_handler	= proc_doulongvec_minmax,
679
		.extra1		= &init_net.ipv4.frags.low_thresh
680 681 682
	},
	{
		.procname	= "ipfrag_low_thresh",
683
		.data		= &init_net.ipv4.frags.low_thresh,
684
		.maxlen		= sizeof(unsigned long),
685
		.mode		= 0644,
686
		.proc_handler	= proc_doulongvec_minmax,
687 688
		.extra1		= &zero,
		.extra2		= &init_net.ipv4.frags.high_thresh
689 690 691
	},
	{
		.procname	= "ipfrag_time",
692
		.data		= &init_net.ipv4.frags.timeout,
693 694
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
695
		.proc_handler	= proc_dointvec_jiffies,
696
	},
697 698 699 700 701 702 703 704
	{
		.procname	= "ipfrag_max_dist",
		.data		= &init_net.ipv4.frags.max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero
	},
705 706 707
	{ }
};

708 709
/* secret interval has been deprecated */
static int ip4_frags_secret_interval_unused;
710
static struct ctl_table ip4_frags_ctl_table[] = {
711 712
	{
		.procname	= "ipfrag_secret_interval",
713
		.data		= &ip4_frags_secret_interval_unused,
714 715
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
716
		.proc_handler	= proc_dointvec_jiffies,
717 718 719 720
	},
	{ }
};

721
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
722
{
723
	struct ctl_table *table;
724 725
	struct ctl_table_header *hdr;

726
	table = ip4_frags_ns_ctl_table;
O
Octavian Purdila 已提交
727
	if (!net_eq(net, &init_net)) {
728
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
729
		if (!table)
730 731
			goto err_alloc;

732
		table[0].data = &net->ipv4.frags.high_thresh;
733 734
		table[0].extra1 = &net->ipv4.frags.low_thresh;
		table[0].extra2 = &init_net.ipv4.frags.high_thresh;
735
		table[1].data = &net->ipv4.frags.low_thresh;
736
		table[1].extra2 = &net->ipv4.frags.high_thresh;
737
		table[2].data = &net->ipv4.frags.timeout;
738
		table[3].data = &net->ipv4.frags.max_dist;
739 740
	}

741
	hdr = register_net_sysctl(net, "net/ipv4", table);
742
	if (!hdr)
743 744 745 746 747 748
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
O
Octavian Purdila 已提交
749
	if (!net_eq(net, &init_net))
750 751 752 753 754
		kfree(table);
err_alloc:
	return -ENOMEM;
}

755
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
756 757 758 759 760 761
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
762
}
763

764
static void __init ip4_frags_ctl_register(void)
765
{
766
	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
767
}
768
#else
769
static int ip4_frags_ns_ctl_register(struct net *net)
770 771 772
{
	return 0;
}
773

774
static void ip4_frags_ns_ctl_unregister(struct net *net)
775 776
{
}
777

778
static void __init ip4_frags_ctl_register(void)
779 780
{
}
781 782
#endif

783
static int __net_init ipv4_frags_init_net(struct net *net)
784
{
785 786
	int res;

787 788 789 790 791 792 793 794 795 796 797 798 799
	/* Fragment cache limits.
	 *
	 * The fragment memory accounting code, (tries to) account for
	 * the real memory usage, by measuring both the size of frag
	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
	 * and the SKB's truesize.
	 *
	 * A 64K fragment consumes 129736 bytes (44*2944)+200
	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
	 *
	 * We will commit 4MB at one time. Should we cross that limit
	 * we will prune down to 3MB, making room for approx 8 big 64K
	 * fragments 8x128k.
800
	 */
801 802
	net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
803 804 805 806 807 808 809
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

810
	net->ipv4.frags.max_dist = 64;
811
	net->ipv4.frags.f = &ip4_frags;
812

813 814 815 816 817
	res = inet_frags_init_net(&net->ipv4.frags);
	if (res < 0)
		return res;
	res = ip4_frags_ns_ctl_register(net);
	if (res < 0)
818
		inet_frags_exit_net(&net->ipv4.frags);
819
	return res;
820 821
}

822
static void __net_exit ipv4_frags_exit_net(struct net *net)
823
{
824
	ip4_frags_ns_ctl_unregister(net);
825
	inet_frags_exit_net(&net->ipv4.frags);
826 827 828 829 830 831 832
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865

static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
	return jhash2(data,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
	const struct inet_frag_queue *fq = data;

	return jhash2((const u32 *)&fq->key.v4,
		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}

static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
	const struct frag_v4_compare_key *key = arg->key;
	const struct inet_frag_queue *fq = ptr;

	return !!memcmp(&fq->key, key, sizeof(*key));
}

static const struct rhashtable_params ip4_rhash_params = {
	.head_offset		= offsetof(struct inet_frag_queue, node),
	.key_offset		= offsetof(struct inet_frag_queue, key),
	.key_len		= sizeof(struct frag_v4_compare_key),
	.hashfn			= ip4_key_hashfn,
	.obj_hashfn		= ip4_obj_hashfn,
	.obj_cmpfn		= ip4_obj_cmpfn,
	.automatic_shrinking	= true,
};

866
void __init ipfrag_init(void)
L
Linus Torvalds 已提交
867
{
868
	ip4_frags.constructor = ip4_frag_init;
869 870
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.qsize = sizeof(struct ipq);
871
	ip4_frags.frag_expire = ip_expire;
872
	ip4_frags.frags_cache_name = ip_frag_cache_name;
873
	ip4_frags.rhash_params = ip4_rhash_params;
874 875
	if (inet_frags_init(&ip4_frags))
		panic("IP: failed to allocate ip4_frags cache\n");
876 877
	ip4_frags_ctl_register();
	register_pernet_subsys(&ip4_frags_ops);
L
Linus Torvalds 已提交
878
}