inet_fragment.c 10.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * inet fragments management
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
 *				Started as consolidation of ipv4/ip_fragment.c,
 *				ipv6/reassembly. and ipv6 nf conntrack reassembly
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
19
#include <linux/random.h>
20 21
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
22
#include <linux/slab.h>
23

24
#include <net/sock.h>
25
#include <net/inet_frag.h>
26 27
#include <net/inet_ecn.h>

28 29 30
#define INETFRAGS_EVICT_BUCKETS   128
#define INETFRAGS_EVICT_MAX	  512

31 32 33
/* don't rebuild inetfrag table with new secret more often than this */
#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
const u8 ip_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
54

55 56 57 58 59 60
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}

61 62 63 64 65 66 67
static bool inet_frag_may_rebuild(struct inet_frags *f)
{
	return time_after(jiffies,
	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
}

static void inet_frag_secret_rebuild(struct inet_frags *f)
68 69 70
{
	int i;

71
	write_seqlock_bh(&f->rnd_seqlock);
72 73 74

	if (!inet_frag_may_rebuild(f))
		goto out;
75

76
	get_random_bytes(&f->rnd, sizeof(u32));
77

78
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
79
		struct inet_frag_bucket *hb;
80
		struct inet_frag_queue *q;
81
		struct hlist_node *n;
82

83
		hb = &f->hash[i];
84 85
		spin_lock(&hb->chain_lock);

86
		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
87
			unsigned int hval = inet_frag_hashfn(f, q);
88 89

			if (hval != i) {
90 91
				struct inet_frag_bucket *hb_dest;

92 93 94
				hlist_del(&q->list);

				/* Relink to new hash chain. */
95
				hb_dest = &f->hash[hval];
96 97 98 99 100 101 102 103 104 105 106

				/* This is the only place where we take
				 * another chain_lock while already holding
				 * one.  As this will not run concurrently,
				 * we cannot deadlock on hb_dest lock below, if its
				 * already locked it will be released soon since
				 * other caller cannot be waiting for hb lock
				 * that we've taken above.
				 */
				spin_lock_nested(&hb_dest->chain_lock,
						 SINGLE_DEPTH_NESTING);
107
				hlist_add_head(&q->list, &hb_dest->chain);
108
				spin_unlock(&hb_dest->chain_lock);
109 110
			}
		}
111
		spin_unlock(&hb->chain_lock);
112 113
	}

114 115 116
	f->rebuild = false;
	f->last_rebuild_jiffies = jiffies;
out:
117
	write_sequnlock_bh(&f->rnd_seqlock);
118 119
}

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
	return q->net->low_thresh == 0 ||
	       frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
	struct inet_frag_queue *fq;
	struct hlist_node *n;
	unsigned int evicted = 0;
	HLIST_HEAD(expired);

evict_again:
	spin_lock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
		if (!inet_fragq_should_evict(fq))
			continue;

		if (!del_timer(&fq->timer)) {
			/* q expiring right now thus increment its refcount so
			 * it won't be freed under us and wait until the timer
			 * has finished executing then destroy it
			 */
			atomic_inc(&fq->refcnt);
			spin_unlock(&hb->chain_lock);
			del_timer_sync(&fq->timer);
			inet_frag_put(fq, f);
			goto evict_again;
		}

153
		fq->flags |= INET_FRAG_EVICTED;
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
		hlist_del(&fq->list);
		hlist_add_head(&fq->list, &expired);
		++evicted;
	}

	spin_unlock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &expired, list)
		f->frag_expire((unsigned long) fq);

	return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
	unsigned int i, evicted = 0;
	struct inet_frags *f;

	f = container_of(work, struct inet_frags, frags_work);

	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

177
	local_bh_disable();
178 179 180 181 182 183 184 185 186 187

	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
		evicted += inet_evict_bucket(f, &f->hash[i]);
		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
		if (evicted > INETFRAGS_EVICT_MAX)
			break;
	}

	f->next_bucket = i;

188 189
	local_bh_enable();

190 191
	if (f->rebuild && inet_frag_may_rebuild(f))
		inet_frag_secret_rebuild(f);
192 193 194 195 196 197 198 199
}

static void inet_frag_schedule_worker(struct inet_frags *f)
{
	if (unlikely(!work_pending(&f->frags_work)))
		schedule_work(&f->frags_work);
}

200
int inet_frags_init(struct inet_frags *f)
201 202 203
{
	int i;

204 205
	INIT_WORK(&f->frags_work, inet_frag_worker);

206 207
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
		struct inet_frag_bucket *hb = &f->hash[i];
208

209 210 211
		spin_lock_init(&hb->chain_lock);
		INIT_HLIST_HEAD(&hb->chain);
	}
212 213

	seqlock_init(&f->rnd_seqlock);
214
	f->last_rebuild_jiffies = 0;
215 216 217 218 219 220
	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
					    NULL);
	if (!f->frags_cachep)
		return -ENOMEM;

	return 0;
221 222 223
}
EXPORT_SYMBOL(inet_frags_init);

224 225
void inet_frags_init_net(struct netns_frags *nf)
{
226
	init_frag_mem_limit(nf);
227 228 229
}
EXPORT_SYMBOL(inet_frags_init_net);

230 231
void inet_frags_fini(struct inet_frags *f)
{
232
	cancel_work_sync(&f->frags_work);
233
	kmem_cache_destroy(f->frags_cachep);
234 235
}
EXPORT_SYMBOL(inet_frags_fini);
236

237 238
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
239
	unsigned int seq;
240 241
	int i;

242
	nf->low_thresh = 0;
243
	local_bh_disable();
244

245 246
evict_again:
	seq = read_seqbegin(&f->rnd_seqlock);
247 248 249 250

	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
		inet_evict_bucket(f, &f->hash[i]);

251 252 253 254
	if (read_seqretry(&f->rnd_seqlock, seq))
		goto evict_again;

	local_bh_enable();
255 256

	percpu_counter_destroy(&nf->mem);
257 258 259
}
EXPORT_SYMBOL(inet_frags_exit_net);

260 261 262
static struct inet_frag_bucket *
get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
__acquires(hb->chain_lock)
263
{
264
	struct inet_frag_bucket *hb;
265 266 267 268
	unsigned int seq, hash;

 restart:
	seq = read_seqbegin(&f->rnd_seqlock);
269

270
	hash = inet_frag_hashfn(f, fq);
271 272 273
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
274 275 276 277 278 279 280 281 282 283 284 285 286
	if (read_seqretry(&f->rnd_seqlock, seq)) {
		spin_unlock(&hb->chain_lock);
		goto restart;
	}

	return hb;
}

static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
	struct inet_frag_bucket *hb;

	hb = get_frag_bucket_locked(fq, f);
287 288
	if (!(fq->flags & INET_FRAG_EVICTED))
		hlist_del(&fq->list);
289
	spin_unlock(&hb->chain_lock);
290 291 292 293 294 295 296
}

void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
	if (del_timer(&fq->timer))
		atomic_dec(&fq->refcnt);

297
	if (!(fq->flags & INET_FRAG_COMPLETE)) {
298 299
		fq_unlink(fq, f);
		atomic_dec(&fq->refcnt);
300
		fq->flags |= INET_FRAG_COMPLETE;
301 302 303
	}
}
EXPORT_SYMBOL(inet_frag_kill);
304

305
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
306
				  struct sk_buff *skb)
307 308 309 310 311 312
{
	if (f->skb_free)
		f->skb_free(skb);
	kfree_skb(skb);
}

F
Florian Westphal 已提交
313
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
314 315
{
	struct sk_buff *fp;
316
	struct netns_frags *nf;
317
	unsigned int sum, sum_truesize = 0;
318

319
	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
320
	WARN_ON(del_timer(&q->timer) != 0);
321 322 323

	/* Release all fragment data. */
	fp = q->fragments;
324
	nf = q->net;
325 326 327
	while (fp) {
		struct sk_buff *xp = fp->next;

328 329
		sum_truesize += fp->truesize;
		frag_kfree_skb(nf, f, fp);
330 331
		fp = xp;
	}
332 333
	sum = sum_truesize + f->qsize;
	sub_frag_mem_limit(q, sum);
334

335 336
	if (f->destructor)
		f->destructor(q);
337
	kmem_cache_free(f->frags_cachep, q);
338 339
}
EXPORT_SYMBOL(inet_frag_destroy);
340

341
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
342 343 344
						struct inet_frag_queue *qp_in,
						struct inet_frags *f,
						void *arg)
345
{
346
	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
347
	struct inet_frag_queue *qp;
348

349 350
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
351 352
	 * such entry could have been created on other cpu before
	 * we acquired hash bucket lock.
353
	 */
354
	hlist_for_each_entry(qp, &hb->chain, list) {
355
		if (qp->net == nf && f->match(qp, arg)) {
356
			atomic_inc(&qp->refcnt);
357
			spin_unlock(&hb->chain_lock);
358
			qp_in->flags |= INET_FRAG_COMPLETE;
359 360 361 362 363 364
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
365
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
366 367 368
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);
369
	hlist_add_head(&qp->list, &hb->chain);
F
Florian Westphal 已提交
370

371
	spin_unlock(&hb->chain_lock);
372

373 374
	return qp;
}
375

376
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
377 378
					       struct inet_frags *f,
					       void *arg)
379 380 381
{
	struct inet_frag_queue *q;

382 383
	if (frag_mem_limit(nf) > nf->high_thresh) {
		inet_frag_schedule_worker(f);
384
		return NULL;
385
	}
386

387
	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
388 389 390
	if (q == NULL)
		return NULL;

391
	q->net = nf;
392
	f->constructor(q, arg);
393 394
	add_frag_mem_limit(q, f->qsize);

395 396 397 398 399 400
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);
	atomic_set(&q->refcnt, 1);

	return q;
}
401

402
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
403 404
						struct inet_frags *f,
						void *arg)
405 406 407
{
	struct inet_frag_queue *q;

408
	q = inet_frag_alloc(nf, f, arg);
409 410 411
	if (q == NULL)
		return NULL;

412
	return inet_frag_intern(nf, q, f, arg);
413
}
414

415
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
416 417
				       struct inet_frags *f, void *key,
				       unsigned int hash)
418
{
419
	struct inet_frag_bucket *hb;
420
	struct inet_frag_queue *q;
421
	int depth = 0;
422

423 424
	if (frag_mem_limit(nf) > nf->low_thresh)
		inet_frag_schedule_worker(f);
425

426
	hash &= (INETFRAGS_HASHSZ - 1);
427 428 429 430
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
	hlist_for_each_entry(q, &hb->chain, list) {
431
		if (q->net == nf && f->match(q, key)) {
432
			atomic_inc(&q->refcnt);
433
			spin_unlock(&hb->chain_lock);
434 435
			return q;
		}
436
		depth++;
437
	}
438
	spin_unlock(&hb->chain_lock);
439

440 441
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
442 443

	if (inet_frag_may_rebuild(f)) {
444 445
		if (!f->rebuild)
			f->rebuild = true;
446 447 448 449
		inet_frag_schedule_worker(f);
	}

	return ERR_PTR(-ENOBUFS);
450 451
}
EXPORT_SYMBOL(inet_frag_find);
452 453 454 455 456 457 458 459 460 461 462 463

void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
				   const char *prefix)
{
	static const char msg[] = "inet_frag_find: Fragment hash bucket"
		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
		". Dropping fragment.\n";

	if (PTR_ERR(q) == -ENOBUFS)
		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);