inet_fragment.c 10.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * inet fragments management
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
 *				Started as consolidation of ipv4/ip_fragment.c,
 *				ipv6/reassembly. and ipv6 nf conntrack reassembly
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
19
#include <linux/random.h>
20 21
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
22
#include <linux/slab.h>
23

24
#include <net/sock.h>
25
#include <net/inet_frag.h>
26 27
#include <net/inet_ecn.h>

28 29 30
#define INETFRAGS_EVICT_BUCKETS   128
#define INETFRAGS_EVICT_MAX	  512

31 32 33
/* don't rebuild inetfrag table with new secret more often than this */
#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
const u8 ip_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
54

55 56 57 58 59 60
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}

61 62 63 64 65 66 67
static bool inet_frag_may_rebuild(struct inet_frags *f)
{
	return time_after(jiffies,
	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
}

static void inet_frag_secret_rebuild(struct inet_frags *f)
68 69 70
{
	int i;

71
	/* Per bucket lock NOT needed here, due to write lock protection */
72 73 74 75
	write_lock_bh(&f->lock);

	if (!inet_frag_may_rebuild(f))
		goto out;
76

77
	get_random_bytes(&f->rnd, sizeof(u32));
78

79
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
80
		struct inet_frag_bucket *hb;
81
		struct inet_frag_queue *q;
82
		struct hlist_node *n;
83

84 85
		hb = &f->hash[i];
		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
86
			unsigned int hval = inet_frag_hashfn(f, q);
87 88

			if (hval != i) {
89 90
				struct inet_frag_bucket *hb_dest;

91 92 93
				hlist_del(&q->list);

				/* Relink to new hash chain. */
94 95
				hb_dest = &f->hash[hval];
				hlist_add_head(&q->list, &hb_dest->chain);
96 97 98 99
			}
		}
	}

100 101 102 103
	f->rebuild = false;
	f->last_rebuild_jiffies = jiffies;
out:
	write_unlock_bh(&f->lock);
104 105
}

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
	return q->net->low_thresh == 0 ||
	       frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
	struct inet_frag_queue *fq;
	struct hlist_node *n;
	unsigned int evicted = 0;
	HLIST_HEAD(expired);

evict_again:
	spin_lock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
		if (!inet_fragq_should_evict(fq))
			continue;

		if (!del_timer(&fq->timer)) {
			/* q expiring right now thus increment its refcount so
			 * it won't be freed under us and wait until the timer
			 * has finished executing then destroy it
			 */
			atomic_inc(&fq->refcnt);
			spin_unlock(&hb->chain_lock);
			del_timer_sync(&fq->timer);
			WARN_ON(atomic_read(&fq->refcnt) != 1);
			inet_frag_put(fq, f);
			goto evict_again;
		}

		/* suppress xmit of (icmp) error packet */
		fq->last_in &= ~INET_FRAG_FIRST_IN;
		fq->last_in |= INET_FRAG_EVICTED;
		hlist_del(&fq->list);
		hlist_add_head(&fq->list, &expired);
		++evicted;
	}

	spin_unlock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &expired, list)
		f->frag_expire((unsigned long) fq);

	return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
	unsigned int i, evicted = 0;
	struct inet_frags *f;

	f = container_of(work, struct inet_frags, frags_work);

	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

	read_lock_bh(&f->lock);

	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
		evicted += inet_evict_bucket(f, &f->hash[i]);
		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
		if (evicted > INETFRAGS_EVICT_MAX)
			break;
	}

	f->next_bucket = i;

	read_unlock_bh(&f->lock);
178 179
	if (f->rebuild && inet_frag_may_rebuild(f))
		inet_frag_secret_rebuild(f);
180 181 182 183 184 185 186 187
}

static void inet_frag_schedule_worker(struct inet_frags *f)
{
	if (unlikely(!work_pending(&f->frags_work)))
		schedule_work(&f->frags_work);
}

188 189 190 191
void inet_frags_init(struct inet_frags *f)
{
	int i;

192 193
	INIT_WORK(&f->frags_work, inet_frag_worker);

194 195
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
		struct inet_frag_bucket *hb = &f->hash[i];
196

197 198 199
		spin_lock_init(&hb->chain_lock);
		INIT_HLIST_HEAD(&hb->chain);
	}
200
	rwlock_init(&f->lock);
201
	f->last_rebuild_jiffies = 0;
202 203 204
}
EXPORT_SYMBOL(inet_frags_init);

205 206
void inet_frags_init_net(struct netns_frags *nf)
{
207
	init_frag_mem_limit(nf);
208 209 210
}
EXPORT_SYMBOL(inet_frags_init_net);

211 212
void inet_frags_fini(struct inet_frags *f)
{
213
	cancel_work_sync(&f->frags_work);
214 215
}
EXPORT_SYMBOL(inet_frags_fini);
216

217 218
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
219 220
	int i;

221
	nf->low_thresh = 0;
222

223 224 225 226 227 228
	read_lock_bh(&f->lock);

	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
		inet_evict_bucket(f, &f->hash[i]);

	read_unlock_bh(&f->lock);
229 230

	percpu_counter_destroy(&nf->mem);
231 232 233
}
EXPORT_SYMBOL(inet_frags_exit_net);

234 235
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
236 237 238 239
	struct inet_frag_bucket *hb;
	unsigned int hash;

	read_lock(&f->lock);
240
	hash = inet_frag_hashfn(f, fq);
241 242 243
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
244
	hlist_del(&fq->list);
245 246 247
	spin_unlock(&hb->chain_lock);

	read_unlock(&f->lock);
248 249 250 251 252 253 254
}

void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
	if (del_timer(&fq->timer))
		atomic_dec(&fq->refcnt);

255
	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
256 257
		fq_unlink(fq, f);
		atomic_dec(&fq->refcnt);
258
		fq->last_in |= INET_FRAG_COMPLETE;
259 260 261
	}
}
EXPORT_SYMBOL(inet_frag_kill);
262

263
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
264
		struct sk_buff *skb)
265 266 267 268 269 270
{
	if (f->skb_free)
		f->skb_free(skb);
	kfree_skb(skb);
}

F
Florian Westphal 已提交
271
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
272 273
{
	struct sk_buff *fp;
274
	struct netns_frags *nf;
275
	unsigned int sum, sum_truesize = 0;
276

277 278
	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
	WARN_ON(del_timer(&q->timer) != 0);
279 280 281

	/* Release all fragment data. */
	fp = q->fragments;
282
	nf = q->net;
283 284 285
	while (fp) {
		struct sk_buff *xp = fp->next;

286 287
		sum_truesize += fp->truesize;
		frag_kfree_skb(nf, f, fp);
288 289
		fp = xp;
	}
290 291
	sum = sum_truesize + f->qsize;
	sub_frag_mem_limit(q, sum);
292

293 294 295
	if (f->destructor)
		f->destructor(q);
	kfree(q);
296 297
}
EXPORT_SYMBOL(inet_frag_destroy);
298

299 300
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
		struct inet_frag_queue *qp_in, struct inet_frags *f,
301
		void *arg)
302
{
303
	struct inet_frag_bucket *hb;
304
	struct inet_frag_queue *qp;
305
	unsigned int hash;
306

307
	read_lock(&f->lock); /* Protects against hash rebuild */
308 309 310 311 312
	/*
	 * While we stayed w/o the lock other CPU could update
	 * the rnd seed, so we need to re-calculate the hash
	 * chain. Fortunatelly the qp_in can be used to get one.
	 */
313
	hash = inet_frag_hashfn(f, qp_in);
314 315 316
	hb = &f->hash[hash];
	spin_lock(&hb->chain_lock);

317 318 319
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
320
	 * released the hash bucket lock.
321
	 */
322
	hlist_for_each_entry(qp, &hb->chain, list) {
323
		if (qp->net == nf && f->match(qp, arg)) {
324
			atomic_inc(&qp->refcnt);
325 326
			spin_unlock(&hb->chain_lock);
			read_unlock(&f->lock);
327
			qp_in->last_in |= INET_FRAG_COMPLETE;
328 329 330 331 332 333
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
334
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
335 336 337
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);
338
	hlist_add_head(&qp->list, &hb->chain);
F
Florian Westphal 已提交
339

340 341
	spin_unlock(&hb->chain_lock);
	read_unlock(&f->lock);
342

343 344
	return qp;
}
345

346 347
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
348 349 350
{
	struct inet_frag_queue *q;

351 352
	if (frag_mem_limit(nf) > nf->high_thresh) {
		inet_frag_schedule_worker(f);
353
		return NULL;
354
	}
355

356 357 358 359
	q = kzalloc(f->qsize, GFP_ATOMIC);
	if (q == NULL)
		return NULL;

360
	q->net = nf;
361
	f->constructor(q, arg);
362 363
	add_frag_mem_limit(q, f->qsize);

364 365 366 367 368 369
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);
	atomic_set(&q->refcnt, 1);

	return q;
}
370

371
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
372
		struct inet_frags *f, void *arg)
373 374 375
{
	struct inet_frag_queue *q;

376
	q = inet_frag_alloc(nf, f, arg);
377 378 379
	if (q == NULL)
		return NULL;

380
	return inet_frag_intern(nf, q, f, arg);
381
}
382

383 384
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
		struct inet_frags *f, void *key, unsigned int hash)
385
	__releases(&f->lock)
386
{
387
	struct inet_frag_bucket *hb;
388
	struct inet_frag_queue *q;
389
	int depth = 0;
390

391 392
	if (frag_mem_limit(nf) > nf->low_thresh)
		inet_frag_schedule_worker(f);
393

394
	hash &= (INETFRAGS_HASHSZ - 1);
395 396 397 398
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
	hlist_for_each_entry(q, &hb->chain, list) {
399
		if (q->net == nf && f->match(q, key)) {
400
			atomic_inc(&q->refcnt);
401
			spin_unlock(&hb->chain_lock);
402 403 404
			read_unlock(&f->lock);
			return q;
		}
405
		depth++;
406
	}
407
	spin_unlock(&hb->chain_lock);
408 409
	read_unlock(&f->lock);

410 411
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
412 413 414 415 416 417 418

	if (inet_frag_may_rebuild(f)) {
		f->rebuild = true;
		inet_frag_schedule_worker(f);
	}

	return ERR_PTR(-ENOBUFS);
419 420
}
EXPORT_SYMBOL(inet_frag_find);
421 422 423 424 425 426 427 428 429 430 431 432

void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
				   const char *prefix)
{
	static const char msg[] = "inet_frag_find: Fragment hash bucket"
		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
		". Dropping fragment.\n";

	if (PTR_ERR(q) == -ENOBUFS)
		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);