inet_fragment.c 10.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * inet fragments management
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
 *				Started as consolidation of ipv4/ip_fragment.c,
 *				ipv6/reassembly. and ipv6 nf conntrack reassembly
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
19
#include <linux/random.h>
20 21
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
22
#include <linux/slab.h>
23

24
#include <net/sock.h>
25
#include <net/inet_frag.h>
26 27
#include <net/inet_ecn.h>

28 29 30
#define INETFRAGS_EVICT_BUCKETS   128
#define INETFRAGS_EVICT_MAX	  512

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
const u8 ip_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
51

52 53 54 55 56 57
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}

58 59 60 61 62 63
static void inet_frag_secret_rebuild(unsigned long dummy)
{
	struct inet_frags *f = (struct inet_frags *)dummy;
	unsigned long now = jiffies;
	int i;

64
	/* Per bucket lock NOT needed here, due to write lock protection */
65
	write_lock(&f->lock);
66

67 68
	get_random_bytes(&f->rnd, sizeof(u32));
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
69
		struct inet_frag_bucket *hb;
70
		struct inet_frag_queue *q;
71
		struct hlist_node *n;
72

73 74
		hb = &f->hash[i];
		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
75
			unsigned int hval = inet_frag_hashfn(f, q);
76 77

			if (hval != i) {
78 79
				struct inet_frag_bucket *hb_dest;

80 81 82
				hlist_del(&q->list);

				/* Relink to new hash chain. */
83 84
				hb_dest = &f->hash[hval];
				hlist_add_head(&q->list, &hb_dest->chain);
85 86 87 88 89
			}
		}
	}
	write_unlock(&f->lock);

90
	mod_timer(&f->secret_timer, now + f->secret_interval);
91 92
}

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
	return q->net->low_thresh == 0 ||
	       frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
	struct inet_frag_queue *fq;
	struct hlist_node *n;
	unsigned int evicted = 0;
	HLIST_HEAD(expired);

evict_again:
	spin_lock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
		if (!inet_fragq_should_evict(fq))
			continue;

		if (!del_timer(&fq->timer)) {
			/* q expiring right now thus increment its refcount so
			 * it won't be freed under us and wait until the timer
			 * has finished executing then destroy it
			 */
			atomic_inc(&fq->refcnt);
			spin_unlock(&hb->chain_lock);
			del_timer_sync(&fq->timer);
			WARN_ON(atomic_read(&fq->refcnt) != 1);
			inet_frag_put(fq, f);
			goto evict_again;
		}

		/* suppress xmit of (icmp) error packet */
		fq->last_in &= ~INET_FRAG_FIRST_IN;
		fq->last_in |= INET_FRAG_EVICTED;
		hlist_del(&fq->list);
		hlist_add_head(&fq->list, &expired);
		++evicted;
	}

	spin_unlock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &expired, list)
		f->frag_expire((unsigned long) fq);

	return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
	unsigned int i, evicted = 0;
	struct inet_frags *f;

	f = container_of(work, struct inet_frags, frags_work);

	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

	read_lock_bh(&f->lock);

	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
		evicted += inet_evict_bucket(f, &f->hash[i]);
		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
		if (evicted > INETFRAGS_EVICT_MAX)
			break;
	}

	f->next_bucket = i;

	read_unlock_bh(&f->lock);
}

static void inet_frag_schedule_worker(struct inet_frags *f)
{
	if (unlikely(!work_pending(&f->frags_work)))
		schedule_work(&f->frags_work);
}

173 174 175 176
void inet_frags_init(struct inet_frags *f)
{
	int i;

177 178
	INIT_WORK(&f->frags_work, inet_frag_worker);

179 180
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
		struct inet_frag_bucket *hb = &f->hash[i];
181

182 183 184
		spin_lock_init(&hb->chain_lock);
		INIT_HLIST_HEAD(&hb->chain);
	}
185 186
	rwlock_init(&f->lock);

187 188
	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
			(unsigned long)f);
189
	f->secret_timer.expires = jiffies + f->secret_interval;
190
	add_timer(&f->secret_timer);
191 192 193
}
EXPORT_SYMBOL(inet_frags_init);

194 195
void inet_frags_init_net(struct netns_frags *nf)
{
196
	init_frag_mem_limit(nf);
197 198 199
}
EXPORT_SYMBOL(inet_frags_init_net);

200 201
void inet_frags_fini(struct inet_frags *f)
{
202
	del_timer(&f->secret_timer);
203
	cancel_work_sync(&f->frags_work);
204 205
}
EXPORT_SYMBOL(inet_frags_fini);
206

207 208
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
209 210
	int i;

211
	nf->low_thresh = 0;
212

213 214 215 216 217 218
	read_lock_bh(&f->lock);

	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
		inet_evict_bucket(f, &f->hash[i]);

	read_unlock_bh(&f->lock);
219 220

	percpu_counter_destroy(&nf->mem);
221 222 223
}
EXPORT_SYMBOL(inet_frags_exit_net);

224 225
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
226 227 228 229
	struct inet_frag_bucket *hb;
	unsigned int hash;

	read_lock(&f->lock);
230
	hash = inet_frag_hashfn(f, fq);
231 232 233
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
234
	hlist_del(&fq->list);
235 236 237
	spin_unlock(&hb->chain_lock);

	read_unlock(&f->lock);
238 239 240 241 242 243 244
}

void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
	if (del_timer(&fq->timer))
		atomic_dec(&fq->refcnt);

245
	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
246 247
		fq_unlink(fq, f);
		atomic_dec(&fq->refcnt);
248
		fq->last_in |= INET_FRAG_COMPLETE;
249 250 251
	}
}
EXPORT_SYMBOL(inet_frag_kill);
252

253
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
254
		struct sk_buff *skb)
255 256 257 258 259 260
{
	if (f->skb_free)
		f->skb_free(skb);
	kfree_skb(skb);
}

F
Florian Westphal 已提交
261
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
262 263
{
	struct sk_buff *fp;
264
	struct netns_frags *nf;
265
	unsigned int sum, sum_truesize = 0;
266

267 268
	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
	WARN_ON(del_timer(&q->timer) != 0);
269 270 271

	/* Release all fragment data. */
	fp = q->fragments;
272
	nf = q->net;
273 274 275
	while (fp) {
		struct sk_buff *xp = fp->next;

276 277
		sum_truesize += fp->truesize;
		frag_kfree_skb(nf, f, fp);
278 279
		fp = xp;
	}
280 281
	sum = sum_truesize + f->qsize;
	sub_frag_mem_limit(q, sum);
282

283 284 285
	if (f->destructor)
		f->destructor(q);
	kfree(q);
286 287
}
EXPORT_SYMBOL(inet_frag_destroy);
288

289 290
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
		struct inet_frag_queue *qp_in, struct inet_frags *f,
291
		void *arg)
292
{
293
	struct inet_frag_bucket *hb;
294
	struct inet_frag_queue *qp;
295
	unsigned int hash;
296

297
	read_lock(&f->lock); /* Protects against hash rebuild */
298 299 300 301 302
	/*
	 * While we stayed w/o the lock other CPU could update
	 * the rnd seed, so we need to re-calculate the hash
	 * chain. Fortunatelly the qp_in can be used to get one.
	 */
303
	hash = inet_frag_hashfn(f, qp_in);
304 305 306
	hb = &f->hash[hash];
	spin_lock(&hb->chain_lock);

307 308 309
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
310
	 * released the hash bucket lock.
311
	 */
312
	hlist_for_each_entry(qp, &hb->chain, list) {
313
		if (qp->net == nf && f->match(qp, arg)) {
314
			atomic_inc(&qp->refcnt);
315 316
			spin_unlock(&hb->chain_lock);
			read_unlock(&f->lock);
317
			qp_in->last_in |= INET_FRAG_COMPLETE;
318 319 320 321 322 323
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
324
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
325 326 327
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);
328
	hlist_add_head(&qp->list, &hb->chain);
F
Florian Westphal 已提交
329

330 331
	spin_unlock(&hb->chain_lock);
	read_unlock(&f->lock);
332

333 334
	return qp;
}
335

336 337
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
338 339 340
{
	struct inet_frag_queue *q;

341 342
	if (frag_mem_limit(nf) > nf->high_thresh) {
		inet_frag_schedule_worker(f);
343
		return NULL;
344
	}
345

346 347 348 349
	q = kzalloc(f->qsize, GFP_ATOMIC);
	if (q == NULL)
		return NULL;

350
	q->net = nf;
351
	f->constructor(q, arg);
352 353
	add_frag_mem_limit(q, f->qsize);

354 355 356 357 358 359
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);
	atomic_set(&q->refcnt, 1);

	return q;
}
360

361
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
362
		struct inet_frags *f, void *arg)
363 364 365
{
	struct inet_frag_queue *q;

366
	q = inet_frag_alloc(nf, f, arg);
367 368 369
	if (q == NULL)
		return NULL;

370
	return inet_frag_intern(nf, q, f, arg);
371
}
372

373 374
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
		struct inet_frags *f, void *key, unsigned int hash)
375
	__releases(&f->lock)
376
{
377
	struct inet_frag_bucket *hb;
378
	struct inet_frag_queue *q;
379
	int depth = 0;
380

381 382
	if (frag_mem_limit(nf) > nf->low_thresh)
		inet_frag_schedule_worker(f);
383

384
	hash &= (INETFRAGS_HASHSZ - 1);
385 386 387 388
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
	hlist_for_each_entry(q, &hb->chain, list) {
389
		if (q->net == nf && f->match(q, key)) {
390
			atomic_inc(&q->refcnt);
391
			spin_unlock(&hb->chain_lock);
392 393 394
			read_unlock(&f->lock);
			return q;
		}
395
		depth++;
396
	}
397
	spin_unlock(&hb->chain_lock);
398 399
	read_unlock(&f->lock);

400 401 402 403
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
	else
		return ERR_PTR(-ENOBUFS);
404 405
}
EXPORT_SYMBOL(inet_frag_find);
406 407 408 409 410 411 412 413 414 415 416 417

void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
				   const char *prefix)
{
	static const char msg[] = "inet_frag_find: Fragment hash bucket"
		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
		". Dropping fragment.\n";

	if (PTR_ERR(q) == -ENOBUFS)
		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);