inet_fragment.c 10.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * inet fragments management
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
 *				Started as consolidation of ipv4/ip_fragment.c,
 *				ipv6/reassembly. and ipv6 nf conntrack reassembly
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
19
#include <linux/random.h>
20 21
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
22
#include <linux/slab.h>
23

24
#include <net/sock.h>
25
#include <net/inet_frag.h>
26 27
#include <net/inet_ecn.h>

28 29 30
#define INETFRAGS_EVICT_BUCKETS   128
#define INETFRAGS_EVICT_MAX	  512

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
const u8 ip_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
51

52 53 54 55 56 57
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}

58 59 60 61 62 63
static void inet_frag_secret_rebuild(unsigned long dummy)
{
	struct inet_frags *f = (struct inet_frags *)dummy;
	unsigned long now = jiffies;
	int i;

64
	/* Per bucket lock NOT needed here, due to write lock protection */
65
	write_lock(&f->lock);
66

67 68
	get_random_bytes(&f->rnd, sizeof(u32));
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
69
		struct inet_frag_bucket *hb;
70
		struct inet_frag_queue *q;
71
		struct hlist_node *n;
72

73 74
		hb = &f->hash[i];
		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
75
			unsigned int hval = inet_frag_hashfn(f, q);
76 77

			if (hval != i) {
78 79
				struct inet_frag_bucket *hb_dest;

80 81 82
				hlist_del(&q->list);

				/* Relink to new hash chain. */
83 84
				hb_dest = &f->hash[hval];
				hlist_add_head(&q->list, &hb_dest->chain);
85 86 87 88 89
			}
		}
	}
	write_unlock(&f->lock);

90
	mod_timer(&f->secret_timer, now + f->secret_interval);
91 92
}

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
	return q->net->low_thresh == 0 ||
	       frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
	struct inet_frag_queue *fq;
	struct hlist_node *n;
	unsigned int evicted = 0;
	HLIST_HEAD(expired);

evict_again:
	spin_lock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
		if (!inet_fragq_should_evict(fq))
			continue;

		if (!del_timer(&fq->timer)) {
			/* q expiring right now thus increment its refcount so
			 * it won't be freed under us and wait until the timer
			 * has finished executing then destroy it
			 */
			atomic_inc(&fq->refcnt);
			spin_unlock(&hb->chain_lock);
			del_timer_sync(&fq->timer);
			WARN_ON(atomic_read(&fq->refcnt) != 1);
			inet_frag_put(fq, f);
			goto evict_again;
		}

		/* suppress xmit of (icmp) error packet */
		fq->last_in &= ~INET_FRAG_FIRST_IN;
		fq->last_in |= INET_FRAG_EVICTED;
		hlist_del(&fq->list);
		hlist_add_head(&fq->list, &expired);
		++evicted;
	}

	spin_unlock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &expired, list)
		f->frag_expire((unsigned long) fq);

	return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
	unsigned int i, evicted = 0;
	struct inet_frags *f;

	f = container_of(work, struct inet_frags, frags_work);

	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

	read_lock_bh(&f->lock);

	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
		evicted += inet_evict_bucket(f, &f->hash[i]);
		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
		if (evicted > INETFRAGS_EVICT_MAX)
			break;
	}

	f->next_bucket = i;

	read_unlock_bh(&f->lock);
}

static void inet_frag_schedule_worker(struct inet_frags *f)
{
	if (unlikely(!work_pending(&f->frags_work)))
		schedule_work(&f->frags_work);
}

173 174 175 176
void inet_frags_init(struct inet_frags *f)
{
	int i;

177 178
	INIT_WORK(&f->frags_work, inet_frag_worker);

179 180
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
		struct inet_frag_bucket *hb = &f->hash[i];
181

182 183 184
		spin_lock_init(&hb->chain_lock);
		INIT_HLIST_HEAD(&hb->chain);
	}
185 186
	rwlock_init(&f->lock);

187 188
	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
			(unsigned long)f);
189
	f->secret_timer.expires = jiffies + f->secret_interval;
190
	add_timer(&f->secret_timer);
191 192 193
}
EXPORT_SYMBOL(inet_frags_init);

194 195 196
void inet_frags_init_net(struct netns_frags *nf)
{
	nf->nqueues = 0;
197
	init_frag_mem_limit(nf);
198
	INIT_LIST_HEAD(&nf->lru_list);
199
	spin_lock_init(&nf->lru_lock);
200 201 202
}
EXPORT_SYMBOL(inet_frags_init_net);

203 204
void inet_frags_fini(struct inet_frags *f)
{
205
	del_timer(&f->secret_timer);
206
	cancel_work_sync(&f->frags_work);
207 208
}
EXPORT_SYMBOL(inet_frags_fini);
209

210 211
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
212 213
	int i;

214
	nf->low_thresh = 0;
215

216 217 218 219 220 221
	read_lock_bh(&f->lock);

	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
		inet_evict_bucket(f, &f->hash[i]);

	read_unlock_bh(&f->lock);
222 223

	percpu_counter_destroy(&nf->mem);
224 225 226
}
EXPORT_SYMBOL(inet_frags_exit_net);

227 228
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
229 230 231 232
	struct inet_frag_bucket *hb;
	unsigned int hash;

	read_lock(&f->lock);
233
	hash = inet_frag_hashfn(f, fq);
234 235 236
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
237
	hlist_del(&fq->list);
238 239 240
	spin_unlock(&hb->chain_lock);

	read_unlock(&f->lock);
241
	inet_frag_lru_del(fq);
242 243 244 245 246 247 248
}

void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
	if (del_timer(&fq->timer))
		atomic_dec(&fq->refcnt);

249
	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
250 251
		fq_unlink(fq, f);
		atomic_dec(&fq->refcnt);
252
		fq->last_in |= INET_FRAG_COMPLETE;
253 254 255
	}
}
EXPORT_SYMBOL(inet_frag_kill);
256

257
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
258
		struct sk_buff *skb)
259 260 261 262 263 264 265 266 267 268
{
	if (f->skb_free)
		f->skb_free(skb);
	kfree_skb(skb);
}

void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
					int *work)
{
	struct sk_buff *fp;
269
	struct netns_frags *nf;
270
	unsigned int sum, sum_truesize = 0;
271

272 273
	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
	WARN_ON(del_timer(&q->timer) != 0);
274 275 276

	/* Release all fragment data. */
	fp = q->fragments;
277
	nf = q->net;
278 279 280
	while (fp) {
		struct sk_buff *xp = fp->next;

281 282
		sum_truesize += fp->truesize;
		frag_kfree_skb(nf, f, fp);
283 284
		fp = xp;
	}
285
	sum = sum_truesize + f->qsize;
286
	if (work)
287 288
		*work -= sum;
	sub_frag_mem_limit(q, sum);
289

290 291 292
	if (f->destructor)
		f->destructor(q);
	kfree(q);
293 294 295

}
EXPORT_SYMBOL(inet_frag_destroy);
296

297 298
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
		struct inet_frag_queue *qp_in, struct inet_frags *f,
299
		void *arg)
300
{
301
	struct inet_frag_bucket *hb;
302
	struct inet_frag_queue *qp;
303
	unsigned int hash;
304

305
	read_lock(&f->lock); /* Protects against hash rebuild */
306 307 308 309 310
	/*
	 * While we stayed w/o the lock other CPU could update
	 * the rnd seed, so we need to re-calculate the hash
	 * chain. Fortunatelly the qp_in can be used to get one.
	 */
311
	hash = inet_frag_hashfn(f, qp_in);
312 313 314
	hb = &f->hash[hash];
	spin_lock(&hb->chain_lock);

315 316 317
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
318
	 * released the hash bucket lock.
319
	 */
320
	hlist_for_each_entry(qp, &hb->chain, list) {
321
		if (qp->net == nf && f->match(qp, arg)) {
322
			atomic_inc(&qp->refcnt);
323 324
			spin_unlock(&hb->chain_lock);
			read_unlock(&f->lock);
325
			qp_in->last_in |= INET_FRAG_COMPLETE;
326 327 328 329 330 331
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
332
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
333 334 335
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);
336
	hlist_add_head(&qp->list, &hb->chain);
337
	inet_frag_lru_add(nf, qp);
338 339
	spin_unlock(&hb->chain_lock);
	read_unlock(&f->lock);
340

341 342
	return qp;
}
343

344 345
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
346 347 348
{
	struct inet_frag_queue *q;

349 350
	if (frag_mem_limit(nf) > nf->high_thresh) {
		inet_frag_schedule_worker(f);
351
		return NULL;
352
	}
353

354 355 356 357
	q = kzalloc(f->qsize, GFP_ATOMIC);
	if (q == NULL)
		return NULL;

358
	q->net = nf;
359
	f->constructor(q, arg);
360 361
	add_frag_mem_limit(q, f->qsize);

362 363 364
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);
	atomic_set(&q->refcnt, 1);
365
	INIT_LIST_HEAD(&q->lru_list);
366 367 368

	return q;
}
369

370
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
371
		struct inet_frags *f, void *arg)
372 373 374
{
	struct inet_frag_queue *q;

375
	q = inet_frag_alloc(nf, f, arg);
376 377 378
	if (q == NULL)
		return NULL;

379
	return inet_frag_intern(nf, q, f, arg);
380
}
381

382 383
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
		struct inet_frags *f, void *key, unsigned int hash)
384
	__releases(&f->lock)
385
{
386
	struct inet_frag_bucket *hb;
387
	struct inet_frag_queue *q;
388
	int depth = 0;
389

390 391
	if (frag_mem_limit(nf) > nf->low_thresh)
		inet_frag_schedule_worker(f);
392

393
	hash &= (INETFRAGS_HASHSZ - 1);
394 395 396 397
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
	hlist_for_each_entry(q, &hb->chain, list) {
398
		if (q->net == nf && f->match(q, key)) {
399
			atomic_inc(&q->refcnt);
400
			spin_unlock(&hb->chain_lock);
401 402 403
			read_unlock(&f->lock);
			return q;
		}
404
		depth++;
405
	}
406
	spin_unlock(&hb->chain_lock);
407 408
	read_unlock(&f->lock);

409 410 411 412
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
	else
		return ERR_PTR(-ENOBUFS);
413 414
}
EXPORT_SYMBOL(inet_frag_find);
415 416 417 418 419 420 421 422 423 424 425 426

void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
				   const char *prefix)
{
	static const char msg[] = "inet_frag_find: Fragment hash bucket"
		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
		". Dropping fragment.\n";

	if (PTR_ERR(q) == -ENOBUFS)
		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);