inet_fragment.c 10.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * inet fragments management
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
 *				Started as consolidation of ipv4/ip_fragment.c,
 *				ipv6/reassembly. and ipv6 nf conntrack reassembly
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
19
#include <linux/random.h>
20 21
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
22
#include <linux/slab.h>
23

24
#include <net/sock.h>
25
#include <net/inet_frag.h>
26 27
#include <net/inet_ecn.h>

28 29 30
#define INETFRAGS_EVICT_BUCKETS   128
#define INETFRAGS_EVICT_MAX	  512

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
 * Value : 0xff if frame should be dropped.
 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
 */
const u8 ip_frag_ecn_table[16] = {
	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,

	/* invalid combinations : drop frame */
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
51

52 53 54 55 56 57
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}

58 59 60 61 62 63
static void inet_frag_secret_rebuild(unsigned long dummy)
{
	struct inet_frags *f = (struct inet_frags *)dummy;
	unsigned long now = jiffies;
	int i;

64
	/* Per bucket lock NOT needed here, due to write lock protection */
65
	write_lock(&f->lock);
66

67 68
	get_random_bytes(&f->rnd, sizeof(u32));
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
69
		struct inet_frag_bucket *hb;
70
		struct inet_frag_queue *q;
71
		struct hlist_node *n;
72

73 74
		hb = &f->hash[i];
		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
75
			unsigned int hval = inet_frag_hashfn(f, q);
76 77

			if (hval != i) {
78 79
				struct inet_frag_bucket *hb_dest;

80 81 82
				hlist_del(&q->list);

				/* Relink to new hash chain. */
83 84
				hb_dest = &f->hash[hval];
				hlist_add_head(&q->list, &hb_dest->chain);
85 86 87 88 89
			}
		}
	}
	write_unlock(&f->lock);

90
	mod_timer(&f->secret_timer, now + f->secret_interval);
91 92
}

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
	return q->net->low_thresh == 0 ||
	       frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
	struct inet_frag_queue *fq;
	struct hlist_node *n;
	unsigned int evicted = 0;
	HLIST_HEAD(expired);

evict_again:
	spin_lock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
		if (!inet_fragq_should_evict(fq))
			continue;

		if (!del_timer(&fq->timer)) {
			/* q expiring right now thus increment its refcount so
			 * it won't be freed under us and wait until the timer
			 * has finished executing then destroy it
			 */
			atomic_inc(&fq->refcnt);
			spin_unlock(&hb->chain_lock);
			del_timer_sync(&fq->timer);
			WARN_ON(atomic_read(&fq->refcnt) != 1);
			inet_frag_put(fq, f);
			goto evict_again;
		}

		/* suppress xmit of (icmp) error packet */
		fq->last_in &= ~INET_FRAG_FIRST_IN;
		fq->last_in |= INET_FRAG_EVICTED;
		hlist_del(&fq->list);
		hlist_add_head(&fq->list, &expired);
		++evicted;
	}

	spin_unlock(&hb->chain_lock);

	hlist_for_each_entry_safe(fq, n, &expired, list)
		f->frag_expire((unsigned long) fq);

	return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
	unsigned int i, evicted = 0;
	struct inet_frags *f;

	f = container_of(work, struct inet_frags, frags_work);

	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

	read_lock_bh(&f->lock);

	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
		evicted += inet_evict_bucket(f, &f->hash[i]);
		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
		if (evicted > INETFRAGS_EVICT_MAX)
			break;
	}

	f->next_bucket = i;

	read_unlock_bh(&f->lock);
}

static void inet_frag_schedule_worker(struct inet_frags *f)
{
	if (unlikely(!work_pending(&f->frags_work)))
		schedule_work(&f->frags_work);
}

173 174 175 176
void inet_frags_init(struct inet_frags *f)
{
	int i;

177 178
	INIT_WORK(&f->frags_work, inet_frag_worker);

179 180
	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
		struct inet_frag_bucket *hb = &f->hash[i];
181

182 183 184
		spin_lock_init(&hb->chain_lock);
		INIT_HLIST_HEAD(&hb->chain);
	}
185 186
	rwlock_init(&f->lock);

187 188
	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
			(unsigned long)f);
189
	f->secret_timer.expires = jiffies + f->secret_interval;
190
	add_timer(&f->secret_timer);
191 192 193
}
EXPORT_SYMBOL(inet_frags_init);

194 195
void inet_frags_init_net(struct netns_frags *nf)
{
196
	init_frag_mem_limit(nf);
197
	INIT_LIST_HEAD(&nf->lru_list);
198
	spin_lock_init(&nf->lru_lock);
199 200 201
}
EXPORT_SYMBOL(inet_frags_init_net);

202 203
void inet_frags_fini(struct inet_frags *f)
{
204
	del_timer(&f->secret_timer);
205
	cancel_work_sync(&f->frags_work);
206 207
}
EXPORT_SYMBOL(inet_frags_fini);
208

209 210
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
211 212
	int i;

213
	nf->low_thresh = 0;
214

215 216 217 218 219 220
	read_lock_bh(&f->lock);

	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
		inet_evict_bucket(f, &f->hash[i]);

	read_unlock_bh(&f->lock);
221 222

	percpu_counter_destroy(&nf->mem);
223 224 225
}
EXPORT_SYMBOL(inet_frags_exit_net);

226 227
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
228 229 230 231
	struct inet_frag_bucket *hb;
	unsigned int hash;

	read_lock(&f->lock);
232
	hash = inet_frag_hashfn(f, fq);
233 234 235
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
236
	hlist_del(&fq->list);
237 238 239
	spin_unlock(&hb->chain_lock);

	read_unlock(&f->lock);
240
	inet_frag_lru_del(fq);
241 242 243 244 245 246 247
}

void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
	if (del_timer(&fq->timer))
		atomic_dec(&fq->refcnt);

248
	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
249 250
		fq_unlink(fq, f);
		atomic_dec(&fq->refcnt);
251
		fq->last_in |= INET_FRAG_COMPLETE;
252 253 254
	}
}
EXPORT_SYMBOL(inet_frag_kill);
255

256
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
257
		struct sk_buff *skb)
258 259 260 261 262 263 264 265 266 267
{
	if (f->skb_free)
		f->skb_free(skb);
	kfree_skb(skb);
}

void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
					int *work)
{
	struct sk_buff *fp;
268
	struct netns_frags *nf;
269
	unsigned int sum, sum_truesize = 0;
270

271 272
	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
	WARN_ON(del_timer(&q->timer) != 0);
273 274 275

	/* Release all fragment data. */
	fp = q->fragments;
276
	nf = q->net;
277 278 279
	while (fp) {
		struct sk_buff *xp = fp->next;

280 281
		sum_truesize += fp->truesize;
		frag_kfree_skb(nf, f, fp);
282 283
		fp = xp;
	}
284
	sum = sum_truesize + f->qsize;
285
	if (work)
286 287
		*work -= sum;
	sub_frag_mem_limit(q, sum);
288

289 290 291
	if (f->destructor)
		f->destructor(q);
	kfree(q);
292 293 294

}
EXPORT_SYMBOL(inet_frag_destroy);
295

296 297
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
		struct inet_frag_queue *qp_in, struct inet_frags *f,
298
		void *arg)
299
{
300
	struct inet_frag_bucket *hb;
301
	struct inet_frag_queue *qp;
302
	unsigned int hash;
303

304
	read_lock(&f->lock); /* Protects against hash rebuild */
305 306 307 308 309
	/*
	 * While we stayed w/o the lock other CPU could update
	 * the rnd seed, so we need to re-calculate the hash
	 * chain. Fortunatelly the qp_in can be used to get one.
	 */
310
	hash = inet_frag_hashfn(f, qp_in);
311 312 313
	hb = &f->hash[hash];
	spin_lock(&hb->chain_lock);

314 315 316
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
317
	 * released the hash bucket lock.
318
	 */
319
	hlist_for_each_entry(qp, &hb->chain, list) {
320
		if (qp->net == nf && f->match(qp, arg)) {
321
			atomic_inc(&qp->refcnt);
322 323
			spin_unlock(&hb->chain_lock);
			read_unlock(&f->lock);
324
			qp_in->last_in |= INET_FRAG_COMPLETE;
325 326 327 328 329 330
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
331
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
332 333 334
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);
335
	hlist_add_head(&qp->list, &hb->chain);
336
	inet_frag_lru_add(nf, qp);
337 338
	spin_unlock(&hb->chain_lock);
	read_unlock(&f->lock);
339

340 341
	return qp;
}
342

343 344
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
345 346 347
{
	struct inet_frag_queue *q;

348 349
	if (frag_mem_limit(nf) > nf->high_thresh) {
		inet_frag_schedule_worker(f);
350
		return NULL;
351
	}
352

353 354 355 356
	q = kzalloc(f->qsize, GFP_ATOMIC);
	if (q == NULL)
		return NULL;

357
	q->net = nf;
358
	f->constructor(q, arg);
359 360
	add_frag_mem_limit(q, f->qsize);

361 362 363
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);
	atomic_set(&q->refcnt, 1);
364
	INIT_LIST_HEAD(&q->lru_list);
365 366 367

	return q;
}
368

369
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
370
		struct inet_frags *f, void *arg)
371 372 373
{
	struct inet_frag_queue *q;

374
	q = inet_frag_alloc(nf, f, arg);
375 376 377
	if (q == NULL)
		return NULL;

378
	return inet_frag_intern(nf, q, f, arg);
379
}
380

381 382
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
		struct inet_frags *f, void *key, unsigned int hash)
383
	__releases(&f->lock)
384
{
385
	struct inet_frag_bucket *hb;
386
	struct inet_frag_queue *q;
387
	int depth = 0;
388

389 390
	if (frag_mem_limit(nf) > nf->low_thresh)
		inet_frag_schedule_worker(f);
391

392
	hash &= (INETFRAGS_HASHSZ - 1);
393 394 395 396
	hb = &f->hash[hash];

	spin_lock(&hb->chain_lock);
	hlist_for_each_entry(q, &hb->chain, list) {
397
		if (q->net == nf && f->match(q, key)) {
398
			atomic_inc(&q->refcnt);
399
			spin_unlock(&hb->chain_lock);
400 401 402
			read_unlock(&f->lock);
			return q;
		}
403
		depth++;
404
	}
405
	spin_unlock(&hb->chain_lock);
406 407
	read_unlock(&f->lock);

408 409 410 411
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
	else
		return ERR_PTR(-ENOBUFS);
412 413
}
EXPORT_SYMBOL(inet_frag_find);
414 415 416 417 418 419 420 421 422 423 424 425

void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
				   const char *prefix)
{
	static const char msg[] = "inet_frag_find: Fragment hash bucket"
		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
		". Dropping fragment.\n";

	if (PTR_ERR(q) == -ENOBUFS)
		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);