cls_u32.c 23.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 * net/sched/cls_u32.c	Ugly (or Universal) 32bit key Packet Classifier.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *	The filters are packed to hash tables of key nodes
 *	with a set of 32bit key/mask pairs at every node.
 *	Nodes reference next level hash tables etc.
 *
 *	This scheme is the best universal classifier I managed to
 *	invent; it is not super-fast, but it is not slow (provided you
 *	program it correctly), and general enough.  And its relative
 *	speed grows as the number of rules becomes larger.
 *
 *	It seems that it represents the best middle point between
 *	speed and manageability both by human and by machine.
 *
 *	It is especially useful for link sharing combined with QoS;
 *	pure RSVP doesn't need such a general approach and can use
 *	much simpler (and faster) schemes, sort of cls_rsvp.c.
 *
 *	JHS: We should remove the CONFIG_NET_CLS_IND from here
 *	eventually when the meta match extension is made available
 *
 *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
 */

#include <linux/module.h>
34
#include <linux/slab.h>
L
Linus Torvalds 已提交
35 36 37 38
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
39
#include <linux/percpu.h>
L
Linus Torvalds 已提交
40 41
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
42
#include <linux/bitmap.h>
43
#include <net/netlink.h>
L
Linus Torvalds 已提交
44 45 46
#include <net/act_api.h>
#include <net/pkt_cls.h>

E
Eric Dumazet 已提交
47
struct tc_u_knode {
48
	struct tc_u_knode __rcu	*next;
L
Linus Torvalds 已提交
49
	u32			handle;
50
	struct tc_u_hnode __rcu	*ht_up;
L
Linus Torvalds 已提交
51 52
	struct tcf_exts		exts;
#ifdef CONFIG_NET_CLS_IND
53
	int			ifindex;
L
Linus Torvalds 已提交
54 55 56
#endif
	u8			fshift;
	struct tcf_result	res;
57
	struct tc_u_hnode __rcu	*ht_down;
L
Linus Torvalds 已提交
58
#ifdef CONFIG_CLS_U32_PERF
59
	struct tc_u32_pcnt __percpu *pf;
L
Linus Torvalds 已提交
60 61
#endif
#ifdef CONFIG_CLS_U32_MARK
62 63 64
	u32			val;
	u32			mask;
	u32 __percpu		*pcpu_success;
L
Linus Torvalds 已提交
65
#endif
66 67
	struct tcf_proto	*tp;
	struct rcu_head		rcu;
68 69 70 71
	/* The 'sel' field MUST be the last field in structure to allow for
	 * tc_u32_keys allocated at end of structure.
	 */
	struct tc_u32_sel	sel;
L
Linus Torvalds 已提交
72 73
};

E
Eric Dumazet 已提交
74
struct tc_u_hnode {
75
	struct tc_u_hnode __rcu	*next;
L
Linus Torvalds 已提交
76 77 78 79
	u32			handle;
	u32			prio;
	struct tc_u_common	*tp_c;
	int			refcnt;
E
Eric Dumazet 已提交
80
	unsigned int		divisor;
81 82
	struct tc_u_knode __rcu	*ht[1];
	struct rcu_head		rcu;
L
Linus Torvalds 已提交
83 84
};

E
Eric Dumazet 已提交
85
struct tc_u_common {
86
	struct tc_u_hnode __rcu	*hlist;
L
Linus Torvalds 已提交
87 88 89
	struct Qdisc		*q;
	int			refcnt;
	u32			hgenerator;
90
	struct rcu_head		rcu;
L
Linus Torvalds 已提交
91 92
};

E
Eric Dumazet 已提交
93 94 95
static inline unsigned int u32_hash_fold(__be32 key,
					 const struct tc_u32_sel *sel,
					 u8 fshift)
L
Linus Torvalds 已提交
96
{
E
Eric Dumazet 已提交
97
	unsigned int h = ntohl(key & sel->hmask) >> fshift;
L
Linus Torvalds 已提交
98 99 100 101

	return h;
}

102
static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
L
Linus Torvalds 已提交
103 104 105
{
	struct {
		struct tc_u_knode *knode;
106
		unsigned int	  off;
L
Linus Torvalds 已提交
107 108
	} stack[TC_U32_MAXDEPTH];

109
	struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
110
	unsigned int off = skb_network_offset(skb);
L
Linus Torvalds 已提交
111 112 113 114 115 116 117 118 119 120
	struct tc_u_knode *n;
	int sdepth = 0;
	int off2 = 0;
	int sel = 0;
#ifdef CONFIG_CLS_U32_PERF
	int j;
#endif
	int i, r;

next_ht:
121
	n = rcu_dereference_bh(ht->ht[sel]);
L
Linus Torvalds 已提交
122 123 124 125 126 127

next_knode:
	if (n) {
		struct tc_u32_key *key = n->sel.keys;

#ifdef CONFIG_CLS_U32_PERF
128
		__this_cpu_inc(n->pf->rcnt);
L
Linus Torvalds 已提交
129 130 131 132
		j = 0;
#endif

#ifdef CONFIG_CLS_U32_MARK
133
		if ((skb->mark & n->mask) != n->val) {
134
			n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
135 136
			goto next_knode;
		} else {
137
			__this_cpu_inc(*n->pcpu_success);
L
Linus Torvalds 已提交
138 139 140
		}
#endif

E
Eric Dumazet 已提交
141
		for (i = n->sel.nkeys; i > 0; i--, key++) {
S
stephen hemminger 已提交
142
			int toff = off + key->off + (off2 & key->offmask);
S
stephen hemminger 已提交
143
			__be32 *data, hdata;
144

D
Dan Carpenter 已提交
145
			if (skb_headroom(skb) + toff > INT_MAX)
S
stephen hemminger 已提交
146 147
				goto out;

S
stephen hemminger 已提交
148
			data = skb_header_pointer(skb, toff, 4, &hdata);
149 150 151
			if (!data)
				goto out;
			if ((*data ^ key->val) & key->mask) {
152
				n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
153 154 155
				goto next_knode;
			}
#ifdef CONFIG_CLS_U32_PERF
156
			__this_cpu_inc(n->pf->kcnts[j]);
L
Linus Torvalds 已提交
157 158 159
			j++;
#endif
		}
160 161 162

		ht = rcu_dereference_bh(n->ht_down);
		if (!ht) {
L
Linus Torvalds 已提交
163
check_terminal:
E
Eric Dumazet 已提交
164
			if (n->sel.flags & TC_U32_TERMINAL) {
L
Linus Torvalds 已提交
165 166 167

				*res = n->res;
#ifdef CONFIG_NET_CLS_IND
168
				if (!tcf_match_indev(skb, n->ifindex)) {
169
					n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
170 171 172 173
					goto next_knode;
				}
#endif
#ifdef CONFIG_CLS_U32_PERF
174
				__this_cpu_inc(n->pf->rhit);
L
Linus Torvalds 已提交
175 176 177
#endif
				r = tcf_exts_exec(skb, &n->exts, res);
				if (r < 0) {
178
					n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
179 180 181 182 183
					goto next_knode;
				}

				return r;
			}
184
			n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
185 186 187 188 189 190 191
			goto next_knode;
		}

		/* PUSH */
		if (sdepth >= TC_U32_MAXDEPTH)
			goto deadloop;
		stack[sdepth].knode = n;
192
		stack[sdepth].off = off;
L
Linus Torvalds 已提交
193 194
		sdepth++;

195
		ht = rcu_dereference_bh(n->ht_down);
L
Linus Torvalds 已提交
196
		sel = 0;
197
		if (ht->divisor) {
S
stephen hemminger 已提交
198
			__be32 *data, hdata;
199 200

			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
S
stephen hemminger 已提交
201
						  &hdata);
202 203 204 205 206
			if (!data)
				goto out;
			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
							  n->fshift);
		}
E
Eric Dumazet 已提交
207
		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
L
Linus Torvalds 已提交
208 209
			goto next_ht;

E
Eric Dumazet 已提交
210
		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
L
Linus Torvalds 已提交
211
			off2 = n->sel.off + 3;
212
			if (n->sel.flags & TC_U32_VAROFFSET) {
S
stephen hemminger 已提交
213
				__be16 *data, hdata;
214 215 216

				data = skb_header_pointer(skb,
							  off + n->sel.offoff,
S
stephen hemminger 已提交
217
							  2, &hdata);
218 219 220 221 222
				if (!data)
					goto out;
				off2 += ntohs(n->sel.offmask & *data) >>
					n->sel.offshift;
			}
L
Linus Torvalds 已提交
223 224
			off2 &= ~3;
		}
E
Eric Dumazet 已提交
225
		if (n->sel.flags & TC_U32_EAT) {
226
			off += off2;
L
Linus Torvalds 已提交
227 228 229
			off2 = 0;
		}

230
		if (off < skb->len)
L
Linus Torvalds 已提交
231 232 233 234 235 236
			goto next_ht;
	}

	/* POP */
	if (sdepth--) {
		n = stack[sdepth].knode;
237
		ht = rcu_dereference_bh(n->ht_up);
238
		off = stack[sdepth].off;
L
Linus Torvalds 已提交
239 240
		goto check_terminal;
	}
241
out:
L
Linus Torvalds 已提交
242 243 244
	return -1;

deadloop:
245
	net_warn_ratelimited("cls_u32: dead loop\n");
L
Linus Torvalds 已提交
246 247 248
	return -1;
}

E
Eric Dumazet 已提交
249
static struct tc_u_hnode *
L
Linus Torvalds 已提交
250 251 252 253
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
	struct tc_u_hnode *ht;

254 255 256
	for (ht = rtnl_dereference(tp_c->hlist);
	     ht;
	     ht = rtnl_dereference(ht->next))
L
Linus Torvalds 已提交
257 258 259 260 261 262
		if (ht->handle == handle)
			break;

	return ht;
}

E
Eric Dumazet 已提交
263
static struct tc_u_knode *
L
Linus Torvalds 已提交
264 265
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
E
Eric Dumazet 已提交
266
	unsigned int sel;
L
Linus Torvalds 已提交
267 268 269 270 271 272
	struct tc_u_knode *n = NULL;

	sel = TC_U32_HASH(handle);
	if (sel > ht->divisor)
		goto out;

273 274 275
	for (n = rtnl_dereference(ht->ht[sel]);
	     n;
	     n = rtnl_dereference(n->next))
L
Linus Torvalds 已提交
276 277 278 279 280 281 282 283 284 285 286 287 288
		if (n->handle == handle)
			break;
out:
	return n;
}


static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
{
	struct tc_u_hnode *ht;
	struct tc_u_common *tp_c = tp->data;

	if (TC_U32_HTID(handle) == TC_U32_ROOT)
289
		ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
	else
		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));

	if (!ht)
		return 0;

	if (TC_U32_KEY(handle) == 0)
		return (unsigned long)ht;

	return (unsigned long)u32_lookup_key(ht, handle);
}

static u32 gen_new_htid(struct tc_u_common *tp_c)
{
	int i = 0x800;

306 307 308
	/* hgenerator only used inside rtnl lock it is safe to increment
	 * without read _copy_ update semantics
	 */
L
Linus Torvalds 已提交
309 310 311
	do {
		if (++tp_c->hgenerator == 0x7FF)
			tp_c->hgenerator = 1;
E
Eric Dumazet 已提交
312
	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
L
Linus Torvalds 已提交
313 314 315 316 317 318 319 320 321

	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}

static int u32_init(struct tcf_proto *tp)
{
	struct tc_u_hnode *root_ht;
	struct tc_u_common *tp_c;

322
	tp_c = tp->q->u32_node;
L
Linus Torvalds 已提交
323

324
	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
L
Linus Torvalds 已提交
325 326 327 328 329 330 331 332 333
	if (root_ht == NULL)
		return -ENOBUFS;

	root_ht->divisor = 0;
	root_ht->refcnt++;
	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
	root_ht->prio = tp->prio;

	if (tp_c == NULL) {
334
		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
L
Linus Torvalds 已提交
335 336 337 338 339
		if (tp_c == NULL) {
			kfree(root_ht);
			return -ENOBUFS;
		}
		tp_c->q = tp->q;
340
		tp->q->u32_node = tp_c;
L
Linus Torvalds 已提交
341 342 343
	}

	tp_c->refcnt++;
344 345
	RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
	rcu_assign_pointer(tp_c->hlist, root_ht);
L
Linus Torvalds 已提交
346 347
	root_ht->tp_c = tp_c;

348
	rcu_assign_pointer(tp->root, root_ht);
L
Linus Torvalds 已提交
349 350 351 352
	tp->data = tp_c;
	return 0;
}

353 354 355
static int u32_destroy_key(struct tcf_proto *tp,
			   struct tc_u_knode *n,
			   bool free_pf)
L
Linus Torvalds 已提交
356
{
357
	tcf_exts_destroy(&n->exts);
L
Linus Torvalds 已提交
358 359 360
	if (n->ht_down)
		n->ht_down->refcnt--;
#ifdef CONFIG_CLS_U32_PERF
361 362
	if (free_pf)
		free_percpu(n->pf);
363 364
#endif
#ifdef CONFIG_CLS_U32_MARK
365 366
	if (free_pf)
		free_percpu(n->pcpu_success);
L
Linus Torvalds 已提交
367 368 369 370 371
#endif
	kfree(n);
	return 0;
}

372 373 374 375 376 377 378 379
/* u32_delete_key_rcu should be called when free'ing a copied
 * version of a tc_u_knode obtained from u32_init_knode(). When
 * copies are obtained from u32_init_knode() the statistics are
 * shared between the old and new copies to allow readers to
 * continue to update the statistics during the copy. To support
 * this the u32_delete_key_rcu variant does not free the percpu
 * statistics.
 */
380 381 382 383
static void u32_delete_key_rcu(struct rcu_head *rcu)
{
	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
	u32_destroy_key(key->tp, key, false);
}

/* u32_delete_key_freepf_rcu is the rcu callback variant
 * that free's the entire structure including the statistics
 * percpu variables. Only use this if the key is not a copy
 * returned by u32_init_knode(). See u32_delete_key_rcu()
 * for the variant that should be used with keys return from
 * u32_init_knode()
 */
static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
{
	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);

	u32_destroy_key(key->tp, key, true);
399 400
}

401
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
L
Linus Torvalds 已提交
402
{
403 404
	struct tc_u_knode __rcu **kp;
	struct tc_u_knode *pkp;
405
	struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
L
Linus Torvalds 已提交
406 407

	if (ht) {
408 409 410 411 412
		kp = &ht->ht[TC_U32_HASH(key->handle)];
		for (pkp = rtnl_dereference(*kp); pkp;
		     kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
			if (pkp == key) {
				RCU_INIT_POINTER(*kp, key->next);
L
Linus Torvalds 已提交
413

414
				tcf_unbind_filter(tp, &key->res);
415
				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
L
Linus Torvalds 已提交
416 417 418 419
				return 0;
			}
		}
	}
420
	WARN_ON(1);
L
Linus Torvalds 已提交
421 422 423
	return 0;
}

424
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
L
Linus Torvalds 已提交
425 426
{
	struct tc_u_knode *n;
E
Eric Dumazet 已提交
427
	unsigned int h;
L
Linus Torvalds 已提交
428

E
Eric Dumazet 已提交
429
	for (h = 0; h <= ht->divisor; h++) {
430 431 432
		while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
			RCU_INIT_POINTER(ht->ht[h],
					 rtnl_dereference(n->next));
433
			tcf_unbind_filter(tp, &n->res);
434
			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
L
Linus Torvalds 已提交
435 436 437 438 439 440 441
		}
	}
}

static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
	struct tc_u_common *tp_c = tp->data;
442 443
	struct tc_u_hnode __rcu **hn;
	struct tc_u_hnode *phn;
L
Linus Torvalds 已提交
444

445
	WARN_ON(ht->refcnt);
L
Linus Torvalds 已提交
446

447
	u32_clear_hnode(tp, ht);
L
Linus Torvalds 已提交
448

449 450 451 452 453 454 455
	hn = &tp_c->hlist;
	for (phn = rtnl_dereference(*hn);
	     phn;
	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
		if (phn == ht) {
			RCU_INIT_POINTER(*hn, ht->next);
			kfree_rcu(ht, rcu);
L
Linus Torvalds 已提交
456 457 458 459 460 461 462 463 464 465
			return 0;
		}
	}

	return -ENOENT;
}

static void u32_destroy(struct tcf_proto *tp)
{
	struct tc_u_common *tp_c = tp->data;
466
	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
467

468
	WARN_ON(root_ht == NULL);
L
Linus Torvalds 已提交
469 470 471 472 473 474 475

	if (root_ht && --root_ht->refcnt == 0)
		u32_destroy_hnode(tp, root_ht);

	if (--tp_c->refcnt == 0) {
		struct tc_u_hnode *ht;

476
		tp->q->u32_node = NULL;
L
Linus Torvalds 已提交
477

478 479 480
		for (ht = rtnl_dereference(tp_c->hlist);
		     ht;
		     ht = rtnl_dereference(ht->next)) {
481
			ht->refcnt--;
482
			u32_clear_hnode(tp, ht);
483
		}
L
Linus Torvalds 已提交
484

485 486 487
		while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
			RCU_INIT_POINTER(tp_c->hlist, ht->next);
			kfree_rcu(ht, rcu);
488
		}
L
Linus Torvalds 已提交
489 490 491 492 493 494 495 496 497

		kfree(tp_c);
	}

	tp->data = NULL;
}

static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
E
Eric Dumazet 已提交
498
	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
499
	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
500 501 502 503 504

	if (ht == NULL)
		return 0;

	if (TC_U32_KEY(ht->handle))
E
Eric Dumazet 已提交
505
		return u32_delete_key(tp, (struct tc_u_knode *)ht);
L
Linus Torvalds 已提交
506

507
	if (root_ht == ht)
L
Linus Torvalds 已提交
508 509
		return -EINVAL;

510 511
	if (ht->refcnt == 1) {
		ht->refcnt--;
L
Linus Torvalds 已提交
512
		u32_destroy_hnode(tp, ht);
513 514 515
	} else {
		return -EBUSY;
	}
L
Linus Torvalds 已提交
516 517 518 519

	return 0;
}

520
#define NR_U32_NODE (1<<12)
L
Linus Torvalds 已提交
521 522 523
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
	struct tc_u_knode *n;
524 525 526 527 528
	unsigned long i;
	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
					GFP_KERNEL);
	if (!bitmap)
		return handle | 0xFFF;
L
Linus Torvalds 已提交
529

530 531 532
	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
	     n;
	     n = rtnl_dereference(n->next))
533
		set_bit(TC_U32_NODE(n->handle), bitmap);
L
Linus Torvalds 已提交
534

535 536 537 538 539 540
	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
	if (i >= NR_U32_NODE)
		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);

	kfree(bitmap);
	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
L
Linus Torvalds 已提交
541 542
}

543 544 545 546 547 548 549 550 551 552
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
	[TCA_U32_HASH]		= { .type = NLA_U32 },
	[TCA_U32_LINK]		= { .type = NLA_U32 },
	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
};

553 554
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
			 unsigned long base, struct tc_u_hnode *ht,
555
			 struct tc_u_knode *n, struct nlattr **tb,
556
			 struct nlattr *est, bool ovr)
L
Linus Torvalds 已提交
557 558 559 560
{
	int err;
	struct tcf_exts e;

561
	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
562
	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
L
Linus Torvalds 已提交
563 564 565 566
	if (err < 0)
		return err;

	err = -EINVAL;
567
	if (tb[TCA_U32_LINK]) {
568
		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
569
		struct tc_u_hnode *ht_down = NULL, *ht_old;
L
Linus Torvalds 已提交
570 571 572 573 574 575 576 577 578 579 580 581

		if (TC_U32_KEY(handle))
			goto errout;

		if (handle) {
			ht_down = u32_lookup_ht(ht->tp_c, handle);

			if (ht_down == NULL)
				goto errout;
			ht_down->refcnt++;
		}

582 583
		ht_old = rtnl_dereference(n->ht_down);
		rcu_assign_pointer(n->ht_down, ht_down);
L
Linus Torvalds 已提交
584

585 586
		if (ht_old)
			ht_old->refcnt--;
L
Linus Torvalds 已提交
587
	}
588
	if (tb[TCA_U32_CLASSID]) {
589
		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
L
Linus Torvalds 已提交
590 591 592 593
		tcf_bind_filter(tp, &n->res, base);
	}

#ifdef CONFIG_NET_CLS_IND
594
	if (tb[TCA_U32_INDEV]) {
595 596 597
		int ret;
		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
		if (ret < 0)
L
Linus Torvalds 已提交
598
			goto errout;
599
		n->ifindex = ret;
L
Linus Torvalds 已提交
600 601 602 603 604 605
	}
#endif
	tcf_exts_change(tp, &n->exts, &e);

	return 0;
errout:
606
	tcf_exts_destroy(&e);
L
Linus Torvalds 已提交
607 608 609
	return err;
}

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
static void u32_replace_knode(struct tcf_proto *tp,
			      struct tc_u_common *tp_c,
			      struct tc_u_knode *n)
{
	struct tc_u_knode __rcu **ins;
	struct tc_u_knode *pins;
	struct tc_u_hnode *ht;

	if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
		ht = rtnl_dereference(tp->root);
	else
		ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));

	ins = &ht->ht[TC_U32_HASH(n->handle)];

	/* The node must always exist for it to be replaced if this is not the
	 * case then something went very wrong elsewhere.
	 */
	for (pins = rtnl_dereference(*ins); ;
	     ins = &pins->next, pins = rtnl_dereference(*ins))
		if (pins->handle == n->handle)
			break;

	RCU_INIT_POINTER(n->next, pins->next);
	rcu_assign_pointer(*ins, n);
}

static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
					 struct tc_u_knode *n)
{
	struct tc_u_knode *new;
	struct tc_u32_sel *s = &n->sel;

	new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
		      GFP_KERNEL);

	if (!new)
		return NULL;

	RCU_INIT_POINTER(new->next, n->next);
	new->handle = n->handle;
	RCU_INIT_POINTER(new->ht_up, n->ht_up);

#ifdef CONFIG_NET_CLS_IND
	new->ifindex = n->ifindex;
#endif
	new->fshift = n->fshift;
	new->res = n->res;
	RCU_INIT_POINTER(new->ht_down, n->ht_down);

	/* bump reference count as long as we hold pointer to structure */
	if (new->ht_down)
		new->ht_down->refcnt++;

#ifdef CONFIG_CLS_U32_PERF
	/* Statistics may be incremented by readers during update
	 * so we must keep them in tact. When the node is later destroyed
	 * a special destroy call must be made to not free the pf memory.
	 */
	new->pf = n->pf;
#endif

#ifdef CONFIG_CLS_U32_MARK
	new->val = n->val;
	new->mask = n->mask;
	/* Similarly success statistics must be moved as pointers */
	new->pcpu_success = n->pcpu_success;
#endif
	new->tp = tp;
	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));

	tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE);

	return new;
}

686
static int u32_change(struct net *net, struct sk_buff *in_skb,
687
		      struct tcf_proto *tp, unsigned long base, u32 handle,
688
		      struct nlattr **tca,
689
		      unsigned long *arg, bool ovr)
L
Linus Torvalds 已提交
690 691 692 693 694
{
	struct tc_u_common *tp_c = tp->data;
	struct tc_u_hnode *ht;
	struct tc_u_knode *n;
	struct tc_u32_sel *s;
695 696
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct nlattr *tb[TCA_U32_MAX + 1];
L
Linus Torvalds 已提交
697 698
	u32 htid;
	int err;
699 700 701
#ifdef CONFIG_CLS_U32_PERF
	size_t size;
#endif
L
Linus Torvalds 已提交
702 703 704 705

	if (opt == NULL)
		return handle ? -EINVAL : 0;

706
	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
707 708
	if (err < 0)
		return err;
L
Linus Torvalds 已提交
709

E
Eric Dumazet 已提交
710 711
	n = (struct tc_u_knode *)*arg;
	if (n) {
712 713
		struct tc_u_knode *new;

L
Linus Torvalds 已提交
714 715 716
		if (TC_U32_KEY(n->handle) == 0)
			return -EINVAL;

717 718 719 720 721 722 723 724 725 726 727 728 729 730
		new = u32_init_knode(tp, n);
		if (!new)
			return -ENOMEM;

		err = u32_set_parms(net, tp, base,
				    rtnl_dereference(n->ht_up), new, tb,
				    tca[TCA_RATE], ovr);

		if (err) {
			u32_destroy_key(tp, new, false);
			return err;
		}

		u32_replace_knode(tp, tp_c, new);
731
		tcf_unbind_filter(tp, &n->res);
732 733
		call_rcu(&n->rcu, u32_delete_key_rcu);
		return 0;
L
Linus Torvalds 已提交
734 735
	}

736
	if (tb[TCA_U32_DIVISOR]) {
E
Eric Dumazet 已提交
737
		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
L
Linus Torvalds 已提交
738 739 740 741 742 743 744 745 746 747

		if (--divisor > 0x100)
			return -EINVAL;
		if (TC_U32_KEY(handle))
			return -EINVAL;
		if (handle == 0) {
			handle = gen_new_htid(tp->data);
			if (handle == 0)
				return -ENOMEM;
		}
E
Eric Dumazet 已提交
748
		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
L
Linus Torvalds 已提交
749 750 751
		if (ht == NULL)
			return -ENOBUFS;
		ht->tp_c = tp_c;
752
		ht->refcnt = 1;
L
Linus Torvalds 已提交
753 754 755
		ht->divisor = divisor;
		ht->handle = handle;
		ht->prio = tp->prio;
756 757
		RCU_INIT_POINTER(ht->next, tp_c->hlist);
		rcu_assign_pointer(tp_c->hlist, ht);
L
Linus Torvalds 已提交
758 759 760 761
		*arg = (unsigned long)ht;
		return 0;
	}

762
	if (tb[TCA_U32_HASH]) {
763
		htid = nla_get_u32(tb[TCA_U32_HASH]);
L
Linus Torvalds 已提交
764
		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
765
			ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
766 767 768 769 770 771 772
			htid = ht->handle;
		} else {
			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
			if (ht == NULL)
				return -EINVAL;
		}
	} else {
773
		ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
774 775 776 777 778 779 780 781 782 783 784 785 786
		htid = ht->handle;
	}

	if (ht->divisor < TC_U32_HASH(htid))
		return -EINVAL;

	if (handle) {
		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
			return -EINVAL;
		handle = htid | TC_U32_NODE(handle);
	} else
		handle = gen_new_kid(ht, htid);

787
	if (tb[TCA_U32_SEL] == NULL)
L
Linus Torvalds 已提交
788 789
		return -EINVAL;

790
	s = nla_data(tb[TCA_U32_SEL]);
L
Linus Torvalds 已提交
791

792
	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
L
Linus Torvalds 已提交
793 794 795 796
	if (n == NULL)
		return -ENOBUFS;

#ifdef CONFIG_CLS_U32_PERF
797 798 799
	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
	if (!n->pf) {
L
Linus Torvalds 已提交
800 801 802 803 804 805
		kfree(n);
		return -ENOBUFS;
	}
#endif

	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
806
	RCU_INIT_POINTER(n->ht_up, ht);
L
Linus Torvalds 已提交
807
	n->handle = handle;
808
	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
809
	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
810
	n->tp = tp;
L
Linus Torvalds 已提交
811 812

#ifdef CONFIG_CLS_U32_MARK
813
	n->pcpu_success = alloc_percpu(u32);
814 815 816 817
	if (!n->pcpu_success) {
		err = -ENOMEM;
		goto errout;
	}
818

819
	if (tb[TCA_U32_MARK]) {
L
Linus Torvalds 已提交
820 821
		struct tc_u32_mark *mark;

822
		mark = nla_data(tb[TCA_U32_MARK]);
823 824
		n->val = mark->val;
		n->mask = mark->mask;
L
Linus Torvalds 已提交
825 826 827
	}
#endif

828
	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
L
Linus Torvalds 已提交
829
	if (err == 0) {
830 831 832 833 834 835 836
		struct tc_u_knode __rcu **ins;
		struct tc_u_knode *pins;

		ins = &ht->ht[TC_U32_HASH(handle)];
		for (pins = rtnl_dereference(*ins); pins;
		     ins = &pins->next, pins = rtnl_dereference(*ins))
			if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
L
Linus Torvalds 已提交
837 838
				break;

839 840
		RCU_INIT_POINTER(n->next, pins);
		rcu_assign_pointer(*ins, n);
L
Linus Torvalds 已提交
841 842 843 844

		*arg = (unsigned long)n;
		return 0;
	}
845 846 847

#ifdef CONFIG_CLS_U32_MARK
	free_percpu(n->pcpu_success);
848
errout:
849 850
#endif

L
Linus Torvalds 已提交
851
#ifdef CONFIG_CLS_U32_PERF
852
	free_percpu(n->pf);
L
Linus Torvalds 已提交
853 854 855 856 857 858 859 860 861 862
#endif
	kfree(n);
	return err;
}

static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
	struct tc_u_common *tp_c = tp->data;
	struct tc_u_hnode *ht;
	struct tc_u_knode *n;
E
Eric Dumazet 已提交
863
	unsigned int h;
L
Linus Torvalds 已提交
864 865 866 867

	if (arg->stop)
		return;

868 869 870
	for (ht = rtnl_dereference(tp_c->hlist);
	     ht;
	     ht = rtnl_dereference(ht->next)) {
L
Linus Torvalds 已提交
871 872 873 874 875 876 877 878 879 880
		if (ht->prio != tp->prio)
			continue;
		if (arg->count >= arg->skip) {
			if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
				arg->stop = 1;
				return;
			}
		}
		arg->count++;
		for (h = 0; h <= ht->divisor; h++) {
881 882 883
			for (n = rtnl_dereference(ht->ht[h]);
			     n;
			     n = rtnl_dereference(n->next)) {
L
Linus Torvalds 已提交
884 885 886 887 888 889 890 891 892 893 894 895 896 897
				if (arg->count < arg->skip) {
					arg->count++;
					continue;
				}
				if (arg->fn(tp, (unsigned long)n, arg) < 0) {
					arg->stop = 1;
					return;
				}
				arg->count++;
			}
		}
	}
}

898
static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
L
Linus Torvalds 已提交
899 900
		     struct sk_buff *skb, struct tcmsg *t)
{
E
Eric Dumazet 已提交
901
	struct tc_u_knode *n = (struct tc_u_knode *)fh;
902
	struct tc_u_hnode *ht_up, *ht_down;
903
	struct nlattr *nest;
L
Linus Torvalds 已提交
904 905 906 907 908 909

	if (n == NULL)
		return skb->len;

	t->tcm_handle = n->handle;

910 911 912
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;
L
Linus Torvalds 已提交
913 914

	if (TC_U32_KEY(n->handle) == 0) {
E
Eric Dumazet 已提交
915 916 917
		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
		u32 divisor = ht->divisor + 1;

918 919
		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
			goto nla_put_failure;
L
Linus Torvalds 已提交
920
	} else {
921 922 923
#ifdef CONFIG_CLS_U32_PERF
		struct tc_u32_pcnt *gpf;
		int cpu;
924
#endif
925

926 927 928 929
		if (nla_put(skb, TCA_U32_SEL,
			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
			    &n->sel))
			goto nla_put_failure;
930 931 932

		ht_up = rtnl_dereference(n->ht_up);
		if (ht_up) {
L
Linus Torvalds 已提交
933
			u32 htid = n->handle & 0xFFFFF000;
934 935
			if (nla_put_u32(skb, TCA_U32_HASH, htid))
				goto nla_put_failure;
L
Linus Torvalds 已提交
936
		}
937 938 939
		if (n->res.classid &&
		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
			goto nla_put_failure;
940 941 942 943

		ht_down = rtnl_dereference(n->ht_down);
		if (ht_down &&
		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
944
			goto nla_put_failure;
L
Linus Torvalds 已提交
945 946

#ifdef CONFIG_CLS_U32_MARK
947 948 949 950
		if ((n->val || n->mask)) {
			struct tc_u32_mark mark = {.val = n->val,
						   .mask = n->mask,
						   .success = 0};
951
			int cpum;
952

953 954
			for_each_possible_cpu(cpum) {
				__u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
955 956 957 958 959 960 961

				mark.success += cnt;
			}

			if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
				goto nla_put_failure;
		}
L
Linus Torvalds 已提交
962 963
#endif

964
		if (tcf_exts_dump(skb, &n->exts) < 0)
965
			goto nla_put_failure;
L
Linus Torvalds 已提交
966 967

#ifdef CONFIG_NET_CLS_IND
968 969 970 971 972 973
		if (n->ifindex) {
			struct net_device *dev;
			dev = __dev_get_by_index(net, n->ifindex);
			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
				goto nla_put_failure;
		}
L
Linus Torvalds 已提交
974 975
#endif
#ifdef CONFIG_CLS_U32_PERF
976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991
		gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
			      n->sel.nkeys * sizeof(u64),
			      GFP_KERNEL);
		if (!gpf)
			goto nla_put_failure;

		for_each_possible_cpu(cpu) {
			int i;
			struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);

			gpf->rcnt += pf->rcnt;
			gpf->rhit += pf->rhit;
			for (i = 0; i < n->sel.nkeys; i++)
				gpf->kcnts[i] += pf->kcnts[i];
		}

992 993
		if (nla_put(skb, TCA_U32_PCNT,
			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
994 995
			    gpf)) {
			kfree(gpf);
996
			goto nla_put_failure;
997 998
		}
		kfree(gpf);
L
Linus Torvalds 已提交
999 1000 1001
#endif
	}

1002 1003
	nla_nest_end(skb, nest);

L
Linus Torvalds 已提交
1004
	if (TC_U32_KEY(n->handle))
1005
		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
1006
			goto nla_put_failure;
L
Linus Torvalds 已提交
1007 1008
	return skb->len;

1009
nla_put_failure:
1010
	nla_nest_cancel(skb, nest);
L
Linus Torvalds 已提交
1011 1012 1013
	return -1;
}

1014
static struct tcf_proto_ops cls_u32_ops __read_mostly = {
L
Linus Torvalds 已提交
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
	.kind		=	"u32",
	.classify	=	u32_classify,
	.init		=	u32_init,
	.destroy	=	u32_destroy,
	.get		=	u32_get,
	.change		=	u32_change,
	.delete		=	u32_delete,
	.walk		=	u32_walk,
	.dump		=	u32_dump,
	.owner		=	THIS_MODULE,
};

static int __init init_u32(void)
{
1029
	pr_info("u32 classifier\n");
L
Linus Torvalds 已提交
1030
#ifdef CONFIG_CLS_U32_PERF
1031
	pr_info("    Performance counters on\n");
L
Linus Torvalds 已提交
1032 1033
#endif
#ifdef CONFIG_NET_CLS_IND
1034
	pr_info("    input device check on\n");
L
Linus Torvalds 已提交
1035 1036
#endif
#ifdef CONFIG_NET_CLS_ACT
1037
	pr_info("    Actions configured\n");
L
Linus Torvalds 已提交
1038 1039 1040 1041
#endif
	return register_tcf_proto_ops(&cls_u32_ops);
}

1042
static void __exit exit_u32(void)
L
Linus Torvalds 已提交
1043 1044 1045 1046 1047 1048 1049
{
	unregister_tcf_proto_ops(&cls_u32_ops);
}

module_init(init_u32)
module_exit(exit_u32)
MODULE_LICENSE("GPL");