cls_u32.c 23.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 * net/sched/cls_u32.c	Ugly (or Universal) 32bit key Packet Classifier.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *	The filters are packed to hash tables of key nodes
 *	with a set of 32bit key/mask pairs at every node.
 *	Nodes reference next level hash tables etc.
 *
 *	This scheme is the best universal classifier I managed to
 *	invent; it is not super-fast, but it is not slow (provided you
 *	program it correctly), and general enough.  And its relative
 *	speed grows as the number of rules becomes larger.
 *
 *	It seems that it represents the best middle point between
 *	speed and manageability both by human and by machine.
 *
 *	It is especially useful for link sharing combined with QoS;
 *	pure RSVP doesn't need such a general approach and can use
 *	much simpler (and faster) schemes, sort of cls_rsvp.c.
 *
 *	JHS: We should remove the CONFIG_NET_CLS_IND from here
 *	eventually when the meta match extension is made available
 *
 *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
 */

#include <linux/module.h>
34
#include <linux/slab.h>
L
Linus Torvalds 已提交
35 36 37 38
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
39
#include <linux/percpu.h>
L
Linus Torvalds 已提交
40 41
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
42
#include <linux/bitmap.h>
43
#include <net/netlink.h>
L
Linus Torvalds 已提交
44 45 46
#include <net/act_api.h>
#include <net/pkt_cls.h>

E
Eric Dumazet 已提交
47
struct tc_u_knode {
48
	struct tc_u_knode __rcu	*next;
L
Linus Torvalds 已提交
49
	u32			handle;
50
	struct tc_u_hnode __rcu	*ht_up;
L
Linus Torvalds 已提交
51 52
	struct tcf_exts		exts;
#ifdef CONFIG_NET_CLS_IND
53
	int			ifindex;
L
Linus Torvalds 已提交
54 55 56
#endif
	u8			fshift;
	struct tcf_result	res;
57
	struct tc_u_hnode __rcu	*ht_down;
L
Linus Torvalds 已提交
58
#ifdef CONFIG_CLS_U32_PERF
59
	struct tc_u32_pcnt __percpu *pf;
L
Linus Torvalds 已提交
60 61
#endif
#ifdef CONFIG_CLS_U32_MARK
62 63 64
	u32			val;
	u32			mask;
	u32 __percpu		*pcpu_success;
L
Linus Torvalds 已提交
65
#endif
66 67
	struct tcf_proto	*tp;
	struct rcu_head		rcu;
68 69 70 71
	/* The 'sel' field MUST be the last field in structure to allow for
	 * tc_u32_keys allocated at end of structure.
	 */
	struct tc_u32_sel	sel;
L
Linus Torvalds 已提交
72 73
};

E
Eric Dumazet 已提交
74
struct tc_u_hnode {
75
	struct tc_u_hnode __rcu	*next;
L
Linus Torvalds 已提交
76 77 78 79
	u32			handle;
	u32			prio;
	struct tc_u_common	*tp_c;
	int			refcnt;
E
Eric Dumazet 已提交
80
	unsigned int		divisor;
81 82
	struct tc_u_knode __rcu	*ht[1];
	struct rcu_head		rcu;
L
Linus Torvalds 已提交
83 84
};

E
Eric Dumazet 已提交
85
struct tc_u_common {
86
	struct tc_u_hnode __rcu	*hlist;
L
Linus Torvalds 已提交
87 88 89
	struct Qdisc		*q;
	int			refcnt;
	u32			hgenerator;
90
	struct rcu_head		rcu;
L
Linus Torvalds 已提交
91 92
};

E
Eric Dumazet 已提交
93 94 95
static inline unsigned int u32_hash_fold(__be32 key,
					 const struct tc_u32_sel *sel,
					 u8 fshift)
L
Linus Torvalds 已提交
96
{
E
Eric Dumazet 已提交
97
	unsigned int h = ntohl(key & sel->hmask) >> fshift;
L
Linus Torvalds 已提交
98 99 100 101

	return h;
}

102
static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
L
Linus Torvalds 已提交
103 104 105
{
	struct {
		struct tc_u_knode *knode;
106
		unsigned int	  off;
L
Linus Torvalds 已提交
107 108
	} stack[TC_U32_MAXDEPTH];

109
	struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
110
	unsigned int off = skb_network_offset(skb);
L
Linus Torvalds 已提交
111 112 113 114 115 116 117 118 119 120
	struct tc_u_knode *n;
	int sdepth = 0;
	int off2 = 0;
	int sel = 0;
#ifdef CONFIG_CLS_U32_PERF
	int j;
#endif
	int i, r;

next_ht:
121
	n = rcu_dereference_bh(ht->ht[sel]);
L
Linus Torvalds 已提交
122 123 124 125 126 127

next_knode:
	if (n) {
		struct tc_u32_key *key = n->sel.keys;

#ifdef CONFIG_CLS_U32_PERF
128
		__this_cpu_inc(n->pf->rcnt);
L
Linus Torvalds 已提交
129 130 131 132
		j = 0;
#endif

#ifdef CONFIG_CLS_U32_MARK
133
		if ((skb->mark & n->mask) != n->val) {
134
			n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
135 136
			goto next_knode;
		} else {
137
			__this_cpu_inc(*n->pcpu_success);
L
Linus Torvalds 已提交
138 139 140
		}
#endif

E
Eric Dumazet 已提交
141
		for (i = n->sel.nkeys; i > 0; i--, key++) {
S
stephen hemminger 已提交
142
			int toff = off + key->off + (off2 & key->offmask);
S
stephen hemminger 已提交
143
			__be32 *data, hdata;
144

D
Dan Carpenter 已提交
145
			if (skb_headroom(skb) + toff > INT_MAX)
S
stephen hemminger 已提交
146 147
				goto out;

S
stephen hemminger 已提交
148
			data = skb_header_pointer(skb, toff, 4, &hdata);
149 150 151
			if (!data)
				goto out;
			if ((*data ^ key->val) & key->mask) {
152
				n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
153 154 155
				goto next_knode;
			}
#ifdef CONFIG_CLS_U32_PERF
156
			__this_cpu_inc(n->pf->kcnts[j]);
L
Linus Torvalds 已提交
157 158 159
			j++;
#endif
		}
160 161 162

		ht = rcu_dereference_bh(n->ht_down);
		if (!ht) {
L
Linus Torvalds 已提交
163
check_terminal:
E
Eric Dumazet 已提交
164
			if (n->sel.flags & TC_U32_TERMINAL) {
L
Linus Torvalds 已提交
165 166 167

				*res = n->res;
#ifdef CONFIG_NET_CLS_IND
168
				if (!tcf_match_indev(skb, n->ifindex)) {
169
					n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
170 171 172 173
					goto next_knode;
				}
#endif
#ifdef CONFIG_CLS_U32_PERF
174
				__this_cpu_inc(n->pf->rhit);
L
Linus Torvalds 已提交
175 176 177
#endif
				r = tcf_exts_exec(skb, &n->exts, res);
				if (r < 0) {
178
					n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
179 180 181 182 183
					goto next_knode;
				}

				return r;
			}
184
			n = rcu_dereference_bh(n->next);
L
Linus Torvalds 已提交
185 186 187 188 189 190 191
			goto next_knode;
		}

		/* PUSH */
		if (sdepth >= TC_U32_MAXDEPTH)
			goto deadloop;
		stack[sdepth].knode = n;
192
		stack[sdepth].off = off;
L
Linus Torvalds 已提交
193 194
		sdepth++;

195
		ht = rcu_dereference_bh(n->ht_down);
L
Linus Torvalds 已提交
196
		sel = 0;
197
		if (ht->divisor) {
S
stephen hemminger 已提交
198
			__be32 *data, hdata;
199 200

			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
S
stephen hemminger 已提交
201
						  &hdata);
202 203 204 205 206
			if (!data)
				goto out;
			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
							  n->fshift);
		}
E
Eric Dumazet 已提交
207
		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
L
Linus Torvalds 已提交
208 209
			goto next_ht;

E
Eric Dumazet 已提交
210
		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
L
Linus Torvalds 已提交
211
			off2 = n->sel.off + 3;
212
			if (n->sel.flags & TC_U32_VAROFFSET) {
S
stephen hemminger 已提交
213
				__be16 *data, hdata;
214 215 216

				data = skb_header_pointer(skb,
							  off + n->sel.offoff,
S
stephen hemminger 已提交
217
							  2, &hdata);
218 219 220 221 222
				if (!data)
					goto out;
				off2 += ntohs(n->sel.offmask & *data) >>
					n->sel.offshift;
			}
L
Linus Torvalds 已提交
223 224
			off2 &= ~3;
		}
E
Eric Dumazet 已提交
225
		if (n->sel.flags & TC_U32_EAT) {
226
			off += off2;
L
Linus Torvalds 已提交
227 228 229
			off2 = 0;
		}

230
		if (off < skb->len)
L
Linus Torvalds 已提交
231 232 233 234 235 236
			goto next_ht;
	}

	/* POP */
	if (sdepth--) {
		n = stack[sdepth].knode;
237
		ht = rcu_dereference_bh(n->ht_up);
238
		off = stack[sdepth].off;
L
Linus Torvalds 已提交
239 240
		goto check_terminal;
	}
241
out:
L
Linus Torvalds 已提交
242 243 244
	return -1;

deadloop:
245
	net_warn_ratelimited("cls_u32: dead loop\n");
L
Linus Torvalds 已提交
246 247 248
	return -1;
}

E
Eric Dumazet 已提交
249
static struct tc_u_hnode *
L
Linus Torvalds 已提交
250 251 252 253
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
	struct tc_u_hnode *ht;

254 255 256
	for (ht = rtnl_dereference(tp_c->hlist);
	     ht;
	     ht = rtnl_dereference(ht->next))
L
Linus Torvalds 已提交
257 258 259 260 261 262
		if (ht->handle == handle)
			break;

	return ht;
}

E
Eric Dumazet 已提交
263
static struct tc_u_knode *
L
Linus Torvalds 已提交
264 265
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
E
Eric Dumazet 已提交
266
	unsigned int sel;
L
Linus Torvalds 已提交
267 268 269 270 271 272
	struct tc_u_knode *n = NULL;

	sel = TC_U32_HASH(handle);
	if (sel > ht->divisor)
		goto out;

273 274 275
	for (n = rtnl_dereference(ht->ht[sel]);
	     n;
	     n = rtnl_dereference(n->next))
L
Linus Torvalds 已提交
276 277 278 279 280 281 282 283 284 285 286 287 288
		if (n->handle == handle)
			break;
out:
	return n;
}


static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
{
	struct tc_u_hnode *ht;
	struct tc_u_common *tp_c = tp->data;

	if (TC_U32_HTID(handle) == TC_U32_ROOT)
289
		ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
	else
		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));

	if (!ht)
		return 0;

	if (TC_U32_KEY(handle) == 0)
		return (unsigned long)ht;

	return (unsigned long)u32_lookup_key(ht, handle);
}

static void u32_put(struct tcf_proto *tp, unsigned long f)
{
}

static u32 gen_new_htid(struct tc_u_common *tp_c)
{
	int i = 0x800;

310 311 312
	/* hgenerator only used inside rtnl lock it is safe to increment
	 * without read _copy_ update semantics
	 */
L
Linus Torvalds 已提交
313 314 315
	do {
		if (++tp_c->hgenerator == 0x7FF)
			tp_c->hgenerator = 1;
E
Eric Dumazet 已提交
316
	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
L
Linus Torvalds 已提交
317 318 319 320 321 322 323 324 325

	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}

static int u32_init(struct tcf_proto *tp)
{
	struct tc_u_hnode *root_ht;
	struct tc_u_common *tp_c;

326
	tp_c = tp->q->u32_node;
L
Linus Torvalds 已提交
327

328
	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
L
Linus Torvalds 已提交
329 330 331 332 333 334 335 336 337
	if (root_ht == NULL)
		return -ENOBUFS;

	root_ht->divisor = 0;
	root_ht->refcnt++;
	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
	root_ht->prio = tp->prio;

	if (tp_c == NULL) {
338
		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
L
Linus Torvalds 已提交
339 340 341 342 343
		if (tp_c == NULL) {
			kfree(root_ht);
			return -ENOBUFS;
		}
		tp_c->q = tp->q;
344
		tp->q->u32_node = tp_c;
L
Linus Torvalds 已提交
345 346 347
	}

	tp_c->refcnt++;
348 349
	RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
	rcu_assign_pointer(tp_c->hlist, root_ht);
L
Linus Torvalds 已提交
350 351
	root_ht->tp_c = tp_c;

352
	rcu_assign_pointer(tp->root, root_ht);
L
Linus Torvalds 已提交
353 354 355 356
	tp->data = tp_c;
	return 0;
}

357 358 359
static int u32_destroy_key(struct tcf_proto *tp,
			   struct tc_u_knode *n,
			   bool free_pf)
L
Linus Torvalds 已提交
360 361
{
	tcf_unbind_filter(tp, &n->res);
362
	tcf_exts_destroy(&n->exts);
L
Linus Torvalds 已提交
363 364 365
	if (n->ht_down)
		n->ht_down->refcnt--;
#ifdef CONFIG_CLS_U32_PERF
366 367
	if (free_pf)
		free_percpu(n->pf);
368 369
#endif
#ifdef CONFIG_CLS_U32_MARK
370 371
	if (free_pf)
		free_percpu(n->pcpu_success);
L
Linus Torvalds 已提交
372 373 374 375 376
#endif
	kfree(n);
	return 0;
}

377 378 379 380 381 382 383 384
/* u32_delete_key_rcu should be called when free'ing a copied
 * version of a tc_u_knode obtained from u32_init_knode(). When
 * copies are obtained from u32_init_knode() the statistics are
 * shared between the old and new copies to allow readers to
 * continue to update the statistics during the copy. To support
 * this the u32_delete_key_rcu variant does not free the percpu
 * statistics.
 */
385 386 387 388
static void u32_delete_key_rcu(struct rcu_head *rcu)
{
	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);

389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
	u32_destroy_key(key->tp, key, false);
}

/* u32_delete_key_freepf_rcu is the rcu callback variant
 * that free's the entire structure including the statistics
 * percpu variables. Only use this if the key is not a copy
 * returned by u32_init_knode(). See u32_delete_key_rcu()
 * for the variant that should be used with keys return from
 * u32_init_knode()
 */
static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
{
	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);

	u32_destroy_key(key->tp, key, true);
404 405
}

406
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
L
Linus Torvalds 已提交
407
{
408 409
	struct tc_u_knode __rcu **kp;
	struct tc_u_knode *pkp;
410
	struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
L
Linus Torvalds 已提交
411 412

	if (ht) {
413 414 415 416 417
		kp = &ht->ht[TC_U32_HASH(key->handle)];
		for (pkp = rtnl_dereference(*kp); pkp;
		     kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
			if (pkp == key) {
				RCU_INIT_POINTER(*kp, key->next);
L
Linus Torvalds 已提交
418

419
				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
L
Linus Torvalds 已提交
420 421 422 423
				return 0;
			}
		}
	}
424
	WARN_ON(1);
L
Linus Torvalds 已提交
425 426 427
	return 0;
}

428
static void u32_clear_hnode(struct tc_u_hnode *ht)
L
Linus Torvalds 已提交
429 430
{
	struct tc_u_knode *n;
E
Eric Dumazet 已提交
431
	unsigned int h;
L
Linus Torvalds 已提交
432

E
Eric Dumazet 已提交
433
	for (h = 0; h <= ht->divisor; h++) {
434 435 436
		while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
			RCU_INIT_POINTER(ht->ht[h],
					 rtnl_dereference(n->next));
437
			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
L
Linus Torvalds 已提交
438 439 440 441 442 443 444
		}
	}
}

static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
	struct tc_u_common *tp_c = tp->data;
445 446
	struct tc_u_hnode __rcu **hn;
	struct tc_u_hnode *phn;
L
Linus Torvalds 已提交
447

448
	WARN_ON(ht->refcnt);
L
Linus Torvalds 已提交
449

450
	u32_clear_hnode(ht);
L
Linus Torvalds 已提交
451

452 453 454 455 456 457 458
	hn = &tp_c->hlist;
	for (phn = rtnl_dereference(*hn);
	     phn;
	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
		if (phn == ht) {
			RCU_INIT_POINTER(*hn, ht->next);
			kfree_rcu(ht, rcu);
L
Linus Torvalds 已提交
459 460 461 462 463 464 465 466 467 468
			return 0;
		}
	}

	return -ENOENT;
}

static void u32_destroy(struct tcf_proto *tp)
{
	struct tc_u_common *tp_c = tp->data;
469
	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
470

471
	WARN_ON(root_ht == NULL);
L
Linus Torvalds 已提交
472 473 474 475 476 477 478

	if (root_ht && --root_ht->refcnt == 0)
		u32_destroy_hnode(tp, root_ht);

	if (--tp_c->refcnt == 0) {
		struct tc_u_hnode *ht;

479
		tp->q->u32_node = NULL;
L
Linus Torvalds 已提交
480

481 482 483
		for (ht = rtnl_dereference(tp_c->hlist);
		     ht;
		     ht = rtnl_dereference(ht->next)) {
484
			ht->refcnt--;
485
			u32_clear_hnode(ht);
486
		}
L
Linus Torvalds 已提交
487

488 489 490
		while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
			RCU_INIT_POINTER(tp_c->hlist, ht->next);
			kfree_rcu(ht, rcu);
491
		}
L
Linus Torvalds 已提交
492 493 494 495 496 497 498 499 500

		kfree(tp_c);
	}

	tp->data = NULL;
}

static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
E
Eric Dumazet 已提交
501
	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
502
	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
503 504 505 506 507

	if (ht == NULL)
		return 0;

	if (TC_U32_KEY(ht->handle))
E
Eric Dumazet 已提交
508
		return u32_delete_key(tp, (struct tc_u_knode *)ht);
L
Linus Torvalds 已提交
509

510
	if (root_ht == ht)
L
Linus Torvalds 已提交
511 512
		return -EINVAL;

513 514
	if (ht->refcnt == 1) {
		ht->refcnt--;
L
Linus Torvalds 已提交
515
		u32_destroy_hnode(tp, ht);
516 517 518
	} else {
		return -EBUSY;
	}
L
Linus Torvalds 已提交
519 520 521 522

	return 0;
}

523
#define NR_U32_NODE (1<<12)
L
Linus Torvalds 已提交
524 525 526
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
	struct tc_u_knode *n;
527 528 529 530 531
	unsigned long i;
	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
					GFP_KERNEL);
	if (!bitmap)
		return handle | 0xFFF;
L
Linus Torvalds 已提交
532

533 534 535
	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
	     n;
	     n = rtnl_dereference(n->next))
536
		set_bit(TC_U32_NODE(n->handle), bitmap);
L
Linus Torvalds 已提交
537

538 539 540 541 542 543
	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
	if (i >= NR_U32_NODE)
		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);

	kfree(bitmap);
	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
L
Linus Torvalds 已提交
544 545
}

546 547 548 549 550 551 552 553 554 555
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
	[TCA_U32_HASH]		= { .type = NLA_U32 },
	[TCA_U32_LINK]		= { .type = NLA_U32 },
	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
};

556 557
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
			 unsigned long base, struct tc_u_hnode *ht,
558
			 struct tc_u_knode *n, struct nlattr **tb,
559
			 struct nlattr *est, bool ovr)
L
Linus Torvalds 已提交
560 561 562 563
{
	int err;
	struct tcf_exts e;

564
	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
565
	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
L
Linus Torvalds 已提交
566 567 568 569
	if (err < 0)
		return err;

	err = -EINVAL;
570
	if (tb[TCA_U32_LINK]) {
571
		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
572
		struct tc_u_hnode *ht_down = NULL, *ht_old;
L
Linus Torvalds 已提交
573 574 575 576 577 578 579 580 581 582 583 584

		if (TC_U32_KEY(handle))
			goto errout;

		if (handle) {
			ht_down = u32_lookup_ht(ht->tp_c, handle);

			if (ht_down == NULL)
				goto errout;
			ht_down->refcnt++;
		}

585 586
		ht_old = rtnl_dereference(n->ht_down);
		rcu_assign_pointer(n->ht_down, ht_down);
L
Linus Torvalds 已提交
587

588 589
		if (ht_old)
			ht_old->refcnt--;
L
Linus Torvalds 已提交
590
	}
591
	if (tb[TCA_U32_CLASSID]) {
592
		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
L
Linus Torvalds 已提交
593 594 595 596
		tcf_bind_filter(tp, &n->res, base);
	}

#ifdef CONFIG_NET_CLS_IND
597
	if (tb[TCA_U32_INDEV]) {
598 599 600
		int ret;
		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
		if (ret < 0)
L
Linus Torvalds 已提交
601
			goto errout;
602
		n->ifindex = ret;
L
Linus Torvalds 已提交
603 604 605 606 607 608
	}
#endif
	tcf_exts_change(tp, &n->exts, &e);

	return 0;
errout:
609
	tcf_exts_destroy(&e);
L
Linus Torvalds 已提交
610 611 612
	return err;
}

613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
static void u32_replace_knode(struct tcf_proto *tp,
			      struct tc_u_common *tp_c,
			      struct tc_u_knode *n)
{
	struct tc_u_knode __rcu **ins;
	struct tc_u_knode *pins;
	struct tc_u_hnode *ht;

	if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
		ht = rtnl_dereference(tp->root);
	else
		ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));

	ins = &ht->ht[TC_U32_HASH(n->handle)];

	/* The node must always exist for it to be replaced if this is not the
	 * case then something went very wrong elsewhere.
	 */
	for (pins = rtnl_dereference(*ins); ;
	     ins = &pins->next, pins = rtnl_dereference(*ins))
		if (pins->handle == n->handle)
			break;

	RCU_INIT_POINTER(n->next, pins->next);
	rcu_assign_pointer(*ins, n);
}

static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
					 struct tc_u_knode *n)
{
	struct tc_u_knode *new;
	struct tc_u32_sel *s = &n->sel;

	new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
		      GFP_KERNEL);

	if (!new)
		return NULL;

	RCU_INIT_POINTER(new->next, n->next);
	new->handle = n->handle;
	RCU_INIT_POINTER(new->ht_up, n->ht_up);

#ifdef CONFIG_NET_CLS_IND
	new->ifindex = n->ifindex;
#endif
	new->fshift = n->fshift;
	new->res = n->res;
	RCU_INIT_POINTER(new->ht_down, n->ht_down);

	/* bump reference count as long as we hold pointer to structure */
	if (new->ht_down)
		new->ht_down->refcnt++;

#ifdef CONFIG_CLS_U32_PERF
	/* Statistics may be incremented by readers during update
	 * so we must keep them in tact. When the node is later destroyed
	 * a special destroy call must be made to not free the pf memory.
	 */
	new->pf = n->pf;
#endif

#ifdef CONFIG_CLS_U32_MARK
	new->val = n->val;
	new->mask = n->mask;
	/* Similarly success statistics must be moved as pointers */
	new->pcpu_success = n->pcpu_success;
#endif
	new->tp = tp;
	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));

	tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE);

	return new;
}

689
static int u32_change(struct net *net, struct sk_buff *in_skb,
690
		      struct tcf_proto *tp, unsigned long base, u32 handle,
691
		      struct nlattr **tca,
692
		      unsigned long *arg, bool ovr)
L
Linus Torvalds 已提交
693 694 695 696 697
{
	struct tc_u_common *tp_c = tp->data;
	struct tc_u_hnode *ht;
	struct tc_u_knode *n;
	struct tc_u32_sel *s;
698 699
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct nlattr *tb[TCA_U32_MAX + 1];
L
Linus Torvalds 已提交
700 701
	u32 htid;
	int err;
702 703 704
#ifdef CONFIG_CLS_U32_PERF
	size_t size;
#endif
L
Linus Torvalds 已提交
705 706 707 708

	if (opt == NULL)
		return handle ? -EINVAL : 0;

709
	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
710 711
	if (err < 0)
		return err;
L
Linus Torvalds 已提交
712

E
Eric Dumazet 已提交
713 714
	n = (struct tc_u_knode *)*arg;
	if (n) {
715 716
		struct tc_u_knode *new;

L
Linus Torvalds 已提交
717 718 719
		if (TC_U32_KEY(n->handle) == 0)
			return -EINVAL;

720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
		new = u32_init_knode(tp, n);
		if (!new)
			return -ENOMEM;

		err = u32_set_parms(net, tp, base,
				    rtnl_dereference(n->ht_up), new, tb,
				    tca[TCA_RATE], ovr);

		if (err) {
			u32_destroy_key(tp, new, false);
			return err;
		}

		u32_replace_knode(tp, tp_c, new);
		call_rcu(&n->rcu, u32_delete_key_rcu);
		return 0;
L
Linus Torvalds 已提交
736 737
	}

738
	if (tb[TCA_U32_DIVISOR]) {
E
Eric Dumazet 已提交
739
		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
L
Linus Torvalds 已提交
740 741 742 743 744 745 746 747 748 749

		if (--divisor > 0x100)
			return -EINVAL;
		if (TC_U32_KEY(handle))
			return -EINVAL;
		if (handle == 0) {
			handle = gen_new_htid(tp->data);
			if (handle == 0)
				return -ENOMEM;
		}
E
Eric Dumazet 已提交
750
		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
L
Linus Torvalds 已提交
751 752 753
		if (ht == NULL)
			return -ENOBUFS;
		ht->tp_c = tp_c;
754
		ht->refcnt = 1;
L
Linus Torvalds 已提交
755 756 757
		ht->divisor = divisor;
		ht->handle = handle;
		ht->prio = tp->prio;
758 759
		RCU_INIT_POINTER(ht->next, tp_c->hlist);
		rcu_assign_pointer(tp_c->hlist, ht);
L
Linus Torvalds 已提交
760 761 762 763
		*arg = (unsigned long)ht;
		return 0;
	}

764
	if (tb[TCA_U32_HASH]) {
765
		htid = nla_get_u32(tb[TCA_U32_HASH]);
L
Linus Torvalds 已提交
766
		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
767
			ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
768 769 770 771 772 773 774
			htid = ht->handle;
		} else {
			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
			if (ht == NULL)
				return -EINVAL;
		}
	} else {
775
		ht = rtnl_dereference(tp->root);
L
Linus Torvalds 已提交
776 777 778 779 780 781 782 783 784 785 786 787 788
		htid = ht->handle;
	}

	if (ht->divisor < TC_U32_HASH(htid))
		return -EINVAL;

	if (handle) {
		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
			return -EINVAL;
		handle = htid | TC_U32_NODE(handle);
	} else
		handle = gen_new_kid(ht, htid);

789
	if (tb[TCA_U32_SEL] == NULL)
L
Linus Torvalds 已提交
790 791
		return -EINVAL;

792
	s = nla_data(tb[TCA_U32_SEL]);
L
Linus Torvalds 已提交
793

794
	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
L
Linus Torvalds 已提交
795 796 797 798
	if (n == NULL)
		return -ENOBUFS;

#ifdef CONFIG_CLS_U32_PERF
799 800 801
	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
	if (!n->pf) {
L
Linus Torvalds 已提交
802 803 804 805 806 807
		kfree(n);
		return -ENOBUFS;
	}
#endif

	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
808
	RCU_INIT_POINTER(n->ht_up, ht);
L
Linus Torvalds 已提交
809
	n->handle = handle;
810
	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
811
	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
812
	n->tp = tp;
L
Linus Torvalds 已提交
813 814

#ifdef CONFIG_CLS_U32_MARK
815
	n->pcpu_success = alloc_percpu(u32);
816 817 818 819
	if (!n->pcpu_success) {
		err = -ENOMEM;
		goto errout;
	}
820

821
	if (tb[TCA_U32_MARK]) {
L
Linus Torvalds 已提交
822 823
		struct tc_u32_mark *mark;

824
		mark = nla_data(tb[TCA_U32_MARK]);
825 826
		n->val = mark->val;
		n->mask = mark->mask;
L
Linus Torvalds 已提交
827 828 829
	}
#endif

830
	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
L
Linus Torvalds 已提交
831
	if (err == 0) {
832 833 834 835 836 837 838
		struct tc_u_knode __rcu **ins;
		struct tc_u_knode *pins;

		ins = &ht->ht[TC_U32_HASH(handle)];
		for (pins = rtnl_dereference(*ins); pins;
		     ins = &pins->next, pins = rtnl_dereference(*ins))
			if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
L
Linus Torvalds 已提交
839 840
				break;

841 842
		RCU_INIT_POINTER(n->next, pins);
		rcu_assign_pointer(*ins, n);
L
Linus Torvalds 已提交
843 844 845 846

		*arg = (unsigned long)n;
		return 0;
	}
847 848 849

#ifdef CONFIG_CLS_U32_MARK
	free_percpu(n->pcpu_success);
850
errout:
851 852
#endif

L
Linus Torvalds 已提交
853
#ifdef CONFIG_CLS_U32_PERF
854
	free_percpu(n->pf);
L
Linus Torvalds 已提交
855 856 857 858 859 860 861 862 863 864
#endif
	kfree(n);
	return err;
}

static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
	struct tc_u_common *tp_c = tp->data;
	struct tc_u_hnode *ht;
	struct tc_u_knode *n;
E
Eric Dumazet 已提交
865
	unsigned int h;
L
Linus Torvalds 已提交
866 867 868 869

	if (arg->stop)
		return;

870 871 872
	for (ht = rtnl_dereference(tp_c->hlist);
	     ht;
	     ht = rtnl_dereference(ht->next)) {
L
Linus Torvalds 已提交
873 874 875 876 877 878 879 880 881 882
		if (ht->prio != tp->prio)
			continue;
		if (arg->count >= arg->skip) {
			if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
				arg->stop = 1;
				return;
			}
		}
		arg->count++;
		for (h = 0; h <= ht->divisor; h++) {
883 884 885
			for (n = rtnl_dereference(ht->ht[h]);
			     n;
			     n = rtnl_dereference(n->next)) {
L
Linus Torvalds 已提交
886 887 888 889 890 891 892 893 894 895 896 897 898 899
				if (arg->count < arg->skip) {
					arg->count++;
					continue;
				}
				if (arg->fn(tp, (unsigned long)n, arg) < 0) {
					arg->stop = 1;
					return;
				}
				arg->count++;
			}
		}
	}
}

900
static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
L
Linus Torvalds 已提交
901 902
		     struct sk_buff *skb, struct tcmsg *t)
{
E
Eric Dumazet 已提交
903
	struct tc_u_knode *n = (struct tc_u_knode *)fh;
904
	struct tc_u_hnode *ht_up, *ht_down;
905
	struct nlattr *nest;
L
Linus Torvalds 已提交
906 907 908 909 910 911

	if (n == NULL)
		return skb->len;

	t->tcm_handle = n->handle;

912 913 914
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;
L
Linus Torvalds 已提交
915 916

	if (TC_U32_KEY(n->handle) == 0) {
E
Eric Dumazet 已提交
917 918 919
		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
		u32 divisor = ht->divisor + 1;

920 921
		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
			goto nla_put_failure;
L
Linus Torvalds 已提交
922
	} else {
923 924 925
#ifdef CONFIG_CLS_U32_PERF
		struct tc_u32_pcnt *gpf;
		int cpu;
926
#endif
927

928 929 930 931
		if (nla_put(skb, TCA_U32_SEL,
			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
			    &n->sel))
			goto nla_put_failure;
932 933 934

		ht_up = rtnl_dereference(n->ht_up);
		if (ht_up) {
L
Linus Torvalds 已提交
935
			u32 htid = n->handle & 0xFFFFF000;
936 937
			if (nla_put_u32(skb, TCA_U32_HASH, htid))
				goto nla_put_failure;
L
Linus Torvalds 已提交
938
		}
939 940 941
		if (n->res.classid &&
		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
			goto nla_put_failure;
942 943 944 945

		ht_down = rtnl_dereference(n->ht_down);
		if (ht_down &&
		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
946
			goto nla_put_failure;
L
Linus Torvalds 已提交
947 948

#ifdef CONFIG_CLS_U32_MARK
949 950 951 952
		if ((n->val || n->mask)) {
			struct tc_u32_mark mark = {.val = n->val,
						   .mask = n->mask,
						   .success = 0};
953
			int cpum;
954

955 956
			for_each_possible_cpu(cpum) {
				__u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
957 958 959 960 961 962 963

				mark.success += cnt;
			}

			if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
				goto nla_put_failure;
		}
L
Linus Torvalds 已提交
964 965
#endif

966
		if (tcf_exts_dump(skb, &n->exts) < 0)
967
			goto nla_put_failure;
L
Linus Torvalds 已提交
968 969

#ifdef CONFIG_NET_CLS_IND
970 971 972 973 974 975
		if (n->ifindex) {
			struct net_device *dev;
			dev = __dev_get_by_index(net, n->ifindex);
			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
				goto nla_put_failure;
		}
L
Linus Torvalds 已提交
976 977
#endif
#ifdef CONFIG_CLS_U32_PERF
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
		gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
			      n->sel.nkeys * sizeof(u64),
			      GFP_KERNEL);
		if (!gpf)
			goto nla_put_failure;

		for_each_possible_cpu(cpu) {
			int i;
			struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);

			gpf->rcnt += pf->rcnt;
			gpf->rhit += pf->rhit;
			for (i = 0; i < n->sel.nkeys; i++)
				gpf->kcnts[i] += pf->kcnts[i];
		}

994 995
		if (nla_put(skb, TCA_U32_PCNT,
			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
996 997
			    gpf)) {
			kfree(gpf);
998
			goto nla_put_failure;
999 1000
		}
		kfree(gpf);
L
Linus Torvalds 已提交
1001 1002 1003
#endif
	}

1004 1005
	nla_nest_end(skb, nest);

L
Linus Torvalds 已提交
1006
	if (TC_U32_KEY(n->handle))
1007
		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
1008
			goto nla_put_failure;
L
Linus Torvalds 已提交
1009 1010
	return skb->len;

1011
nla_put_failure:
1012
	nla_nest_cancel(skb, nest);
L
Linus Torvalds 已提交
1013 1014 1015
	return -1;
}

1016
static struct tcf_proto_ops cls_u32_ops __read_mostly = {
L
Linus Torvalds 已提交
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
	.kind		=	"u32",
	.classify	=	u32_classify,
	.init		=	u32_init,
	.destroy	=	u32_destroy,
	.get		=	u32_get,
	.put		=	u32_put,
	.change		=	u32_change,
	.delete		=	u32_delete,
	.walk		=	u32_walk,
	.dump		=	u32_dump,
	.owner		=	THIS_MODULE,
};

static int __init init_u32(void)
{
1032
	pr_info("u32 classifier\n");
L
Linus Torvalds 已提交
1033
#ifdef CONFIG_CLS_U32_PERF
1034
	pr_info("    Performance counters on\n");
L
Linus Torvalds 已提交
1035 1036
#endif
#ifdef CONFIG_NET_CLS_IND
1037
	pr_info("    input device check on\n");
L
Linus Torvalds 已提交
1038 1039
#endif
#ifdef CONFIG_NET_CLS_ACT
1040
	pr_info("    Actions configured\n");
L
Linus Torvalds 已提交
1041 1042 1043 1044
#endif
	return register_tcf_proto_ops(&cls_u32_ops);
}

1045
static void __exit exit_u32(void)
L
Linus Torvalds 已提交
1046 1047 1048 1049 1050 1051 1052
{
	unregister_tcf_proto_ops(&cls_u32_ops);
}

module_init(init_u32)
module_exit(exit_u32)
MODULE_LICENSE("GPL");