ematch.c 14.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * net/sched/ematch.c		Extended Match API
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Thomas Graf <tgraf@suug.ch>
 *
 * ==========================================================================
 *
 * An extended match (ematch) is a small classification tool not worth
 * writing a full classifier for. Ematches can be interconnected to form
 * a logic expression and get attached to classifiers to extend their
 * functionatlity.
 *
 * The userspace part transforms the logic expressions into an array
 * consisting of multiple sequences of interconnected ematches separated
 * by markers. Precedence is implemented by a special ematch kind
 * referencing a sequence beyond the marker of the current sequence
 * causing the current position in the sequence to be pushed onto a stack
 * to allow the current position to be overwritten by the position referenced
 * in the special ematch. Matching continues in the new sequence until a
 * marker is reached causing the position to be restored from the stack.
 *
 * Example:
 *          A AND (B1 OR B2) AND C AND D
 *
 *              ------->-PUSH-------
 *    -->--    /         -->--      \   -->--
 *   /     \  /         /     \      \ /     \
 * +-------+-------+-------+-------+-------+--------+
 * | A AND | B AND | C AND | D END | B1 OR | B2 END |
 * +-------+-------+-------+-------+-------+--------+
 *                    \                      /
 *                     --------<-POP---------
 *
 * where B is a virtual ematch referencing to sequence starting with B1.
40
 *
L
Linus Torvalds 已提交
41 42 43 44
 * ==========================================================================
 *
 * How to write an ematch in 60 seconds
 * ------------------------------------
45
 *
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 *   1) Provide a matcher function:
 *      static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
 *                          struct tcf_pkt_info *info)
 *      {
 *      	struct mydata *d = (struct mydata *) m->data;
 *
 *      	if (...matching goes here...)
 *      		return 1;
 *      	else
 *      		return 0;
 *      }
 *
 *   2) Fill out a struct tcf_ematch_ops:
 *      static struct tcf_ematch_ops my_ops = {
 *      	.kind = unique id,
 *      	.datalen = sizeof(struct mydata),
 *      	.match = my_match,
 *      	.owner = THIS_MODULE,
 *      };
 *
 *   3) Register/Unregister your ematch:
 *      static int __init init_my_ematch(void)
 *      {
 *      	return tcf_em_register(&my_ops);
 *      }
 *
 *      static void __exit exit_my_ematch(void)
 *      {
 *      	return tcf_em_unregister(&my_ops);
 *      }
 *
 *      module_init(init_my_ematch);
 *      module_exit(exit_my_ematch);
 *
 *   4) By now you should have two more seconds left, barely enough to
 *      open up a beer to watch the compilation going.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>

static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);

static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
{
	struct tcf_ematch_ops *e = NULL;

	read_lock(&ematch_mod_lock);
	list_for_each_entry(e, &ematch_ops, link) {
		if (kind == e->kind) {
			if (!try_module_get(e->owner))
				e = NULL;
			read_unlock(&ematch_mod_lock);
			return e;
		}
	}
	read_unlock(&ematch_mod_lock);

	return NULL;
}

/**
 * tcf_em_register - register an extended match
115
 *
L
Linus Torvalds 已提交
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 * @ops: ematch operations lookup table
 *
 * This function must be called by ematches to announce their presence.
 * The given @ops must have kind set to a unique identifier and the
 * callback match() must be implemented. All other callbacks are optional
 * and a fallback implementation is used instead.
 *
 * Returns -EEXISTS if an ematch of the same kind has already registered.
 */
int tcf_em_register(struct tcf_ematch_ops *ops)
{
	int err = -EEXIST;
	struct tcf_ematch_ops *e;

	if (ops->match == NULL)
		return -EINVAL;

	write_lock(&ematch_mod_lock);
	list_for_each_entry(e, &ematch_ops, link)
		if (ops->kind == e->kind)
			goto errout;

	list_add_tail(&ops->link, &ematch_ops);
	err = 0;
errout:
	write_unlock(&ematch_mod_lock);
	return err;
}
144
EXPORT_SYMBOL(tcf_em_register);
L
Linus Torvalds 已提交
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174

/**
 * tcf_em_unregister - unregster and extended match
 *
 * @ops: ematch operations lookup table
 *
 * This function must be called by ematches to announce their disappearance
 * for examples when the module gets unloaded. The @ops parameter must be
 * the same as the one used for registration.
 *
 * Returns -ENOENT if no matching ematch was found.
 */
int tcf_em_unregister(struct tcf_ematch_ops *ops)
{
	int err = 0;
	struct tcf_ematch_ops *e;

	write_lock(&ematch_mod_lock);
	list_for_each_entry(e, &ematch_ops, link) {
		if (e == ops) {
			list_del(&e->link);
			goto out;
		}
	}

	err = -ENOENT;
out:
	write_unlock(&ematch_mod_lock);
	return err;
}
175
EXPORT_SYMBOL(tcf_em_unregister);
L
Linus Torvalds 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212

static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
						   int index)
{
	return &tree->matches[index];
}


static int tcf_em_validate(struct tcf_proto *tp,
			   struct tcf_ematch_tree_hdr *tree_hdr,
			   struct tcf_ematch *em, struct rtattr *rta, int idx)
{
	int err = -EINVAL;
	struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta);
	int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr);
	void *data = (void *) em_hdr + sizeof(*em_hdr);

	if (!TCF_EM_REL_VALID(em_hdr->flags))
		goto errout;

	if (em_hdr->kind == TCF_EM_CONTAINER) {
		/* Special ematch called "container", carries an index
		 * referencing an external ematch sequence. */
		u32 ref;

		if (data_len < sizeof(ref))
			goto errout;
		ref = *(u32 *) data;

		if (ref >= tree_hdr->nmatches)
			goto errout;

		/* We do not allow backward jumps to avoid loops and jumps
		 * to our own position are of course illegal. */
		if (ref <= idx)
			goto errout;

213

L
Linus Torvalds 已提交
214 215 216 217 218 219 220 221 222 223 224 225 226
		em->data = ref;
	} else {
		/* Note: This lookup will increase the module refcnt
		 * of the ematch module referenced. In case of a failure,
		 * a destroy function is called by the underlying layer
		 * which automatically releases the reference again, therefore
		 * the module MUST not be given back under any circumstances
		 * here. Be aware, the destroy function assumes that the
		 * module is held if the ops field is non zero. */
		em->ops = tcf_em_lookup(em_hdr->kind);

		if (em->ops == NULL) {
			err = -ENOENT;
227 228 229 230 231 232 233 234 235 236 237 238 239
#ifdef CONFIG_KMOD
			__rtnl_unlock();
			request_module("ematch-kind-%u", em_hdr->kind);
			rtnl_lock();
			em->ops = tcf_em_lookup(em_hdr->kind);
			if (em->ops) {
				/* We dropped the RTNL mutex in order to
				 * perform the module load. Tell the caller
				 * to replay the request. */
				module_put(em->ops->owner);
				err = -EAGAIN;
			}
#endif
L
Linus Torvalds 已提交
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
			goto errout;
		}

		/* ematch module provides expected length of data, so we
		 * can do a basic sanity check. */
		if (em->ops->datalen && data_len < em->ops->datalen)
			goto errout;

		if (em->ops->change) {
			err = em->ops->change(tp, data, data_len, em);
			if (err < 0)
				goto errout;
		} else if (data_len > 0) {
			/* ematch module doesn't provide an own change
			 * procedure and expects us to allocate and copy
			 * the ematch data.
			 *
			 * TCF_EM_SIMPLE may be specified stating that the
			 * data only consists of a u32 integer and the module
			 * does not expected a memory reference but rather
			 * the value carried. */
			if (em_hdr->flags & TCF_EM_SIMPLE) {
				if (data_len < sizeof(u32))
					goto errout;
				em->data = *(u32 *) data;
			} else {
266
				void *v = kmemdup(data, data_len, GFP_KERNEL);
L
Linus Torvalds 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
				if (v == NULL) {
					err = -ENOBUFS;
					goto errout;
				}
				em->data = (unsigned long) v;
			}
		}
	}

	em->matchid = em_hdr->matchid;
	em->flags = em_hdr->flags;
	em->datalen = data_len;

	err = 0;
errout:
	return err;
}

/**
 * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
 *
 * @tp: classifier kind handle
 * @rta: ematch tree configuration TLV
 * @tree: destination ematch tree variable to store the resulting
 *        ematch tree.
 *
 * This function validates the given configuration TLV @rta and builds an
 * ematch tree in @tree. The resulting tree must later be copied into
 * the private classifier data using tcf_em_tree_change(). You MUST NOT
 * provide the ematch tree variable of the private classifier data directly,
 * the changes would not be locked properly.
 *
 * Returns a negative error code if the configuration TLV contains errors.
 */
int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
			 struct tcf_ematch_tree *tree)
{
	int idx, list_len, matches_len, err = -EINVAL;
	struct rtattr *tb[TCA_EMATCH_TREE_MAX];
	struct rtattr *rt_match, *rt_hdr, *rt_list;
	struct tcf_ematch_tree_hdr *tree_hdr;
	struct tcf_ematch *em;

310 311 312 313 314
	if (!rta) {
		memset(tree, 0, sizeof(*tree));
		return 0;
	}

L
Linus Torvalds 已提交
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
	if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
		goto errout;

	rt_hdr = tb[TCA_EMATCH_TREE_HDR-1];
	rt_list = tb[TCA_EMATCH_TREE_LIST-1];

	if (rt_hdr == NULL || rt_list == NULL)
		goto errout;

	if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) ||
	    RTA_PAYLOAD(rt_list) < sizeof(*rt_match))
		goto errout;

	tree_hdr = RTA_DATA(rt_hdr);
	memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));

	rt_match = RTA_DATA(rt_list);
	list_len = RTA_PAYLOAD(rt_list);
	matches_len = tree_hdr->nmatches * sizeof(*em);

335
	tree->matches = kzalloc(matches_len, GFP_KERNEL);
L
Linus Torvalds 已提交
336 337 338 339 340 341
	if (tree->matches == NULL)
		goto errout;

	/* We do not use rtattr_parse_nested here because the maximum
	 * number of attributes is unknown. This saves us the allocation
	 * for a tb buffer which would serve no purpose at all.
342
	 *
L
Linus Torvalds 已提交
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
	 * The array of rt attributes is parsed in the order as they are
	 * provided, their type must be incremental from 1 to n. Even
	 * if it does not serve any real purpose, a failure of sticking
	 * to this policy will result in parsing failure. */
	for (idx = 0; RTA_OK(rt_match, list_len); idx++) {
		err = -EINVAL;

		if (rt_match->rta_type != (idx + 1))
			goto errout_abort;

		if (idx >= tree_hdr->nmatches)
			goto errout_abort;

		if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr))
			goto errout_abort;

		em = tcf_em_get_match(tree, idx);

		err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
		if (err < 0)
			goto errout_abort;

		rt_match = RTA_NEXT(rt_match, list_len);
	}

	/* Check if the number of matches provided by userspace actually
	 * complies with the array of matches. The number was used for
	 * the validation of references and a mismatch could lead to
	 * undefined references during the matching process. */
	if (idx != tree_hdr->nmatches) {
		err = -EINVAL;
		goto errout_abort;
	}

	err = 0;
errout:
	return err;

errout_abort:
	tcf_em_tree_destroy(tp, tree);
	return err;
}
385
EXPORT_SYMBOL(tcf_em_tree_validate);
L
Linus Torvalds 已提交
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414

/**
 * tcf_em_tree_destroy - destroy an ematch tree
 *
 * @tp: classifier kind handle
 * @tree: ematch tree to be deleted
 *
 * This functions destroys an ematch tree previously created by
 * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
 * the ematch tree is not in use before calling this function.
 */
void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
{
	int i;

	if (tree->matches == NULL)
		return;

	for (i = 0; i < tree->hdr.nmatches; i++) {
		struct tcf_ematch *em = tcf_em_get_match(tree, i);

		if (em->ops) {
			if (em->ops->destroy)
				em->ops->destroy(tp, em);
			else if (!tcf_em_is_simple(em) && em->data)
				kfree((void *) em->data);
			module_put(em->ops->owner);
		}
	}
415

L
Linus Torvalds 已提交
416 417 418
	tree->hdr.nmatches = 0;
	kfree(tree->matches);
}
419
EXPORT_SYMBOL(tcf_em_tree_destroy);
L
Linus Torvalds 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435

/**
 * tcf_em_tree_dump - dump ematch tree into a rtnl message
 *
 * @skb: skb holding the rtnl message
 * @t: ematch tree to be dumped
 * @tlv: TLV type to be used to encapsulate the tree
 *
 * This function dumps a ematch tree into a rtnl message. It is valid to
 * call this function while the ematch tree is in use.
 *
 * Returns -1 if the skb tailroom is insufficient.
 */
int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
{
	int i;
436 437 438
	u8 *tail;
	struct rtattr *top_start = (struct rtattr *)skb_tail_pointer(skb);
	struct rtattr *list_start;
L
Linus Torvalds 已提交
439 440 441 442

	RTA_PUT(skb, tlv, 0, NULL);
	RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);

443
	list_start = (struct rtattr *)skb_tail_pointer(skb);
L
Linus Torvalds 已提交
444 445
	RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);

446
	tail = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
447
	for (i = 0; i < tree->hdr.nmatches; i++) {
448
		struct rtattr *match_start = (struct rtattr *)tail;
L
Linus Torvalds 已提交
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
		struct tcf_ematch *em = tcf_em_get_match(tree, i);
		struct tcf_ematch_hdr em_hdr = {
			.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
			.matchid = em->matchid,
			.flags = em->flags
		};

		RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);

		if (em->ops && em->ops->dump) {
			if (em->ops->dump(skb, em) < 0)
				goto rtattr_failure;
		} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
			u32 u = em->data;
			RTA_PUT_NOHDR(skb, sizeof(u), &u);
		} else if (em->datalen > 0)
			RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data);

467 468
		tail = skb_tail_pointer(skb);
		match_start->rta_len = tail - (u8 *)match_start;
L
Linus Torvalds 已提交
469 470
	}

471 472
	list_start->rta_len = tail - (u8 *)list_start;
	top_start->rta_len = tail - (u8 *)top_start;
L
Linus Torvalds 已提交
473 474 475 476 477 478

	return 0;

rtattr_failure:
	return -1;
}
479
EXPORT_SYMBOL(tcf_em_tree_dump);
L
Linus Torvalds 已提交
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537

static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
			       struct tcf_pkt_info *info)
{
	int r = em->ops->match(skb, em, info);
	return tcf_em_is_inverted(em) ? !r : r;
}

/* Do not use this function directly, use tcf_em_tree_match instead */
int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
			struct tcf_pkt_info *info)
{
	int stackp = 0, match_idx = 0, res = 0;
	struct tcf_ematch *cur_match;
	int stack[CONFIG_NET_EMATCH_STACK];

proceed:
	while (match_idx < tree->hdr.nmatches) {
		cur_match = tcf_em_get_match(tree, match_idx);

		if (tcf_em_is_container(cur_match)) {
			if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
				goto stack_overflow;

			stack[stackp++] = match_idx;
			match_idx = cur_match->data;
			goto proceed;
		}

		res = tcf_em_match(skb, cur_match, info);

		if (tcf_em_early_end(cur_match, res))
			break;

		match_idx++;
	}

pop_stack:
	if (stackp > 0) {
		match_idx = stack[--stackp];
		cur_match = tcf_em_get_match(tree, match_idx);

		if (tcf_em_early_end(cur_match, res))
			goto pop_stack;
		else {
			match_idx++;
			goto proceed;
		}
	}

	return res;

stack_overflow:
	if (net_ratelimit())
		printk("Local stack overflow, increase NET_EMATCH_STACK\n");
	return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);