cls_bpf.c 14.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Berkeley Packet Filter based traffic classifier
 *
 * Might be used to classify traffic through flexible, user-defined and
 * possibly JIT-ed BPF filters for traffic control as an alternative to
 * ematches.
 *
 * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
19 20
#include <linux/bpf.h>

21 22 23 24 25 26 27 28
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");

29
#define CLS_BPF_NAME_LEN	256
30
#define CLS_BPF_SUPPORTED_GEN_FLAGS		\
31
	(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
32

33 34 35
struct cls_bpf_head {
	struct list_head plist;
	u32 hgen;
36
	struct rcu_head rcu;
37 38 39
};

struct cls_bpf_prog {
40
	struct bpf_prog *filter;
41
	struct list_head link;
42
	struct tcf_result res;
43
	bool exts_integrated;
44
	bool offloaded;
45
	u32 gen_flags;
46
	struct tcf_exts exts;
47
	u32 handle;
48
	u16 bpf_num_ops;
49 50
	struct sock_filter *bpf_ops;
	const char *bpf_name;
51 52
	struct tcf_proto *tp;
	struct rcu_head rcu;
53 54 55 56
};

static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
57
	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
58
	[TCA_BPF_FLAGS_GEN]	= { .type = NLA_U32 },
59
	[TCA_BPF_FD]		= { .type = NLA_U32 },
60 61
	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
				    .len = CLS_BPF_NAME_LEN },
62 63 64 65 66
	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
};

67 68 69 70 71 72
static int cls_bpf_exec_opcode(int code)
{
	switch (code) {
	case TC_ACT_OK:
	case TC_ACT_SHOT:
	case TC_ACT_STOLEN:
73
	case TC_ACT_TRAP:
74
	case TC_ACT_REDIRECT:
75 76 77 78 79 80 81
	case TC_ACT_UNSPEC:
		return code;
	default:
		return TC_ACT_UNSPEC;
	}
}

82 83 84
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
			    struct tcf_result *res)
{
85
	struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
86
	bool at_ingress = skb_at_tc_ingress(skb);
87
	struct cls_bpf_prog *prog;
88
	int ret = -1;
89

90 91
	/* Needed here for accessing maps. */
	rcu_read_lock();
92
	list_for_each_entry_rcu(prog, &head->plist, link) {
93 94
		int filter_res;

95 96
		qdisc_skb_cb(skb)->tc_classid = prog->res.classid;

97 98 99
		if (tc_skip_sw(prog->gen_flags)) {
			filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
		} else if (at_ingress) {
100 101
			/* It is safe to push/pull even if skb_shared() */
			__skb_push(skb, skb->mac_len);
102
			bpf_compute_data_end(skb);
103 104 105
			filter_res = BPF_PROG_RUN(prog->filter, skb);
			__skb_pull(skb, skb->mac_len);
		} else {
106
			bpf_compute_data_end(skb);
107 108
			filter_res = BPF_PROG_RUN(prog->filter, skb);
		}
109

110
		if (prog->exts_integrated) {
111 112 113
			res->class   = 0;
			res->classid = TC_H_MAJ(prog->res.classid) |
				       qdisc_skb_cb(skb)->tc_classid;
114 115 116 117 118 119 120

			ret = cls_bpf_exec_opcode(filter_res);
			if (ret == TC_ACT_UNSPEC)
				continue;
			break;
		}

121 122
		if (filter_res == 0)
			continue;
123 124
		if (filter_res != -1) {
			res->class   = 0;
125
			res->classid = filter_res;
126 127 128
		} else {
			*res = prog->res;
		}
129 130 131 132 133

		ret = tcf_exts_exec(skb, &prog->exts, res);
		if (ret < 0)
			continue;

134
		break;
135
	}
136
	rcu_read_unlock();
137

138
	return ret;
139 140
}

141 142 143 144 145
static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
{
	return !prog->bpf_ops;
}

146 147 148 149 150 151
static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
			       enum tc_clsbpf_command cmd)
{
	struct net_device *dev = tp->q->dev_queue->dev;
	struct tc_cls_bpf_offload bpf_offload = {};
	struct tc_to_netdev offload;
152
	int err;
153 154 155 156 157 158 159 160 161

	offload.type = TC_SETUP_CLSBPF;
	offload.cls_bpf = &bpf_offload;

	bpf_offload.command = cmd;
	bpf_offload.exts = &prog->exts;
	bpf_offload.prog = prog->filter;
	bpf_offload.name = prog->bpf_name;
	bpf_offload.exts_integrated = prog->exts_integrated;
162
	bpf_offload.gen_flags = prog->gen_flags;
163

164 165 166 167 168 169 170
	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
					    tp->protocol, &offload);

	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;

	return err;
171 172
}

173 174
static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
			   struct cls_bpf_prog *oldprog)
175 176 177 178
{
	struct net_device *dev = tp->q->dev_queue->dev;
	struct cls_bpf_prog *obj = prog;
	enum tc_clsbpf_command cmd;
179 180 181 182 183
	bool skip_sw;
	int ret;

	skip_sw = tc_skip_sw(prog->gen_flags) ||
		(oldprog && tc_skip_sw(oldprog->gen_flags));
184 185

	if (oldprog && oldprog->offloaded) {
186
		if (tc_should_offload(dev, tp, prog->gen_flags)) {
187
			cmd = TC_CLSBPF_REPLACE;
188
		} else if (!tc_skip_sw(prog->gen_flags)) {
189 190
			obj = oldprog;
			cmd = TC_CLSBPF_DESTROY;
191 192
		} else {
			return -EINVAL;
193 194
		}
	} else {
195
		if (!tc_should_offload(dev, tp, prog->gen_flags))
196
			return skip_sw ? -EINVAL : 0;
197 198 199
		cmd = TC_CLSBPF_ADD;
	}

200 201 202
	ret = cls_bpf_offload_cmd(tp, obj, cmd);
	if (ret)
		return skip_sw ? ret : 0;
203 204 205 206

	obj->offloaded = true;
	if (oldprog)
		oldprog->offloaded = false;
207 208

	return 0;
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
}

static void cls_bpf_stop_offload(struct tcf_proto *tp,
				 struct cls_bpf_prog *prog)
{
	int err;

	if (!prog->offloaded)
		return;

	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
	if (err) {
		pr_err("Stopping hardware offload failed: %d\n", err);
		return;
	}

	prog->offloaded = false;
}

228 229 230 231 232 233 234 235 236
static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
					 struct cls_bpf_prog *prog)
{
	if (!prog->offloaded)
		return;

	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
}

237 238 239 240 241 242 243 244
static int cls_bpf_init(struct tcf_proto *tp)
{
	struct cls_bpf_head *head;

	head = kzalloc(sizeof(*head), GFP_KERNEL);
	if (head == NULL)
		return -ENOBUFS;

245 246
	INIT_LIST_HEAD_RCU(&head->plist);
	rcu_assign_pointer(tp->root, head);
247 248 249 250

	return 0;
}

251
static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
252
{
253
	tcf_exts_destroy(&prog->exts);
254

255 256 257 258
	if (cls_bpf_is_ebpf(prog))
		bpf_prog_put(prog->filter);
	else
		bpf_prog_destroy(prog->filter);
259

260
	kfree(prog->bpf_name);
261 262 263 264
	kfree(prog->bpf_ops);
	kfree(prog);
}

265
static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
266
{
267
	__cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
268 269
}

270
static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
271
{
272
	cls_bpf_stop_offload(tp, prog);
273 274
	list_del_rcu(&prog->link);
	tcf_unbind_filter(tp, &prog->res);
275 276
	call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
}
277

278
static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
279
{
280 281
	struct cls_bpf_head *head = rtnl_dereference(tp->root);

282
	__cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
283
	*last = list_empty(&head->plist);
284
	return 0;
285 286
}

287
static void cls_bpf_destroy(struct tcf_proto *tp)
288
{
289
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
290 291
	struct cls_bpf_prog *prog, *tmp;

292 293
	list_for_each_entry_safe(prog, tmp, &head->plist, link)
		__cls_bpf_delete(tp, prog);
294

295
	kfree_rcu(head, rcu);
296 297 298 299
}

static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
{
300
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
301 302 303
	struct cls_bpf_prog *prog;
	unsigned long ret = 0UL;

304
	list_for_each_entry(prog, &head->plist, link) {
305 306 307 308 309 310 311 312 313
		if (prog->handle == handle) {
			ret = (unsigned long) prog;
			break;
		}
	}

	return ret;
}

314
static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
315
{
316
	struct sock_filter *bpf_ops;
317
	struct sock_fprog_kern fprog_tmp;
318
	struct bpf_prog *fp;
319
	u16 bpf_size, bpf_num_ops;
320 321
	int ret;

322
	bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
323 324
	if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
		return -EINVAL;
325

326
	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
327 328
	if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
		return -EINVAL;
329

330
	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
331 332
	if (bpf_ops == NULL)
		return -ENOMEM;
333 334 335

	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);

336 337
	fprog_tmp.len = bpf_num_ops;
	fprog_tmp.filter = bpf_ops;
338

339 340 341 342 343
	ret = bpf_prog_create(&fp, &fprog_tmp);
	if (ret < 0) {
		kfree(bpf_ops);
		return ret;
	}
344 345

	prog->bpf_ops = bpf_ops;
346 347
	prog->bpf_num_ops = bpf_num_ops;
	prog->bpf_name = NULL;
348 349
	prog->filter = fp;

350 351 352
	return 0;
}

353 354
static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
				 const struct tcf_proto *tp)
355 356 357 358 359 360 361
{
	struct bpf_prog *fp;
	char *name = NULL;
	u32 bpf_fd;

	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);

362
	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
363 364 365 366
	if (IS_ERR(fp))
		return PTR_ERR(fp);

	if (tb[TCA_BPF_NAME]) {
367
		name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
368 369 370 371 372 373 374 375 376 377
		if (!name) {
			bpf_prog_put(fp);
			return -ENOMEM;
		}
	}

	prog->bpf_ops = NULL;
	prog->bpf_name = name;
	prog->filter = fp;

378
	if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS))
379 380
		netif_keep_dst(qdisc_dev(tp->q));

381 382 383 384 385 386 387 388
	return 0;
}

static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
				   struct cls_bpf_prog *prog,
				   unsigned long base, struct nlattr **tb,
				   struct nlattr *est, bool ovr)
{
389
	bool is_bpf, is_ebpf, have_exts = false;
390
	struct tcf_exts exts;
391
	u32 gen_flags = 0;
392 393 394 395
	int ret;

	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
	is_ebpf = tb[TCA_BPF_FD];
396
	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
397 398
		return -EINVAL;

399
	ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
400 401
	if (ret < 0)
		return ret;
402 403 404
	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
	if (ret < 0)
		goto errout;
405

406 407 408 409
	if (tb[TCA_BPF_FLAGS]) {
		u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);

		if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
410 411
			ret = -EINVAL;
			goto errout;
412 413 414 415
		}

		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
	}
416 417 418 419 420 421 422 423
	if (tb[TCA_BPF_FLAGS_GEN]) {
		gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
		if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
		    !tc_flags_valid(gen_flags)) {
			ret = -EINVAL;
			goto errout;
		}
	}
424 425

	prog->exts_integrated = have_exts;
426
	prog->gen_flags = gen_flags;
427

428
	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
429
		       cls_bpf_prog_from_efd(tb, prog, tp);
430 431
	if (ret < 0)
		goto errout;
432

433 434 435 436
	if (tb[TCA_BPF_CLASSID]) {
		prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
		tcf_bind_filter(tp, &prog->res, base);
	}
437

438
	tcf_exts_change(tp, &prog->exts, &exts);
439
	return 0;
440 441 442 443

errout:
	tcf_exts_destroy(&exts);
	return ret;
444 445 446 447 448 449
}

static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
				   struct cls_bpf_head *head)
{
	unsigned int i = 0x80000000;
450
	u32 handle;
451 452 453 454 455

	do {
		if (++head->hgen == 0x7FFFFFFF)
			head->hgen = 1;
	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
456 457

	if (unlikely(i == 0)) {
458
		pr_err("Insufficient number of handles\n");
459 460 461 462
		handle = 0;
	} else {
		handle = head->hgen;
	}
463

464
	return handle;
465 466 467 468 469
}

static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
			  struct tcf_proto *tp, unsigned long base,
			  u32 handle, struct nlattr **tca,
470
			  unsigned long *arg, bool ovr)
471
{
472 473
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
	struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
474
	struct nlattr *tb[TCA_BPF_MAX + 1];
475
	struct cls_bpf_prog *prog;
476 477 478 479 480
	int ret;

	if (tca[TCA_OPTIONS] == NULL)
		return -EINVAL;

481 482
	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy,
			       NULL);
483 484 485 486
	if (ret < 0)
		return ret;

	prog = kzalloc(sizeof(*prog), GFP_KERNEL);
487
	if (!prog)
488 489
		return -ENOBUFS;

490 491 492
	ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
	if (ret < 0)
		goto errout;
493 494 495 496 497 498 499 500

	if (oldprog) {
		if (handle && oldprog->handle != handle) {
			ret = -EINVAL;
			goto errout;
		}
	}

501 502 503 504 505 506 507 508 509
	if (handle == 0)
		prog->handle = cls_bpf_grab_new_handle(tp, head);
	else
		prog->handle = handle;
	if (prog->handle == 0) {
		ret = -EINVAL;
		goto errout;
	}

510 511
	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
				      ovr);
512 513 514
	if (ret < 0)
		goto errout;

515 516
	ret = cls_bpf_offload(tp, prog, oldprog);
	if (ret) {
517
		__cls_bpf_delete_prog(prog);
518 519
		return ret;
	}
520

521 522 523
	if (!tc_in_hw(prog->gen_flags))
		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;

524
	if (oldprog) {
525
		list_replace_rcu(&oldprog->link, &prog->link);
526
		tcf_unbind_filter(tp, &oldprog->res);
527
		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
528 529 530
	} else {
		list_add_rcu(&prog->link, &head->plist);
	}
531 532 533

	*arg = (unsigned long) prog;
	return 0;
534

535
errout:
536
	tcf_exts_destroy(&prog->exts);
537
	kfree(prog);
538 539 540
	return ret;
}

541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
				 struct sk_buff *skb)
{
	struct nlattr *nla;

	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
		return -EMSGSIZE;

	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
			  sizeof(struct sock_filter));
	if (nla == NULL)
		return -EMSGSIZE;

	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));

	return 0;
}

static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
				  struct sk_buff *skb)
{
562 563
	struct nlattr *nla;

564 565 566 567
	if (prog->bpf_name &&
	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
		return -EMSGSIZE;

568
	nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
569 570 571
	if (nla == NULL)
		return -EMSGSIZE;

572
	memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
573

574 575 576
	return 0;
}

577
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
578 579 580
			struct sk_buff *skb, struct tcmsg *tm)
{
	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
581
	struct nlattr *nest;
582
	u32 bpf_flags = 0;
583
	int ret;
584 585 586 587 588 589

	if (prog == NULL)
		return skb->len;

	tm->tcm_handle = prog->handle;

590 591
	cls_bpf_offload_update_stats(tp, prog);

592 593 594 595
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

596 597
	if (prog->res.classid &&
	    nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
598 599
		goto nla_put_failure;

600 601 602 603 604
	if (cls_bpf_is_ebpf(prog))
		ret = cls_bpf_dump_ebpf_info(prog, skb);
	else
		ret = cls_bpf_dump_bpf_info(prog, skb);
	if (ret)
605 606
		goto nla_put_failure;

607
	if (tcf_exts_dump(skb, &prog->exts) < 0)
608 609
		goto nla_put_failure;

610 611 612 613
	if (prog->exts_integrated)
		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
	if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
		goto nla_put_failure;
614 615 616
	if (prog->gen_flags &&
	    nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
		goto nla_put_failure;
617

618 619
	nla_nest_end(skb, nest);

620
	if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
621 622 623 624 625 626 627 628 629 630 631
		goto nla_put_failure;

	return skb->len;

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
632
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
633 634
	struct cls_bpf_prog *prog;

635
	list_for_each_entry(prog, &head->plist, link) {
636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
		if (arg->count < arg->skip)
			goto skip;
		if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
			arg->stop = 1;
			break;
		}
skip:
		arg->count++;
	}
}

static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
	.kind		=	"bpf",
	.owner		=	THIS_MODULE,
	.classify	=	cls_bpf_classify,
	.init		=	cls_bpf_init,
	.destroy	=	cls_bpf_destroy,
	.get		=	cls_bpf_get,
	.change		=	cls_bpf_change,
	.delete		=	cls_bpf_delete,
	.walk		=	cls_bpf_walk,
	.dump		=	cls_bpf_dump,
};

static int __init cls_bpf_init_mod(void)
{
	return register_tcf_proto_ops(&cls_bpf_ops);
}

static void __exit cls_bpf_exit_mod(void)
{
	unregister_tcf_proto_ops(&cls_bpf_ops);
}

module_init(cls_bpf_init_mod);
module_exit(cls_bpf_exit_mod);
新手
引导
客服 返回
顶部