cls_bpf.c 14.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Berkeley Packet Filter based traffic classifier
 *
 * Might be used to classify traffic through flexible, user-defined and
 * possibly JIT-ed BPF filters for traffic control as an alternative to
 * ematches.
 *
 * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
19 20
#include <linux/bpf.h>

21 22 23 24 25 26 27 28
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");

29
#define CLS_BPF_NAME_LEN	256
30
#define CLS_BPF_SUPPORTED_GEN_FLAGS		\
31
	(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
32

33 34 35
struct cls_bpf_head {
	struct list_head plist;
	u32 hgen;
J
John Fastabend 已提交
36
	struct rcu_head rcu;
37 38 39
};

struct cls_bpf_prog {
40
	struct bpf_prog *filter;
41
	struct list_head link;
42
	struct tcf_result res;
43
	bool exts_integrated;
44
	bool offloaded;
45
	u32 gen_flags;
46
	struct tcf_exts exts;
47
	u32 handle;
48
	u16 bpf_num_ops;
49 50
	struct sock_filter *bpf_ops;
	const char *bpf_name;
J
John Fastabend 已提交
51 52
	struct tcf_proto *tp;
	struct rcu_head rcu;
53 54 55 56
};

static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
57
	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
58
	[TCA_BPF_FLAGS_GEN]	= { .type = NLA_U32 },
59
	[TCA_BPF_FD]		= { .type = NLA_U32 },
J
Jamal Hadi Salim 已提交
60 61
	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
				    .len = CLS_BPF_NAME_LEN },
62 63 64 65 66
	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
};

67 68 69 70 71 72
static int cls_bpf_exec_opcode(int code)
{
	switch (code) {
	case TC_ACT_OK:
	case TC_ACT_SHOT:
	case TC_ACT_STOLEN:
73
	case TC_ACT_TRAP:
74
	case TC_ACT_REDIRECT:
75 76 77 78 79 80 81
	case TC_ACT_UNSPEC:
		return code;
	default:
		return TC_ACT_UNSPEC;
	}
}

82 83 84
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
			    struct tcf_result *res)
{
85
	struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
86
	bool at_ingress = skb_at_tc_ingress(skb);
87
	struct cls_bpf_prog *prog;
88
	int ret = -1;
89

90 91
	/* Needed here for accessing maps. */
	rcu_read_lock();
J
John Fastabend 已提交
92
	list_for_each_entry_rcu(prog, &head->plist, link) {
93 94
		int filter_res;

95 96
		qdisc_skb_cb(skb)->tc_classid = prog->res.classid;

97 98 99
		if (tc_skip_sw(prog->gen_flags)) {
			filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
		} else if (at_ingress) {
100 101
			/* It is safe to push/pull even if skb_shared() */
			__skb_push(skb, skb->mac_len);
102
			bpf_compute_data_end(skb);
103 104 105
			filter_res = BPF_PROG_RUN(prog->filter, skb);
			__skb_pull(skb, skb->mac_len);
		} else {
106
			bpf_compute_data_end(skb);
107 108
			filter_res = BPF_PROG_RUN(prog->filter, skb);
		}
109

110
		if (prog->exts_integrated) {
111 112 113
			res->class   = 0;
			res->classid = TC_H_MAJ(prog->res.classid) |
				       qdisc_skb_cb(skb)->tc_classid;
114 115 116 117 118 119 120

			ret = cls_bpf_exec_opcode(filter_res);
			if (ret == TC_ACT_UNSPEC)
				continue;
			break;
		}

121 122
		if (filter_res == 0)
			continue;
123 124
		if (filter_res != -1) {
			res->class   = 0;
125
			res->classid = filter_res;
126 127 128
		} else {
			*res = prog->res;
		}
129 130 131 132 133

		ret = tcf_exts_exec(skb, &prog->exts, res);
		if (ret < 0)
			continue;

134
		break;
135
	}
136
	rcu_read_unlock();
137

138
	return ret;
139 140
}

141 142 143 144 145
static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
{
	return !prog->bpf_ops;
}

146 147 148 149
static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
			       enum tc_clsbpf_command cmd)
{
	struct net_device *dev = tp->q->dev_queue->dev;
150
	struct tc_cls_bpf_offload cls_bpf = {};
151
	int err;
152

153 154 155 156 157 158 159
	tc_cls_common_offload_init(&cls_bpf.common, tp);
	cls_bpf.command = cmd;
	cls_bpf.exts = &prog->exts;
	cls_bpf.prog = prog->filter;
	cls_bpf.name = prog->bpf_name;
	cls_bpf.exts_integrated = prog->exts_integrated;
	cls_bpf.gen_flags = prog->gen_flags;
160

161
	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &cls_bpf);
162 163 164 165
	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;

	return err;
166 167
}

168 169
static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
			   struct cls_bpf_prog *oldprog)
170 171 172 173
{
	struct net_device *dev = tp->q->dev_queue->dev;
	struct cls_bpf_prog *obj = prog;
	enum tc_clsbpf_command cmd;
174 175 176 177 178
	bool skip_sw;
	int ret;

	skip_sw = tc_skip_sw(prog->gen_flags) ||
		(oldprog && tc_skip_sw(oldprog->gen_flags));
179 180

	if (oldprog && oldprog->offloaded) {
181
		if (tc_should_offload(dev, tp, prog->gen_flags)) {
182
			cmd = TC_CLSBPF_REPLACE;
183
		} else if (!tc_skip_sw(prog->gen_flags)) {
184 185
			obj = oldprog;
			cmd = TC_CLSBPF_DESTROY;
186 187
		} else {
			return -EINVAL;
188 189
		}
	} else {
190
		if (!tc_should_offload(dev, tp, prog->gen_flags))
191
			return skip_sw ? -EINVAL : 0;
192 193 194
		cmd = TC_CLSBPF_ADD;
	}

195 196 197
	ret = cls_bpf_offload_cmd(tp, obj, cmd);
	if (ret)
		return skip_sw ? ret : 0;
198 199 200 201

	obj->offloaded = true;
	if (oldprog)
		oldprog->offloaded = false;
202 203

	return 0;
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
}

static void cls_bpf_stop_offload(struct tcf_proto *tp,
				 struct cls_bpf_prog *prog)
{
	int err;

	if (!prog->offloaded)
		return;

	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
	if (err) {
		pr_err("Stopping hardware offload failed: %d\n", err);
		return;
	}

	prog->offloaded = false;
}

223 224 225 226 227 228 229 230 231
static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
					 struct cls_bpf_prog *prog)
{
	if (!prog->offloaded)
		return;

	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
}

232 233 234 235 236 237 238 239
static int cls_bpf_init(struct tcf_proto *tp)
{
	struct cls_bpf_head *head;

	head = kzalloc(sizeof(*head), GFP_KERNEL);
	if (head == NULL)
		return -ENOBUFS;

J
John Fastabend 已提交
240 241
	INIT_LIST_HEAD_RCU(&head->plist);
	rcu_assign_pointer(tp->root, head);
242 243 244 245

	return 0;
}

246
static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
247
{
248
	tcf_exts_destroy(&prog->exts);
249

250 251 252 253
	if (cls_bpf_is_ebpf(prog))
		bpf_prog_put(prog->filter);
	else
		bpf_prog_destroy(prog->filter);
254

255
	kfree(prog->bpf_name);
256 257 258 259
	kfree(prog->bpf_ops);
	kfree(prog);
}

260
static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
J
John Fastabend 已提交
261
{
262
	__cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
J
John Fastabend 已提交
263 264
}

265
static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
266
{
267
	cls_bpf_stop_offload(tp, prog);
268 269
	list_del_rcu(&prog->link);
	tcf_unbind_filter(tp, &prog->res);
270 271
	call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
}
272

273
static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last)
274
{
275 276
	struct cls_bpf_head *head = rtnl_dereference(tp->root);

277
	__cls_bpf_delete(tp, arg);
278
	*last = list_empty(&head->plist);
279
	return 0;
280 281
}

282
static void cls_bpf_destroy(struct tcf_proto *tp)
283
{
J
John Fastabend 已提交
284
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
285 286
	struct cls_bpf_prog *prog, *tmp;

287 288
	list_for_each_entry_safe(prog, tmp, &head->plist, link)
		__cls_bpf_delete(tp, prog);
289

J
John Fastabend 已提交
290
	kfree_rcu(head, rcu);
291 292
}

293
static void *cls_bpf_get(struct tcf_proto *tp, u32 handle)
294
{
J
John Fastabend 已提交
295
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
296 297
	struct cls_bpf_prog *prog;

298
	list_for_each_entry(prog, &head->plist, link) {
299 300
		if (prog->handle == handle)
			return prog;
301 302
	}

303
	return NULL;
304 305
}

306
static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
307
{
J
John Fastabend 已提交
308
	struct sock_filter *bpf_ops;
309
	struct sock_fprog_kern fprog_tmp;
J
John Fastabend 已提交
310
	struct bpf_prog *fp;
311
	u16 bpf_size, bpf_num_ops;
312 313
	int ret;

314
	bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
315 316
	if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
		return -EINVAL;
317

318
	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
319 320
	if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
		return -EINVAL;
321

322
	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
323 324
	if (bpf_ops == NULL)
		return -ENOMEM;
325 326 327

	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);

328 329
	fprog_tmp.len = bpf_num_ops;
	fprog_tmp.filter = bpf_ops;
330

331 332 333 334 335
	ret = bpf_prog_create(&fp, &fprog_tmp);
	if (ret < 0) {
		kfree(bpf_ops);
		return ret;
	}
336 337

	prog->bpf_ops = bpf_ops;
338 339
	prog->bpf_num_ops = bpf_num_ops;
	prog->bpf_name = NULL;
340 341
	prog->filter = fp;

342 343 344
	return 0;
}

345 346
static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
				 const struct tcf_proto *tp)
347 348 349 350 351 352 353
{
	struct bpf_prog *fp;
	char *name = NULL;
	u32 bpf_fd;

	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);

354
	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
355 356 357 358
	if (IS_ERR(fp))
		return PTR_ERR(fp);

	if (tb[TCA_BPF_NAME]) {
359
		name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
360 361 362 363 364 365 366 367 368 369
		if (!name) {
			bpf_prog_put(fp);
			return -ENOMEM;
		}
	}

	prog->bpf_ops = NULL;
	prog->bpf_name = name;
	prog->filter = fp;

D
Daniel Borkmann 已提交
370
	if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS))
371 372
		netif_keep_dst(qdisc_dev(tp->q));

373 374 375
	return 0;
}

376 377 378
static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
			     struct cls_bpf_prog *prog, unsigned long base,
			     struct nlattr **tb, struct nlattr *est, bool ovr)
379
{
380
	bool is_bpf, is_ebpf, have_exts = false;
381
	u32 gen_flags = 0;
382 383 384 385
	int ret;

	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
	is_ebpf = tb[TCA_BPF_FD];
386
	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
387 388
		return -EINVAL;

389
	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr);
390 391 392
	if (ret < 0)
		return ret;

393 394 395
	if (tb[TCA_BPF_FLAGS]) {
		u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);

396 397
		if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT)
			return -EINVAL;
398 399 400

		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
	}
401 402 403
	if (tb[TCA_BPF_FLAGS_GEN]) {
		gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
		if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
404 405
		    !tc_flags_valid(gen_flags))
			return -EINVAL;
406
	}
407 408

	prog->exts_integrated = have_exts;
409
	prog->gen_flags = gen_flags;
410

411
	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
412
		       cls_bpf_prog_from_efd(tb, prog, tp);
413
	if (ret < 0)
414
		return ret;
415

416 417 418 419
	if (tb[TCA_BPF_CLASSID]) {
		prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
		tcf_bind_filter(tp, &prog->res, base);
	}
420 421 422 423 424 425 426 427

	return 0;
}

static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
				   struct cls_bpf_head *head)
{
	unsigned int i = 0x80000000;
428
	u32 handle;
429 430 431 432 433

	do {
		if (++head->hgen == 0x7FFFFFFF)
			head->hgen = 1;
	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
434 435

	if (unlikely(i == 0)) {
436
		pr_err("Insufficient number of handles\n");
437 438 439 440
		handle = 0;
	} else {
		handle = head->hgen;
	}
441

442
	return handle;
443 444 445 446 447
}

static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
			  struct tcf_proto *tp, unsigned long base,
			  u32 handle, struct nlattr **tca,
448
			  void **arg, bool ovr)
449
{
J
John Fastabend 已提交
450
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
451
	struct cls_bpf_prog *oldprog = *arg;
452
	struct nlattr *tb[TCA_BPF_MAX + 1];
J
John Fastabend 已提交
453
	struct cls_bpf_prog *prog;
454 455 456 457 458
	int ret;

	if (tca[TCA_OPTIONS] == NULL)
		return -EINVAL;

459 460
	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy,
			       NULL);
461 462 463 464
	if (ret < 0)
		return ret;

	prog = kzalloc(sizeof(*prog), GFP_KERNEL);
J
John Fastabend 已提交
465
	if (!prog)
466 467
		return -ENOBUFS;

468 469 470
	ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
	if (ret < 0)
		goto errout;
J
John Fastabend 已提交
471 472 473 474 475 476 477 478

	if (oldprog) {
		if (handle && oldprog->handle != handle) {
			ret = -EINVAL;
			goto errout;
		}
	}

479 480 481 482 483 484 485 486 487
	if (handle == 0)
		prog->handle = cls_bpf_grab_new_handle(tp, head);
	else
		prog->handle = handle;
	if (prog->handle == 0) {
		ret = -EINVAL;
		goto errout;
	}

488
	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
489 490 491
	if (ret < 0)
		goto errout;

492 493
	ret = cls_bpf_offload(tp, prog, oldprog);
	if (ret) {
494
		__cls_bpf_delete_prog(prog);
495 496
		return ret;
	}
497

498 499 500
	if (!tc_in_hw(prog->gen_flags))
		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;

J
John Fastabend 已提交
501
	if (oldprog) {
502
		list_replace_rcu(&oldprog->link, &prog->link);
503
		tcf_unbind_filter(tp, &oldprog->res);
504
		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
J
John Fastabend 已提交
505 506 507
	} else {
		list_add_rcu(&prog->link, &head->plist);
	}
508

509
	*arg = prog;
510
	return 0;
511

512
errout:
513
	tcf_exts_destroy(&prog->exts);
J
John Fastabend 已提交
514
	kfree(prog);
515 516 517
	return ret;
}

518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
				 struct sk_buff *skb)
{
	struct nlattr *nla;

	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
		return -EMSGSIZE;

	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
			  sizeof(struct sock_filter));
	if (nla == NULL)
		return -EMSGSIZE;

	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));

	return 0;
}

static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
				  struct sk_buff *skb)
{
539 540
	struct nlattr *nla;

541 542 543 544
	if (prog->bpf_name &&
	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
		return -EMSGSIZE;

545 546 547
	if (nla_put_u32(skb, TCA_BPF_ID, prog->filter->aux->id))
		return -EMSGSIZE;

548
	nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
549 550 551
	if (nla == NULL)
		return -EMSGSIZE;

552
	memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
553

554 555 556
	return 0;
}

557
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
558 559
			struct sk_buff *skb, struct tcmsg *tm)
{
560
	struct cls_bpf_prog *prog = fh;
561
	struct nlattr *nest;
562
	u32 bpf_flags = 0;
563
	int ret;
564 565 566 567 568 569

	if (prog == NULL)
		return skb->len;

	tm->tcm_handle = prog->handle;

570 571
	cls_bpf_offload_update_stats(tp, prog);

572 573 574 575
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

576 577
	if (prog->res.classid &&
	    nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
578 579
		goto nla_put_failure;

580 581 582 583 584
	if (cls_bpf_is_ebpf(prog))
		ret = cls_bpf_dump_ebpf_info(prog, skb);
	else
		ret = cls_bpf_dump_bpf_info(prog, skb);
	if (ret)
585 586
		goto nla_put_failure;

587
	if (tcf_exts_dump(skb, &prog->exts) < 0)
588 589
		goto nla_put_failure;

590 591 592 593
	if (prog->exts_integrated)
		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
	if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
		goto nla_put_failure;
594 595 596
	if (prog->gen_flags &&
	    nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
		goto nla_put_failure;
597

598 599
	nla_nest_end(skb, nest);

600
	if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
601 602 603 604 605 606 607 608 609 610 611
		goto nla_put_failure;

	return skb->len;

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
J
John Fastabend 已提交
612
	struct cls_bpf_head *head = rtnl_dereference(tp->root);
613 614
	struct cls_bpf_prog *prog;

615
	list_for_each_entry(prog, &head->plist, link) {
616 617
		if (arg->count < arg->skip)
			goto skip;
618
		if (arg->fn(tp, prog, arg) < 0) {
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
			arg->stop = 1;
			break;
		}
skip:
		arg->count++;
	}
}

static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
	.kind		=	"bpf",
	.owner		=	THIS_MODULE,
	.classify	=	cls_bpf_classify,
	.init		=	cls_bpf_init,
	.destroy	=	cls_bpf_destroy,
	.get		=	cls_bpf_get,
	.change		=	cls_bpf_change,
	.delete		=	cls_bpf_delete,
	.walk		=	cls_bpf_walk,
	.dump		=	cls_bpf_dump,
};

static int __init cls_bpf_init_mod(void)
{
	return register_tcf_proto_ops(&cls_bpf_ops);
}

static void __exit cls_bpf_exit_mod(void)
{
	unregister_tcf_proto_ops(&cls_bpf_ops);
}

module_init(cls_bpf_init_mod);
module_exit(cls_bpf_exit_mod);