act_bpf.c 9.7 KB
Newer Older
J
Jiri Pirko 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/filter.h>
16 17
#include <linux/bpf.h>

J
Jiri Pirko 已提交
18 19 20 21 22 23
#include <net/netlink.h>
#include <net/pkt_sched.h>

#include <linux/tc_act/tc_bpf.h>
#include <net/tc_act/tc_bpf.h>

24 25 26 27 28 29
#define BPF_TAB_MASK		15
#define ACT_BPF_NAME_LEN	256

struct tcf_bpf_cfg {
	struct bpf_prog *filter;
	struct sock_filter *bpf_ops;
30
	const char *bpf_name;
31 32
	u32 bpf_fd;
	u16 bpf_num_ops;
33
	bool is_ebpf;
34
};
J
Jiri Pirko 已提交
35

36 37
static int bpf_net_id;

38
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
J
Jiri Pirko 已提交
39 40
		   struct tcf_result *res)
{
41
	struct tcf_bpf *prog = act->priv;
42
	struct bpf_prog *filter;
43
	int action, filter_res;
44
	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
J
Jiri Pirko 已提交
45

46 47 48
	if (unlikely(!skb_mac_header_was_set(skb)))
		return TC_ACT_UNSPEC;

49 50
	tcf_lastuse_update(&prog->tcf_tm);
	bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
J
Jiri Pirko 已提交
51

52
	rcu_read_lock();
53
	filter = rcu_dereference(prog->filter);
54 55
	if (at_ingress) {
		__skb_push(skb, skb->mac_len);
56
		bpf_compute_data_end(skb);
57
		filter_res = BPF_PROG_RUN(filter, skb);
58 59
		__skb_pull(skb, skb->mac_len);
	} else {
60
		bpf_compute_data_end(skb);
61
		filter_res = BPF_PROG_RUN(filter, skb);
62
	}
63
	rcu_read_unlock();
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78

	/* A BPF program may overwrite the default action opcode.
	 * Similarly as in cls_bpf, if filter_res == -1 we use the
	 * default action specified from tc.
	 *
	 * In case a different well-known TC_ACT opcode has been
	 * returned, it will overwrite the default one.
	 *
	 * For everything else that is unkown, TC_ACT_UNSPEC is
	 * returned.
	 */
	switch (filter_res) {
	case TC_ACT_PIPE:
	case TC_ACT_RECLASSIFY:
	case TC_ACT_OK:
79
	case TC_ACT_REDIRECT:
80 81 82 83
		action = filter_res;
		break;
	case TC_ACT_SHOT:
		action = filter_res;
84
		qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats));
85 86
		break;
	case TC_ACT_UNSPEC:
87
		action = prog->tcf_action;
88 89 90 91
		break;
	default:
		action = TC_ACT_UNSPEC;
		break;
J
Jiri Pirko 已提交
92 93 94 95 96
	}

	return action;
}

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog)
{
	return !prog->bpf_ops;
}

static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
				 struct sk_buff *skb)
{
	struct nlattr *nla;

	if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops))
		return -EMSGSIZE;

	nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops *
			  sizeof(struct sock_filter));
	if (nla == NULL)
		return -EMSGSIZE;

	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));

	return 0;
}

static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
				  struct sk_buff *skb)
{
	if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd))
		return -EMSGSIZE;

	if (prog->bpf_name &&
	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
		return -EMSGSIZE;

	return 0;
}

static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
J
Jiri Pirko 已提交
134 135 136
			int bind, int ref)
{
	unsigned char *tp = skb_tail_pointer(skb);
137
	struct tcf_bpf *prog = act->priv;
J
Jiri Pirko 已提交
138
	struct tc_act_bpf opt = {
139 140 141 142
		.index   = prog->tcf_index,
		.refcnt  = prog->tcf_refcnt - ref,
		.bindcnt = prog->tcf_bindcnt - bind,
		.action  = prog->tcf_action,
J
Jiri Pirko 已提交
143
	};
144 145
	struct tcf_t tm;
	int ret;
J
Jiri Pirko 已提交
146 147 148 149

	if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;

150 151 152 153 154
	if (tcf_bpf_is_ebpf(prog))
		ret = tcf_bpf_dump_ebpf_info(prog, skb);
	else
		ret = tcf_bpf_dump_bpf_info(prog, skb);
	if (ret)
J
Jiri Pirko 已提交
155 156
		goto nla_put_failure;

157
	tcf_tm_dump(&tm, &prog->tcf_tm);
158 159
	if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
			  TCA_ACT_BPF_PAD))
J
Jiri Pirko 已提交
160
		goto nla_put_failure;
161

J
Jiri Pirko 已提交
162 163 164 165 166 167 168 169 170
	return skb->len;

nla_put_failure:
	nlmsg_trim(skb, tp);
	return -1;
}

static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
	[TCA_ACT_BPF_PARMS]	= { .len = sizeof(struct tc_act_bpf) },
171
	[TCA_ACT_BPF_FD]	= { .type = NLA_U32 },
172 173
	[TCA_ACT_BPF_NAME]	= { .type = NLA_NUL_STRING,
				    .len = ACT_BPF_NAME_LEN },
J
Jiri Pirko 已提交
174 175 176 177 178
	[TCA_ACT_BPF_OPS_LEN]	= { .type = NLA_U16 },
	[TCA_ACT_BPF_OPS]	= { .type = NLA_BINARY,
				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
};

179
static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
J
Jiri Pirko 已提交
180 181
{
	struct sock_filter *bpf_ops;
182
	struct sock_fprog_kern fprog_tmp;
J
Jiri Pirko 已提交
183
	struct bpf_prog *fp;
184
	u16 bpf_size, bpf_num_ops;
J
Jiri Pirko 已提交
185 186 187 188 189 190 191
	int ret;

	bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]);
	if (bpf_num_ops	> BPF_MAXINSNS || bpf_num_ops == 0)
		return -EINVAL;

	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
192 193 194
	if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
		return -EINVAL;

J
Jiri Pirko 已提交
195
	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
196
	if (bpf_ops == NULL)
J
Jiri Pirko 已提交
197 198 199 200
		return -ENOMEM;

	memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);

201 202
	fprog_tmp.len = bpf_num_ops;
	fprog_tmp.filter = bpf_ops;
J
Jiri Pirko 已提交
203

204 205 206 207 208
	ret = bpf_prog_create(&fp, &fprog_tmp);
	if (ret < 0) {
		kfree(bpf_ops);
		return ret;
	}
J
Jiri Pirko 已提交
209

210 211 212
	cfg->bpf_ops = bpf_ops;
	cfg->bpf_num_ops = bpf_num_ops;
	cfg->filter = fp;
213
	cfg->is_ebpf = false;
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

	return 0;
}

static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
{
	struct bpf_prog *fp;
	char *name = NULL;
	u32 bpf_fd;

	bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);

	fp = bpf_prog_get(bpf_fd);
	if (IS_ERR(fp))
		return PTR_ERR(fp);

	if (fp->type != BPF_PROG_TYPE_SCHED_ACT) {
		bpf_prog_put(fp);
		return -EINVAL;
	}

	if (tb[TCA_ACT_BPF_NAME]) {
		name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
			       nla_len(tb[TCA_ACT_BPF_NAME]),
			       GFP_KERNEL);
		if (!name) {
			bpf_prog_put(fp);
			return -ENOMEM;
		}
	}

	cfg->bpf_fd = bpf_fd;
	cfg->bpf_name = name;
	cfg->filter = fp;
248
	cfg->is_ebpf = true;
249 250 251 252

	return 0;
}

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg)
{
	if (cfg->is_ebpf)
		bpf_prog_put(cfg->filter);
	else
		bpf_prog_destroy(cfg->filter);

	kfree(cfg->bpf_ops);
	kfree(cfg->bpf_name);
}

static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
				  struct tcf_bpf_cfg *cfg)
{
	cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
268 269 270 271
	/* updates to prog->filter are prevented, since it's called either
	 * with rtnl lock or during final cleanup in rcu callback
	 */
	cfg->filter = rcu_dereference_protected(prog->filter, 1);
272 273 274 275 276

	cfg->bpf_ops = prog->bpf_ops;
	cfg->bpf_name = prog->bpf_name;
}

277 278 279 280
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
			struct nlattr *est, struct tc_action *act,
			int replace, int bind)
{
281
	struct tc_action_net *tn = net_generic(net, bpf_net_id);
282
	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
283
	struct tcf_bpf_cfg cfg, old;
284 285 286
	struct tc_act_bpf *parm;
	struct tcf_bpf *prog;
	bool is_bpf, is_ebpf;
287
	int ret, res = 0;
288 289 290 291 292 293 294 295

	if (!nla)
		return -EINVAL;

	ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy);
	if (ret < 0)
		return ret;

296
	if (!tb[TCA_ACT_BPF_PARMS])
297 298 299 300
		return -EINVAL;

	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);

301 302
	if (!tcf_hash_check(tn, parm->index, act, bind)) {
		ret = tcf_hash_create(tn, parm->index, est, act,
303
				      sizeof(*prog), bind, true);
304
		if (ret < 0)
305
			return ret;
J
Jiri Pirko 已提交
306

307
		res = ACT_P_CREATED;
J
Jiri Pirko 已提交
308
	} else {
309
		/* Don't override defaults. */
J
Jiri Pirko 已提交
310
		if (bind)
311
			return 0;
312 313

		tcf_hash_release(act, bind);
314 315
		if (!replace)
			return -EEXIST;
J
Jiri Pirko 已提交
316 317
	}

318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
	is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
	is_ebpf = tb[TCA_ACT_BPF_FD];

	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
		ret = -EINVAL;
		goto out;
	}

	memset(&cfg, 0, sizeof(cfg));

	ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
		       tcf_bpf_init_from_efd(tb, &cfg);
	if (ret < 0)
		goto out;

333
	prog = to_bpf(act);
334
	ASSERT_RTNL();
335

336
	if (res != ACT_P_CREATED)
337 338
		tcf_bpf_prog_fill_cfg(prog, &old);

339 340 341 342 343 344 345 346 347
	prog->bpf_ops = cfg.bpf_ops;
	prog->bpf_name = cfg.bpf_name;

	if (cfg.bpf_num_ops)
		prog->bpf_num_ops = cfg.bpf_num_ops;
	if (cfg.bpf_fd)
		prog->bpf_fd = cfg.bpf_fd;

	prog->tcf_action = parm->action;
348
	rcu_assign_pointer(prog->filter, cfg.filter);
J
Jiri Pirko 已提交
349

350
	if (res == ACT_P_CREATED) {
351
		tcf_hash_insert(tn, act);
352 353 354
	} else {
		/* make sure the program being replaced is no longer executing */
		synchronize_rcu();
355
		tcf_bpf_cfg_cleanup(&old);
356
	}
357

358 359 360 361
	return res;
out:
	if (res == ACT_P_CREATED)
		tcf_hash_cleanup(act, est);
J
Jiri Pirko 已提交
362 363 364 365

	return ret;
}

366
static void tcf_bpf_cleanup(struct tc_action *act, int bind)
J
Jiri Pirko 已提交
367
{
368
	struct tcf_bpf_cfg tmp;
369

370 371
	tcf_bpf_prog_fill_cfg(act->priv, &tmp);
	tcf_bpf_cfg_cleanup(&tmp);
J
Jiri Pirko 已提交
372 373
}

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
			  struct netlink_callback *cb, int type,
			  struct tc_action *a)
{
	struct tc_action_net *tn = net_generic(net, bpf_net_id);

	return tcf_generic_walker(tn, skb, cb, type, a);
}

static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
{
	struct tc_action_net *tn = net_generic(net, bpf_net_id);

	return tcf_hash_search(tn, a, index);
}

390 391 392 393 394 395 396 397
static struct tc_action_ops act_bpf_ops __read_mostly = {
	.kind		=	"bpf",
	.type		=	TCA_ACT_BPF,
	.owner		=	THIS_MODULE,
	.act		=	tcf_bpf,
	.dump		=	tcf_bpf_dump,
	.cleanup	=	tcf_bpf_cleanup,
	.init		=	tcf_bpf_init,
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
	.walk		=	tcf_bpf_walker,
	.lookup		=	tcf_bpf_search,
};

static __net_init int bpf_init_net(struct net *net)
{
	struct tc_action_net *tn = net_generic(net, bpf_net_id);

	return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK);
}

static void __net_exit bpf_exit_net(struct net *net)
{
	struct tc_action_net *tn = net_generic(net, bpf_net_id);

	tc_action_net_exit(tn);
}

static struct pernet_operations bpf_net_ops = {
	.init = bpf_init_net,
	.exit = bpf_exit_net,
	.id   = &bpf_net_id,
	.size = sizeof(struct tc_action_net),
J
Jiri Pirko 已提交
421 422 423 424
};

static int __init bpf_init_module(void)
{
425
	return tcf_register_action(&act_bpf_ops, &bpf_net_ops);
J
Jiri Pirko 已提交
426 427 428 429
}

static void __exit bpf_cleanup_module(void)
{
430
	tcf_unregister_action(&act_bpf_ops, &bpf_net_ops);
J
Jiri Pirko 已提交
431 432 433 434 435 436 437 438
}

module_init(bpf_init_module);
module_exit(bpf_cleanup_module);

MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
MODULE_DESCRIPTION("TC BPF based action");
MODULE_LICENSE("GPL v2");