diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h new file mode 100644 index 0000000000000000000000000000000000000000..ec80d0c0953e63182ef79c72baa6901162cce009 --- /dev/null +++ b/include/linux/bpf-cgroup.h @@ -0,0 +1,79 @@ +#ifndef _BPF_CGROUP_H +#define _BPF_CGROUP_H + +#include +#include +#include + +struct sock; +struct cgroup; +struct sk_buff; + +#ifdef CONFIG_CGROUP_BPF + +extern struct static_key_false cgroup_bpf_enabled_key; +#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) + +struct cgroup_bpf { + /* + * Store two sets of bpf_prog pointers, one for programs that are + * pinned directly to this cgroup, and one for those that are effective + * when this cgroup is accessed. + */ + struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE]; +}; + +void cgroup_bpf_put(struct cgroup *cgrp); +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); + +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type); + +/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type); + +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type); + +/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter(sk, skb, \ + BPF_CGROUP_INET_INGRESS); \ + \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + typeof(sk) __sk = sk_to_full_sk(sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter(__sk, skb, \ + BPF_CGROUP_INET_EGRESS); \ + } \ + __ret; \ +}) + +#else + +struct cgroup_bpf {}; +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} +static inline void cgroup_bpf_inherit(struct cgroup *cgrp, + struct cgroup *parent) {} + +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) + +#endif /* CONFIG_CGROUP_BPF */ + +#endif /* _BPF_CGROUP_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5b17de62c962cd73d625427c2230d66e08cbcb4b..861b4677fc5b41134f96da33710a79735b827fe0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -300,6 +301,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to store eBPF programs */ + struct cgroup_bpf bpf; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d9b2832c280c82172cb428a8e6d793359891db0..1370a9d1456fb80e0ee6930a32e07b456406fc28 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,6 +73,8 @@ enum bpf_cmd { BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, }; enum bpf_map_type { @@ -98,8 +100,17 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, }; +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -150,6 +161,12 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/init/Kconfig b/init/Kconfig index 34407f15e6d34da57be238f69441f1dad9e60764..405120b5f13e843caeb70cf87ee70458bc2746dd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1154,6 +1154,18 @@ config CGROUP_PERF Say N if unsure. +config CGROUP_BPF + bool "Support for eBPF programs attached to cgroups" + depends on BPF_SYSCALL && SOCK_CGROUP_DATA + help + Allow attaching eBPF programs to a cgroup using the bpf(2) + syscall command BPF_PROG_ATTACH. + + In which context these programs are accessed depends on the type + of attachment. For instance, programs that are attached using + BPF_CGROUP_INET_INGRESS will be executed on the ingress path of + inet sockets. + config CGROUP_DEBUG bool "Example controller" default n diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c4d89d6e2058481d149733d4767cf98bd99eeaa9..1276474ac3cd9ddf2a87f37312edc661d2ec335c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 0000000000000000000000000000000000000000..a0ab43f264b0040ab28db16dec4ff6eaaae6a637 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,167 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is %NULL, this function attaches a new program to the cgroup and + * releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb15498b8d55c5966397ff10dcdd16adec8dd56d..1090d16a31c154c5557ee99db3ccdbf5fe8ab8b0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -835,6 +835,77 @@ static int bpf_obj_get(const union bpf_attr *attr) return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SKB); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} + +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -901,6 +972,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85bc9beb046d9a6deda2e3564f4d5bd01d6fc27b..2ee9ec3051b20774b118a57e4609f30e87bf82be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); + if (parent) + cgroup_bpf_inherit(cgrp, parent); + cgroup_propagate_control(cgrp); /* @cgrp doesn't have dir yet so the following will only create csses */ @@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) } subsys_initcall(cgroup_namespaces_init); +#ifdef CONFIG_CGROUP_BPF +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + mutex_lock(&cgroup_mutex); + __cgroup_bpf_update(cgrp, parent, prog, type); + mutex_unlock(&cgroup_mutex); +} +#endif /* CONFIG_CGROUP_BPF */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/net/core/filter.c b/net/core/filter.c index dece94fef0050731e81fc141d4da16b82449ecf3..ea315af56511aa64d26aaa3c5f2a79d13df07a68 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -78,6 +78,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; @@ -2630,6 +2634,17 @@ xdp_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2992,6 +3007,12 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3012,12 +3033,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 358f2c82b03067955064e69fe19b921d9d643c39..9af2b7853be4b82e989b421e8194c406e42f947c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -285,6 +286,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -303,6 +311,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -340,7 +362,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -356,7 +378,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 312cbd0e5038de3031c87d600d13006472b2432d..70d0de4041972ceaeb4656fa4bdef884a5403b10 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -131,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index f394ac616ed8ea37b1929236ce862ac2260039d7..fb17206ddb57a4c11fd7f526792f9a619fcb450e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -22,6 +22,7 @@ hostprogs-y += spintest hostprogs-y += map_perf_test hostprogs-y += test_overhead hostprogs-y += test_cgrp2_array_pin +hostprogs-y += test_cgrp2_attach hostprogs-y += xdp1 hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup @@ -49,6 +50,7 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o +test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o xdp1-objs := bpf_load.o libbpf.o xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := bpf_load.o libbpf.o xdp1_user.o diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c index 9969e35550c3dd3c9702e7e99dab35b3df9fb7f6..9ce707bf02a7a7aee8e37383af34429d97c569cf 100644 --- a/samples/bpf/libbpf.c +++ b/samples/bpf/libbpf.c @@ -104,6 +104,27 @@ int bpf_prog_load(enum bpf_prog_type prog_type, return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + union bpf_attr attr = { + .target_fd = target_fd, + .attach_bpf_fd = prog_fd, + .attach_type = type, + }; + + return syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); +} + +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + union bpf_attr attr = { + .target_fd = target_fd, + .attach_type = type, + }; + + return syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); +} + int bpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr = { diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index de96a935068d13857c6779ddef5d0422455149e3..94a901d86fc2ea4867a3993f87380e9894d3b1f5 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -15,6 +15,9 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, const char *license, int kern_version); +int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type); +int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); + int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c new file mode 100644 index 0000000000000000000000000000000000000000..63ef2083f766b9e319ae4a43bdf4fb7095e5d855 --- /dev/null +++ b/samples/bpf/test_cgrp2_attach.c @@ -0,0 +1,147 @@ +/* eBPF example program: + * + * - Creates arraymap in kernel with 4 bytes keys and 8 byte values + * + * - Loads eBPF program + * + * The eBPF program accesses the map passed in to store two pieces of + * information. The number of invocations of the program, which maps + * to the number of packets received, is stored to key 0. Key 1 is + * incremented on each iteration by the number of bytes stored in + * the skb. + * + * - Detaches any eBPF program previously attached to the cgroup + * + * - Attaches the new program to a cgroup using BPF_PROG_ATTACH + * + * - Every second, reads map[0] and map[1] to see how many bytes and + * packets were seen on any socket of tasks in the given cgroup. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libbpf.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +static int prog_load(int map_fd, int verdict) +{ + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */ + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + + return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB, + prog, sizeof(prog), "GPL", 0); +} + +static int usage(const char *argv0) +{ + printf("Usage: %s [drop]\n", argv0); + return EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + int cg_fd, map_fd, prog_fd, key, ret; + long long pkt_cnt, byte_cnt; + enum bpf_attach_type type; + int verdict = 1; + + if (argc < 3) + return usage(argv[0]); + + if (strcmp(argv[2], "ingress") == 0) + type = BPF_CGROUP_INET_INGRESS; + else if (strcmp(argv[2], "egress") == 0) + type = BPF_CGROUP_INET_EGRESS; + else + return usage(argv[0]); + + if (argc > 3 && strcmp(argv[3], "drop") == 0) + verdict = 0; + + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, + sizeof(key), sizeof(byte_cnt), + 256, 0); + if (map_fd < 0) { + printf("Failed to create map: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + prog_fd = prog_load(map_fd, verdict); + printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); + + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + ret = bpf_prog_detach(cg_fd, type); + printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), errno); + + ret = bpf_prog_attach(prog_fd, cg_fd, type); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + + while (1) { + key = MAP_KEY_PACKETS; + assert(bpf_lookup_elem(map_fd, &key, &pkt_cnt) == 0); + + key = MAP_KEY_BYTES; + assert(bpf_lookup_elem(map_fd, &key, &byte_cnt) == 0); + + printf("cgroup received %lld packets, %lld bytes\n", + pkt_cnt, byte_cnt); + sleep(1); + } + + return EXIT_SUCCESS; +}