diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed6c4f66859a59525b2b16db39276f..db11662faea64b87c9f61844dd320377a577781e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f8620e722b4a1bac67f68a72be333..14fd6c02d2588b6fd2f5949477dcfdb4962b8fbd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d305e6b5a3faaa98e7390ac8984560..40f584bc7da0045e092d2c8b479d6584eeb2bc8e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f09848a5524f5c699e7f3a8f5b12b..0488b8258321257aac395f9ece7bef5e04004918 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c774b972b5a1d1eaac6c053f4959115..f6ca3e712831d8b68a69d45a09845b36ff44170f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 0000000000000000000000000000000000000000..18e225de80ff0edf1b12845b32a0f7098f2457be --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722dbf8b0e745f0b321d8d05ac6bbf17..57f4d076141bbcfc115a9cfb1ccc51eb379d7976 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index cf2e4d305af9a0ae1a5c31c067c72d9ba6987b6a..8235f243981671de07de6cea4453057f52d5919e 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) {