提交 035ff358 编写于 作者: J Jakub Sitnicki 提交者: Daniel Borkmann

net: Generate reuseport group ID on group creation

Commit 736b4602 ("net: Add ID (if needed) to sock_reuseport and expose
reuseport_lock") has introduced lazy generation of reuseport group IDs that
survive group resize.

By comparing the identifier we check if BPF reuseport program is not trying
to select a socket from a BPF map that belongs to a different reuseport
group than the one the packet is for.

Because SOCKARRAY used to be the only BPF map type that can be used with
reuseport BPF, it was possible to delay the generation of reuseport group
ID until a socket from the group was inserted into BPF map for the first
time.

Now that SOCK{MAP,HASH} can be used with reuseport BPF we have two options,
either generate the reuseport ID on map update, like SOCKARRAY does, or
allocate an ID from the start when reuseport group gets created.

This patch takes the latter approach to keep sockmap free of calls into
reuseport code. This streamlines the reuseport_id access as its lifetime
now matches the longevity of reuseport object.

The cost of this simplification, however, is that we allocate reuseport IDs
for all SO_REUSEPORT users. Even those that don't use SOCKARRAY in their
setups. With the way identifiers are currently generated, we can have at
most S32_MAX reuseport groups, which hopefully is sufficient. If we ever
get close to the limit, we can switch an u64 counter like sk_cookie.

Another change is that we now always call into SOCKARRAY logic to unlink
the socket from the map when unhashing or closing the socket. Previously we
did it only when at least one socket from the group was in a BPF map.

It is worth noting that this doesn't conflict with sockmap tear-down in
case a socket is in a SOCK{MAP,HASH} and belongs to a reuseport
group. sockmap tear-down happens first:

  prot->unhash
  `- tcp_bpf_unhash
     |- tcp_bpf_remove
     |  `- while (sk_psock_link_pop(psock))
     |     `- sk_psock_unlink
     |        `- sock_map_delete_from_link
     |           `- __sock_map_delete
     |              `- sock_map_unref
     |                 `- sk_psock_put
     |                    `- sk_psock_drop
     |                       `- rcu_assign_sk_user_data(sk, NULL)
     `- inet_unhash
        `- reuseport_detach_sock
           `- bpf_sk_reuseport_detach
              `- WRITE_ONCE(sk->sk_user_data, NULL)
Suggested-by: NMartin Lau <kafai@fb.com>
Signed-off-by: NJakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: NDaniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200218171023.844439-10-jakub@cloudflare.com
上级 9fed9000
...@@ -55,6 +55,4 @@ static inline bool reuseport_has_conns(struct sock *sk, bool set) ...@@ -55,6 +55,4 @@ static inline bool reuseport_has_conns(struct sock *sk, bool set)
return ret; return ret;
} }
int reuseport_get_id(struct sock_reuseport *reuse);
#endif /* _SOCK_REUSEPORT_H */ #endif /* _SOCK_REUSEPORT_H */
...@@ -305,11 +305,6 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, ...@@ -305,11 +305,6 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
if (err) if (err)
goto put_file_unlock; goto put_file_unlock;
/* Ensure reuse->reuseport_id is set */
err = reuseport_get_id(reuse);
if (err < 0)
goto put_file_unlock;
WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
rcu_assign_pointer(array->ptrs[index], nsk); rcu_assign_pointer(array->ptrs[index], nsk);
free_osk = osk; free_osk = osk;
......
...@@ -8641,18 +8641,8 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, ...@@ -8641,18 +8641,8 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
} }
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
struct sock *sk; struct sock *sk = reuse_kern->sk;
if (unlikely(!reuse_kern->reuseport_id))
/* There is a small race between adding the
* sk to the map and setting the
* reuse_kern->reuseport_id.
* Treat it as the sk has not been added to
* the bpf map yet.
*/
return -ENOENT;
sk = reuse_kern->sk;
if (sk->sk_protocol != selected_sk->sk_protocol) if (sk->sk_protocol != selected_sk->sk_protocol)
return -EPROTOTYPE; return -EPROTOTYPE;
else if (sk->sk_family != selected_sk->sk_family) else if (sk->sk_family != selected_sk->sk_family)
......
...@@ -16,27 +16,8 @@ ...@@ -16,27 +16,8 @@
DEFINE_SPINLOCK(reuseport_lock); DEFINE_SPINLOCK(reuseport_lock);
#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida); static DEFINE_IDA(reuseport_ida);
int reuseport_get_id(struct sock_reuseport *reuse)
{
int id;
if (reuse->reuseport_id)
return reuse->reuseport_id;
id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
/* Called under reuseport_lock */
GFP_ATOMIC);
if (id < 0)
return id;
reuse->reuseport_id = id;
return reuse->reuseport_id;
}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{ {
unsigned int size = sizeof(struct sock_reuseport) + unsigned int size = sizeof(struct sock_reuseport) +
...@@ -55,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) ...@@ -55,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany) int reuseport_alloc(struct sock *sk, bool bind_inany)
{ {
struct sock_reuseport *reuse; struct sock_reuseport *reuse;
int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in /* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context * soft irq of receive path or setsockopt from process context
...@@ -78,10 +60,18 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) ...@@ -78,10 +60,18 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = __reuseport_alloc(INIT_SOCKS); reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) { if (!reuse) {
spin_unlock_bh(&reuseport_lock); ret = -ENOMEM;
return -ENOMEM; goto out;
} }
id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
if (id < 0) {
kfree(reuse);
ret = id;
goto out;
}
reuse->reuseport_id = id;
reuse->socks[0] = sk; reuse->socks[0] = sk;
reuse->num_socks = 1; reuse->num_socks = 1;
reuse->bind_inany = bind_inany; reuse->bind_inany = bind_inany;
...@@ -90,7 +80,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) ...@@ -90,7 +80,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
out: out:
spin_unlock_bh(&reuseport_lock); spin_unlock_bh(&reuseport_lock);
return 0; return ret;
} }
EXPORT_SYMBOL(reuseport_alloc); EXPORT_SYMBOL(reuseport_alloc);
...@@ -134,8 +124,7 @@ static void reuseport_free_rcu(struct rcu_head *head) ...@@ -134,8 +124,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu); reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
if (reuse->reuseport_id) ida_free(&reuseport_ida, reuse->reuseport_id);
ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
kfree(reuse); kfree(reuse);
} }
...@@ -199,12 +188,15 @@ void reuseport_detach_sock(struct sock *sk) ...@@ -199,12 +188,15 @@ void reuseport_detach_sock(struct sock *sk)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb, reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock)); lockdep_is_held(&reuseport_lock));
/* At least one of the sk in this reuseport group is added to /* Notify the bpf side. The sk may be added to a sockarray
* a bpf map. Notify the bpf side. The bpf map logic will * map. If so, sockarray logic will remove it from the map.
* remove the sk if it is indeed added to a bpf map. *
* Other bpf map types that work with reuseport, like sockmap,
* don't need an explicit callback from here. They override sk
* unhash/close ops to remove the sk from the map before we
* get to this point.
*/ */
if (reuse->reuseport_id) bpf_sk_reuseport_detach(sk);
bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL); rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册