提交 824bd0ce 编写于 作者: A Alexei Starovoitov 提交者: David S. Miller

bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map

Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do
accurate counters without need to use BPF_XADD instruction which turned
out to be too costly for high-performance network monitoring.
In the typical use case the 'key' is the flow tuple or other long
living object that sees a lot of events per second.

bpf_map_lookup_elem() returns per-cpu area.
Example:
struct {
  u32 packets;
  u32 bytes;
} * ptr = bpf_map_lookup_elem(&map, &key);
/* ptr points to this_cpu area of the value, so the following
 * increments will not collide with other cpus
 */
ptr->packets ++;
ptr->bytes += skb->len;

bpf_update_elem() atomically creates a new element where all per-cpu
values are zero initialized and this_cpu value is populated with
given 'value'.
Note that non-per-cpu hash map always allocates new element
and then deletes old after rcu grace period to maintain atomicity
of update. Per-cpu hash map updates element values in-place.
Signed-off-by: NAlexei Starovoitov <ast@kernel.org>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 ba905f5e
master alk-4.19.24 alk-4.19.30 alk-4.19.34 alk-4.19.36 alk-4.19.43 alk-4.19.48 alk-4.19.57 ck-4.19.67 ck-4.19.81 ck-4.19.91 github/fork/deepanshu1422/fix-typo-in-comment github/fork/haosdent/fix-typo linux-next v4.19.91 v4.19.90 v4.19.89 v4.19.88 v4.19.87 v4.19.86 v4.19.85 v4.19.84 v4.19.83 v4.19.82 v4.19.81 v4.19.80 v4.19.79 v4.19.78 v4.19.77 v4.19.76 v4.19.75 v4.19.74 v4.19.73 v4.19.72 v4.19.71 v4.19.70 v4.19.69 v4.19.68 v4.19.67 v4.19.66 v4.19.65 v4.19.64 v4.19.63 v4.19.62 v4.19.61 v4.19.60 v4.19.59 v4.19.58 v4.19.57 v4.19.56 v4.19.55 v4.19.54 v4.19.53 v4.19.52 v4.19.51 v4.19.50 v4.19.49 v4.19.48 v4.19.47 v4.19.46 v4.19.45 v4.19.44 v4.19.43 v4.19.42 v4.19.41 v4.19.40 v4.19.39 v4.19.38 v4.19.37 v4.19.36 v4.19.35 v4.19.34 v4.19.33 v4.19.32 v4.19.31 v4.19.30 v4.19.29 v4.19.28 v4.19.27 v4.19.26 v4.19.25 v4.19.24 v4.19.23 v4.19.22 v4.19.21 v4.19.20 v4.19.19 v4.19.18 v4.19.17 v4.19.16 v4.19.15 v4.19.14 v4.19.13 v4.19.12 v4.19.11 v4.19.10 v4.19.9 v4.19.8 v4.19.7 v4.19.6 v4.19.5 v4.19.4 v4.19.3 v4.19.2 v4.19.1 v4.19 v4.19-rc8 v4.19-rc7 v4.19-rc6 v4.19-rc5 v4.19-rc4 v4.19-rc3 v4.19-rc2 v4.19-rc1 ck-release-21 ck-release-20 ck-release-19.2 ck-release-19.1 ck-release-19 ck-release-18 ck-release-17.2 ck-release-17.1 ck-release-17 ck-release-16 ck-release-15.1 ck-release-15 ck-release-14 ck-release-13.2 ck-release-13 ck-release-12 ck-release-11 ck-release-10 ck-release-9 ck-release-7 alk-release-15 alk-release-14 alk-release-13.2 alk-release-13 alk-release-12 alk-release-11 alk-release-10 alk-release-9 alk-release-7
无相关合并请求
......@@ -81,6 +81,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
};
enum bpf_prog_type {
......
......@@ -31,21 +31,27 @@ struct bpf_htab {
struct htab_elem {
struct hlist_node hash_node;
struct rcu_head rcu;
u32 hash;
union {
u32 hash;
u32 key_size;
};
char key[0] __aligned(8);
};
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct bpf_htab *htab;
int err, i;
u64 cost;
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
/* mandatory map attributes */
htab->map.map_type = attr->map_type;
htab->map.key_size = attr->key_size;
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
......@@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
/* make sure the size for pcpu_alloc() is reasonable */
goto free_htab;
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8) +
htab->map.value_size;
round_up(htab->map.key_size, 8);
if (percpu)
htab->elem_size += sizeof(void *);
else
htab->elem_size += htab->map.value_size;
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
if ((u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries >=
U32_MAX - PAGE_SIZE)
cost = (u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries;
if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries;
if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;
htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
err = -ENOMEM;
htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
......@@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
}
/* Called from syscall or from eBPF program */
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
......@@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
return l;
}
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
return l->key + round_up(map->key_size, 8);
......@@ -230,65 +253,139 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return -ENOENT;
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
*(void __percpu **)(l->key + key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
return *(void __percpu **)(l->key + key_size);
}
static void htab_percpu_elem_free(struct htab_elem *l)
{
free_percpu(htab_elem_get_ptr(l, l->key_size));
kfree(l);
}
static void htab_percpu_elem_free_rcu(struct rcu_head *head)
{
struct htab_elem *l = container_of(head, struct htab_elem, rcu);
htab_percpu_elem_free(l);
}
static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
{
if (percpu) {
l->key_size = key_size;
call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
} else {
kfree_rcu(l, rcu);
}
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu)
{
u32 size = htab->map.value_size;
struct htab_elem *l_new;
void __percpu *pptr;
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
if (!l_new)
return NULL;
memcpy(l_new->key, key, key_size);
if (percpu) {
/* round up value_size to 8 bytes */
size = round_up(size, 8);
/* alloc_percpu zero-fills */
pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);
return NULL;
}
/* copy true value_size bytes */
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
htab_elem_set_ptr(l_new, key_size, pptr);
} else {
memcpy(l_new->key + round_up(key_size, 8), value, size);
}
l_new->hash = hash;
return l_new;
}
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
return -E2BIG;
if (l_old && map_flags == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
if (!l_old && map_flags == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
return 0;
}
/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct htab_elem *l_new = NULL, *l_old;
struct hlist_head *head;
struct bucket *b;
unsigned long flags;
u32 key_size;
struct bucket *b;
u32 key_size, hash;
int ret;
if (map_flags > BPF_EXIST)
if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
/* allocate new element outside of lock */
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
if (!l_new)
return -ENOMEM;
key_size = map->key_size;
memcpy(l_new->key, key, key_size);
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
hash = htab_map_hash(key, key_size);
/* allocate new element outside of the lock, since
* we're most likley going to insert it
*/
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
if (!l_new)
return -ENOMEM;
l_new->hash = htab_map_hash(l_new->key, key_size);
b = __select_bucket(htab, l_new->hash);
b = __select_bucket(htab, hash);
head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
l_old = lookup_elem_raw(head, hash, key, key_size);
if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
ret = -E2BIG;
ret = check_flags(htab, l_old, map_flags);
if (ret)
goto err;
}
if (l_old && map_flags == BPF_NOEXIST) {
/* elem already exists */
ret = -EEXIST;
goto err;
}
if (!l_old && map_flags == BPF_EXIST) {
/* elem doesn't exist, cannot update it */
ret = -ENOENT;
goto err;
}
/* add new element to the head of the list, so that concurrent
* search will find it before old elem
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
......@@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
atomic_inc(&htab->count);
}
raw_spin_unlock_irqrestore(&b->lock, flags);
return 0;
err:
raw_spin_unlock_irqrestore(&b->lock, flags);
......@@ -306,10 +402,64 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
return ret;
}
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
struct hlist_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
int ret;
if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
b = __select_bucket(htab, hash);
head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size);
ret = check_flags(htab, l_old, map_flags);
if (ret)
goto err;
if (l_old) {
/* per-cpu hash map can update value in-place */
memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
value, htab->map.value_size);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true);
if (!l_new) {
ret = -ENOMEM;
goto err;
}
hlist_add_head_rcu(&l_new->hash_node, head);
atomic_inc(&htab->count);
}
ret = 0;
err:
raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct hlist_head *head;
struct bucket *b;
struct htab_elem *l;
......@@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
if (l) {
hlist_del_rcu(&l->hash_node);
atomic_dec(&htab->count);
kfree_rcu(l, rcu);
free_htab_elem(l, percpu, key_size);
ret = 0;
}
......@@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
atomic_dec(&htab->count);
kfree(l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
l->key_size = htab->map.key_size;
htab_percpu_elem_free(l);
} else {
kfree(l);
}
}
}
}
......@@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
else
return NULL;
}
static const struct bpf_map_ops htab_percpu_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
};
static struct bpf_map_type_list htab_percpu_type __read_mostly = {
.ops = &htab_percpu_ops,
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type);
return 0;
}
late_initcall(register_htab_map);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部