diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83d1926c61e4567b881bfbc26b75b802c428cbc3..90ee6ab24bc53badebf5ab0f6362de1d4ad2062d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -151,6 +151,7 @@ struct bpf_array { union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); + void __percpu *pptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 @@ -182,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 flags); +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 flags); + +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de136b74fba93996883bd69b3e28d412..2ee0fde1bf9649739e8663dc480742fdf0ede72f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,8 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164fea26bdcb62561bd15838be6a2db..bd3bdf2486a7b1aa4744f2bd5045ff21e47812cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,11 +17,39 @@ #include #include +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a3151f99b8cf175ad2a7ce563a0e13c8..fd5db8fe9360db2134339d3e3f4ec47032d42772 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -31,21 +31,27 @@ struct bpf_htab { struct htab_elem { struct hlist_node hash_node; struct rcu_head rcu; - u32 hash; + union { + u32 hash; + u32 key_size; + }; char key[0] __aligned(8); }; /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; @@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += htab->map.value_size; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -230,65 +253,149 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return -ENOENT; } + +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static void htab_percpu_elem_free(struct htab_elem *l) +{ + free_percpu(htab_elem_get_ptr(l, l->key_size)); + kfree(l); +} + +static void htab_percpu_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + + htab_percpu_elem_free(l); +} + +static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +{ + if (percpu) { + l->key_size = key_size; + call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + } else { + kfree_rcu(l, rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + struct htab_elem *l_new; + void __percpu *pptr; + + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return NULL; + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return NULL; + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + return -E2BIG; + + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); + + /* allocate new element outside of the lock, since + * we're most likley going to insert it + */ + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (!l_new) + return -ENOMEM; - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; - goto err; - } - - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { @@ -298,7 +405,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, atomic_inc(&htab->count); } raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); @@ -306,10 +412,84 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, return ret; } +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (!l_new) { + ret = -ENOMEM; + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + atomic_inc(&htab->count); + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} + +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -332,7 +512,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(l, percpu, key_size); ret = 0; } @@ -352,7 +532,12 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree(l); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { + l->key_size = htab->map.key_size; + htab_percpu_elem_free(l); + } else { + kfree(l); + } } } } @@ -391,9 +576,70 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, true); +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f763564b535cfda2e4eaa9bf1d34fad..c95a753c2007966a752c2c2a5eda00e6a0a39072 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here - */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } free_value: kfree(value); diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c index 6299ee95cd11b63112ae5b7875872cb72ca91208..ad466ed3309307c79a7d393359f74a6c558eef1b 100644 --- a/samples/bpf/test_maps.c +++ b/samples/bpf/test_maps.c @@ -89,6 +89,100 @@ static void test_hashmap_sanity(int i, void *data) close(map_fd); } +/* sanity tests for percpu map API */ +static void test_percpu_hashmap_sanity(int task, void *data) +{ + long long key, next_key; + int expected_key_mask = 0; + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + long long value[nr_cpus]; + int map_fd, i; + + map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key), + sizeof(value[0]), 2); + if (map_fd < 0) { + printf("failed to create hashmap '%s'\n", strerror(errno)); + exit(1); + } + + for (i = 0; i < nr_cpus; i++) + value[i] = i + 100; + key = 1; + /* insert key=1 element */ + assert(!(expected_key_mask & key)); + assert(bpf_update_elem(map_fd, &key, value, BPF_ANY) == 0); + expected_key_mask |= key; + + /* BPF_NOEXIST means: add new element if it doesn't exist */ + assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == -1 && + /* key=1 already exists */ + errno == EEXIST); + + /* -1 is an invalid flag */ + assert(bpf_update_elem(map_fd, &key, value, -1) == -1 && + errno == EINVAL); + + /* check that key=1 can be found. value could be 0 if the lookup + * was run from a different cpu. + */ + value[0] = 1; + assert(bpf_lookup_elem(map_fd, &key, value) == 0 && value[0] == 100); + + key = 2; + /* check that key=2 is not found */ + assert(bpf_lookup_elem(map_fd, &key, value) == -1 && errno == ENOENT); + + /* BPF_EXIST means: update existing element */ + assert(bpf_update_elem(map_fd, &key, value, BPF_EXIST) == -1 && + /* key=2 is not there */ + errno == ENOENT); + + /* insert key=2 element */ + assert(!(expected_key_mask & key)); + assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0); + expected_key_mask |= key; + + /* key=1 and key=2 were inserted, check that key=0 cannot be inserted + * due to max_entries limit + */ + key = 0; + assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == -1 && + errno == E2BIG); + + /* check that key = 0 doesn't exist */ + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT); + + /* iterate over two elements */ + while (!bpf_get_next_key(map_fd, &key, &next_key)) { + assert((expected_key_mask & next_key) == next_key); + expected_key_mask &= ~next_key; + + assert(bpf_lookup_elem(map_fd, &next_key, value) == 0); + for (i = 0; i < nr_cpus; i++) + assert(value[i] == i + 100); + + key = next_key; + } + assert(errno == ENOENT); + + /* Update with BPF_EXIST */ + key = 1; + assert(bpf_update_elem(map_fd, &key, value, BPF_EXIST) == 0); + + /* delete both elements */ + key = 1; + assert(bpf_delete_elem(map_fd, &key) == 0); + key = 2; + assert(bpf_delete_elem(map_fd, &key) == 0); + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT); + + key = 0; + /* check that map is empty */ + assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 && + errno == ENOENT); + close(map_fd); +} + static void test_arraymap_sanity(int i, void *data) { int key, next_key, map_fd; @@ -142,6 +236,94 @@ static void test_arraymap_sanity(int i, void *data) close(map_fd); } +static void test_percpu_arraymap_many_keys(void) +{ + unsigned nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + unsigned nr_keys = 20000; + long values[nr_cpus]; + int key, map_fd, i; + + map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), + sizeof(values[0]), nr_keys); + if (map_fd < 0) { + printf("failed to create per-cpu arraymap '%s'\n", + strerror(errno)); + exit(1); + } + + for (i = 0; i < nr_cpus; i++) + values[i] = i + 10; + + for (key = 0; key < nr_keys; key++) + assert(bpf_update_elem(map_fd, &key, values, BPF_ANY) == 0); + + for (key = 0; key < nr_keys; key++) { + for (i = 0; i < nr_cpus; i++) + values[i] = 0; + assert(bpf_lookup_elem(map_fd, &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + assert(values[i] == i + 10); + } + + close(map_fd); +} + +static void test_percpu_arraymap_sanity(int i, void *data) +{ + unsigned nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + long values[nr_cpus]; + int key, next_key, map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), + sizeof(values[0]), 2); + if (map_fd < 0) { + printf("failed to create arraymap '%s'\n", strerror(errno)); + exit(1); + } + + for (i = 0; i < nr_cpus; i++) + values[i] = i + 100; + + key = 1; + /* insert key=1 element */ + assert(bpf_update_elem(map_fd, &key, values, BPF_ANY) == 0); + + values[0] = 0; + assert(bpf_update_elem(map_fd, &key, values, BPF_NOEXIST) == -1 && + errno == EEXIST); + + /* check that key=1 can be found */ + assert(bpf_lookup_elem(map_fd, &key, values) == 0 && values[0] == 100); + + key = 0; + /* check that key=0 is also found and zero initialized */ + assert(bpf_lookup_elem(map_fd, &key, values) == 0 && + values[0] == 0 && values[nr_cpus - 1] == 0); + + + /* check that key=2 cannot be inserted due to max_entries limit */ + key = 2; + assert(bpf_update_elem(map_fd, &key, values, BPF_EXIST) == -1 && + errno == E2BIG); + + /* check that key = 2 doesn't exist */ + assert(bpf_lookup_elem(map_fd, &key, values) == -1 && errno == ENOENT); + + /* iterate over two elements */ + assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 && + next_key == 0); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 && + next_key == 1); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 && + errno == ENOENT); + + /* delete shouldn't succeed */ + key = 1; + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == EINVAL); + + close(map_fd); +} + #define MAP_SIZE (32 * 1024) static void test_map_large(void) { @@ -209,7 +391,9 @@ static void run_parallel(int tasks, void (*fn)(int i, void *data), void *data) static void test_map_stress(void) { run_parallel(100, test_hashmap_sanity, NULL); + run_parallel(100, test_percpu_hashmap_sanity, NULL); run_parallel(100, test_arraymap_sanity, NULL); + run_parallel(100, test_percpu_arraymap_sanity, NULL); } #define TASKS 1024 @@ -282,7 +466,11 @@ static void test_map_parallel(void) int main(void) { test_hashmap_sanity(0, NULL); + test_percpu_hashmap_sanity(0, NULL); test_arraymap_sanity(0, NULL); + test_percpu_arraymap_sanity(0, NULL); + test_percpu_arraymap_many_keys(); + test_map_large(); test_map_parallel(); test_map_stress(); diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index b32367cfbff4aff3020bb9c36c8faf6c981dd0f8..09c1adc27d426ed4adec7408a6fbae9193c39bb0 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -70,7 +70,7 @@ struct hist_key { }; struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_HASH, + .type = BPF_MAP_TYPE_PERCPU_HASH, .key_size = sizeof(struct hist_key), .value_size = sizeof(long), .max_entries = 1024, diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index cd0241c1447a5fb37d0e9e1633f4a47a457f344f..ab5b19e68acf0c3d53916ce6324f57777cf3acbd 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -37,6 +37,8 @@ struct hist_key { static void print_hist_for_pid(int fd, void *task) { struct hist_key key = {}, next_key; + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + long values[nr_cpus]; char starstr[MAX_STARS]; long value; long data[MAX_INDEX] = {}; @@ -49,7 +51,10 @@ static void print_hist_for_pid(int fd, void *task) key = next_key; continue; } - bpf_lookup_elem(fd, &next_key, &value); + bpf_lookup_elem(fd, &next_key, values); + value = 0; + for (i = 0; i < nr_cpus; i++) + value += values[i]; ind = next_key.index; data[ind] = value; if (value && ind > max_ind) diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index bf337fbb09472cbe32bfbaff2d4313b7cafb58c6..9974c3d7c18b90f5850e87822a158c90e2645b8d 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -20,7 +20,7 @@ struct bpf_map_def SEC("maps") my_map = { /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful */ -SEC("kprobe/blk_mq_start_request") +SEC("kprobe/blk_start_request") int bpf_prog1(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); @@ -42,13 +42,13 @@ static unsigned int log2l(unsigned long long n) #define SLOTS 100 struct bpf_map_def SEC("maps") lat_map = { - .type = BPF_MAP_TYPE_ARRAY, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, .key_size = sizeof(u32), .value_size = sizeof(u64), .max_entries = SLOTS, }; -SEC("kprobe/blk_update_request") +SEC("kprobe/blk_account_io_completion") int bpf_prog2(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); @@ -81,7 +81,7 @@ int bpf_prog2(struct pt_regs *ctx) value = bpf_map_lookup_elem(&lat_map, &index); if (value) - __sync_fetch_and_add((long *)value, 1); + *value += 1; return 0; } diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 0aaa933ab93818df1634ba170e125e65dc6031d5..48716f7f0d8b9eae647a76ba93b07fe76a79f7bd 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -20,11 +20,13 @@ static void clear_stats(int fd) { + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + __u64 values[nr_cpus]; __u32 key; - __u64 value = 0; + memset(values, 0, sizeof(values)); for (key = 0; key < SLOTS; key++) - bpf_update_elem(fd, &key, &value, BPF_ANY); + bpf_update_elem(fd, &key, values, BPF_ANY); } const char *color[] = { @@ -75,15 +77,20 @@ static void print_banner(void) static void print_hist(int fd) { - __u32 key; - __u64 value; - __u64 cnt[SLOTS]; - __u64 max_cnt = 0; + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); __u64 total_events = 0; + long values[nr_cpus]; + __u64 max_cnt = 0; + __u64 cnt[SLOTS]; + __u64 value; + __u32 key; + int i; for (key = 0; key < SLOTS; key++) { + bpf_lookup_elem(fd, &key, values); value = 0; - bpf_lookup_elem(fd, &key, &value); + for (i = 0; i < nr_cpus; i++) + value += values[i]; cnt[key] = value; total_events += value; if (value > max_cnt)