diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2f38e0091b6fd0045cb37930fdfcfdeae40d626..7d9b2832c280c82172cb428a8e6d793359891db0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,8 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, }; enum bpf_prog_type { @@ -106,6 +108,13 @@ enum bpf_prog_type { #define BPF_EXIST 2 /* update existing element */ #define BPF_F_NO_PREALLOC (1U << 0) +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ +#define BPF_F_NO_COMMON_LRU (1U << 1) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091dacebc429393e80120721fc6a8f110..c4d89d6e2058481d149733d4767cf98bd99eeaa9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 0000000000000000000000000000000000000000..bfebff010ba90c5e43e1e2fbbbc67292226ef90d --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET (128) +#define LOCAL_NR_SCANS LOCAL_FREE_TARGET + +#define PERCPU_FREE_TARGET (16) +#define PERCPU_NR_SCANS PERCPU_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_possible_mask); + return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ + return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, + struct bpf_lru_node *node, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + bpf_lru_list_count_dec(l, node->type); + + node->type = tgt_free_type; + list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + node->ref = 0; + list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + if (node->type != tgt_type) { + bpf_lru_list_count_dec(l, node->type); + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + } + node->ref = 0; + + /* If the moving node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ + return l->counts[BPF_LRU_LIST_T_INACTIVE] < + l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + * back to the head of active list with the ref bit cleared. + * Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + * inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int i = 0; + + first_node = list_first_entry(active, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, active, list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + + if (++i == lru->nr_scans || node == first_node) + break; + } +} + +/* Rotate the inactive list. It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + * of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + * at its current location (i.e. do nothing) so that it can + * be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct list_head *cur, *next, *last; + struct bpf_lru_node *node; + unsigned int i = 0; + + if (list_empty(inactive)) + return; + + last = l->next_inactive_rotation->next; + if (last == inactive) + last = last->next; + + cur = l->next_inactive_rotation; + while (i < lru->nr_scans) { + if (cur == inactive) { + cur = cur->prev; + continue; + } + + node = list_entry(cur, struct bpf_lru_node, list); + next = cur->prev; + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + if (cur == last) + break; + cur = next; + i++; + } + + l->next_inactive_rotation = next; +} + +/* Shrink the inactive list. It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int nshrinked = 0; + unsigned int i = 0; + + first_node = list_first_entry(inactive, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { + if (bpf_lru_node_is_ref(node)) { + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + } else if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + if (++nshrinked == tgt_nshrink) + break; + } + + if (++i == lru->nr_scans) + break; + } + + return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ + if (bpf_lru_list_inactive_low(l)) + __bpf_lru_list_rotate_active(lru, l); + + __bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive(). It will just remove + * one node from either inactive or active list without + * honoring the ref-bit. It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) + +{ + struct bpf_lru_node *node, *tmp_node; + struct list_head *force_shrink_list; + unsigned int nshrinked; + + nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, + free_list, tgt_free_type); + if (nshrinked) + return nshrinked; + + /* Do a force shrink by ignoring the reference bit */ + if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) + force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + else + force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + + list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, + list) { + if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + return 1; + } + } + + return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node, *tmp_node; + + list_for_each_entry_safe_reverse(node, tmp_node, + local_pending_list(loc_l), list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move_in(l, node, + BPF_LRU_LIST_T_INACTIVE); + } +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + raw_spin_lock_irqsave(&l->lock, flags); + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_lru_node *node, *tmp_node; + unsigned int nfree = 0; + + raw_spin_lock(&l->lock); + + __local_list_flush(l, loc_l); + + __bpf_lru_list_rotate(lru, l); + + list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], + list) { + __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + if (++nfree == LOCAL_FREE_TARGET) + break; + } + + if (nfree < LOCAL_FREE_TARGET) + __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + + raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node, + u32 hash) +{ + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->cpu = cpu; + node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + node->ref = 0; + list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(local_free_list(loc_l), + struct bpf_lru_node, + list); + if (node) + list_del(&node->list); + + return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + bool force = false; + +ignore_ref: + /* Get from the tail (i.e. older element) of the pending list. */ + list_for_each_entry_reverse(node, local_pending_list(loc_l), + list) { + if ((!bpf_lru_node_is_ref(node) || force) && + lru->del_from_htab(lru->del_arg, node)) { + list_del(&node->list); + return node; + } + } + + if (!force) { + force = true; + goto ignore_ref; + } + + return NULL; +} + +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct list_head *free_list; + struct bpf_lru_node *node = NULL; + struct bpf_lru_list *l; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_list_rotate(lru, l); + + free_list = &l->lists[BPF_LRU_LIST_T_FREE]; + if (list_empty(free_list)) + __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, + BPF_LRU_LIST_T_FREE); + + if (!list_empty(free_list)) { + node = list_first_entry(free_list, struct bpf_lru_node, list); + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->ref = 0; + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + } + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct bpf_lru_locallist *loc_l, *steal_loc_l; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_node *node; + int steal, first_steal; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = __local_list_pop_free(loc_l); + if (!node) { + bpf_lru_list_pop_free_to_local(lru, loc_l); + node = __local_list_pop_free(loc_l); + } + + if (node) + __local_list_add_pending(lru, loc_l, cpu, node, hash); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + if (node) + return node; + + /* No free nodes found from the local free list and + * the global LRU list. + * + * Steal from the local free/pending list of the + * current CPU and remote CPU in RR. It starts + * with the loc_l->next_steal CPU. + */ + + first_steal = loc_l->next_steal; + steal = first_steal; + do { + steal_loc_l = per_cpu_ptr(clru->local_list, steal); + + raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + + raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + + steal = get_next_cpu(steal); + } while (!node && steal != first_steal); + + loc_l->next_steal = steal; + + if (node) { + raw_spin_lock_irqsave(&loc_l->lock, flags); + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + } + + return node; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_free(lru, hash); + else + return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + return; + + if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + goto check_lru_list; + } + + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + node->ref = 0; + list_move(&node->list, local_free_list(loc_l)); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + return; + } + +check_lru_list: + bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, node->cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + if (lru->percpu) + bpf_percpu_lru_push_free(lru, node); + else + bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + u32 i; + + for (i = 0; i < nr_elems; i++) { + struct bpf_lru_node *node; + + node = (struct bpf_lru_node *)(buf + node_offset); + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + buf += elem_size; + } +} + +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + u32 i, pcpu_entries; + int cpu; + struct bpf_lru_list *l; + + pcpu_entries = nr_elems / num_possible_cpus(); + + i = 0; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); +again: + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + if (lru->percpu) + bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); + else + bpf_common_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) + INIT_LIST_HEAD(&loc_l->lists[i]); + + loc_l->next_steal = cpu; + + raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LIST_T; i++) + INIT_LIST_HEAD(&l->lists[i]); + + for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) + l->counts[i] = 0; + + l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + + raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *del_arg) +{ + int cpu; + + if (percpu) { + lru->percpu_lru = alloc_percpu(struct bpf_lru_list); + if (!lru->percpu_lru) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_list *l; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + bpf_lru_list_init(l); + } + lru->nr_scans = PERCPU_NR_SCANS; + } else { + struct bpf_common_lru *clru = &lru->common_lru; + + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + } + + lru->percpu = percpu; + lru->del_from_htab = del_from_htab; + lru->del_arg = del_arg; + lru->hash_offset = hash_offset; + + return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ + if (lru->percpu) + free_percpu(lru->percpu_lru); + else + free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 0000000000000000000000000000000000000000..5c35a98d02bf281d0d9527248c37aca0b9876fb1 --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include +#include + +#define NR_BPF_LRU_LIST_T (3) +#define NR_BPF_LRU_LIST_COUNT (2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { + BPF_LRU_LIST_T_ACTIVE, + BPF_LRU_LIST_T_INACTIVE, + BPF_LRU_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { + struct list_head list; + u16 cpu; + u8 type; + u8 ref; +}; + +struct bpf_lru_list { + struct list_head lists[NR_BPF_LRU_LIST_T]; + unsigned int counts[NR_BPF_LRU_LIST_COUNT]; + /* The next inacitve list rotation starts from here */ + struct list_head *next_inactive_rotation; + + raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { + struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + u16 next_steal; + raw_spinlock_t lock; +}; + +struct bpf_common_lru { + struct bpf_lru_list lru_list; + struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { + union { + struct bpf_common_lru common_lru; + struct bpf_lru_list __percpu *percpu_lru; + }; + del_from_htab_func del_from_htab; + void *del_arg; + unsigned int hash_offset; + unsigned int nr_scans; + bool percpu; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ + /* ref is an approximation on access frequency. It does not + * have to be very accurate. Hence, no protection is used. + */ + node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad1bc67aff1b514fe9801e695ef9f428091227fc..34debc1a9641875382b6603820f3e85c4a18dffc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include #include #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ static void htab_free_elems(struct bpf_htab *htab) vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +172,16 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + u32 size = round_up(htab->map.value_size, 8); + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - if (!onallcpus) { - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); - } else { - int off = 0, cpu; + pcpu_copy_value(htab, pptr, value, onallcpus); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else { @@ -571,6 +715,70 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); - u32 size = htab->map.value_size; - /* per-cpu hash map can update value in-place */ - if (!onallcpus) { - memcpy(this_cpu_ptr(pptr), value, size); - } else { - int off = 0, cpu; - - size = round_up(size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, false); @@ -637,12 +832,84 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -776,10 +1104,16 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 233e3ac836a6689e7aefee822cbd38a3f4d00216..ce1b7de7d72c6926ac5d4a50f812df764d25a07c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ac87f9c068ae72a96ec63fedc7ddc51f665fa081..f394ac616ed8ea37b1929236ce862ac2260039d7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -2,6 +2,7 @@ obj- := dummy.o # List of programs to build +hostprogs-y := test_lru_dist hostprogs-y += sock_example hostprogs-y += fds_example hostprogs-y += sockex1 @@ -28,6 +29,7 @@ hostprogs-y += trace_event hostprogs-y += sampleip hostprogs-y += tc_l2_redirect +test_lru_dist-objs := test_lru_dist.o libbpf.o sock_example-objs := sock_example.o libbpf.o fds_example-objs := bpf_load.o libbpf.o fds_example.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 311538e5a7016e7bfdd250779d60491051ef8b4b..7ee1574c8ccff49ad7a2a301c4462f66e46d88da 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = { .max_entries = MAX_ENTRIES, }; +struct bpf_map_def SEC("maps") lru_hash_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") percpu_lru_hash_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 10000, + .map_flags = BPF_F_NO_COMMON_LRU, +}; + struct bpf_map_def SEC("maps") percpu_hash_map = { .type = BPF_MAP_TYPE_PERCPU_HASH, .key_size = sizeof(u32), @@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx) value = bpf_map_lookup_elem(&hash_map, &key); if (value) bpf_map_delete_elem(&hash_map, &key); + return 0; } @@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) bpf_map_delete_elem(&percpu_hash_map_alloc, &key); return 0; } + +SEC("kprobe/sys_getpid") +int stress_lru_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_prandom_u32(); + long val = 1; + + bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + + return 0; +} + +SEC("kprobe/sys_getppid") +int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_prandom_u32(); + long val = 1; + + bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 3147377e8fd3c98fe455c6d96f5c774cb5392f11..9505b4d112f426790645ecd00c50263620ae68e6 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -35,6 +35,8 @@ static __u64 time_get_ns(void) #define PERCPU_HASH_PREALLOC (1 << 1) #define HASH_KMALLOC (1 << 2) #define PERCPU_HASH_KMALLOC (1 << 3) +#define LRU_HASH_PREALLOC (1 << 4) +#define PERCPU_LRU_HASH_PREALLOC (1 << 5) static int test_flags = ~0; @@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu) cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); } +static void test_lru_hash_prealloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + syscall(__NR_getpid); + printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_percpu_lru_hash_prealloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + syscall(__NR_getppid); + printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); +} + static void test_percpu_hash_prealloc(int cpu) { __u64 start_time; @@ -105,6 +131,12 @@ static void loop(int cpu) if (test_flags & PERCPU_HASH_KMALLOC) test_percpu_hash_kmalloc(cpu); + + if (test_flags & LRU_HASH_PREALLOC) + test_lru_hash_prealloc(cpu); + + if (test_flags & PERCPU_LRU_HASH_PREALLOC) + test_percpu_lru_hash_prealloc(cpu); } static void run_perf_test(int tasks) diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c new file mode 100644 index 0000000000000000000000000000000000000000..2859977b7f37dfe48f47ab76a553e1fbea4e2d4b --- /dev/null +++ b/samples/bpf/test_lru_dist.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +static int nr_cpus; +static unsigned long long *dist_keys; +static unsigned int dist_key_counts; + +struct list_head { + struct list_head *next, *prev; +}; + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +struct pfect_lru_node { + struct list_head list; + unsigned long long key; +}; + +struct pfect_lru { + struct list_head list; + struct pfect_lru_node *free_nodes; + unsigned int cur_size; + unsigned int lru_size; + unsigned int nr_unique; + unsigned int nr_misses; + unsigned int total; + int map_fd; +}; + +static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, + unsigned int nr_possible_elems) +{ + lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(unsigned long long), + sizeof(struct pfect_lru_node *), + nr_possible_elems, 0); + assert(lru->map_fd != -1); + + lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); + assert(lru->free_nodes); + + INIT_LIST_HEAD(&lru->list); + lru->cur_size = 0; + lru->lru_size = lru_size; + lru->nr_unique = lru->nr_misses = lru->total = 0; +} + +static void pfect_lru_destroy(struct pfect_lru *lru) +{ + close(lru->map_fd); + free(lru->free_nodes); +} + +static int pfect_lru_lookup_or_insert(struct pfect_lru *lru, + unsigned long long key) +{ + struct pfect_lru_node *node = NULL; + int seen = 0; + + lru->total++; + if (!bpf_lookup_elem(lru->map_fd, &key, &node)) { + if (node) { + list_move(&node->list, &lru->list); + return 1; + } + seen = 1; + } + + if (lru->cur_size < lru->lru_size) { + node = &lru->free_nodes[lru->cur_size++]; + INIT_LIST_HEAD(&node->list); + } else { + struct pfect_lru_node *null_node = NULL; + + node = list_last_entry(&lru->list, + struct pfect_lru_node, + list); + bpf_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST); + } + + node->key = key; + list_move(&node->list, &lru->list); + + lru->nr_misses++; + if (seen) { + assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_EXIST)); + } else { + lru->nr_unique++; + assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST)); + } + + return seen; +} + +static unsigned int read_keys(const char *dist_file, + unsigned long long **keys) +{ + struct stat fst; + unsigned long long *retkeys; + unsigned int counts = 0; + int dist_fd; + char *b, *l; + int i; + + dist_fd = open(dist_file, 0); + assert(dist_fd != -1); + + assert(fstat(dist_fd, &fst) == 0); + b = malloc(fst.st_size); + assert(b); + + assert(read(dist_fd, b, fst.st_size) == fst.st_size); + close(dist_fd); + for (i = 0; i < fst.st_size; i++) { + if (b[i] == '\n') + counts++; + } + counts++; /* in case the last line has no \n */ + + retkeys = malloc(counts * sizeof(unsigned long long)); + assert(retkeys); + + counts = 0; + for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n")) + retkeys[counts++] = strtoull(l, NULL, 10); + free(b); + + *keys = retkeys; + + return counts; +} + +static int create_map(int map_type, int map_flags, unsigned int size) +{ + int map_fd; + + map_fd = bpf_create_map(map_type, sizeof(unsigned long long), + sizeof(unsigned long long), size, map_flags); + + if (map_fd == -1) + perror("bpf_create_map"); + + return map_fd; +} + +static int sched_next_online(int pid, int next_to_try) +{ + cpu_set_t cpuset; + + if (next_to_try == nr_cpus) + return -1; + + while (next_to_try < nr_cpus) { + CPU_ZERO(&cpuset); + CPU_SET(next_to_try++, &cpuset); + if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) + break; + } + + return next_to_try; +} + +static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data), + void *data) +{ + int next_sched_cpu = 0; + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + next_sched_cpu = sched_next_online(0, next_sched_cpu); + fn(i, data); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + /* It is mostly redundant and just allow the parent + * process to update next_shced_cpu for the next child + * process + */ + next_sched_cpu = sched_next_online(pid[i], next_sched_cpu); + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void do_test_lru_dist(int task, void *data) +{ + unsigned int nr_misses = 0; + struct pfect_lru pfect_lru; + unsigned long long key, value = 1234; + unsigned int i; + + unsigned int lru_map_fd = ((unsigned int *)data)[0]; + unsigned int lru_size = ((unsigned int *)data)[1]; + unsigned long long key_offset = task * dist_key_counts; + + pfect_lru_init(&pfect_lru, lru_size, dist_key_counts); + + for (i = 0; i < dist_key_counts; i++) { + key = dist_keys[i] + key_offset; + + pfect_lru_lookup_or_insert(&pfect_lru, key); + + if (!bpf_lookup_elem(lru_map_fd, &key, &value)) + continue; + + if (bpf_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) { + printf("bpf_update_elem(lru_map_fd, %llu): errno:%d\n", + key, errno); + assert(0); + } + + nr_misses++; + } + + printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", + task, pfect_lru.nr_unique, dist_key_counts, nr_misses, + dist_key_counts); + printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", + task, pfect_lru.nr_unique, pfect_lru.total, + pfect_lru.nr_misses, pfect_lru.total); + + pfect_lru_destroy(&pfect_lru); + close(lru_map_fd); +} + +static void test_parallel_lru_dist(int map_type, int map_flags, + int nr_tasks, unsigned int lru_size) +{ + int child_data[2]; + int lru_map_fd; + + printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, + map_flags); + + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, + nr_cpus * lru_size); + else + lru_map_fd = create_map(map_type, map_flags, + nr_tasks * lru_size); + assert(lru_map_fd != -1); + + child_data[0] = lru_map_fd; + child_data[1] = lru_size; + + run_parallel(nr_tasks, do_test_lru_dist, child_data); + + close(lru_map_fd); +} + +static void test_lru_loss0(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + unsigned int old_unused_losses = 0; + unsigned int new_unused_losses = 0; + unsigned int used_losses = 0; + int map_fd; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, 900 * nr_cpus); + else + map_fd = create_map(map_type, map_flags, 900); + + assert(map_fd != -1); + + value[0] = 1234; + + for (key = 1; key <= 1000; key++) { + int start_key, end_key; + + assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0); + + start_key = 101; + end_key = min(key, 900); + + while (start_key <= end_key) { + bpf_lookup_elem(map_fd, &start_key, value); + start_key++; + } + } + + for (key = 1; key <= 1000; key++) { + if (bpf_lookup_elem(map_fd, &key, value)) { + if (key <= 100) + old_unused_losses++; + else if (key <= 900) + used_losses++; + else + new_unused_losses++; + } + } + + close(map_fd); + + printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) " + "newer-elem-losses:%d(/100)\n", + old_unused_losses, used_losses, new_unused_losses); +} + +static void test_lru_loss1(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + int map_fd; + unsigned int nr_losses = 0; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, 1000 * nr_cpus); + else + map_fd = create_map(map_type, map_flags, 1000); + + assert(map_fd != -1); + + value[0] = 1234; + + for (key = 1; key <= 1000; key++) + assert(!bpf_update_elem(map_fd, &key, value, BPF_NOEXIST)); + + for (key = 1; key <= 1000; key++) { + if (bpf_lookup_elem(map_fd, &key, value)) + nr_losses++; + } + + close(map_fd); + + printf("nr_losses:%d(/1000)\n", nr_losses); +} + +static void do_test_parallel_lru_loss(int task, void *data) +{ + const unsigned int nr_stable_elems = 1000; + const unsigned int nr_repeats = 100000; + + int map_fd = *(int *)data; + unsigned long long stable_base; + unsigned long long key, value[nr_cpus]; + unsigned long long next_ins_key; + unsigned int nr_losses = 0; + unsigned int i; + + stable_base = task * nr_repeats * 2 + 1; + next_ins_key = stable_base; + value[0] = 1234; + for (i = 0; i < nr_stable_elems; i++) { + assert(bpf_update_elem(map_fd, &next_ins_key, value, + BPF_NOEXIST) == 0); + next_ins_key++; + } + + for (i = 0; i < nr_repeats; i++) { + int rn; + + rn = rand(); + + if (rn % 10) { + key = rn % nr_stable_elems + stable_base; + bpf_lookup_elem(map_fd, &key, value); + } else { + bpf_update_elem(map_fd, &next_ins_key, value, + BPF_NOEXIST); + next_ins_key++; + } + } + + key = stable_base; + for (i = 0; i < nr_stable_elems; i++) { + if (bpf_lookup_elem(map_fd, &key, value)) + nr_losses++; + key++; + } + + printf(" task:%d nr_losses:%u\n", task, nr_losses); +} + +static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) +{ + int map_fd; + + printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, + map_flags); + + /* Give 20% more than the active working set */ + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, + nr_cpus * (1000 + 200)); + else + map_fd = create_map(map_type, map_flags, + nr_tasks * (1000 + 200)); + + assert(map_fd != -1); + + run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd); + + close(map_fd); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; + const char *dist_file; + int nr_tasks = 1; + int lru_size; + int f; + + if (argc < 4) { + printf("Usage: %s \n", + argv[0]); + return -1; + } + + dist_file = argv[1]; + lru_size = atoi(argv[2]); + nr_tasks = atoi(argv[3]); + + setbuf(stdout, NULL); + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + srand(time(NULL)); + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + assert(nr_cpus != -1); + printf("nr_cpus:%d\n\n", nr_cpus); + + nr_tasks = min(nr_tasks, nr_cpus); + + dist_key_counts = read_keys(dist_file, &dist_keys); + if (!dist_key_counts) { + printf("%s has no key\n", dist_file); + return -1; + } + + for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { + test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); + test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); + test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f], + nr_tasks); + test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f], + nr_tasks, lru_size); + printf("\n"); + } + + free(dist_keys); + + return 0; +} diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4761e2d65ab80a9e0c9f325edcda72c2d397f812..7a5f24543a5f06fc92e2c0c139f6af7a05f9ea37 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,8 +1,8 @@ -CFLAGS += -Wall -O2 +CFLAGS += -Wall -O2 -I../../../../usr/include -test_objs = test_verifier test_maps +test_objs = test_verifier test_maps test_lru_map -TEST_PROGS := test_verifier test_maps test_kmod.sh +TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh TEST_FILES := $(test_objs) all: $(test_objs) diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c new file mode 100644 index 0000000000000000000000000000000000000000..627757ed7836c80d513baf9792ec9f4dba7348ca --- /dev/null +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_sys.h" + +#define LOCAL_FREE_TARGET (128) +#define PERCPU_FREE_TARGET (16) + +static int nr_cpus; + +static int create_map(int map_type, int map_flags, unsigned int size) +{ + int map_fd; + + map_fd = bpf_map_create(map_type, sizeof(unsigned long long), + sizeof(unsigned long long), size, map_flags); + + if (map_fd == -1) + perror("bpf_map_create"); + + return map_fd; +} + +static int map_subset(int map0, int map1) +{ + unsigned long long next_key = 0; + unsigned long long value0[nr_cpus], value1[nr_cpus]; + int ret; + + while (!bpf_map_next_key(map1, &next_key, &next_key)) { + assert(!bpf_map_lookup(map1, &next_key, value1)); + ret = bpf_map_lookup(map0, &next_key, value0); + if (ret) { + printf("key:%llu not found from map. %s(%d)\n", + next_key, strerror(errno), errno); + return 0; + } + if (value0[0] != value1[0]) { + printf("key:%llu value0:%llu != value1:%llu\n", + next_key, value0[0], value1[0]); + return 0; + } + } + return 1; +} + +static int map_equal(int lru_map, int expected) +{ + return map_subset(lru_map, expected) && map_subset(expected, lru_map); +} + +static int sched_next_online(int pid, int next_to_try) +{ + cpu_set_t cpuset; + + if (next_to_try == nr_cpus) + return -1; + + while (next_to_try < nr_cpus) { + CPU_ZERO(&cpuset); + CPU_SET(next_to_try++, &cpuset); + if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) + break; + } + + return next_to_try; +} + +/* Size of the LRU amp is 2 + * Add key=1 (+1 key) + * Add key=2 (+1 key) + * Lookup Key=1 + * Add Key=3 + * => Key=2 will be removed by LRU + * Iterate map. Only found key=1 and key=3 + */ +static void test_lru_sanity0(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + int lru_map_fd, expected_map_fd; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus); + else + lru_map_fd = create_map(map_type, map_flags, 2); + assert(lru_map_fd != -1); + + expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2); + assert(expected_map_fd != -1); + + value[0] = 1234; + + /* insert key=1 element */ + + key = 1; + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST)); + + /* BPF_NOEXIST means: add new element if it doesn't exist */ + assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST) == -1 && + /* key=1 already exists */ + errno == EEXIST); + + assert(bpf_map_update(lru_map_fd, &key, value, -1) == -1 && + errno == EINVAL); + + /* insert key=2 element */ + + /* check that key=2 is not found */ + key = 2; + assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 && + errno == ENOENT); + + /* BPF_EXIST means: update existing element */ + assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST) == -1 && + /* key=2 is not there */ + errno == ENOENT); + + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + /* insert key=3 element */ + + /* check that key=3 is not found */ + key = 3; + assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 && + errno == ENOENT); + + /* check that key=1 can be found and mark the ref bit to + * stop LRU from removing key=1 + */ + key = 1; + assert(!bpf_map_lookup(lru_map_fd, &key, value)); + assert(value[0] == 1234); + + key = 3; + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST)); + + /* key=2 has been removed from the LRU */ + key = 2; + assert(bpf_map_lookup(lru_map_fd, &key, value) == -1); + + assert(map_equal(lru_map_fd, expected_map_fd)); + + close(expected_map_fd); + close(lru_map_fd); + + printf("Pass\n"); +} + +/* Size of the LRU map is 1.5*tgt_free + * Insert 1 to tgt_free (+tgt_free keys) + * Lookup 1 to tgt_free/2 + * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys) + * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU + */ +static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) +{ + unsigned long long key, end_key, value[nr_cpus]; + int lru_map_fd, expected_map_fd; + unsigned int batch_size; + unsigned int map_size; + + if (map_flags & BPF_F_NO_COMMON_LRU) + /* Ther percpu lru list (i.e each cpu has its own LRU + * list) does not have a local free list. Hence, + * it will only free old nodes till there is no free + * from the LRU list. Hence, this test does not apply + * to BPF_F_NO_COMMON_LRU + */ + return; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + batch_size = tgt_free / 2; + assert(batch_size * 2 == tgt_free); + + map_size = tgt_free + batch_size; + lru_map_fd = create_map(map_type, map_flags, map_size); + assert(lru_map_fd != -1); + + expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size); + assert(expected_map_fd != -1); + + value[0] = 1234; + + /* Insert 1 to tgt_free (+tgt_free keys) */ + end_key = 1 + tgt_free; + for (key = 1; key < end_key; key++) + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + /* Lookup 1 to tgt_free/2 */ + end_key = 1 + batch_size; + for (key = 1; key < end_key; key++) { + assert(!bpf_map_lookup(lru_map_fd, &key, value)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + /* Insert 1+tgt_free to 2*tgt_free + * => 1+tgt_free/2 to LOCALFREE_TARGET will be + * removed by LRU + */ + key = 1 + tgt_free; + end_key = key + tgt_free; + for (; key < end_key; key++) { + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + assert(map_equal(lru_map_fd, expected_map_fd)); + + close(expected_map_fd); + close(lru_map_fd); + + printf("Pass\n"); +} + +/* Size of the LRU map 1.5 * tgt_free + * Insert 1 to tgt_free (+tgt_free keys) + * Update 1 to tgt_free/2 + * => The original 1 to tgt_free/2 will be removed due to + * the LRU shrink process + * Re-insert 1 to tgt_free/2 again and do a lookup immeidately + * Insert 1+tgt_free to tgt_free*3/2 + * Insert 1+tgt_free*3/2 to tgt_free*5/2 + * => Key 1+tgt_free to tgt_free*3/2 + * will be removed from LRU because it has never + * been lookup and ref bit is not set + */ +static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) +{ + unsigned long long key, value[nr_cpus]; + unsigned long long end_key; + int lru_map_fd, expected_map_fd; + unsigned int batch_size; + unsigned int map_size; + + if (map_flags & BPF_F_NO_COMMON_LRU) + /* Ther percpu lru list (i.e each cpu has its own LRU + * list) does not have a local free list. Hence, + * it will only free old nodes till there is no free + * from the LRU list. Hence, this test does not apply + * to BPF_F_NO_COMMON_LRU + */ + return; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + batch_size = tgt_free / 2; + assert(batch_size * 2 == tgt_free); + + map_size = tgt_free + batch_size; + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, + map_size * nr_cpus); + else + lru_map_fd = create_map(map_type, map_flags, map_size); + assert(lru_map_fd != -1); + + expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size); + assert(expected_map_fd != -1); + + value[0] = 1234; + + /* Insert 1 to tgt_free (+tgt_free keys) */ + end_key = 1 + tgt_free; + for (key = 1; key < end_key; key++) + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + /* Any bpf_map_update will require to acquire a new node + * from LRU first. + * + * The local list is running out of free nodes. + * It gets from the global LRU list which tries to + * shrink the inactive list to get tgt_free + * number of free nodes. + * + * Hence, the oldest key 1 to tgt_free/2 + * are removed from the LRU list. + */ + key = 1; + if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_delete(lru_map_fd, &key)); + } else { + assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST)); + } + + /* Re-insert 1 to tgt_free/2 again and do a lookup + * immeidately. + */ + end_key = 1 + batch_size; + value[0] = 4321; + for (key = 1; key < end_key; key++) { + assert(bpf_map_lookup(lru_map_fd, &key, value)); + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_lookup(lru_map_fd, &key, value)); + assert(value[0] == 4321); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + value[0] = 1234; + + /* Insert 1+tgt_free to tgt_free*3/2 */ + end_key = 1 + tgt_free + batch_size; + for (key = 1 + tgt_free; key < end_key; key++) + /* These newly added but not referenced keys will be + * gone during the next LRU shrink. + */ + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */ + end_key = key + tgt_free; + for (; key < end_key; key++) { + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + assert(map_equal(lru_map_fd, expected_map_fd)); + + close(expected_map_fd); + close(lru_map_fd); + + printf("Pass\n"); +} + +/* Size of the LRU map is 2*tgt_free + * It is to test the active/inactive list rotation + * Insert 1 to 2*tgt_free (+2*tgt_free keys) + * Lookup key 1 to tgt_free*3/2 + * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys) + * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU + */ +static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) +{ + unsigned long long key, end_key, value[nr_cpus]; + int lru_map_fd, expected_map_fd; + unsigned int batch_size; + unsigned int map_size; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + batch_size = tgt_free / 2; + assert(batch_size * 2 == tgt_free); + + map_size = tgt_free * 2; + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, + map_size * nr_cpus); + else + lru_map_fd = create_map(map_type, map_flags, map_size); + assert(lru_map_fd != -1); + + expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size); + assert(expected_map_fd != -1); + + value[0] = 1234; + + /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */ + end_key = 1 + (2 * tgt_free); + for (key = 1; key < end_key; key++) + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + /* Lookup key 1 to tgt_free*3/2 */ + end_key = tgt_free + batch_size; + for (key = 1; key < end_key; key++) { + assert(!bpf_map_lookup(lru_map_fd, &key, value)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + /* Add 1+2*tgt_free to tgt_free*5/2 + * (+tgt_free/2 keys) + */ + key = 2 * tgt_free + 1; + end_key = key + batch_size; + for (; key < end_key; key++) { + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + assert(map_equal(lru_map_fd, expected_map_fd)); + + close(expected_map_fd); + close(lru_map_fd); + + printf("Pass\n"); +} + +/* Test deletion */ +static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free) +{ + int lru_map_fd, expected_map_fd; + unsigned long long key, value[nr_cpus]; + unsigned long long end_key; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, + 3 * tgt_free * nr_cpus); + else + lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free); + assert(lru_map_fd != -1); + + expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, + 3 * tgt_free); + assert(expected_map_fd != -1); + + value[0] = 1234; + + for (key = 1; key <= 2 * tgt_free; key++) + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + key = 1; + assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + + for (key = 1; key <= tgt_free; key++) { + assert(!bpf_map_lookup(lru_map_fd, &key, value)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + for (; key <= 2 * tgt_free; key++) { + assert(!bpf_map_delete(lru_map_fd, &key)); + assert(bpf_map_delete(lru_map_fd, &key)); + } + + end_key = key + 2 * tgt_free; + for (; key < end_key; key++) { + assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_update(expected_map_fd, &key, value, + BPF_NOEXIST)); + } + + assert(map_equal(lru_map_fd, expected_map_fd)); + + close(expected_map_fd); + close(lru_map_fd); + + printf("Pass\n"); +} + +static void do_test_lru_sanity5(unsigned long long last_key, int map_fd) +{ + unsigned long long key, value[nr_cpus]; + + /* Ensure the last key inserted by previous CPU can be found */ + assert(!bpf_map_lookup(map_fd, &last_key, value)); + + value[0] = 1234; + + key = last_key + 1; + assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST)); + assert(!bpf_map_lookup(map_fd, &key, value)); + + /* Cannot find the last key because it was removed by LRU */ + assert(bpf_map_lookup(map_fd, &last_key, value)); +} + +/* Test map with only one element */ +static void test_lru_sanity5(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + int next_sched_cpu = 0; + int map_fd; + int i; + + if (map_flags & BPF_F_NO_COMMON_LRU) + return; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + map_fd = create_map(map_type, map_flags, 1); + assert(map_fd != -1); + + value[0] = 1234; + key = 0; + assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST)); + + for (i = 0; i < nr_cpus; i++) { + pid_t pid; + + pid = fork(); + if (pid == 0) { + next_sched_cpu = sched_next_online(0, next_sched_cpu); + if (next_sched_cpu != -1) + do_test_lru_sanity5(key, map_fd); + exit(0); + } else if (pid == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } else { + int status; + + /* It is mostly redundant and just allow the parent + * process to update next_shced_cpu for the next child + * process + */ + next_sched_cpu = sched_next_online(pid, next_sched_cpu); + + assert(waitpid(pid, &status, 0) == pid); + assert(status == 0); + key++; + } + } + + close(map_fd); + + printf("Pass\n"); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int map_types[] = {BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH}; + int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; + int t, f; + + setbuf(stdout, NULL); + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + assert(nr_cpus != -1); + printf("nr_cpus:%d\n\n", nr_cpus); + + for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { + unsigned int tgt_free = (map_flags[f] & BPF_F_NO_COMMON_LRU) ? + PERCPU_FREE_TARGET : LOCAL_FREE_TARGET; + + for (t = 0; t < sizeof(map_types) / sizeof(*map_types); t++) { + test_lru_sanity0(map_types[t], map_flags[f]); + test_lru_sanity1(map_types[t], map_flags[f], tgt_free); + test_lru_sanity2(map_types[t], map_flags[f], tgt_free); + test_lru_sanity3(map_types[t], map_flags[f], tgt_free); + test_lru_sanity4(map_types[t], map_flags[f], tgt_free); + test_lru_sanity5(map_types[t], map_flags[f]); + + printf("\n"); + } + } + + return 0; +}