perf stat: Introduce 'bperf' to share hardware PMCs with BPF

The perf tool uses performance monitoring counters (PMCs) to monitor system performance. The PMCs are limited hardware resources. For example, Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. Modern data center systems use these PMCs in many different ways: system level monitoring, (maybe nested) container level monitoring, per process monitoring, profiling (in sample mode), etc. In some cases, there are more active perf_events than available hardware PMCs. To allow all perf_events to have a chance to run, it is necessary to do expensive time multiplexing of events. On the other hand, many monitoring tools count the common metrics (cycles, instructions). It is a waste to have multiple tools create multiple perf_events of "cycles" and occupy multiple PMCs. bperf tries to reduce such wastes by allowing multiple perf_events of "cycles" or "instructions" (at different scopes) to share PMUs. Instead of having each perf-stat session to read its own perf_events, bperf uses BPF programs to read the perf_events and aggregate readings to BPF maps. Then, the perf-stat session(s) reads the values from these BPF maps. Please refer to the comment before the definition of bperf_ops for the description of bperf architecture. bperf is off by default. To enable it, pass --bpf-counters option to perf-stat. bperf uses a BPF hashmap to share information about BPF programs and maps used by bperf. This map is pinned to bpffs. The default path is /sys/fs/bpf/perf_attr_map. The user could change the path with option --bpf-attr-map. Committer testing: # dmesg|grep "Performance Events" -A5 [ 0.225277] Performance Events: Fam17h+ core perfctr, AMD PMU driver. [ 0.225280] ... version: 0 [ 0.225280] ... bit width: 48 [ 0.225281] ... generic registers: 6 [ 0.225281] ... value mask: 0000ffffffffffff [ 0.225281] ... max period: 00007fffffffffff # # for a in $(seq 6) ; do perf stat -a -e cycles,instructions sleep 100000 & done [1] 2436231 [2] 2436232 [3] 2436233 [4] 2436234 [5] 2436235 [6] 2436236 # perf stat -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 310,326,987 cycles (41.87%) 236,143,290 instructions # 0.76 insn per cycle (41.87%) 0.100800885 seconds time elapsed # We can see that the counters were enabled for this workload 41.87% of the time. Now with --bpf-counters: # for a in $(seq 32) ; do perf stat --bpf-counters -a -e cycles,instructions sleep 100000 & done [1] 2436514 [2] 2436515 [3] 2436516 [4] 2436517 [5] 2436518 [6] 2436519 [7] 2436520 [8] 2436521 [9] 2436522 [10] 2436523 [11] 2436524 [12] 2436525 [13] 2436526 [14] 2436527 [15] 2436528 [16] 2436529 [17] 2436530 [18] 2436531 [19] 2436532 [20] 2436533 [21] 2436534 [22] 2436535 [23] 2436536 [24] 2436537 [25] 2436538 [26] 2436539 [27] 2436540 [28] 2436541 [29] 2436542 [30] 2436543 [31] 2436544 [32] 2436545 # # ls -la /sys/fs/bpf/perf_attr_map -rw-------. 1 root root 0 Mar 23 14:53 /sys/fs/bpf/perf_attr_map # bpftool map | grep bperf | wc -l 64 # # bpftool map | tail 1265: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1266: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1267: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 996 pids perf(2436545) 1268: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1269: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1270: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 997 pids perf(2436541) 1285: array name pid_iter.rodata flags 0x480 key 4B value 4B max_entries 1 memlock 4096B btf_id 1017 frozen pids bpftool(2437504) 1286: array flags 0x0 key 4B value 32B max_entries 1 memlock 4096B # # bpftool map dump id 1268 | tail value (CPU 21): 8f f3 bc ca 00 00 00 00 80 fd 2a d1 4d 00 00 00 80 fd 2a d1 4d 00 00 00 value (CPU 22): 7e d5 64 4d 00 00 00 00 a4 8a 2e ee 4d 00 00 00 a4 8a 2e ee 4d 00 00 00 value (CPU 23): a7 78 3e 06 01 00 00 00 b2 34 94 f6 4d 00 00 00 b2 34 94 f6 4d 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): c6 8b d9 ca 00 00 00 00 20 c6 fc 83 4e 00 00 00 20 c6 fc 83 4e 00 00 00 value (CPU 22): 9c b4 d2 4d 00 00 00 00 3e 0c df 89 4e 00 00 00 3e 0c df 89 4e 00 00 00 value (CPU 23): 18 43 66 06 01 00 00 00 5b 69 ed 83 4e 00 00 00 5b 69 ed 83 4e 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): f2 6e db ca 00 00 00 00 92 67 4c ba 4e 00 00 00 92 67 4c ba 4e 00 00 00 value (CPU 22): dc 8e e1 4d 00 00 00 00 d9 32 7a c5 4e 00 00 00 d9 32 7a c5 4e 00 00 00 value (CPU 23): bd 2b 73 06 01 00 00 00 7c 73 87 bf 4e 00 00 00 7c 73 87 bf 4e 00 00 00 Found 1 element # # perf stat --bpf-counters -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 119,410,122 cycles 152,105,479 instructions # 1.27 insn per cycle 0.101395093 seconds time elapsed # See? We had the counters enabled all the time. Signed-off-by: N Song Liu <songliubraving@fb.com> Reviewed-by: N Jiri Olsa <jolsa@kernel.org> Acked-by: N Namhyung Kim <namhyung@kernel.org> Tested-by: N Arnaldo Carvalho de Melo <acme@redhat.com> Cc: kernel-team@fb.com Link: http://lore.kernel.org/lkml/20210316211837.910506-2-songliubraving@fb.comSigned-off-by: N Arnaldo Carvalho de Melo <acme@redhat.com>

perf stat: Introduce 'bperf' to share hardware PMCs with BPF
The perf tool uses performance monitoring counters (PMCs) to monitor system performance. The PMCs are limited hardware resources. For example, Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. Modern data center systems use these PMCs in many different ways: system level monitoring, (maybe nested) container level monitoring, per process monitoring, profiling (in sample mode), etc. In some cases, there are more active perf_events than available hardware PMCs. To allow all perf_events to have a chance to run, it is necessary to do expensive time multiplexing of events. On the other hand, many monitoring tools count the common metrics (cycles, instructions). It is a waste to have multiple tools create multiple perf_events of "cycles" and occupy multiple PMCs. bperf tries to reduce such wastes by allowing multiple perf_events of "cycles" or "instructions" (at different scopes) to share PMUs. Instead of having each perf-stat session to read its own perf_events, bperf uses BPF programs to read the perf_events and aggregate readings to BPF maps. Then, the perf-stat session(s) reads the values from these BPF maps. Please refer to the comment before the definition of bperf_ops for the description of bperf architecture. bperf is off by default. To enable it, pass --bpf-counters option to perf-stat. bperf uses a BPF hashmap to share information about BPF programs and maps used by bperf. This map is pinned to bpffs. The default path is /sys/fs/bpf/perf_attr_map. The user could change the path with option --bpf-attr-map. Committer testing: # dmesg|grep "Performance Events" -A5 [ 0.225277] Performance Events: Fam17h+ core perfctr, AMD PMU driver. [ 0.225280] ... version: 0 [ 0.225280] ... bit width: 48 [ 0.225281] ... generic registers: 6 [ 0.225281] ... value mask: 0000ffffffffffff [ 0.225281] ... max period: 00007fffffffffff # # for a in $(seq 6) ; do perf stat -a -e cycles,instructions sleep 100000 & done [1] 2436231 [2] 2436232 [3] 2436233 [4] 2436234 [5] 2436235 [6] 2436236 # perf stat -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 310,326,987 cycles (41.87%) 236,143,290 instructions # 0.76 insn per cycle (41.87%) 0.100800885 seconds time elapsed # We can see that the counters were enabled for this workload 41.87% of the time. Now with --bpf-counters: # for a in $(seq 32) ; do perf stat --bpf-counters -a -e cycles,instructions sleep 100000 & done [1] 2436514 [2] 2436515 [3] 2436516 [4] 2436517 [5] 2436518 [6] 2436519 [7] 2436520 [8] 2436521 [9] 2436522 [10] 2436523 [11] 2436524 [12] 2436525 [13] 2436526 [14] 2436527 [15] 2436528 [16] 2436529 [17] 2436530 [18] 2436531 [19] 2436532 [20] 2436533 [21] 2436534 [22] 2436535 [23] 2436536 [24] 2436537 [25] 2436538 [26] 2436539 [27] 2436540 [28] 2436541 [29] 2436542 [30] 2436543 [31] 2436544 [32] 2436545 # # ls -la /sys/fs/bpf/perf_attr_map -rw-------. 1 root root 0 Mar 23 14:53 /sys/fs/bpf/perf_attr_map # bpftool map | grep bperf | wc -l 64 # # bpftool map | tail 1265: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1266: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1267: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 996 pids perf(2436545) 1268: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1269: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1270: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 997 pids perf(2436541) 1285: array name pid_iter.rodata flags 0x480 key 4B value 4B max_entries 1 memlock 4096B btf_id 1017 frozen pids bpftool(2437504) 1286: array flags 0x0 key 4B value 32B max_entries 1 memlock 4096B # # bpftool map dump id 1268 | tail value (CPU 21): 8f f3 bc ca 00 00 00 00 80 fd 2a d1 4d 00 00 00 80 fd 2a d1 4d 00 00 00 value (CPU 22): 7e d5 64 4d 00 00 00 00 a4 8a 2e ee 4d 00 00 00 a4 8a 2e ee 4d 00 00 00 value (CPU 23): a7 78 3e 06 01 00 00 00 b2 34 94 f6 4d 00 00 00 b2 34 94 f6 4d 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): c6 8b d9 ca 00 00 00 00 20 c6 fc 83 4e 00 00 00 20 c6 fc 83 4e 00 00 00 value (CPU 22): 9c b4 d2 4d 00 00 00 00 3e 0c df 89 4e 00 00 00 3e 0c df 89 4e 00 00 00 value (CPU 23): 18 43 66 06 01 00 00 00 5b 69 ed 83 4e 00 00 00 5b 69 ed 83 4e 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): f2 6e db ca 00 00 00 00 92 67 4c ba 4e 00 00 00 92 67 4c ba 4e 00 00 00 value (CPU 22): dc 8e e1 4d 00 00 00 00 d9 32 7a c5 4e 00 00 00 d9 32 7a c5 4e 00 00 00 value (CPU 23): bd 2b 73 06 01 00 00 00 7c 73 87 bf 4e 00 00 00 7c 73 87 bf 4e 00 00 00 Found 1 element # # perf stat --bpf-counters -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 119,410,122 cycles 152,105,479 instructions # 1.27 insn per cycle 0.101395093 seconds time elapsed # See? We had the counters enabled all the time. Signed-off-by: N Song Liu <songliubraving@fb.com> Reviewed-by: N Jiri Olsa <jolsa@kernel.org> Acked-by: N Namhyung Kim <namhyung@kernel.org> Tested-by: N Arnaldo Carvalho de Melo <acme@redhat.com> Cc: kernel-team@fb.com Link: http://lore.kernel.org/lkml/20210316211837.910506-2-songliubraving@fb.comSigned-off-by: N Arnaldo Carvalho de Melo <acme@redhat.com>
7fac83aa · Song Liu · Arnaldo Carvalho de Melo · 4d39c89f · 7fac83aa · 7fac83aa
10 changed file
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -93,6 +93,17 @@ report::

        1.102235068 seconds time elapsed

+--bpf-counters::
+	Use BPF programs to aggregate readings from perf_events.  This
+	allows multiple perf-stat sessions that are counting the same metric (cycles,
+	instructions, etc.) to share hardware counters.
+
+--bpf-attr-map::
+	With option "--bpf-counters", different perf-stat sessions share
+	information about shared BPF programs and maps via a pinned hashmap.
+	Use "--bpf-attr-map" to specify the path of this pinned hashmap.
+	The default path is /sys/fs/bpf/perf_attr_map.
+
 ifdef::HAVE_LIBPFM[]
 --pfm-events events::
 Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)

--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1007,6 +1007,7 @@ python-clean:
 SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
+SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h

 ifdef BUILD_BPF_SKEL
 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool

--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -792,6 +792,12 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	}

 	evlist__for_each_cpu (evsel_list, i, cpu) {
+		/*
+		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
+		 * no need to call it again here.
+		 */
+		if (target.use_bpf)
+			break;
 		affinity__set(&affinity, cpu);

 		evlist__for_each_entry(evsel_list, counter) {
@@ -1146,6 +1152,10 @@ static struct option stat_options[] = {
 #ifdef HAVE_BPF_SKEL
 	OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id",
 		   "stat events on existing bpf program id"),
+	OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf,
+		    "use bpf program to count events"),
+	OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path",
+		   "path to perf_event_attr map"),
 #endif
 	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
 		    "system-wide collection from all CPUs"),

--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <unistd.h>
+#include <sys/file.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <linux/err.h>
@@ -12,14 +13,45 @@
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
+#include <api/fs/fs.h>

 #include "bpf_counter.h"
 #include "counts.h"
 #include "debug.h"
 #include "evsel.h"
+#include "evlist.h"
 #include "target.h"
+#include "cpumap.h"
+#include "thread_map.h"

 #include "bpf_skel/bpf_prog_profiler.skel.h"
+#include "bpf_skel/bperf_u.h"
+#include "bpf_skel/bperf_leader.skel.h"
+#include "bpf_skel/bperf_follower.skel.h"
+
+/*
+ * bperf uses a hashmap, the attr_map, to track all the leader programs.
+ * The hashmap is pinned in bpffs. flock() on this file is used to ensure
+ * no concurrent access to the attr_map.  The key of attr_map is struct
+ * perf_event_attr, and the value is struct perf_event_attr_map_entry.
+ *
+ * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the
+ * leader prog, and the diff_map. Each perf-stat session holds a reference
+ * to the bpf_link to make sure the leader prog is attached to sched_switch
+ * tracepoint.
+ *
+ * Since the hashmap only contains IDs of the bpf_link and diff_map, it
+ * does not hold any references to the leader program. Once all perf-stat
+ * sessions of these events exit, the leader prog, its maps, and the
+ * perf_events will be freed.
+ */
+struct perf_event_attr_map_entry {
+	__u32 link_id;
+	__u32 diff_map_id;
+};
+
+#define DEFAULT_ATTR_MAP_PATH "fs/bpf/perf_attr_map"
+#define ATTR_MAP_SIZE 16

 static inline void *u64_to_ptr(__u64 ptr)
 {
@@ -274,17 +306,494 @@ struct bpf_counter_ops bpf_program_profiler_ops = {
 	.install_pe = bpf_program_profiler__install_pe,
 };

+static __u32 bpf_link_get_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.id;
+}
+
+static __u32 bpf_link_get_prog_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.prog_id;
+}
+
+static __u32 bpf_map_get_id(int fd)
+{
+	struct bpf_map_info map_info = {0};
+	__u32 map_info_len = sizeof(map_info);
+
+	bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len);
+	return map_info.id;
+}
+
+static int bperf_lock_attr_map(struct target *target)
+{
+	char path[PATH_MAX];
+	int map_fd, err;
+
+	if (target->attr_map) {
+		scnprintf(path, PATH_MAX, "%s", target->attr_map);
+	} else {
+		scnprintf(path, PATH_MAX, "%s/%s", sysfs__mountpoint(),
+			  DEFAULT_ATTR_MAP_PATH);
+	}
+
+	if (access(path, F_OK)) {
+		map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+					sizeof(struct perf_event_attr),
+					sizeof(struct perf_event_attr_map_entry),
+					ATTR_MAP_SIZE, 0);
+		if (map_fd < 0)
+			return -1;
+
+		err = bpf_obj_pin(map_fd, path);
+		if (err) {
+			/* someone pinned the map in parallel? */
+			close(map_fd);
+			map_fd = bpf_obj_get(path);
+			if (map_fd < 0)
+				return -1;
+		}
+	} else {
+		map_fd = bpf_obj_get(path);
+		if (map_fd < 0)
+			return -1;
+	}
+
+	err = flock(map_fd, LOCK_EX);
+	if (err) {
+		close(map_fd);
+		return -1;
+	}
+	return map_fd;
+}
+
+/* trigger the leader program on a cpu */
+static int bperf_trigger_reading(int prog_fd, int cpu)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+			    .ctx_in = NULL,
+			    .ctx_size_in = 0,
+			    .flags = BPF_F_TEST_RUN_ON_CPU,
+			    .cpu = cpu,
+			    .retval = 0,
+		);
+
+	return bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static int bperf_check_target(struct evsel *evsel,
+			      struct target *target,
+			      enum bperf_filter_type *filter_type,
+			      __u32 *filter_entry_cnt)
+{
+	if (evsel->leader->core.nr_members > 1) {
+		pr_err("bpf managed perf events do not yet support groups.\n");
+		return -1;
+	}
+
+	/* determine filter type based on target */
+	if (target->system_wide) {
+		*filter_type = BPERF_FILTER_GLOBAL;
+		*filter_entry_cnt = 1;
+	} else if (target->cpu_list) {
+		*filter_type = BPERF_FILTER_CPU;
+		*filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
+	} else if (target->tid) {
+		*filter_type = BPERF_FILTER_PID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else if (target->pid || evsel->evlist->workload.pid != -1) {
+		*filter_type = BPERF_FILTER_TGID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else {
+		pr_err("bpf managed perf events do not yet support these targets.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static	struct perf_cpu_map *all_cpu_map;
+
+static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
+				       struct perf_event_attr_map_entry *entry)
+{
+	struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
+	int link_fd, diff_map_fd, err;
+	struct bpf_link *link = NULL;
+
+	if (!skel) {
+		pr_err("Failed to open leader skeleton\n");
+		return -1;
+	}
+
+	bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
+	err = bperf_leader_bpf__load(skel);
+	if (err) {
+		pr_err("Failed to load leader skeleton\n");
+		goto out;
+	}
+
+	err = -1;
+	link = bpf_program__attach(skel->progs.on_switch);
+	if (!link) {
+		pr_err("Failed to attach leader program\n");
+		goto out;
+	}
+
+	link_fd = bpf_link__fd(link);
+	diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
+	entry->link_id = bpf_link_get_id(link_fd);
+	entry->diff_map_id = bpf_map_get_id(diff_map_fd);
+	err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
+	assert(err == 0);
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
+	assert(evsel->bperf_leader_link_fd >= 0);
+
+	/*
+	 * save leader_skel for install_pe, which is called within
+	 * following evsel__open_per_cpu call
+	 */
+	evsel->leader_skel = skel;
+	evsel__open_per_cpu(evsel, all_cpu_map, -1);
+
+out:
+	bperf_leader_bpf__destroy(skel);
+	bpf_link__destroy(link);
+	return err;
+}
+
+static int bperf__load(struct evsel *evsel, struct target *target)
+{
+	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
+	int attr_map_fd, diff_map_fd = -1, err;
+	enum bperf_filter_type filter_type;
+	__u32 filter_entry_cnt, i;
+
+	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
+		return -1;
+
+	if (!all_cpu_map) {
+		all_cpu_map = perf_cpu_map__new(NULL);
+		if (!all_cpu_map)
+			return -1;
+	}
+
+	evsel->bperf_leader_prog_fd = -1;
+	evsel->bperf_leader_link_fd = -1;
+
+	/*
+	 * Step 1: hold a fd on the leader program and the bpf_link, if
+	 * the program is not already gone, reload the program.
+	 * Use flock() to ensure exclusive access to the perf_event_attr
+	 * map.
+	 */
+	attr_map_fd = bperf_lock_attr_map(target);
+	if (attr_map_fd < 0) {
+		pr_err("Failed to lock perf_event_attr map\n");
+		return -1;
+	}
+
+	err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
+	if (err) {
+		err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
+		if (err)
+			goto out;
+	}
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
+	if (evsel->bperf_leader_link_fd < 0 &&
+	    bperf_reload_leader_program(evsel, attr_map_fd, &entry))
+		goto out;
+
+	/*
+	 * The bpf_link holds reference to the leader program, and the
+	 * leader program holds reference to the maps. Therefore, if
+	 * link_id is valid, diff_map_id should also be valid.
+	 */
+	evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
+		bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
+	assert(evsel->bperf_leader_prog_fd >= 0);
+
+	diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
+	assert(diff_map_fd >= 0);
+
+	/*
+	 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
+	 * whether the kernel support it
+	 */
+	err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
+	if (err) {
+		pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
+		       "Therefore, --use-bpf might show inaccurate readings\n");
+		goto out;
+	}
+
+	/* Step 2: load the follower skeleton */
+	evsel->follower_skel = bperf_follower_bpf__open();
+	if (!evsel->follower_skel) {
+		pr_err("Failed to open follower skeleton\n");
+		goto out;
+	}
+
+	/* attach fexit program to the leader program */
+	bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
+				       evsel->bperf_leader_prog_fd, "on_switch");
+
+	/* connect to leader diff_reading map */
+	bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
+
+	/* set up reading map */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
+				 filter_entry_cnt);
+	/* set up follower filter based on target */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
+				 filter_entry_cnt);
+	err = bperf_follower_bpf__load(evsel->follower_skel);
+	if (err) {
+		pr_err("Failed to load follower skeleton\n");
+		bperf_follower_bpf__destroy(evsel->follower_skel);
+		evsel->follower_skel = NULL;
+		goto out;
+	}
+
+	for (i = 0; i < filter_entry_cnt; i++) {
+		int filter_map_fd;
+		__u32 key;
+
+		if (filter_type == BPERF_FILTER_PID ||
+		    filter_type == BPERF_FILTER_TGID)
+			key = evsel->core.threads->map[i].pid;
+		else if (filter_type == BPERF_FILTER_CPU)
+			key = evsel->core.cpus->map[i];
+		else
+			break;
+
+		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
+		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+	}
+
+	evsel->follower_skel->bss->type = filter_type;
+
+	err = bperf_follower_bpf__attach(evsel->follower_skel);
+
+out:
+	if (err && evsel->bperf_leader_link_fd >= 0)
+		close(evsel->bperf_leader_link_fd);
+	if (err && evsel->bperf_leader_prog_fd >= 0)
+		close(evsel->bperf_leader_prog_fd);
+	if (diff_map_fd >= 0)
+		close(diff_map_fd);
+
+	flock(attr_map_fd, LOCK_UN);
+	close(attr_map_fd);
+
+	return err;
+}
+
+static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
+{
+	struct bperf_leader_bpf *skel = evsel->leader_skel;
+
+	return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
+				   &cpu, &fd, BPF_ANY);
+}
+
+/*
+ * trigger the leader prog on each cpu, so the accum_reading map could get
+ * the latest readings.
+ */
+static int bperf_sync_counters(struct evsel *evsel)
+{
+	int num_cpu, i, cpu;
+
+	num_cpu = all_cpu_map->nr;
+	for (i = 0; i < num_cpu; i++) {
+		cpu = all_cpu_map->map[i];
+		bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
+	}
+	return 0;
+}
+
+static int bperf__enable(struct evsel *evsel)
+{
+	evsel->follower_skel->bss->enabled = 1;
+	return 0;
+}
+
+static int bperf__read(struct evsel *evsel)
+{
+	struct bperf_follower_bpf *skel = evsel->follower_skel;
+	__u32 num_cpu_bpf = cpu__max_cpu();
+	struct bpf_perf_event_value values[num_cpu_bpf];
+	int reading_map_fd, err = 0;
+	__u32 i, j, num_cpu;
+
+	bperf_sync_counters(evsel);
+	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
+
+	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+		__u32 cpu;
+
+		err = bpf_map_lookup_elem(reading_map_fd, &i, values);
+		if (err)
+			goto out;
+		switch (evsel->follower_skel->bss->type) {
+		case BPERF_FILTER_GLOBAL:
+			assert(i == 0);
+
+			num_cpu = all_cpu_map->nr;
+			for (j = 0; j < num_cpu; j++) {
+				cpu = all_cpu_map->map[j];
+				perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
+				perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
+				perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
+			}
+			break;
+		case BPERF_FILTER_CPU:
+			cpu = evsel->core.cpus->map[i];
+			perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
+			perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
+			perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
+			break;
+		case BPERF_FILTER_PID:
+		case BPERF_FILTER_TGID:
+			perf_counts(evsel->counts, 0, i)->val = 0;
+			perf_counts(evsel->counts, 0, i)->ena = 0;
+			perf_counts(evsel->counts, 0, i)->run = 0;
+
+			for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
+				perf_counts(evsel->counts, 0, i)->val += values[cpu].counter;
+				perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled;
+				perf_counts(evsel->counts, 0, i)->run += values[cpu].running;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+out:
+	return err;
+}
+
+static int bperf__destroy(struct evsel *evsel)
+{
+	bperf_follower_bpf__destroy(evsel->follower_skel);
+	close(evsel->bperf_leader_prog_fd);
+	close(evsel->bperf_leader_link_fd);
+	return 0;
+}
+
+/*
+ * bperf: share hardware PMCs with BPF
+ *
+ * perf uses performance monitoring counters (PMC) to monitor system
+ * performance. The PMCs are limited hardware resources. For example,
+ * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
+ *
+ * Modern data center systems use these PMCs in many different ways:
+ * system level monitoring, (maybe nested) container level monitoring, per
+ * process monitoring, profiling (in sample mode), etc. In some cases,
+ * there are more active perf_events than available hardware PMCs. To allow
+ * all perf_events to have a chance to run, it is necessary to do expensive
+ * time multiplexing of events.
+ *
+ * On the other hand, many monitoring tools count the common metrics
+ * (cycles, instructions). It is a waste to have multiple tools create
+ * multiple perf_events of "cycles" and occupy multiple PMCs.
+ *
+ * bperf tries to reduce such wastes by allowing multiple perf_events of
+ * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
+ * of having each perf-stat session to read its own perf_events, bperf uses
+ * BPF programs to read the perf_events and aggregate readings to BPF maps.
+ * Then, the perf-stat session(s) reads the values from these BPF maps.
+ *
+ *                                ||
+ *       shared progs and maps <- || -> per session progs and maps
+ *                                ||
+ *   ---------------              ||
+ *   | perf_events |              ||
+ *   ---------------       fexit  ||      -----------------
+ *          |             --------||----> | follower prog |
+ *       --------------- /        || ---  -----------------
+ * cs -> | leader prog |/         ||/        |         |
+ *   --> ---------------         /||  --------------  ------------------
+ *  /       |         |         / ||  | filter map |  | accum_readings |
+ * /  ------------  ------------  ||  --------------  ------------------
+ * |  | prev map |  | diff map |  ||                        |
+ * |  ------------  ------------  ||                        |
+ *  \                             ||                        |
+ * = \ ==================================================== | ============
+ *    \                                                    /   user space
+ *     \                                                  /
+ *      \                                                /
+ *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
+ *        \                                            /
+ *         \                                          /
+ *          \------  perf-stat ----------------------/
+ *
+ * The figure above shows the architecture of bperf. Note that the figure
+ * is divided into 3 regions: shared progs and maps (top left), per session
+ * progs and maps (top right), and user space (bottom).
+ *
+ * The leader prog is triggered on each context switch (cs). The leader
+ * prog reads perf_events and stores the difference (current_reading -
+ * previous_reading) to the diff map. For the same metric, e.g. "cycles",
+ * multiple perf-stat sessions share the same leader prog.
+ *
+ * Each perf-stat session creates a follower prog as fexit program to the
+ * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
+ * follower progs to the same leader prog. The follower prog checks current
+ * task and processor ID to decide whether to add the value from the diff
+ * map to its accumulated reading map (accum_readings).
+ *
+ * Finally, perf-stat user space reads the value from accum_reading map.
+ *
+ * Besides context switch, it is also necessary to trigger the leader prog
+ * before perf-stat reads the value. Otherwise, the accum_reading map may
+ * not have the latest reading from the perf_events. This is achieved by
+ * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
+ *
+ * Comment before the definition of struct perf_event_attr_map_entry
+ * describes how different sessions of perf-stat share information about
+ * the leader prog.
+ */
+
+struct bpf_counter_ops bperf_ops = {
+	.load       = bperf__load,
+	.enable     = bperf__enable,
+	.read       = bperf__read,
+	.install_pe = bperf__install_pe,
+	.destroy    = bperf__destroy,
+};
+
+static inline bool bpf_counter_skip(struct evsel *evsel)
+{
+	return list_empty(&evsel->bpf_counter_list) &&
+		evsel->follower_skel == NULL;
+}
+
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
 }

 int bpf_counter__load(struct evsel *evsel, struct target *target)
 {
-	if (target__has_bpf(target))
+	if (target->bpf_str)
 		evsel->bpf_counter_ops = &bpf_program_profiler_ops;
+	else if (target->use_bpf)
+		evsel->bpf_counter_ops = &bperf_ops;

 	if (evsel->bpf_counter_ops)
 		return evsel->bpf_counter_ops->load(evsel, target);
@@ -293,21 +802,21 @@ int bpf_counter__load(struct evsel *evsel, struct target *target)

 int bpf_counter__enable(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->enable(evsel);
 }

 int bpf_counter__read(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return -EAGAIN;
 	return evsel->bpf_counter_ops->read(evsel);
 }

 void bpf_counter__destroy(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return;
 	evsel->bpf_counter_ops->destroy(evsel);
 	evsel->bpf_counter_ops = NULL;

--- a/tools/perf/util/bpf_skel/bperf.h
+++ b/tools/perf/util/bpf_skel/bperf.h
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_H
+#define __BPERF_STAT_H
+
+typedef struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} reading_map;
+
+#endif /* __BPERF_STAT_H */
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+#include "bperf_u.h"
+
+reading_map diff_readings SEC(".maps");
+reading_map accum_readings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} filter SEC(".maps");
+
+enum bperf_filter_type type = 0;
+int enabled = 0;
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+	struct bpf_perf_event_value *diff_val, *accum_val;
+	__u32 filter_key, zero = 0;
+	__u32 *accum_key;
+
+	if (!enabled)
+		return 0;
+
+	switch (type) {
+	case BPERF_FILTER_GLOBAL:
+		accum_key = &zero;
+		goto do_add;
+	case BPERF_FILTER_CPU:
+		filter_key = bpf_get_smp_processor_id();
+		break;
+	case BPERF_FILTER_PID:
+		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
+		break;
+	case BPERF_FILTER_TGID:
+		filter_key = bpf_get_current_pid_tgid() >> 32;
+		break;
+	default:
+		return 0;
+	}
+
+	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
+	if (!accum_key)
+		return 0;
+
+do_add:
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+	if (!accum_val)
+		return 0;
+
+	accum_val->counter += diff_val->counter;
+	accum_val->enabled += diff_val->enabled;
+	accum_val->running += diff_val->running;
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
--- a/tools/perf/util/bpf_skel/bperf_leader.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(int));
+	__uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} events SEC(".maps");
+
+reading_map prev_readings SEC(".maps");
+reading_map diff_readings SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(on_switch)
+{
+	struct bpf_perf_event_value val, *prev_val, *diff_val;
+	__u32 key = bpf_get_smp_processor_id();
+	__u32 zero = 0;
+	long err;
+
+	prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
+	if (!prev_val)
+		return 0;
+
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+	if (err)
+		return 0;
+
+	diff_val->counter = val.counter - prev_val->counter;
+	diff_val->enabled = val.enabled - prev_val->enabled;
+	diff_val->running = val.running - prev_val->running;
+	*prev_val = val;
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
--- a/tools/perf/util/bpf_skel/bperf_u.h
+++ b/tools/perf/util/bpf_skel/bperf_u.h
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_U_H
+#define __BPERF_STAT_U_H
+
+enum bperf_filter_type {
+	BPERF_FILTER_GLOBAL = 1,
+	BPERF_FILTER_CPU,
+	BPERF_FILTER_PID,
+	BPERF_FILTER_TGID,
+};
+
+#endif /* __BPERF_STAT_U_H */
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -20,6 +20,8 @@ union perf_event;
 struct bpf_counter_ops;
 struct target;
 struct hashmap;
+struct bperf_leader_bpf;
+struct bperf_follower_bpf;

 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data);

@@ -130,8 +132,24 @@ struct evsel {
 	 * See also evsel__has_callchain().
 	 */
 	__u64			synth_sample_type;
-	struct list_head	bpf_counter_list;
+
+	/*
+	 * bpf_counter_ops serves two use cases:
+	 *   1. perf-stat -b          counting events used byBPF programs
+	 *   2. perf-stat --use-bpf   use BPF programs to aggregate counts
+	 */
 	struct bpf_counter_ops	*bpf_counter_ops;
+
+	/* for perf-stat -b */
+	struct list_head	bpf_counter_list;
+
+	/* for perf-stat --use-bpf */
+	int			bperf_leader_prog_fd;
+	int			bperf_leader_link_fd;
+	union {
+		struct bperf_leader_bpf *leader_skel;
+		struct bperf_follower_bpf *follower_skel;
+	};
 };

 struct perf_missing_features {

--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -16,6 +16,8 @@ struct target {
 	bool	     uses_mmap;
 	bool	     default_per_cpu;
 	bool	     per_thread;
+	bool	     use_bpf;
+	const char   *attr_map;
 };

 enum target_errno {
@@ -66,7 +68,7 @@ static inline bool target__has_cpu(struct target *target)

 static inline bool target__has_bpf(struct target *target)
 {
-	return target->bpf_str;
+	return target->bpf_str || target->use_bpf;
 }

 static inline bool target__none(struct target *target)