selftests/bpf: Measure bpf_loop verifier performance

This patch tests bpf_loop in pyperf and strobemeta, and measures the verifier performance of replacing the traditional for loop with bpf_loop. The results are as follows: ~strobemeta~ Baseline verification time 6808200 usec stack depth 496 processed 554252 insns (limit 1000000) max_states_per_insn 16 total_states 15878 peak_states 13489 mark_read 3110 #192 verif_scale_strobemeta:OK (unrolled loop) Using bpf_loop verification time 31589 usec stack depth 96+400 processed 1513 insns (limit 1000000) max_states_per_insn 2 total_states 106 peak_states 106 mark_read 60 #193 verif_scale_strobemeta_bpf_loop:OK ~pyperf600~ Baseline verification time 29702486 usec stack depth 368 processed 626838 insns (limit 1000000) max_states_per_insn 7 total_states 30368 peak_states 30279 mark_read 748 #182 verif_scale_pyperf600:OK (unrolled loop) Using bpf_loop verification time 148488 usec stack depth 320+40 processed 10518 insns (limit 1000000) max_states_per_insn 10 total_states 705 peak_states 517 mark_read 38 #183 verif_scale_pyperf600_bpf_loop:OK Using the bpf_loop helper led to approximately a 99% decrease in the verification time and in the number of instructions. Signed-off-by: N Joanne Koong <joannekoong@fb.com> Signed-off-by: N Alexei Starovoitov <ast@kernel.org> Acked-by: N Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20211130030622.4131246-4-joannekoong@fb.com

selftests/bpf: Measure bpf_loop verifier performance
This patch tests bpf_loop in pyperf and strobemeta, and measures the verifier performance of replacing the traditional for loop with bpf_loop. The results are as follows: ~strobemeta~ Baseline verification time 6808200 usec stack depth 496 processed 554252 insns (limit 1000000) max_states_per_insn 16 total_states 15878 peak_states 13489 mark_read 3110 #192 verif_scale_strobemeta:OK (unrolled loop) Using bpf_loop verification time 31589 usec stack depth 96+400 processed 1513 insns (limit 1000000) max_states_per_insn 2 total_states 106 peak_states 106 mark_read 60 #193 verif_scale_strobemeta_bpf_loop:OK ~pyperf600~ Baseline verification time 29702486 usec stack depth 368 processed 626838 insns (limit 1000000) max_states_per_insn 7 total_states 30368 peak_states 30279 mark_read 748 #182 verif_scale_pyperf600:OK (unrolled loop) Using bpf_loop verification time 148488 usec stack depth 320+40 processed 10518 insns (limit 1000000) max_states_per_insn 10 total_states 705 peak_states 517 mark_read 38 #183 verif_scale_pyperf600_bpf_loop:OK Using the bpf_loop helper led to approximately a 99% decrease in the verification time and in the number of instructions. Signed-off-by: N Joanne Koong <joannekoong@fb.com> Signed-off-by: N Alexei Starovoitov <ast@kernel.org> Acked-by: N Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20211130030622.4131246-4-joannekoong@fb.com
f6e659b7 · Joanne Koong · Alexei Starovoitov · 4e5070b6 · f6e659b7 · f6e659b7
5 changed file
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -115,6 +115,12 @@ void test_verif_scale_pyperf600()
 	scale_test("pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
 }

+void test_verif_scale_pyperf600_bpf_loop(void)
+{
+	/* use the bpf_loop helper*/
+	scale_test("pyperf600_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+}
+
 void test_verif_scale_pyperf600_nounroll()
 {
 	/* no unroll at all.
@@ -165,6 +171,12 @@ void test_verif_scale_strobemeta()
 	scale_test("strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
 }

+void test_verif_scale_strobemeta_bpf_loop(void)
+{
+	/* use the bpf_loop helper*/
+	scale_test("strobemeta_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+}
+
 void test_verif_scale_strobemeta_nounroll1()
 {
 	/* no unroll, tiny loops */

--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -159,6 +159,59 @@ struct {
 	__uint(value_size, sizeof(long long) * 127);
 } stackmap SEC(".maps");

+#ifdef USE_BPF_LOOP
+struct process_frame_ctx {
+	int cur_cpu;
+	int32_t *symbol_counter;
+	void *frame_ptr;
+	FrameData *frame;
+	PidData *pidData;
+	Symbol *sym;
+	Event *event;
+	bool done;
+};
+
+#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+
+static int process_frame_callback(__u32 i, struct process_frame_ctx *ctx)
+{
+	int zero = 0;
+	void *frame_ptr = ctx->frame_ptr;
+	PidData *pidData = ctx->pidData;
+	FrameData *frame = ctx->frame;
+	int32_t *symbol_counter = ctx->symbol_counter;
+	int cur_cpu = ctx->cur_cpu;
+	Event *event = ctx->event;
+	Symbol *sym = ctx->sym;
+
+	if (frame_ptr && get_frame_data(frame_ptr, pidData, frame, sym)) {
+		int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu;
+		int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, sym);
+
+		if (!symbol_id) {
+			bpf_map_update_elem(&symbolmap, sym, &zero, 0);
+			symbol_id = bpf_map_lookup_elem(&symbolmap, sym);
+			if (!symbol_id) {
+				ctx->done = true;
+				return 1;
+			}
+		}
+		if (*symbol_id == new_symbol_id)
+			(*symbol_counter)++;
+
+		barrier_var(i);
+		if (i >= STACK_MAX_LEN)
+			return 1;
+
+		event->stack[i] = *symbol_id;
+
+		event->stack_len = i + 1;
+		frame_ptr = frame->f_back;
+	}
+	return 0;
+}
+#endif /* USE_BPF_LOOP */
+
 #ifdef GLOBAL_FUNC
 __noinline
 #elif defined(SUBPROGS)
@@ -228,11 +281,26 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 		int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym);
 		if (symbol_counter == NULL)
 			return 0;
+#ifdef USE_BPF_LOOP
+	struct process_frame_ctx ctx = {
+		.cur_cpu = cur_cpu,
+		.symbol_counter = symbol_counter,
+		.frame_ptr = frame_ptr,
+		.frame = &frame,
+		.pidData = pidData,
+		.sym = &sym,
+		.event = event,
+	};
+
+	bpf_loop(STACK_MAX_LEN, process_frame_callback, &ctx, 0);
+	if (ctx.done)
+		return 0;
+#else
 #ifdef NO_UNROLL
 #pragma clang loop unroll(disable)
 #else
 #pragma clang loop unroll(full)
-#endif
+#endif /* NO_UNROLL */
 		/* Unwind python stack */
 		for (int i = 0; i < STACK_MAX_LEN; ++i) {
 			if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) {
@@ -251,6 +319,7 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 				frame_ptr = frame.f_back;
 			}
 		}
+#endif /* USE_BPF_LOOP */
 		event->stack_complete = frame_ptr == NULL;
 	} else {
 		event->stack_complete = 1;

--- a/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c
+++ b/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#define STACK_MAX_LEN 600
+#define USE_BPF_LOOP
+#include "pyperf.h"
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -445,6 +445,48 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
 	return payload;
 }

+#ifdef USE_BPF_LOOP
+enum read_type {
+	READ_INT_VAR,
+	READ_MAP_VAR,
+	READ_STR_VAR,
+};
+
+struct read_var_ctx {
+	struct strobemeta_payload *data;
+	void *tls_base;
+	struct strobemeta_cfg *cfg;
+	void *payload;
+	/* value gets mutated */
+	struct strobe_value_generic *value;
+	enum read_type type;
+};
+
+static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
+{
+	switch (ctx->type) {
+	case READ_INT_VAR:
+		if (index >= STROBE_MAX_INTS)
+			return 1;
+		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
+		break;
+	case READ_MAP_VAR:
+		if (index >= STROBE_MAX_MAPS)
+			return 1;
+		ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
+					    ctx->value, ctx->data, ctx->payload);
+		break;
+	case READ_STR_VAR:
+		if (index >= STROBE_MAX_STRS)
+			return 1;
+		ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
+					     ctx->value, ctx->data, ctx->payload);
+		break;
+	}
+	return 0;
+}
+#endif /* USE_BPF_LOOP */
+
 /*
 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
 * pointer to *right after* payload ends
@@ -475,11 +517,36 @@ static void *read_strobe_meta(struct task_struct *task,
 	 */
 	tls_base = (void *)task;

+#ifdef USE_BPF_LOOP
+	struct read_var_ctx ctx = {
+		.cfg = cfg,
+		.tls_base = tls_base,
+		.value = &value,
+		.data = data,
+		.payload = payload,
+	};
+	int err;
+
+	ctx.type = READ_INT_VAR;
+	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_INTS)
+		return NULL;
+
+	ctx.type = READ_STR_VAR;
+	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_STRS)
+		return NULL;
+
+	ctx.type = READ_MAP_VAR;
+	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_MAPS)
+		return NULL;
+#else
 #ifdef NO_UNROLL
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
 		read_int_var(cfg, i, tls_base, &value, data);
 	}
@@ -487,7 +554,7 @@ static void *read_strobe_meta(struct task_struct *task,
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
 		payload += read_str_var(cfg, i, tls_base, &value, data, payload);
 	}
@@ -495,10 +562,12 @@ static void *read_strobe_meta(struct task_struct *task,
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
 		payload = read_map_var(cfg, i, tls_base, &value, data, payload);
 	}
+#endif /* USE_BPF_LOOP */
+
 	/*
 	 * return pointer right after end of payload, so it's possible to
 	 * calculate exact amount of useful data that needs to be sent

--- a/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c
+++ b/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2021 Facebook */
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 100
+#define STROBE_MAX_MAP_ENTRIES 20
+#define USE_BPF_LOOP
+#include "strobemeta.h"