提交 1772be37 编写于 作者: D Daniel Borkmann

Merge branch 'bpf-stackmap-nmi'

Song Liu says:
====================
Changes v2 -> v3:
  Improve syntax based on suggestion by Tobin C. Harding.

Changes v1 -> v2:
  1. Rename some variables to (hopefully) reduce confusion;
  2. Check irq_work status with IRQ_WORK_BUSY (instead of work->sem);
  3. In Kconfig, let BPF_SYSCALL select IRQ_WORK;
  4. Add static to DEFINE_PER_CPU();
   5. Remove pr_info() in stack_map_init().
====================
Signed-off-by: NDaniel Borkmann <daniel@iogearbox.net>
上级 a84880ef 13790d1c
master alk-4.19.24 alk-4.19.30 alk-4.19.34 alk-4.19.36 alk-4.19.43 alk-4.19.48 alk-4.19.57 ck-4.19.67 ck-4.19.81 ck-4.19.91 github/fork/deepanshu1422/fix-typo-in-comment github/fork/haosdent/fix-typo linux-next v4.19.91 v4.19.90 v4.19.89 v4.19.88 v4.19.87 v4.19.86 v4.19.85 v4.19.84 v4.19.83 v4.19.82 v4.19.81 v4.19.80 v4.19.79 v4.19.78 v4.19.77 v4.19.76 v4.19.75 v4.19.74 v4.19.73 v4.19.72 v4.19.71 v4.19.70 v4.19.69 v4.19.68 v4.19.67 v4.19.66 v4.19.65 v4.19.64 v4.19.63 v4.19.62 v4.19.61 v4.19.60 v4.19.59 v4.19.58 v4.19.57 v4.19.56 v4.19.55 v4.19.54 v4.19.53 v4.19.52 v4.19.51 v4.19.50 v4.19.49 v4.19.48 v4.19.47 v4.19.46 v4.19.45 v4.19.44 v4.19.43 v4.19.42 v4.19.41 v4.19.40 v4.19.39 v4.19.38 v4.19.37 v4.19.36 v4.19.35 v4.19.34 v4.19.33 v4.19.32 v4.19.31 v4.19.30 v4.19.29 v4.19.28 v4.19.27 v4.19.26 v4.19.25 v4.19.24 v4.19.23 v4.19.22 v4.19.21 v4.19.20 v4.19.19 v4.19.18 v4.19.17 v4.19.16 v4.19.15 v4.19.14 v4.19.13 v4.19.12 v4.19.11 v4.19.10 v4.19.9 v4.19.8 v4.19.7 v4.19.6 v4.19.5 v4.19.4 v4.19.3 v4.19.2 v4.19.1 v4.19 v4.19-rc8 v4.19-rc7 v4.19-rc6 v4.19-rc5 v4.19-rc4 v4.19-rc3 v4.19-rc2 v4.19-rc1 ck-release-21 ck-release-20 ck-release-19.2 ck-release-19.1 ck-release-19 ck-release-18 ck-release-17.2 ck-release-17.1 ck-release-17 ck-release-16 ck-release-15.1 ck-release-15 ck-release-14 ck-release-13.2 ck-release-13 ck-release-12 ck-release-11 ck-release-10 ck-release-9 ck-release-7 alk-release-15 alk-release-14 alk-release-13.2 alk-release-13 alk-release-12 alk-release-11 alk-release-10 alk-release-9 alk-release-7
无相关合并请求
......@@ -1391,6 +1391,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select ANON_INODES
select BPF
select IRQ_WORK
default n
help
Enable the bpf() system call that allows to manipulate eBPF
......
......@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <linux/elf.h>
#include <linux/pagemap.h>
#include <linux/irq_work.h>
#include "percpu_freelist.h"
#define STACK_CREATE_FLAG_MASK \
......@@ -32,6 +33,23 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
/* irq_work to run up_read() for build_id lookup in nmi context */
struct stack_map_irq_work {
struct irq_work irq_work;
struct rw_semaphore *sem;
};
static void do_up_read(struct irq_work *entry)
{
struct stack_map_irq_work *work;
work = container_of(entry, struct stack_map_irq_work, irq_work);
up_read(work->sem);
work->sem = NULL;
}
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
......@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
{
int i;
struct vm_area_struct *vma;
bool in_nmi_ctx = in_nmi();
bool irq_work_busy = false;
struct stack_map_irq_work *work;
if (in_nmi_ctx) {
work = this_cpu_ptr(&up_read_work);
if (work->irq_work.flags & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
/*
* We cannot do up_read() in nmi context, so build_id lookup is
* only supported for non-nmi events. If at some point, it is
* possible to run find_vma() without taking the semaphore, we
* would like to allow build_id lookup in nmi context.
* We cannot do up_read() in nmi context. To do build_id lookup
* in nmi context, we need to run up_read() in irq_work. We use
* a percpu variable to do the irq_work. If the irq_work is
* already used by another lookup, we fall back to report ips.
*
* Same fallback is used for kernel stack (!user) on a stackmap
* with build_id.
*/
if (!user || !current || !current->mm || in_nmi() ||
if (!user || !current || !current->mm || irq_work_busy ||
down_read_trylock(&current->mm->mmap_sem) == 0) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
......@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
up_read(&current->mm->mmap_sem);
if (!in_nmi_ctx) {
up_read(&current->mm->mmap_sem);
} else {
work->sem = &current->mm->mmap_sem;
irq_work_queue(&work->irq_work);
}
}
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
......@@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
static int __init stack_map_init(void)
{
int cpu;
struct stack_map_irq_work *work;
for_each_possible_cpu(cpu) {
work = per_cpu_ptr(&up_read_work, cpu);
init_irq_work(&work->irq_work, do_up_read);
}
return 0;
}
subsys_initcall(stack_map_init);
......@@ -1272,6 +1272,139 @@ static void test_stacktrace_build_id(void)
return;
}
static void test_stacktrace_build_id_nmi(void)
{
int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
const char *file = "./test_stacktrace_build_id.o";
int err, pmu_fd, prog_fd;
struct perf_event_attr attr = {
.sample_freq = 5000,
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
};
__u32 key, previous_key, val, duration = 0;
struct bpf_object *obj;
char buf[256];
int i, j;
struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
int build_id_matches = 0;
err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
return;
pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
0 /* cpu 0 */, -1 /* group id */,
0 /* flags */);
if (CHECK(pmu_fd < 0, "perf_event_open",
"err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
pmu_fd, errno))
goto close_prog;
err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
err, errno))
goto close_pmu;
err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
err, errno))
goto disable_pmu;
/* find map fds */
control_map_fd = bpf_find_map(__func__, obj, "control_map");
if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
"err %d errno %d\n", err, errno))
goto disable_pmu;
stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
"err %d errno %d\n", err, errno))
goto disable_pmu;
stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
err, errno))
goto disable_pmu;
stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
"err %d errno %d\n", err, errno))
goto disable_pmu;
assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
== 0);
assert(system("taskset 0x1 ./urandom_read 100000") == 0);
/* disable stack trace collection */
key = 0;
val = 1;
bpf_map_update_elem(control_map_fd, &key, &val, 0);
/* for every element in stackid_hmap, we can find a corresponding one
* in stackmap, and vise versa.
*/
err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
"err %d errno %d\n", err, errno))
goto disable_pmu;
err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
"err %d errno %d\n", err, errno))
goto disable_pmu;
err = extract_build_id(buf, 256);
if (CHECK(err, "get build_id with readelf",
"err %d errno %d\n", err, errno))
goto disable_pmu;
err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
if (CHECK(err, "get_next_key from stackmap",
"err %d, errno %d\n", err, errno))
goto disable_pmu;
do {
char build_id[64];
err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
if (CHECK(err, "lookup_elem from stackmap",
"err %d, errno %d\n", err, errno))
goto disable_pmu;
for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
id_offs[i].offset != 0) {
for (j = 0; j < 20; ++j)
sprintf(build_id + 2 * j, "%02x",
id_offs[i].build_id[j] & 0xff);
if (strstr(buf, build_id) != NULL)
build_id_matches = 1;
}
previous_key = key;
} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
if (CHECK(build_id_matches < 1, "build id match",
"Didn't find expected build ID from the map\n"))
goto disable_pmu;
/*
* We intentionally skip compare_stack_ips(). This is because we
* only support one in_nmi() ips-to-build_id translation per cpu
* at any time, thus stack_amap here will always fallback to
* BPF_STACK_BUILD_ID_IP;
*/
disable_pmu:
ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
close_pmu:
close(pmu_fd);
close_prog:
bpf_object__close(obj);
}
#define MAX_CNT_RAWTP 10ull
#define MAX_STACK_RAWTP 100
struct get_stack_trace_t {
......@@ -1425,6 +1558,7 @@ int main(void)
test_tp_attach_query();
test_stacktrace_map();
test_stacktrace_build_id();
test_stacktrace_build_id_nmi();
test_stacktrace_map_raw_tp();
test_get_stack_raw_tp();
......
......@@ -6,15 +6,21 @@
#include <stdlib.h>
#define BUF_SIZE 256
int main(void)
int main(int argc, char *argv[])
{
int fd = open("/dev/urandom", O_RDONLY);
int i;
char buf[BUF_SIZE];
int count = 4;
if (fd < 0)
return 1;
for (i = 0; i < 4; ++i)
if (argc == 2)
count = atoi(argv[1]);
for (i = 0; i < count; ++i)
read(fd, buf, BUF_SIZE);
close(fd);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部