提交 a91c0f7f 编写于 作者: N Namhyung Kim 提交者: Zheng Zengkai

bpf: Adjust BPF stack helper functions to accommodate skip > 0

stable inclusion
from stable-v5.10.110
commit 90805175a206f784b6a77f16f07b07f6803e286b
bugzilla: https://gitee.com/openeuler/kernel/issues/I574AL

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=90805175a206f784b6a77f16f07b07f6803e286b

--------------------------------

commit ee2a0988 upstream.

Let's say that the caller has storage for num_elem stack frames.  Then,
the BPF stack helper functions walk the stack for only num_elem frames.
This means that if skip > 0, one keeps only 'num_elem - skip' frames.

This is because it sets init_nr in the perf_callchain_entry to the end
of the buffer to save num_elem entries only.  I believe it was because
the perf callchain code unwound the stack frames until it reached the
global max size (sysctl_perf_event_max_stack).

However it now has perf_callchain_entry_ctx.max_stack to limit the
iteration locally.  This simplifies the code to handle init_nr in the
BPF callstack entries and removes the confusion with the perf_event's
__PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0.

Also change the comment on bpf_get_stack() in the header file to be
more explicit what the return value means.

Fixes: c195651e ("bpf: add bpf_get_stack helper")
Signed-off-by: NNamhyung Kim <namhyung@kernel.org>
Signed-off-by: NAlexei Starovoitov <ast@kernel.org>
Acked-by: NYonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com
Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.orgSigned-off-by: NGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Based-on-patch-by: NEugene Loh <eugene.loh@oracle.com>
Signed-off-by: NYu Liao <liaoyu15@huawei.com>
Reviewed-by: NWei Li <liwei391@huawei.com>
Signed-off-by: NZheng Zengkai <zhengzengkai@huawei.com>
上级 89c1f625
...@@ -2163,8 +2163,8 @@ union bpf_attr { ...@@ -2163,8 +2163,8 @@ union bpf_attr {
* *
* # sysctl kernel.perf_event_max_stack=<new value> * # sysctl kernel.perf_event_max_stack=<new value>
* Return * Return
* A non-negative value equal to or less than *size* on success, * The non-negative copied *buf* length equal to or less than
* or a negative error in case of failure. * *size* on success, or a negative error in case of failure.
* *
* long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
* Description * Description
...@@ -3448,8 +3448,8 @@ union bpf_attr { ...@@ -3448,8 +3448,8 @@ union bpf_attr {
* *
* # sysctl kernel.perf_event_max_stack=<new value> * # sysctl kernel.perf_event_max_stack=<new value>
* Return * Return
* A non-negative value equal to or less than *size* on success, * The non-negative copied *buf* length equal to or less than
* or a negative error in case of failure. * *size* on success, or a negative error in case of failure.
* *
* long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
* Description * Description
......
...@@ -364,7 +364,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, ...@@ -364,7 +364,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
} }
static struct perf_callchain_entry * static struct perf_callchain_entry *
get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{ {
#ifdef CONFIG_STACKTRACE #ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry; struct perf_callchain_entry *entry;
...@@ -375,9 +375,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) ...@@ -375,9 +375,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry) if (!entry)
return NULL; return NULL;
entry->nr = init_nr + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), max_depth, 0);
sysctl_perf_event_max_stack - init_nr, 0);
/* stack_trace_save_tsk() works on unsigned long array, while /* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is * perf_callchain_entry uses u64 array. For 32-bit systems, it is
...@@ -389,7 +388,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) ...@@ -389,7 +388,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i; int i;
/* copy data from the end to avoid using extra buffer */ /* copy data from the end to avoid using extra buffer */
for (i = entry->nr - 1; i >= (int)init_nr; i--) for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]); to[i] = (u64)(from[i]);
} }
...@@ -406,27 +405,19 @@ static long __bpf_get_stackid(struct bpf_map *map, ...@@ -406,27 +405,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
{ {
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket; struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len; u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK; bool user = flags & BPF_F_USER_STACK;
u64 *ips; u64 *ips;
bool hash_matches; bool hash_matches;
/* get_perf_callchain() guarantees that trace->nr >= init_nr if (trace->nr <= skip)
* and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
if (trace_nr <= skip)
/* skipping more than usable stack trace */ /* skipping more than usable stack trace */
return -EFAULT; return -EFAULT;
trace_nr -= skip; trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64); trace_len = trace_nr * sizeof(u64);
ips = trace->ip + skip + init_nr; ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1); id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]); bucket = READ_ONCE(smap->buckets[id]);
...@@ -483,8 +474,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, ...@@ -483,8 +474,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags) u64, flags)
{ {
u32 max_depth = map->value_size / stack_map_data_size(map); u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
bool user = flags & BPF_F_USER_STACK; bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace; struct perf_callchain_entry *trace;
bool kernel = !user; bool kernel = !user;
...@@ -493,8 +483,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, ...@@ -493,8 +483,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL; return -EINVAL;
trace = get_perf_callchain(regs, init_nr, kernel, user, max_depth += skip;
sysctl_perf_event_max_stack, false, false); if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
if (unlikely(!trace)) if (unlikely(!trace))
/* couldn't fetch the stack trace */ /* couldn't fetch the stack trace */
...@@ -585,7 +579,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, ...@@ -585,7 +579,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in, struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags) void *buf, u32 size, u64 flags)
{ {
u32 init_nr, trace_nr, copy_len, elem_size, num_elem; u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool user_build_id = flags & BPF_F_USER_BUILD_ID;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK; bool user = flags & BPF_F_USER_STACK;
...@@ -610,30 +604,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, ...@@ -610,30 +604,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto err_fault; goto err_fault;
num_elem = size / elem_size; num_elem = size / elem_size;
if (sysctl_perf_event_max_stack < num_elem) max_depth = num_elem + skip;
init_nr = 0; if (sysctl_perf_event_max_stack < max_depth)
else max_depth = sysctl_perf_event_max_stack;
init_nr = sysctl_perf_event_max_stack - num_elem;
if (trace_in) if (trace_in)
trace = trace_in; trace = trace_in;
else if (kernel && task) else if (kernel && task)
trace = get_callchain_entry_for_task(task, init_nr); trace = get_callchain_entry_for_task(task, max_depth);
else else
trace = get_perf_callchain(regs, init_nr, kernel, user, trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
sysctl_perf_event_max_stack,
false, false); false, false);
if (unlikely(!trace)) if (unlikely(!trace))
goto err_fault; goto err_fault;
trace_nr = trace->nr - init_nr; if (trace->nr < skip)
if (trace_nr < skip)
goto err_fault; goto err_fault;
trace_nr -= skip; trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size; copy_len = trace_nr * elem_size;
ips = trace->ip + skip + init_nr;
ips = trace->ip + skip;
if (user && user_build_id) if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user); stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else else
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册