提交 a7038524 编写于 作者: L Linus Torvalds

Merge tag 'perf_urgent_for_v6.1_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Borislav Petkov:

 - Fix raw data handling when perf events are used in bpf

 - Rework how SIGTRAPs get delivered to events to address a bunch of
   problems with it. Add a selftest for that too

* tag 'perf_urgent_for_v6.1_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  bpf: Fix sample_flags for bpf_perf_event_output
  selftests/perf_events: Add a SIGTRAP stress test with disables
  perf: Fix missing SIGTRAPs
...@@ -756,11 +756,14 @@ struct perf_event { ...@@ -756,11 +756,14 @@ struct perf_event {
struct fasync_struct *fasync; struct fasync_struct *fasync;
/* delayed work for NMIs and such */ /* delayed work for NMIs and such */
int pending_wakeup; unsigned int pending_wakeup;
int pending_kill; unsigned int pending_kill;
int pending_disable; unsigned int pending_disable;
unsigned int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */ unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending; struct irq_work pending_irq;
struct callback_head pending_task;
unsigned int pending_work;
atomic_t event_limit; atomic_t event_limit;
...@@ -877,6 +880,14 @@ struct perf_event_context { ...@@ -877,6 +880,14 @@ struct perf_event_context {
#endif #endif
void *task_ctx_data; /* pmu specific data */ void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head; struct rcu_head rcu_head;
/*
* Sum (event->pending_sigtrap + event->pending_work)
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
*/
local_t nr_pending;
}; };
/* /*
......
...@@ -54,6 +54,7 @@ ...@@ -54,6 +54,7 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pgtable.h> #include <linux/pgtable.h>
#include <linux/buildid.h> #include <linux/buildid.h>
#include <linux/task_work.h>
#include "internal.h" #include "internal.h"
...@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event, ...@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0); event->pmu->del(event, 0);
event->oncpu = -1; event->oncpu = -1;
if (READ_ONCE(event->pending_disable) >= 0) { if (event->pending_disable) {
WRITE_ONCE(event->pending_disable, -1); event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx); perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF; state = PERF_EVENT_STATE_OFF;
} }
if (event->pending_sigtrap) {
bool dec = true;
event->pending_sigtrap = 0;
if (state != PERF_EVENT_STATE_OFF &&
!event->pending_work) {
event->pending_work = 1;
dec = false;
task_work_add(current, &event->pending_task, TWA_RESUME);
}
if (dec)
local_dec(&event->ctx->nr_pending);
}
perf_event_set_state(event, state); perf_event_set_state(event, state);
if (!is_software_event(event)) if (!is_software_event(event))
...@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event, ...@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that * hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event(). * goes to exit will block in perf_event_exit_event().
* *
* When called from perf_pending_event it's OK because event->ctx * When called from perf_pending_irq it's OK because event->ctx
* is the current context on this CPU and preemption is disabled, * is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context. * hence we can't get into perf_event_task_sched_out for this context.
*/ */
...@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); ...@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event) void perf_event_disable_inatomic(struct perf_event *event)
{ {
WRITE_ONCE(event->pending_disable, smp_processor_id()); event->pending_disable = 1;
/* can fail, see perf_pending_event_disable() */ irq_work_queue(&event->pending_irq);
irq_work_queue(&event->pending);
} }
#define MAX_INTERRUPTS (~0ULL) #define MAX_INTERRUPTS (~0ULL)
...@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, ...@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) { if (context_equiv(ctx, next_ctx)) {
perf_pmu_disable(pmu);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
local_read(&next_ctx->nr_pending)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
goto inside_switch;
}
WRITE_ONCE(ctx->task, next); WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task); WRITE_ONCE(next_ctx->task, task);
perf_pmu_disable(pmu);
if (cpuctx->sched_cb_usage && pmu->sched_task) if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(ctx, false); pmu->sched_task(ctx, false);
...@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, ...@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock); raw_spin_lock(&ctx->lock);
perf_pmu_disable(pmu); perf_pmu_disable(pmu);
inside_switch:
if (cpuctx->sched_cb_usage && pmu->sched_task) if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(ctx, false); pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
...@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event, ...@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event) static void _free_event(struct perf_event *event)
{ {
irq_work_sync(&event->pending); irq_work_sync(&event->pending_irq);
unaccount_event(event); unaccount_event(event);
...@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event) ...@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
return; return;
/* /*
* perf_pending_event() can race with the task exiting. * Both perf_pending_task() and perf_pending_irq() can race with the
* task exiting.
*/ */
if (current->flags & PF_EXITING) if (current->flags & PF_EXITING)
return; return;
...@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event) ...@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data); event->attr.type, event->attr.sig_data);
} }
static void perf_pending_event_disable(struct perf_event *event) /*
* Deliver the pending work in-event-context or follow the context.
*/
static void __perf_pending_irq(struct perf_event *event)
{ {
int cpu = READ_ONCE(event->pending_disable); int cpu = READ_ONCE(event->oncpu);
/*
* If the event isn't running; we done. event_sched_out() will have
* taken care of things.
*/
if (cpu < 0) if (cpu < 0)
return; return;
/*
* Yay, we hit home and are in the context of the event.
*/
if (cpu == smp_processor_id()) { if (cpu == smp_processor_id()) {
WRITE_ONCE(event->pending_disable, -1); if (event->pending_sigtrap) {
event->pending_sigtrap = 0;
if (event->attr.sigtrap) {
perf_sigtrap(event); perf_sigtrap(event);
atomic_set_release(&event->event_limit, 1); /* rearm event */ local_dec(&event->ctx->nr_pending);
return; }
if (event->pending_disable) {
event->pending_disable = 0;
perf_event_disable_local(event);
} }
perf_event_disable_local(event);
return; return;
} }
...@@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event) ...@@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
* irq_work_queue(); // FAILS * irq_work_queue(); // FAILS
* *
* irq_work_run() * irq_work_run()
* perf_pending_event() * perf_pending_irq()
* *
* But the event runs on CPU-B and wants disabling there. * But the event runs on CPU-B and wants disabling there.
*/ */
irq_work_queue_on(&event->pending, cpu); irq_work_queue_on(&event->pending_irq, cpu);
} }
static void perf_pending_event(struct irq_work *entry) static void perf_pending_irq(struct irq_work *entry)
{ {
struct perf_event *event = container_of(entry, struct perf_event, pending); struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx; int rctx;
rctx = perf_swevent_get_recursion_context();
/* /*
* If we 'fail' here, that's OK, it means recursion is already disabled * If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'. * and we won't recurse 'further'.
*/ */
rctx = perf_swevent_get_recursion_context();
perf_pending_event_disable(event); /*
* The wakeup isn't bound to the context of the event -- it can happen
* irrespective of where the event is.
*/
if (event->pending_wakeup) { if (event->pending_wakeup) {
event->pending_wakeup = 0; event->pending_wakeup = 0;
perf_event_wakeup(event); perf_event_wakeup(event);
} }
__perf_pending_irq(event);
if (rctx >= 0) if (rctx >= 0)
perf_swevent_put_recursion_context(rctx); perf_swevent_put_recursion_context(rctx);
} }
static void perf_pending_task(struct callback_head *head)
{
struct perf_event *event = container_of(head, struct perf_event, pending_task);
int rctx;
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_pending);
}
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
preempt_enable_notrace();
}
#ifdef CONFIG_GUEST_PERF_EVENTS #ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs; struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
...@@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event) ...@@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
*/ */
static int __perf_event_overflow(struct perf_event *event, static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data, int throttle, struct perf_sample_data *data,
struct pt_regs *regs) struct pt_regs *regs)
{ {
int events = atomic_read(&event->event_limit); int events = atomic_read(&event->event_limit);
int ret = 0; int ret = 0;
...@@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event, ...@@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) { if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1; ret = 1;
event->pending_kill = POLL_HUP; event->pending_kill = POLL_HUP;
event->pending_addr = data->addr;
perf_event_disable_inatomic(event); perf_event_disable_inatomic(event);
} }
if (event->attr.sigtrap) {
/*
* Should not be able to return to user space without processing
* pending_sigtrap (kernel events can overflow multiple times).
*/
WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
if (!event->pending_sigtrap) {
event->pending_sigtrap = 1;
local_inc(&event->ctx->nr_pending);
}
event->pending_addr = data->addr;
irq_work_queue(&event->pending_irq);
}
READ_ONCE(event->overflow_handler)(event, data, regs); READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) { if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1; event->pending_wakeup = 1;
irq_work_queue(&event->pending); irq_work_queue(&event->pending_irq);
} }
return ret; return ret;
} }
int perf_event_overflow(struct perf_event *event, int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data, struct perf_sample_data *data,
struct pt_regs *regs) struct pt_regs *regs)
{ {
return __perf_event_overflow(event, 1, data, regs); return __perf_event_overflow(event, 1, data, regs);
} }
...@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, ...@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq); init_waitqueue_head(&event->waitq);
event->pending_disable = -1; init_irq_work(&event->pending_irq, perf_pending_irq);
init_irq_work(&event->pending, perf_pending_event); init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex); mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock); raw_spin_lock_init(&event->addr_filters.lock);
...@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, ...@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event) if (parent_event)
event->event_caps = parent_event->event_caps; event->event_caps = parent_event->event_caps;
if (event->attr.sigtrap)
atomic_set(&event->event_limit, 1);
if (task) { if (task) {
event->attach_state = PERF_ATTACH_TASK; event->attach_state = PERF_ATTACH_TASK;
/* /*
......
...@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) ...@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN); atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1; handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending); irq_work_queue(&handle->event->pending_irq);
} }
/* /*
......
...@@ -687,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, ...@@ -687,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
perf_sample_data_init(sd, 0, 0); perf_sample_data_init(sd, 0, 0);
sd->raw = &raw; sd->raw = &raw;
sd->sample_flags |= PERF_SAMPLE_RAW;
err = __bpf_perf_event_output(regs, map, flags, sd); err = __bpf_perf_event_output(regs, map, flags, sd);
...@@ -745,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ...@@ -745,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs); perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0); perf_sample_data_init(sd, 0, 0);
sd->raw = &raw; sd->raw = &raw;
sd->sample_flags |= PERF_SAMPLE_RAW;
ret = __bpf_perf_event_output(regs, map, flags, sd); ret = __bpf_perf_event_output(regs, map, flags, sd);
out: out:
......
...@@ -62,6 +62,8 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr, ...@@ -62,6 +62,8 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr,
.remove_on_exec = 1, /* Required by sigtrap. */ .remove_on_exec = 1, /* Required by sigtrap. */
.sigtrap = 1, /* Request synchronous SIGTRAP on event. */ .sigtrap = 1, /* Request synchronous SIGTRAP on event. */
.sig_data = TEST_SIG_DATA(addr, id), .sig_data = TEST_SIG_DATA(addr, id),
.exclude_kernel = 1, /* To allow */
.exclude_hv = 1, /* running as !root */
}; };
return attr; return attr;
} }
...@@ -93,9 +95,13 @@ static void *test_thread(void *arg) ...@@ -93,9 +95,13 @@ static void *test_thread(void *arg)
__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
iter = ctx.iterate_on; /* read */ iter = ctx.iterate_on; /* read */
for (i = 0; i < iter - 1; i++) { if (iter >= 0) {
__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); for (i = 0; i < iter - 1; i++) {
ctx.iterate_on = iter; /* idempotent write */ __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
ctx.iterate_on = iter; /* idempotent write */
}
} else {
while (ctx.iterate_on);
} }
return NULL; return NULL;
...@@ -208,4 +214,27 @@ TEST_F(sigtrap_threads, signal_stress) ...@@ -208,4 +214,27 @@ TEST_F(sigtrap_threads, signal_stress)
EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0));
} }
TEST_F(sigtrap_threads, signal_stress_with_disable)
{
const int target_count = NUM_THREADS * 3000;
int i;
ctx.iterate_on = -1;
EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
pthread_barrier_wait(&self->barrier);
while (__atomic_load_n(&ctx.signal_count, __ATOMIC_RELAXED) < target_count) {
EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0);
EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
}
ctx.iterate_on = 0;
for (i = 0; i < NUM_THREADS; i++)
ASSERT_EQ(pthread_join(self->threads[i], NULL), 0);
EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0);
EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on);
EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT);
EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0));
}
TEST_HARNESS_MAIN TEST_HARNESS_MAIN
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册