diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a65ddc580514d9260959298d05595970701ebfa1..717bf3b59ba4058839ad14b8f7c58e2a3a8e786c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -541,8 +541,9 @@ struct perf_counter_context { * been cloned (inherited) from a common ancestor. */ struct perf_counter_context *parent_ctx; - u32 parent_gen; - u32 generation; + u64 parent_gen; + u64 generation; + struct rcu_head rcu_head; }; /** diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 367299f91aaff21fe9948035a43f27bd5ee7a4b5..52e5a15321d80b1d19fb324c90d8a5a01be3927d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx) atomic_inc(&ctx->refcount); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_counter_context *ctx; + + ctx = container_of(head, struct perf_counter_context, rcu_head); + kfree(ctx); +} + static void put_ctx(struct perf_counter_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - kfree(ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -211,22 +221,6 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } -/* - * Mark this context as not being a clone of another. - * Called when counters are added to or removed from this context. - * We also increment our generation number so that anything that - * was cloned from this context before this will not match anything - * cloned from this context after this. - */ -static void unclone_ctx(struct perf_counter_context *ctx) -{ - ++ctx->generation; - if (!ctx->parent_ctx) - return; - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; -} - /* * Cross CPU call to remove a performance counter * @@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info) * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. */ static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; - unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info) /* * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. */ static void perf_counter_disable(struct perf_counter *counter) { @@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info) /* * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. */ static void perf_counter_enable(struct perf_counter *counter) { @@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; struct perf_counter_context *next_ctx; + struct perf_counter_context *parent; struct pt_regs *regs; + int do_switch = 1; regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); @@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task, return; update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_counter_ctxp; - if (next_ctx && context_equiv(ctx, next_ctx)) { - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - return; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); } + rcu_read_unlock(); - __perf_counter_sched_out(ctx, cpuctx); - - cpuctx->task_ctx = NULL; + if (do_switch) { + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } } static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) @@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx, ctx->task = task; } -static void put_context(struct perf_counter_context *ctx) -{ - if (ctx->task) - put_task_struct(ctx->task); -} - static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct perf_counter_context *tctx; + struct perf_counter_context *parent_ctx; struct task_struct *task; + int err; /* * If cpu is not a wildcard then this is a percpu counter: @@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; + get_ctx(ctx); return ctx; } @@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); + /* + * Can't attach counters to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + /* Reuse ptrace permission checks for now. */ - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_task_struct(task); - return ERR_PTR(-EACCES); + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry_lock: + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more and we can + * unclone it if necessary. + * Once it's not a clone things will be stable. + */ + spin_lock_irq(&ctx->lock); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + parent_ctx = ctx->parent_ctx; + if (parent_ctx) { + put_ctx(parent_ctx); + ctx->parent_ctx = NULL; /* no longer a clone */ + } + ++ctx->generation; + /* + * Get an extra reference before dropping the lock so that + * this context won't get freed if the task exits. + */ + get_ctx(ctx); + spin_unlock_irq(&ctx->lock); } + rcu_read_unlock(); - ctx = task->perf_counter_ctxp; if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!ctx) { - put_task_struct(task); - return ERR_PTR(-ENOMEM); - } + err = -ENOMEM; + if (!ctx) + goto errout; __perf_counter_init_context(ctx, task); - /* - * Make sure other cpus see correct values for *ctx - * once task->perf_counter_ctxp is visible to them. - */ - smp_wmb(); - tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); - if (tctx) { + get_ctx(ctx); + if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ kfree(ctx); - ctx = tctx; + goto retry_lock; } + get_task_struct(task); } + put_task_struct(task); return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); } static void free_counter_rcu(struct rcu_head *head) @@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); - put_ctx(counter->ctx); kfree(counter); } @@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter) if (counter->destroy) counter->destroy(counter); + put_ctx(counter->ctx); call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(counter->owner); free_counter(counter); - put_context(ctx); return 0; } @@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, mutex_unlock(&ctx->mutex); } +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ static void perf_counter_for_each_child(struct perf_counter *counter, void (*func)(struct perf_counter *)) { @@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->ctx = ctx; counter->oncpu = -1; - get_ctx(ctx); - counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -3290,7 +3370,7 @@ SYSCALL_DEFINE5(perf_counter_open, kfree(counter); err_put_context: - put_context(ctx); + put_ctx(ctx); goto out_fput; } @@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; + get_ctx(child_ctx); /* * Make the child state follow the state of the parent counter, @@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child, /* * When a child task exits, feed back counter values to parent counters. - * - * Note: we may be running in child context, but the PID is not hashed - * anymore so new counters will not be added. - * (XXX not sure that is true when we get called from flush_old_exec. - * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { @@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child) local_irq_save(flags); __perf_counter_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_counter_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; + spin_unlock(&child_ctx->lock); local_irq_restore(flags); mutex_lock(&child_ctx->mutex); @@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child) __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; + get_task_struct(child); /* * Lock the parent list. No need to lock the child - not PID