perf_event.c 144.6 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events core code:
T
Thomas Gleixner 已提交
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
I
Ingo Molnar 已提交
9
 * For licensing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
T
Thomas Gleixner 已提交
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
T
Thomas Gleixner 已提交
17
#include <linux/poll.h>
18
#include <linux/slab.h>
19
#include <linux/hash.h>
T
Thomas Gleixner 已提交
20
#include <linux/sysfs.h>
21
#include <linux/dcache.h>
T
Thomas Gleixner 已提交
22
#include <linux/percpu.h>
23
#include <linux/ptrace.h>
24
#include <linux/vmstat.h>
25
#include <linux/vmalloc.h>
26 27
#include <linux/hardirq.h>
#include <linux/rculist.h>
T
Thomas Gleixner 已提交
28 29 30
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
I
Ingo Molnar 已提交
31
#include <linux/kernel_stat.h>
32
#include <linux/perf_event.h>
L
Li Zefan 已提交
33
#include <linux/ftrace_event.h>
T
Thomas Gleixner 已提交
34

35 36
#include <asm/irq_regs.h>

37 38 39 40
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
41

P
Peter Zijlstra 已提交
42 43 44 45
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;

46
/*
47
 * perf event paranoia level:
48 49
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
50
 *   1 - disallow cpu events for unpriv
51
 *   2 - disallow kernel profiling for unpriv
52
 */
53
int sysctl_perf_event_paranoid __read_mostly = 1;
54

55
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
56 57

/*
58
 * max perf event sample rate
59
 */
60
int sysctl_perf_event_sample_rate __read_mostly = 100000;
61

62
static atomic64_t perf_event_id;
63

64
void __weak perf_event_print_debug(void)	{ }
65

66 67 68 69 70
extern __weak const char *perf_pmu_name(void)
{
	return "pmu";
}

P
Peter Zijlstra 已提交
71
void perf_pmu_disable(struct pmu *pmu)
72
{
P
Peter Zijlstra 已提交
73 74 75
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!(*count)++)
		pmu->pmu_disable(pmu);
76 77
}

P
Peter Zijlstra 已提交
78
void perf_pmu_enable(struct pmu *pmu)
79
{
P
Peter Zijlstra 已提交
80 81 82
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!--(*count))
		pmu->pmu_enable(pmu);
83 84
}

85 86 87 88 89 90 91
static DEFINE_PER_CPU(struct list_head, rotation_list);

/*
 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
 */
P
Peter Zijlstra 已提交
92
static void perf_pmu_rotate_start(struct pmu *pmu)
93
{
P
Peter Zijlstra 已提交
94
	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
95
	struct list_head *head = &__get_cpu_var(rotation_list);
96

97
	WARN_ON(!irqs_disabled());
98

99 100
	if (list_empty(&cpuctx->rotation_list))
		list_add(&cpuctx->rotation_list, head);
101 102
}

103
static void get_ctx(struct perf_event_context *ctx)
104
{
105
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
106 107
}

108 109
static void free_ctx(struct rcu_head *head)
{
110
	struct perf_event_context *ctx;
111

112
	ctx = container_of(head, struct perf_event_context, rcu_head);
113 114 115
	kfree(ctx);
}

116
static void put_ctx(struct perf_event_context *ctx)
117
{
118 119 120
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
121 122 123
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
124
	}
125 126
}

127
static void unclone_ctx(struct perf_event_context *ctx)
128 129 130 131 132 133 134
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

135
/*
136
 * If we inherit events we want to return the parent event id
137 138
 * to userspace.
 */
139
static u64 primary_event_id(struct perf_event *event)
140
{
141
	u64 id = event->id;
142

143 144
	if (event->parent)
		id = event->parent->id;
145 146 147 148

	return id;
}

149
/*
150
 * Get the perf_event_context for a task and lock it.
151 152 153
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
154
static struct perf_event_context *
P
Peter Zijlstra 已提交
155
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
156
{
157
	struct perf_event_context *ctx;
158 159

	rcu_read_lock();
P
Peter Zijlstra 已提交
160
retry:
P
Peter Zijlstra 已提交
161
	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
162 163 164 165
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
166
		 * perf_event_task_sched_out, though the
167 168 169 170 171 172
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
173
		raw_spin_lock_irqsave(&ctx->lock, *flags);
P
Peter Zijlstra 已提交
174
		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
175
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
176 177
			goto retry;
		}
178 179

		if (!atomic_inc_not_zero(&ctx->refcount)) {
180
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
181 182
			ctx = NULL;
		}
183 184 185 186 187 188 189 190 191 192
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
P
Peter Zijlstra 已提交
193 194
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
195
{
196
	struct perf_event_context *ctx;
197 198
	unsigned long flags;

P
Peter Zijlstra 已提交
199
	ctx = perf_lock_task_context(task, ctxn, &flags);
200 201
	if (ctx) {
		++ctx->pin_count;
202
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
203 204 205 206
	}
	return ctx;
}

207
static void perf_unpin_context(struct perf_event_context *ctx)
208 209 210
{
	unsigned long flags;

211
	raw_spin_lock_irqsave(&ctx->lock, flags);
212
	--ctx->pin_count;
213
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
214 215 216
	put_ctx(ctx);
}

217 218
static inline u64 perf_clock(void)
{
219
	return local_clock();
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
}

/*
 * Update the record of the current time in a context.
 */
static void update_context_time(struct perf_event_context *ctx)
{
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
}

/*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
static void update_event_times(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	u64 run_end;

	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
		return;

245 246 247 248 249 250
	if (ctx->is_active)
		run_end = ctx->time;
	else
		run_end = event->tstamp_stopped;

	event->total_time_enabled = run_end - event->tstamp_enabled;
251 252 253 254 255 256 257 258 259

	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
	else
		run_end = ctx->time;

	event->total_time_running = run_end - event->tstamp_running;
}

260 261 262 263 264 265 266 267 268 269 270 271
/*
 * Update total_time_enabled and total_time_running for all events in a group.
 */
static void update_group_times(struct perf_event *leader)
{
	struct perf_event *event;

	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
}

272 273 274 275 276 277 278 279 280
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
	if (event->attr.pinned)
		return &ctx->pinned_groups;
	else
		return &ctx->flexible_groups;
}

281
/*
282
 * Add a event from the lists for its context.
283 284
 * Must be called with ctx->mutex and ctx->lock held.
 */
285
static void
286
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
287
{
288 289
	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
	event->attach_state |= PERF_ATTACH_CONTEXT;
290 291

	/*
292 293 294
	 * If we're a stand alone event or group leader, we go to the context
	 * list, group events are kept attached to the group so that
	 * perf_group_detach can, at all times, locate all siblings.
295
	 */
296
	if (event->group_leader == event) {
297 298
		struct list_head *list;

299 300 301
		if (is_software_event(event))
			event->group_flags |= PERF_GROUP_SOFTWARE;

302 303
		list = ctx_group_list(event, ctx);
		list_add_tail(&event->group_entry, list);
P
Peter Zijlstra 已提交
304
	}
P
Peter Zijlstra 已提交
305

306
	list_add_rcu(&event->event_entry, &ctx->event_list);
307
	if (!ctx->nr_events)
P
Peter Zijlstra 已提交
308
		perf_pmu_rotate_start(ctx->pmu);
309 310
	ctx->nr_events++;
	if (event->attr.inherit_stat)
311
		ctx->nr_stat++;
312 313
}

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
static void perf_group_attach(struct perf_event *event)
{
	struct perf_event *group_leader = event->group_leader;

	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
	event->attach_state |= PERF_ATTACH_GROUP;

	if (group_leader == event)
		return;

	if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
			!is_software_event(event))
		group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;

	list_add_tail(&event->group_entry, &group_leader->sibling_list);
	group_leader->nr_siblings++;
}

332
/*
333
 * Remove a event from the lists for its context.
334
 * Must be called with ctx->mutex and ctx->lock held.
335
 */
336
static void
337
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
338
{
339 340 341 342
	/*
	 * We can have double detach due to exit/hot-unplug + close.
	 */
	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
343
		return;
344 345 346

	event->attach_state &= ~PERF_ATTACH_CONTEXT;

347 348
	ctx->nr_events--;
	if (event->attr.inherit_stat)
349
		ctx->nr_stat--;
350

351
	list_del_rcu(&event->event_entry);
352

353 354
	if (event->group_leader == event)
		list_del_init(&event->group_entry);
P
Peter Zijlstra 已提交
355

356
	update_group_times(event);
357 358 359 360 361 362 363 364 365 366

	/*
	 * If event was in error state, then keep it
	 * that way, otherwise bogus counts will be
	 * returned on read(). The only way to get out
	 * of error state is by explicit re-enabling
	 * of the event
	 */
	if (event->state > PERF_EVENT_STATE_OFF)
		event->state = PERF_EVENT_STATE_OFF;
367 368
}

369
static void perf_group_detach(struct perf_event *event)
370 371
{
	struct perf_event *sibling, *tmp;
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
	struct list_head *list = NULL;

	/*
	 * We can have double detach due to exit/hot-unplug + close.
	 */
	if (!(event->attach_state & PERF_ATTACH_GROUP))
		return;

	event->attach_state &= ~PERF_ATTACH_GROUP;

	/*
	 * If this is a sibling, remove it from its group.
	 */
	if (event->group_leader != event) {
		list_del_init(&event->group_entry);
		event->group_leader->nr_siblings--;
		return;
	}

	if (!list_empty(&event->group_entry))
		list = &event->group_entry;
393

394
	/*
395 396
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
397
	 * to whatever list we are on.
398
	 */
399
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
400 401
		if (list)
			list_move_tail(&sibling->group_entry, list);
402
		sibling->group_leader = sibling;
403 404 405

		/* Inherit group flags from the previous leader */
		sibling->group_flags = event->group_flags;
406 407 408
	}
}

409 410 411 412 413 414
static inline int
event_filter_match(struct perf_event *event)
{
	return event->cpu == -1 || event->cpu == smp_processor_id();
}

415 416
static int
__event_sched_out(struct perf_event *event,
417
		  struct perf_cpu_context *cpuctx,
418
		  struct perf_event_context *ctx)
419
{
420 421 422 423 424 425 426 427 428 429 430 431 432 433
	u64 delta;
	/*
	 * An event which could not be activated because of
	 * filter mismatch still needs to have its timings
	 * maintained, otherwise bogus information is return
	 * via read() for time_enabled, time_running:
	 */
	if (event->state == PERF_EVENT_STATE_INACTIVE
	    && !event_filter_match(event)) {
		delta = ctx->time - event->tstamp_stopped;
		event->tstamp_running += delta;
		event->tstamp_stopped = ctx->time;
	}

434
	if (event->state != PERF_EVENT_STATE_ACTIVE)
435
		return 0;
436

437 438 439 440
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
441
	}
P
Peter Zijlstra 已提交
442
	event->pmu->del(event, 0);
443
	event->oncpu = -1;
444

445
	if (!is_software_event(event))
446 447
		cpuctx->active_oncpu--;
	ctx->nr_active--;
448
	if (event->attr.exclusive || !cpuctx->active_oncpu)
449
		cpuctx->exclusive = 0;
450 451 452 453 454 455 456 457 458 459 460 461 462
	return 1;
}

static void
event_sched_out(struct perf_event *event,
		  struct perf_cpu_context *cpuctx,
		  struct perf_event_context *ctx)
{
	int ret;

	ret = __event_sched_out(event, cpuctx, ctx);
	if (ret)
		event->tstamp_stopped = ctx->time;
463 464
}

465
static void
466
group_sched_out(struct perf_event *group_event,
467
		struct perf_cpu_context *cpuctx,
468
		struct perf_event_context *ctx)
469
{
470
	struct perf_event *event;
471
	int state = group_event->state;
472

473
	event_sched_out(group_event, cpuctx, ctx);
474 475 476 477

	/*
	 * Schedule out siblings (if any):
	 */
478 479
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
480

481
	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
482 483 484
		cpuctx->exclusive = 0;
}

P
Peter Zijlstra 已提交
485 486 487 488 489 490
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

T
Thomas Gleixner 已提交
491
/*
492
 * Cross CPU call to remove a performance event
T
Thomas Gleixner 已提交
493
 *
494
 * We disable the event on the hardware level first. After that we
T
Thomas Gleixner 已提交
495 496
 * remove it from the context list.
 */
497
static void __perf_event_remove_from_context(void *info)
T
Thomas Gleixner 已提交
498
{
499 500
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
501
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
T
Thomas Gleixner 已提交
502 503 504 505 506 507

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
508
	if (ctx->task && cpuctx->task_ctx != ctx)
T
Thomas Gleixner 已提交
509 510
		return;

511
	raw_spin_lock(&ctx->lock);
T
Thomas Gleixner 已提交
512

513
	event_sched_out(event, cpuctx, ctx);
514

515
	list_del_event(event, ctx);
T
Thomas Gleixner 已提交
516

517
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
518 519 520 521
}


/*
522
 * Remove the event from a task's (or a CPU's) list of events.
T
Thomas Gleixner 已提交
523
 *
524
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
525
 *
526
 * CPU events are removed with a smp call. For task events we only
T
Thomas Gleixner 已提交
527
 * call when the task is on a CPU.
528
 *
529 530
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
531 532
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
533
 * When called from perf_event_exit_task, it's OK because the
534
 * context has been detached from its task.
T
Thomas Gleixner 已提交
535
 */
536
static void perf_event_remove_from_context(struct perf_event *event)
T
Thomas Gleixner 已提交
537
{
538
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
539 540 541 542
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
543
		 * Per cpu events are removed via an smp call and
544
		 * the removal is always successful.
T
Thomas Gleixner 已提交
545
		 */
546 547 548
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
T
Thomas Gleixner 已提交
549 550 551 552
		return;
	}

retry:
553 554
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
T
Thomas Gleixner 已提交
555

556
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
557 558 559
	/*
	 * If the context is active we need to retry the smp call.
	 */
560
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
561
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
562 563 564 565 566
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
567
	 * can remove the event safely, if the call above did not
T
Thomas Gleixner 已提交
568 569
	 * succeed.
	 */
P
Peter Zijlstra 已提交
570
	if (!list_empty(&event->group_entry))
571
		list_del_event(event, ctx);
572
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
573 574
}

575
/*
576
 * Cross CPU call to disable a performance event
577
 */
578
static void __perf_event_disable(void *info)
579
{
580 581
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
582
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
583 584

	/*
585 586
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
587
	 */
588
	if (ctx->task && cpuctx->task_ctx != ctx)
589 590
		return;

591
	raw_spin_lock(&ctx->lock);
592 593

	/*
594
	 * If the event is on, turn it off.
595 596
	 * If it is in error state, leave it in error state.
	 */
597
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
598
		update_context_time(ctx);
599 600 601
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
602
		else
603 604
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
605 606
	}

607
	raw_spin_unlock(&ctx->lock);
608 609 610
}

/*
611
 * Disable a event.
612
 *
613 614
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
615
 * remains valid.  This condition is satisifed when called through
616 617 618 619
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
620
 * is the current context on this CPU and preemption is disabled,
621
 * hence we can't get into perf_event_task_sched_out for this context.
622
 */
623
void perf_event_disable(struct perf_event *event)
624
{
625
	struct perf_event_context *ctx = event->ctx;
626 627 628 629
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
630
		 * Disable the event on the cpu that it's on
631
		 */
632 633
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
634 635 636
		return;
	}

P
Peter Zijlstra 已提交
637
retry:
638
	task_oncpu_function_call(task, __perf_event_disable, event);
639

640
	raw_spin_lock_irq(&ctx->lock);
641
	/*
642
	 * If the event is still active, we need to retry the cross-call.
643
	 */
644
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
645
		raw_spin_unlock_irq(&ctx->lock);
646 647 648 649 650 651 652
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
653 654 655
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
656
	}
657

658
	raw_spin_unlock_irq(&ctx->lock);
659 660
}

661
static int
662
__event_sched_in(struct perf_event *event,
663
		 struct perf_cpu_context *cpuctx,
664
		 struct perf_event_context *ctx)
665
{
666
	if (event->state <= PERF_EVENT_STATE_OFF)
667 668
		return 0;

669
	event->state = PERF_EVENT_STATE_ACTIVE;
670
	event->oncpu = smp_processor_id();
671 672 673 674 675
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

P
Peter Zijlstra 已提交
676
	if (event->pmu->add(event, PERF_EF_START)) {
677 678
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
679 680 681
		return -EAGAIN;
	}

682
	if (!is_software_event(event))
683
		cpuctx->active_oncpu++;
684 685
	ctx->nr_active++;

686
	if (event->attr.exclusive)
687 688
		cpuctx->exclusive = 1;

689 690 691
	return 0;
}

692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
static inline int
event_sched_in(struct perf_event *event,
		 struct perf_cpu_context *cpuctx,
		 struct perf_event_context *ctx)
{
	int ret = __event_sched_in(event, cpuctx, ctx);
	if (ret)
		return ret;
	event->tstamp_running += ctx->time - event->tstamp_stopped;
	return 0;
}

static void
group_commit_event_sched_in(struct perf_event *group_event,
	       struct perf_cpu_context *cpuctx,
	       struct perf_event_context *ctx)
{
	struct perf_event *event;
	u64 now = ctx->time;

	group_event->tstamp_running += now - group_event->tstamp_stopped;
	/*
	 * Schedule in siblings as one group (if any):
	 */
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		event->tstamp_running += now - event->tstamp_stopped;
	}
}

721
static int
722
group_sched_in(struct perf_event *group_event,
723
	       struct perf_cpu_context *cpuctx,
724
	       struct perf_event_context *ctx)
725
{
726
	struct perf_event *event, *partial_group = NULL;
P
Peter Zijlstra 已提交
727
	struct pmu *pmu = group_event->pmu;
728

729
	if (group_event->state == PERF_EVENT_STATE_OFF)
730 731
		return 0;

P
Peter Zijlstra 已提交
732
	pmu->start_txn(pmu);
733

734 735 736 737 738 739 740
	/*
	 * use __event_sched_in() to delay updating tstamp_running
	 * until the transaction is committed. In case of failure
	 * we will keep an unmodified tstamp_running which is a
	 * requirement to get correct timing information
	 */
	if (__event_sched_in(group_event, cpuctx, ctx)) {
P
Peter Zijlstra 已提交
741
		pmu->cancel_txn(pmu);
742
		return -EAGAIN;
743
	}
744 745 746 747

	/*
	 * Schedule in siblings as one group (if any):
	 */
748
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
749
		if (__event_sched_in(event, cpuctx, ctx)) {
750
			partial_group = event;
751 752 753 754
			goto group_error;
		}
	}

755 756 757
	if (!pmu->commit_txn(pmu)) {
		/* commit tstamp_running */
		group_commit_event_sched_in(group_event, cpuctx, ctx);
758
		return 0;
759
	}
760 761 762 763
group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
764 765 766
	 *
	 * use __event_sched_out() to avoid updating tstamp_stopped
	 * because the event never actually ran
767
	 */
768 769
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
770
			break;
771
		__event_sched_out(event, cpuctx, ctx);
772
	}
773
	__event_sched_out(group_event, cpuctx, ctx);
774

P
Peter Zijlstra 已提交
775
	pmu->cancel_txn(pmu);
776

777 778 779
	return -EAGAIN;
}

780
/*
781
 * Work out whether we can put this event group on the CPU now.
782
 */
783
static int group_can_go_on(struct perf_event *event,
784 785 786 787
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
788
	 * Groups consisting entirely of software events can always go on.
789
	 */
790
	if (event->group_flags & PERF_GROUP_SOFTWARE)
791 792 793
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
794
	 * events can go on.
795 796 797 798 799
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
800
	 * events on the CPU, it can't go on.
801
	 */
802
	if (event->attr.exclusive && cpuctx->active_oncpu)
803 804 805 806 807 808 809 810
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

811 812
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
813
{
814
	list_add_event(event, ctx);
815
	perf_group_attach(event);
816 817 818
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
819 820
}

T
Thomas Gleixner 已提交
821
/*
822
 * Cross CPU call to install and enable a performance event
823 824
 *
 * Must be called with ctx->mutex held
T
Thomas Gleixner 已提交
825 826 827
 */
static void __perf_install_in_context(void *info)
{
828 829 830
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
P
Peter Zijlstra 已提交
831
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
832
	int err;
T
Thomas Gleixner 已提交
833 834 835 836 837

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
838
	 * Or possibly this is the right context but it isn't
839
	 * on this cpu because it had no events.
T
Thomas Gleixner 已提交
840
	 */
841
	if (ctx->task && cpuctx->task_ctx != ctx) {
842
		if (cpuctx->task_ctx || ctx->task != current)
843 844 845
			return;
		cpuctx->task_ctx = ctx;
	}
T
Thomas Gleixner 已提交
846

847
	raw_spin_lock(&ctx->lock);
848
	ctx->is_active = 1;
849
	update_context_time(ctx);
T
Thomas Gleixner 已提交
850

851
	add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
852

853 854 855
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

856
	/*
857
	 * Don't put the event on if it is disabled or if
858 859
	 * it is in a group and the group isn't on.
	 */
860 861
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
862 863
		goto unlock;

864
	/*
865 866 867
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
868
	 */
869
	if (!group_can_go_on(event, cpuctx, 1))
870 871
		err = -EEXIST;
	else
872
		err = event_sched_in(event, cpuctx, ctx);
873

874 875
	if (err) {
		/*
876
		 * This event couldn't go on.  If it is in a group
877
		 * then we have to pull the whole group off.
878
		 * If the event group is pinned then put it in error state.
879
		 */
880
		if (leader != event)
881
			group_sched_out(leader, cpuctx, ctx);
882
		if (leader->attr.pinned) {
883
			update_group_times(leader);
884
			leader->state = PERF_EVENT_STATE_ERROR;
885
		}
886
	}
T
Thomas Gleixner 已提交
887

P
Peter Zijlstra 已提交
888
unlock:
889
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
890 891 892
}

/*
893
 * Attach a performance event to a context
T
Thomas Gleixner 已提交
894
 *
895 896
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
T
Thomas Gleixner 已提交
897
 *
898
 * If the event is attached to a task which is on a CPU we use a smp
T
Thomas Gleixner 已提交
899 900
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
901 902
 *
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
903 904
 */
static void
905 906
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
T
Thomas Gleixner 已提交
907 908 909 910
			int cpu)
{
	struct task_struct *task = ctx->task;

911 912
	event->ctx = ctx;

T
Thomas Gleixner 已提交
913 914
	if (!task) {
		/*
915
		 * Per cpu events are installed via an smp call and
916
		 * the install is always successful.
T
Thomas Gleixner 已提交
917 918
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
919
					 event, 1);
T
Thomas Gleixner 已提交
920 921 922 923 924
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
925
				 event);
T
Thomas Gleixner 已提交
926

927
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
928 929 930
	/*
	 * we need to retry the smp call.
	 */
931
	if (ctx->is_active && list_empty(&event->group_entry)) {
932
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
933 934 935 936 937
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
938
	 * can add the event safely, if it the call above did not
T
Thomas Gleixner 已提交
939 940
	 * succeed.
	 */
941 942
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
943
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
944 945
}

946
/*
947
 * Put a event into inactive state and update time fields.
948 949 950 951 952 953
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
954 955
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
956
{
957
	struct perf_event *sub;
958

959 960
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
P
Peter Zijlstra 已提交
961 962
	list_for_each_entry(sub, &event->sibling_list, group_entry) {
		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
963 964
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
P
Peter Zijlstra 已提交
965 966
		}
	}
967 968
}

969
/*
970
 * Cross CPU call to enable a performance event
971
 */
972
static void __perf_event_enable(void *info)
973
{
974 975 976
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
P
Peter Zijlstra 已提交
977
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
978
	int err;
979

980
	/*
981 982
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
983
	 */
984
	if (ctx->task && cpuctx->task_ctx != ctx) {
985
		if (cpuctx->task_ctx || ctx->task != current)
986 987 988
			return;
		cpuctx->task_ctx = ctx;
	}
989

990
	raw_spin_lock(&ctx->lock);
991
	ctx->is_active = 1;
992
	update_context_time(ctx);
993

994
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
995
		goto unlock;
996
	__perf_event_mark_enabled(event, ctx);
997

998 999 1000
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

1001
	/*
1002
	 * If the event is in a group and isn't the group leader,
1003
	 * then don't put it on unless the group is on.
1004
	 */
1005
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1006
		goto unlock;
1007

1008
	if (!group_can_go_on(event, cpuctx, 1)) {
1009
		err = -EEXIST;
1010
	} else {
1011
		if (event == leader)
1012
			err = group_sched_in(event, cpuctx, ctx);
1013
		else
1014
			err = event_sched_in(event, cpuctx, ctx);
1015
	}
1016 1017 1018

	if (err) {
		/*
1019
		 * If this event can't go on and it's part of a
1020 1021
		 * group, then the whole group has to come off.
		 */
1022
		if (leader != event)
1023
			group_sched_out(leader, cpuctx, ctx);
1024
		if (leader->attr.pinned) {
1025
			update_group_times(leader);
1026
			leader->state = PERF_EVENT_STATE_ERROR;
1027
		}
1028 1029
	}

P
Peter Zijlstra 已提交
1030
unlock:
1031
	raw_spin_unlock(&ctx->lock);
1032 1033 1034
}

/*
1035
 * Enable a event.
1036
 *
1037 1038
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
1039
 * remains valid.  This condition is satisfied when called through
1040 1041
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
1042
 */
1043
void perf_event_enable(struct perf_event *event)
1044
{
1045
	struct perf_event_context *ctx = event->ctx;
1046 1047 1048 1049
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
1050
		 * Enable the event on the cpu that it's on
1051
		 */
1052 1053
		smp_call_function_single(event->cpu, __perf_event_enable,
					 event, 1);
1054 1055 1056
		return;
	}

1057
	raw_spin_lock_irq(&ctx->lock);
1058
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
1059 1060 1061
		goto out;

	/*
1062 1063
	 * If the event is in error state, clear that first.
	 * That way, if we see the event in error state below, we
1064 1065 1066 1067
	 * know that it has gone back into error state, as distinct
	 * from the task having been scheduled away before the
	 * cross-call arrived.
	 */
1068 1069
	if (event->state == PERF_EVENT_STATE_ERROR)
		event->state = PERF_EVENT_STATE_OFF;
1070

P
Peter Zijlstra 已提交
1071
retry:
1072
	raw_spin_unlock_irq(&ctx->lock);
1073
	task_oncpu_function_call(task, __perf_event_enable, event);
1074

1075
	raw_spin_lock_irq(&ctx->lock);
1076 1077

	/*
1078
	 * If the context is active and the event is still off,
1079 1080
	 * we need to retry the cross-call.
	 */
1081
	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1082 1083 1084 1085 1086 1087
		goto retry;

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
1088 1089
	if (event->state == PERF_EVENT_STATE_OFF)
		__perf_event_mark_enabled(event, ctx);
1090

P
Peter Zijlstra 已提交
1091
out:
1092
	raw_spin_unlock_irq(&ctx->lock);
1093 1094
}

1095
static int perf_event_refresh(struct perf_event *event, int refresh)
1096
{
1097
	/*
1098
	 * not supported on inherited events
1099
	 */
1100
	if (event->attr.inherit)
1101 1102
		return -EINVAL;

1103 1104
	atomic_add(refresh, &event->event_limit);
	perf_event_enable(event);
1105 1106

	return 0;
1107 1108
}

1109 1110 1111 1112 1113 1114 1115 1116 1117
enum event_type_t {
	EVENT_FLEXIBLE = 0x1,
	EVENT_PINNED = 0x2,
	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

static void ctx_sched_out(struct perf_event_context *ctx,
			  struct perf_cpu_context *cpuctx,
			  enum event_type_t event_type)
1118
{
1119
	struct perf_event *event;
1120

1121
	raw_spin_lock(&ctx->lock);
P
Peter Zijlstra 已提交
1122
	perf_pmu_disable(ctx->pmu);
1123
	ctx->is_active = 0;
1124
	if (likely(!ctx->nr_events))
1125
		goto out;
1126
	update_context_time(ctx);
1127

1128
	if (!ctx->nr_active)
1129
		goto out;
1130

P
Peter Zijlstra 已提交
1131
	if (event_type & EVENT_PINNED) {
1132 1133
		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
			group_sched_out(event, cpuctx, ctx);
P
Peter Zijlstra 已提交
1134
	}
1135

P
Peter Zijlstra 已提交
1136
	if (event_type & EVENT_FLEXIBLE) {
1137
		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1138
			group_sched_out(event, cpuctx, ctx);
P
Peter Zijlstra 已提交
1139 1140
	}
out:
P
Peter Zijlstra 已提交
1141
	perf_pmu_enable(ctx->pmu);
1142
	raw_spin_unlock(&ctx->lock);
1143 1144
}

1145 1146 1147
/*
 * Test whether two contexts are equivalent, i.e. whether they
 * have both been cloned from the same version of the same context
1148 1149 1150 1151
 * and they both have the same number of enabled events.
 * If the number of enabled events is the same, then the set
 * of enabled events should be the same, because these are both
 * inherited contexts, therefore we can't access individual events
1152
 * in them directly with an fd; we can only enable/disable all
1153
 * events via prctl, or enable/disable all events in a family
1154 1155
 * via ioctl, which will have the same effect on both contexts.
 */
1156 1157
static int context_equiv(struct perf_event_context *ctx1,
			 struct perf_event_context *ctx2)
1158 1159
{
	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1160
		&& ctx1->parent_gen == ctx2->parent_gen
1161
		&& !ctx1->pin_count && !ctx2->pin_count;
1162 1163
}

1164 1165
static void __perf_event_sync_stat(struct perf_event *event,
				     struct perf_event *next_event)
1166 1167 1168
{
	u64 value;

1169
	if (!event->attr.inherit_stat)
1170 1171 1172
		return;

	/*
1173
	 * Update the event value, we cannot use perf_event_read()
1174 1175
	 * because we're in the middle of a context switch and have IRQs
	 * disabled, which upsets smp_call_function_single(), however
1176
	 * we know the event must be on the current CPU, therefore we
1177 1178
	 * don't need to use it.
	 */
1179 1180
	switch (event->state) {
	case PERF_EVENT_STATE_ACTIVE:
1181 1182
		event->pmu->read(event);
		/* fall-through */
1183

1184 1185
	case PERF_EVENT_STATE_INACTIVE:
		update_event_times(event);
1186 1187 1188 1189 1190 1191 1192
		break;

	default:
		break;
	}

	/*
1193
	 * In order to keep per-task stats reliable we need to flip the event
1194 1195
	 * values when we flip the contexts.
	 */
1196 1197 1198
	value = local64_read(&next_event->count);
	value = local64_xchg(&event->count, value);
	local64_set(&next_event->count, value);
1199

1200 1201
	swap(event->total_time_enabled, next_event->total_time_enabled);
	swap(event->total_time_running, next_event->total_time_running);
1202

1203
	/*
1204
	 * Since we swizzled the values, update the user visible data too.
1205
	 */
1206 1207
	perf_event_update_userpage(event);
	perf_event_update_userpage(next_event);
1208 1209 1210 1211 1212
}

#define list_next_entry(pos, member) \
	list_entry(pos->member.next, typeof(*pos), member)

1213 1214
static void perf_event_sync_stat(struct perf_event_context *ctx,
				   struct perf_event_context *next_ctx)
1215
{
1216
	struct perf_event *event, *next_event;
1217 1218 1219 1220

	if (!ctx->nr_stat)
		return;

1221 1222
	update_context_time(ctx);

1223 1224
	event = list_first_entry(&ctx->event_list,
				   struct perf_event, event_entry);
1225

1226 1227
	next_event = list_first_entry(&next_ctx->event_list,
					struct perf_event, event_entry);
1228

1229 1230
	while (&event->event_entry != &ctx->event_list &&
	       &next_event->event_entry != &next_ctx->event_list) {
1231

1232
		__perf_event_sync_stat(event, next_event);
1233

1234 1235
		event = list_next_entry(event, event_entry);
		next_event = list_next_entry(next_event, event_entry);
1236 1237 1238
	}
}

P
Peter Zijlstra 已提交
1239 1240
void perf_event_context_sched_out(struct task_struct *task, int ctxn,
				  struct task_struct *next)
T
Thomas Gleixner 已提交
1241
{
P
Peter Zijlstra 已提交
1242
	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1243 1244
	struct perf_event_context *next_ctx;
	struct perf_event_context *parent;
P
Peter Zijlstra 已提交
1245
	struct perf_cpu_context *cpuctx;
1246
	int do_switch = 1;
T
Thomas Gleixner 已提交
1247

P
Peter Zijlstra 已提交
1248 1249
	if (likely(!ctx))
		return;
1250

P
Peter Zijlstra 已提交
1251 1252
	cpuctx = __get_cpu_context(ctx);
	if (!cpuctx->task_ctx)
T
Thomas Gleixner 已提交
1253 1254
		return;

1255 1256
	rcu_read_lock();
	parent = rcu_dereference(ctx->parent_ctx);
P
Peter Zijlstra 已提交
1257
	next_ctx = next->perf_event_ctxp[ctxn];
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
	if (parent && next_ctx &&
	    rcu_dereference(next_ctx->parent_ctx) == parent) {
		/*
		 * Looks like the two contexts are clones, so we might be
		 * able to optimize the context switch.  We lock both
		 * contexts and check that they are clones under the
		 * lock (including re-checking that neither has been
		 * uncloned in the meantime).  It doesn't matter which
		 * order we take the locks because no other cpu could
		 * be trying to lock both of these tasks.
		 */
1269 1270
		raw_spin_lock(&ctx->lock);
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1271
		if (context_equiv(ctx, next_ctx)) {
1272 1273
			/*
			 * XXX do we need a memory barrier of sorts
1274
			 * wrt to rcu_dereference() of perf_event_ctxp
1275
			 */
P
Peter Zijlstra 已提交
1276 1277
			task->perf_event_ctxp[ctxn] = next_ctx;
			next->perf_event_ctxp[ctxn] = ctx;
1278 1279 1280
			ctx->task = next;
			next_ctx->task = task;
			do_switch = 0;
1281

1282
			perf_event_sync_stat(ctx, next_ctx);
1283
		}
1284 1285
		raw_spin_unlock(&next_ctx->lock);
		raw_spin_unlock(&ctx->lock);
1286
	}
1287
	rcu_read_unlock();
1288

1289
	if (do_switch) {
1290
		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1291 1292
		cpuctx->task_ctx = NULL;
	}
T
Thomas Gleixner 已提交
1293 1294
}

P
Peter Zijlstra 已提交
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
#define for_each_task_context_nr(ctxn)					\
	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void perf_event_task_sched_out(struct task_struct *task,
			       struct task_struct *next)
{
	int ctxn;

	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);

	for_each_task_context_nr(ctxn)
		perf_event_context_sched_out(task, ctxn, next);
}

1320 1321
static void task_ctx_sched_out(struct perf_event_context *ctx,
			       enum event_type_t event_type)
1322
{
P
Peter Zijlstra 已提交
1323
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1324

1325 1326
	if (!cpuctx->task_ctx)
		return;
1327 1328 1329 1330

	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
		return;

1331
	ctx_sched_out(ctx, cpuctx, event_type);
1332 1333 1334
	cpuctx->task_ctx = NULL;
}

1335 1336 1337
/*
 * Called with IRQs disabled
 */
1338
static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1339
{
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	task_ctx_sched_out(ctx, EVENT_ALL);
}

/*
 * Called with IRQs disabled
 */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
			      enum event_type_t event_type)
{
	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1350 1351
}

1352
static void
1353
ctx_pinned_sched_in(struct perf_event_context *ctx,
1354
		    struct perf_cpu_context *cpuctx)
T
Thomas Gleixner 已提交
1355
{
1356
	struct perf_event *event;
T
Thomas Gleixner 已提交
1357

1358 1359
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		if (event->state <= PERF_EVENT_STATE_OFF)
1360
			continue;
1361
		if (event->cpu != -1 && event->cpu != smp_processor_id())
1362 1363
			continue;

1364
		if (group_can_go_on(event, cpuctx, 1))
1365
			group_sched_in(event, cpuctx, ctx);
1366 1367 1368 1369 1370

		/*
		 * If this pinned group hasn't been scheduled,
		 * put it in error state.
		 */
1371 1372 1373
		if (event->state == PERF_EVENT_STATE_INACTIVE) {
			update_group_times(event);
			event->state = PERF_EVENT_STATE_ERROR;
1374
		}
1375
	}
1376 1377 1378 1379
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
1380
		      struct perf_cpu_context *cpuctx)
1381 1382 1383
{
	struct perf_event *event;
	int can_add_hw = 1;
1384

1385 1386 1387
	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		/* Ignore events in OFF or ERROR state */
		if (event->state <= PERF_EVENT_STATE_OFF)
1388
			continue;
1389 1390
		/*
		 * Listen to the 'cpu' scheduling filter constraint
1391
		 * of events:
1392
		 */
1393
		if (event->cpu != -1 && event->cpu != smp_processor_id())
T
Thomas Gleixner 已提交
1394 1395
			continue;

P
Peter Zijlstra 已提交
1396
		if (group_can_go_on(event, cpuctx, can_add_hw)) {
1397
			if (group_sched_in(event, cpuctx, ctx))
1398
				can_add_hw = 0;
P
Peter Zijlstra 已提交
1399
		}
T
Thomas Gleixner 已提交
1400
	}
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
}

static void
ctx_sched_in(struct perf_event_context *ctx,
	     struct perf_cpu_context *cpuctx,
	     enum event_type_t event_type)
{
	raw_spin_lock(&ctx->lock);
	ctx->is_active = 1;
	if (likely(!ctx->nr_events))
		goto out;

	ctx->timestamp = perf_clock();

	/*
	 * First go through the list and put on any pinned groups
	 * in order to give them the best chance of going on.
	 */
	if (event_type & EVENT_PINNED)
1420
		ctx_pinned_sched_in(ctx, cpuctx);
1421 1422 1423

	/* Then walk through the lower prio flexible groups */
	if (event_type & EVENT_FLEXIBLE)
1424
		ctx_flexible_sched_in(ctx, cpuctx);
1425

P
Peter Zijlstra 已提交
1426
out:
1427
	raw_spin_unlock(&ctx->lock);
1428 1429
}

1430 1431 1432 1433 1434 1435 1436 1437
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
			     enum event_type_t event_type)
{
	struct perf_event_context *ctx = &cpuctx->ctx;

	ctx_sched_in(ctx, cpuctx, event_type);
}

P
Peter Zijlstra 已提交
1438
static void task_ctx_sched_in(struct perf_event_context *ctx,
1439 1440
			      enum event_type_t event_type)
{
P
Peter Zijlstra 已提交
1441
	struct perf_cpu_context *cpuctx;
1442

P
Peter Zijlstra 已提交
1443
       	cpuctx = __get_cpu_context(ctx);
1444 1445
	if (cpuctx->task_ctx == ctx)
		return;
P
Peter Zijlstra 已提交
1446

1447 1448 1449
	ctx_sched_in(ctx, cpuctx, event_type);
	cpuctx->task_ctx = ctx;
}
T
Thomas Gleixner 已提交
1450

P
Peter Zijlstra 已提交
1451
void perf_event_context_sched_in(struct perf_event_context *ctx)
1452
{
P
Peter Zijlstra 已提交
1453
	struct perf_cpu_context *cpuctx;
1454

P
Peter Zijlstra 已提交
1455
	cpuctx = __get_cpu_context(ctx);
1456 1457 1458
	if (cpuctx->task_ctx == ctx)
		return;

P
Peter Zijlstra 已提交
1459
	perf_pmu_disable(ctx->pmu);
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
	/*
	 * We want to keep the following priority order:
	 * cpu pinned (that don't need to move), task pinned,
	 * cpu flexible, task flexible.
	 */
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);

	cpuctx->task_ctx = ctx;
1472

1473 1474 1475 1476
	/*
	 * Since these rotations are per-cpu, we need to ensure the
	 * cpu-context we got scheduled on is actually rotating.
	 */
P
Peter Zijlstra 已提交
1477
	perf_pmu_rotate_start(ctx->pmu);
P
Peter Zijlstra 已提交
1478
	perf_pmu_enable(ctx->pmu);
1479 1480
}

P
Peter Zijlstra 已提交
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void perf_event_task_sched_in(struct task_struct *task)
{
	struct perf_event_context *ctx;
	int ctxn;

	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (likely(!ctx))
			continue;

		perf_event_context_sched_in(ctx);
	}
1504 1505
}

1506 1507
#define MAX_INTERRUPTS (~0ULL)

1508
static void perf_log_throttle(struct perf_event *event, int enable);
1509

1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
	u64 frequency = event->attr.sample_freq;
	u64 sec = NSEC_PER_SEC;
	u64 divisor, dividend;

	int count_fls, nsec_fls, frequency_fls, sec_fls;

	count_fls = fls64(count);
	nsec_fls = fls64(nsec);
	frequency_fls = fls64(frequency);
	sec_fls = 30;

	/*
	 * We got @count in @nsec, with a target of sample_freq HZ
	 * the target period becomes:
	 *
	 *             @count * 10^9
	 * period = -------------------
	 *          @nsec * sample_freq
	 *
	 */

	/*
	 * Reduce accuracy by one bit such that @a and @b converge
	 * to a similar magnitude.
	 */
#define REDUCE_FLS(a, b) 		\
do {					\
	if (a##_fls > b##_fls) {	\
		a >>= 1;		\
		a##_fls--;		\
	} else {			\
		b >>= 1;		\
		b##_fls--;		\
	}				\
} while (0)

	/*
	 * Reduce accuracy until either term fits in a u64, then proceed with
	 * the other, so that finally we can do a u64/u64 division.
	 */
	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
		REDUCE_FLS(nsec, frequency);
		REDUCE_FLS(sec, count);
	}

	if (count_fls + sec_fls > 64) {
		divisor = nsec * frequency;

		while (count_fls + sec_fls > 64) {
			REDUCE_FLS(count, sec);
			divisor >>= 1;
		}

		dividend = count * sec;
	} else {
		dividend = count * sec;

		while (nsec_fls + frequency_fls > 64) {
			REDUCE_FLS(nsec, frequency);
			dividend >>= 1;
		}

		divisor = nsec * frequency;
	}

1577 1578 1579
	if (!divisor)
		return dividend;

1580 1581 1582 1583
	return div64_u64(dividend, divisor);
}

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1584
{
1585
	struct hw_perf_event *hwc = &event->hw;
1586
	s64 period, sample_period;
1587 1588
	s64 delta;

1589
	period = perf_calculate_period(event, nsec, count);
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */

	sample_period = hwc->sample_period + delta;

	if (!sample_period)
		sample_period = 1;

	hwc->sample_period = sample_period;
1600

1601
	if (local64_read(&hwc->period_left) > 8*sample_period) {
P
Peter Zijlstra 已提交
1602
		event->pmu->stop(event, PERF_EF_UPDATE);
1603
		local64_set(&hwc->period_left, 0);
P
Peter Zijlstra 已提交
1604
		event->pmu->start(event, PERF_EF_RELOAD);
1605
	}
1606 1607
}

1608
static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1609
{
1610 1611
	struct perf_event *event;
	struct hw_perf_event *hwc;
1612 1613
	u64 interrupts, now;
	s64 delta;
1614

1615
	raw_spin_lock(&ctx->lock);
1616
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1617
		if (event->state != PERF_EVENT_STATE_ACTIVE)
1618 1619
			continue;

1620 1621 1622
		if (event->cpu != -1 && event->cpu != smp_processor_id())
			continue;

1623
		hwc = &event->hw;
1624 1625 1626

		interrupts = hwc->interrupts;
		hwc->interrupts = 0;
1627

1628
		/*
1629
		 * unthrottle events on the tick
1630
		 */
1631
		if (interrupts == MAX_INTERRUPTS) {
1632
			perf_log_throttle(event, 1);
P
Peter Zijlstra 已提交
1633
			event->pmu->start(event, 0);
1634 1635
		}

1636
		if (!event->attr.freq || !event->attr.sample_freq)
1637 1638
			continue;

1639
		event->pmu->read(event);
1640
		now = local64_read(&event->count);
1641 1642
		delta = now - hwc->freq_count_stamp;
		hwc->freq_count_stamp = now;
1643

1644
		if (delta > 0)
1645
			perf_adjust_period(event, period, delta);
1646
	}
1647
	raw_spin_unlock(&ctx->lock);
1648 1649
}

1650
/*
1651
 * Round-robin a context's events:
1652
 */
1653
static void rotate_ctx(struct perf_event_context *ctx)
T
Thomas Gleixner 已提交
1654
{
1655
	raw_spin_lock(&ctx->lock);
1656 1657 1658 1659

	/* Rotate the first entry last of non-pinned groups */
	list_rotate_left(&ctx->flexible_groups);

1660
	raw_spin_unlock(&ctx->lock);
1661 1662
}

1663
/*
1664 1665 1666
 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
1667
 */
1668
static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1669
{
1670
	u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
P
Peter Zijlstra 已提交
1671
	struct perf_event_context *ctx = NULL;
1672
	int rotate = 0, remove = 1;
1673

1674
	if (cpuctx->ctx.nr_events) {
1675
		remove = 0;
1676 1677 1678
		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
			rotate = 1;
	}
1679

P
Peter Zijlstra 已提交
1680
	ctx = cpuctx->task_ctx;
1681
	if (ctx && ctx->nr_events) {
1682
		remove = 0;
1683 1684 1685
		if (ctx->nr_events != ctx->nr_active)
			rotate = 1;
	}
1686

P
Peter Zijlstra 已提交
1687
	perf_pmu_disable(cpuctx->ctx.pmu);
1688
	perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1689
	if (ctx)
1690
		perf_ctx_adjust_freq(ctx, interval);
1691

1692
	if (!rotate)
1693
		goto done;
1694

1695
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1696
	if (ctx)
1697
		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
T
Thomas Gleixner 已提交
1698

1699
	rotate_ctx(&cpuctx->ctx);
1700 1701
	if (ctx)
		rotate_ctx(ctx);
1702

1703
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1704
	if (ctx)
P
Peter Zijlstra 已提交
1705
		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1706 1707

done:
1708 1709 1710
	if (remove)
		list_del_init(&cpuctx->rotation_list);

P
Peter Zijlstra 已提交
1711
	perf_pmu_enable(cpuctx->ctx.pmu);
1712 1713 1714 1715 1716 1717
}

void perf_event_task_tick(void)
{
	struct list_head *head = &__get_cpu_var(rotation_list);
	struct perf_cpu_context *cpuctx, *tmp;
1718

1719 1720 1721 1722 1723 1724 1725
	WARN_ON(!irqs_disabled());

	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
		if (cpuctx->jiffies_interval == 1 ||
				!(jiffies % cpuctx->jiffies_interval))
			perf_rotate_context(cpuctx);
	}
T
Thomas Gleixner 已提交
1726 1727
}

1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742
static int event_enable_on_exec(struct perf_event *event,
				struct perf_event_context *ctx)
{
	if (!event->attr.enable_on_exec)
		return 0;

	event->attr.enable_on_exec = 0;
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
		return 0;

	__perf_event_mark_enabled(event, ctx);

	return 1;
}

1743
/*
1744
 * Enable all of a task's events that have been marked enable-on-exec.
1745 1746
 * This expects task == current.
 */
P
Peter Zijlstra 已提交
1747
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1748
{
1749
	struct perf_event *event;
1750 1751
	unsigned long flags;
	int enabled = 0;
1752
	int ret;
1753 1754

	local_irq_save(flags);
1755
	if (!ctx || !ctx->nr_events)
1756 1757
		goto out;

P
Peter Zijlstra 已提交
1758
	task_ctx_sched_out(ctx, EVENT_ALL);
1759

1760
	raw_spin_lock(&ctx->lock);
1761

1762 1763 1764 1765 1766 1767 1768 1769 1770 1771
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
	}

	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
1772 1773 1774
	}

	/*
1775
	 * Unclone this context if we enabled any event.
1776
	 */
1777 1778
	if (enabled)
		unclone_ctx(ctx);
1779

1780
	raw_spin_unlock(&ctx->lock);
1781

P
Peter Zijlstra 已提交
1782
	perf_event_context_sched_in(ctx);
P
Peter Zijlstra 已提交
1783
out:
1784 1785 1786
	local_irq_restore(flags);
}

T
Thomas Gleixner 已提交
1787
/*
1788
 * Cross CPU call to read the hardware event
T
Thomas Gleixner 已提交
1789
 */
1790
static void __perf_event_read(void *info)
T
Thomas Gleixner 已提交
1791
{
1792 1793
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
1794
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
I
Ingo Molnar 已提交
1795

1796 1797 1798 1799
	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu.  If not it has been
	 * scheduled out before the smp call arrived.  In that case
1800 1801
	 * event->count would have been updated to a recent sample
	 * when the event was scheduled out.
1802 1803 1804 1805
	 */
	if (ctx->task && cpuctx->task_ctx != ctx)
		return;

1806
	raw_spin_lock(&ctx->lock);
P
Peter Zijlstra 已提交
1807
	update_context_time(ctx);
1808
	update_event_times(event);
1809
	raw_spin_unlock(&ctx->lock);
P
Peter Zijlstra 已提交
1810

P
Peter Zijlstra 已提交
1811
	event->pmu->read(event);
T
Thomas Gleixner 已提交
1812 1813
}

P
Peter Zijlstra 已提交
1814 1815
static inline u64 perf_event_count(struct perf_event *event)
{
1816
	return local64_read(&event->count) + atomic64_read(&event->child_count);
P
Peter Zijlstra 已提交
1817 1818
}

1819
static u64 perf_event_read(struct perf_event *event)
T
Thomas Gleixner 已提交
1820 1821
{
	/*
1822 1823
	 * If event is enabled and currently active on a CPU, update the
	 * value in the event structure:
T
Thomas Gleixner 已提交
1824
	 */
1825 1826 1827 1828
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
		smp_call_function_single(event->oncpu,
					 __perf_event_read, event, 1);
	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
P
Peter Zijlstra 已提交
1829 1830 1831
		struct perf_event_context *ctx = event->ctx;
		unsigned long flags;

1832
		raw_spin_lock_irqsave(&ctx->lock, flags);
1833 1834 1835 1836 1837 1838 1839
		/*
		 * may read while context is not active
		 * (e.g., thread is blocked), in that case
		 * we cannot update context time
		 */
		if (ctx->is_active)
			update_context_time(ctx);
1840
		update_event_times(event);
1841
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1842 1843
	}

P
Peter Zijlstra 已提交
1844
	return perf_event_count(event);
T
Thomas Gleixner 已提交
1845 1846
}

1847
/*
1848
 * Callchain support
1849
 */
1850 1851 1852 1853 1854 1855

struct callchain_cpus_entries {
	struct rcu_head			rcu_head;
	struct perf_callchain_entry	*cpu_entries[0];
};

1856
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1857 1858 1859 1860 1861 1862 1863
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
struct callchain_cpus_entries *callchain_cpus_entries;


__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
				  struct pt_regs *regs)
1864 1865 1866
{
}

1867 1868
__weak void perf_callchain_user(struct perf_callchain_entry *entry,
				struct pt_regs *regs)
T
Thomas Gleixner 已提交
1869
{
1870
}
T
Thomas Gleixner 已提交
1871

1872 1873 1874 1875
static void release_callchain_buffers_rcu(struct rcu_head *head)
{
	struct callchain_cpus_entries *entries;
	int cpu;
T
Thomas Gleixner 已提交
1876

1877
	entries = container_of(head, struct callchain_cpus_entries, rcu_head);
T
Thomas Gleixner 已提交
1878

1879 1880
	for_each_possible_cpu(cpu)
		kfree(entries->cpu_entries[cpu]);
T
Thomas Gleixner 已提交
1881

1882 1883
	kfree(entries);
}
T
Thomas Gleixner 已提交
1884

1885 1886 1887
static void release_callchain_buffers(void)
{
	struct callchain_cpus_entries *entries;
T
Thomas Gleixner 已提交
1888

1889 1890 1891 1892
	entries = callchain_cpus_entries;
	rcu_assign_pointer(callchain_cpus_entries, NULL);
	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
T
Thomas Gleixner 已提交
1893

1894 1895 1896 1897 1898
static int alloc_callchain_buffers(void)
{
	int cpu;
	int size;
	struct callchain_cpus_entries *entries;
T
Thomas Gleixner 已提交
1899

1900
	/*
1901 1902 1903
	 * We can't use the percpu allocation API for data that can be
	 * accessed from NMI. Use a temporary manual per cpu allocation
	 * until that gets sorted out.
1904
	 */
1905 1906
	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
		num_possible_cpus();
1907

1908 1909 1910
	entries = kzalloc(size, GFP_KERNEL);
	if (!entries)
		return -ENOMEM;
1911

1912
	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
T
Thomas Gleixner 已提交
1913

1914 1915 1916 1917 1918
	for_each_possible_cpu(cpu) {
		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
							 cpu_to_node(cpu));
		if (!entries->cpu_entries[cpu])
			goto fail;
1919 1920
	}

1921
	rcu_assign_pointer(callchain_cpus_entries, entries);
T
Thomas Gleixner 已提交
1922

1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
	return 0;

fail:
	for_each_possible_cpu(cpu)
		kfree(entries->cpu_entries[cpu]);
	kfree(entries);

	return -ENOMEM;
}

static int get_callchain_buffers(void)
{
	int err = 0;
	int count;

	mutex_lock(&callchain_mutex);

	count = atomic_inc_return(&nr_callchain_events);
	if (WARN_ON_ONCE(count < 1)) {
		err = -EINVAL;
		goto exit;
	}

	if (count > 1) {
		/* If the allocation failed, give up */
		if (!callchain_cpus_entries)
			err = -ENOMEM;
		goto exit;
	}

	err = alloc_callchain_buffers();
	if (err)
		release_callchain_buffers();
exit:
	mutex_unlock(&callchain_mutex);

	return err;
}

static void put_callchain_buffers(void)
{
	if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
		release_callchain_buffers();
		mutex_unlock(&callchain_mutex);
	}
}

static int get_recursion_context(int *recursion)
{
	int rctx;

	if (in_nmi())
		rctx = 3;
	else if (in_irq())
		rctx = 2;
	else if (in_softirq())
		rctx = 1;
	else
		rctx = 0;

	if (recursion[rctx])
		return -1;

	recursion[rctx]++;
	barrier();

	return rctx;
}

static inline void put_recursion_context(int *recursion, int rctx)
{
	barrier();
	recursion[rctx]--;
}

static struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
	int cpu;
	struct callchain_cpus_entries *entries;

	*rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
	if (*rctx == -1)
		return NULL;

	entries = rcu_dereference(callchain_cpus_entries);
	if (!entries)
		return NULL;

	cpu = smp_processor_id();

	return &entries->cpu_entries[cpu][*rctx];
}

static void
put_callchain_entry(int rctx)
{
	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
}

static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	int rctx;
	struct perf_callchain_entry *entry;


	entry = get_callchain_entry(&rctx);
	if (rctx == -1)
		return NULL;

	if (!entry)
		goto exit_put;

	entry->nr = 0;

	if (!user_mode(regs)) {
		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
		perf_callchain_kernel(entry, regs);
		if (current->mm)
			regs = task_pt_regs(current);
		else
			regs = NULL;
	}

	if (regs) {
		perf_callchain_store(entry, PERF_CONTEXT_USER);
		perf_callchain_user(entry, regs);
	}

exit_put:
	put_callchain_entry(rctx);

	return entry;
}

2057
/*
2058
 * Initialize the perf_event context in a task_struct:
2059
 */
2060
static void __perf_event_init_context(struct perf_event_context *ctx)
2061
{
2062
	raw_spin_lock_init(&ctx->lock);
2063
	mutex_init(&ctx->mutex);
2064 2065
	INIT_LIST_HEAD(&ctx->pinned_groups);
	INIT_LIST_HEAD(&ctx->flexible_groups);
2066 2067
	INIT_LIST_HEAD(&ctx->event_list);
	atomic_set(&ctx->refcount, 1);
2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
}

static struct perf_event_context *
alloc_perf_context(struct pmu *pmu, struct task_struct *task)
{
	struct perf_event_context *ctx;

	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
	if (!ctx)
		return NULL;

	__perf_event_init_context(ctx);
	if (task) {
		ctx->task = task;
		get_task_struct(task);
	}
	ctx->pmu = pmu;

	return ctx;
2087 2088
}

2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
	struct task_struct *task;
	int err;

	rcu_read_lock();
	if (!vpid)
		task = current;
	else
		task = find_task_by_vpid(vpid);
	if (task)
		get_task_struct(task);
	rcu_read_unlock();

	if (!task)
		return ERR_PTR(-ESRCH);

	/*
	 * Can't attach events to a dying task.
	 */
	err = -ESRCH;
	if (task->flags & PF_EXITING)
		goto errout;

	/* Reuse ptrace permission checks for now. */
	err = -EACCES;
	if (!ptrace_may_access(task, PTRACE_MODE_READ))
		goto errout;

	return task;
errout:
	put_task_struct(task);
	return ERR_PTR(err);

}

P
Peter Zijlstra 已提交
2126
static struct perf_event_context *
M
Matt Helsley 已提交
2127
find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
T
Thomas Gleixner 已提交
2128
{
2129
	struct perf_event_context *ctx;
2130
	struct perf_cpu_context *cpuctx;
2131
	unsigned long flags;
P
Peter Zijlstra 已提交
2132
	int ctxn, err;
T
Thomas Gleixner 已提交
2133

M
Matt Helsley 已提交
2134
	if (!task && cpu != -1) {
2135
		/* Must be root to operate on a CPU event: */
2136
		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
T
Thomas Gleixner 已提交
2137 2138
			return ERR_PTR(-EACCES);

2139
		if (cpu < 0 || cpu >= nr_cpumask_bits)
T
Thomas Gleixner 已提交
2140 2141 2142
			return ERR_PTR(-EINVAL);

		/*
2143
		 * We could be clever and allow to attach a event to an
T
Thomas Gleixner 已提交
2144 2145 2146
		 * offline CPU and activate it when the CPU comes up, but
		 * that's for later.
		 */
2147
		if (!cpu_online(cpu))
T
Thomas Gleixner 已提交
2148 2149
			return ERR_PTR(-ENODEV);

P
Peter Zijlstra 已提交
2150
		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
T
Thomas Gleixner 已提交
2151
		ctx = &cpuctx->ctx;
2152
		get_ctx(ctx);
T
Thomas Gleixner 已提交
2153 2154 2155 2156

		return ctx;
	}

P
Peter Zijlstra 已提交
2157 2158 2159 2160 2161
	err = -EINVAL;
	ctxn = pmu->task_ctx_nr;
	if (ctxn < 0)
		goto errout;

P
Peter Zijlstra 已提交
2162
retry:
P
Peter Zijlstra 已提交
2163
	ctx = perf_lock_task_context(task, ctxn, &flags);
2164
	if (ctx) {
2165
		unclone_ctx(ctx);
2166
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
2167 2168
	}

2169
	if (!ctx) {
2170
		ctx = alloc_perf_context(pmu, task);
2171 2172 2173
		err = -ENOMEM;
		if (!ctx)
			goto errout;
2174

2175
		get_ctx(ctx);
2176

P
Peter Zijlstra 已提交
2177
		if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
2178 2179 2180 2181
			/*
			 * We raced with some other task; use
			 * the context they set.
			 */
2182
			put_task_struct(task);
2183
			kfree(ctx);
2184
			goto retry;
2185 2186 2187
		}
	}

2188
	put_task_struct(task);
T
Thomas Gleixner 已提交
2189
	return ctx;
2190

P
Peter Zijlstra 已提交
2191
errout:
2192 2193
	put_task_struct(task);
	return ERR_PTR(err);
T
Thomas Gleixner 已提交
2194 2195
}

L
Li Zefan 已提交
2196 2197
static void perf_event_free_filter(struct perf_event *event);

2198
static void free_event_rcu(struct rcu_head *head)
P
Peter Zijlstra 已提交
2199
{
2200
	struct perf_event *event;
P
Peter Zijlstra 已提交
2201

2202 2203 2204
	event = container_of(head, struct perf_event, rcu_head);
	if (event->ns)
		put_pid_ns(event->ns);
L
Li Zefan 已提交
2205
	perf_event_free_filter(event);
2206
	kfree(event);
P
Peter Zijlstra 已提交
2207 2208
}

2209
static void perf_pending_sync(struct perf_event *event);
2210
static void perf_buffer_put(struct perf_buffer *buffer);
2211

2212
static void free_event(struct perf_event *event)
2213
{
2214
	perf_pending_sync(event);
2215

2216 2217
	if (!event->parent) {
		atomic_dec(&nr_events);
2218
		if (event->attr.mmap || event->attr.mmap_data)
2219 2220 2221 2222 2223
			atomic_dec(&nr_mmap_events);
		if (event->attr.comm)
			atomic_dec(&nr_comm_events);
		if (event->attr.task)
			atomic_dec(&nr_task_events);
2224 2225
		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
			put_callchain_buffers();
2226
	}
2227

2228 2229 2230
	if (event->buffer) {
		perf_buffer_put(event->buffer);
		event->buffer = NULL;
2231 2232
	}

2233 2234
	if (event->destroy)
		event->destroy(event);
2235

P
Peter Zijlstra 已提交
2236 2237 2238
	if (event->ctx)
		put_ctx(event->ctx);

2239
	call_rcu(&event->rcu_head, free_event_rcu);
2240 2241
}

2242
int perf_event_release_kernel(struct perf_event *event)
T
Thomas Gleixner 已提交
2243
{
2244
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
2245

2246 2247 2248 2249 2250 2251
	/*
	 * Remove from the PMU, can't get re-enabled since we got
	 * here because the last ref went.
	 */
	perf_event_disable(event);

2252
	WARN_ON_ONCE(ctx->parent_ctx);
2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265
	/*
	 * There are two ways this annotation is useful:
	 *
	 *  1) there is a lock recursion from perf_event_exit_task
	 *     see the comment there.
	 *
	 *  2) there is a lock-inversion with mmap_sem through
	 *     perf_event_read_group(), which takes faults while
	 *     holding ctx->mutex, however this is called after
	 *     the last filedesc died, so there is no possibility
	 *     to trigger the AB-BA case.
	 */
	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2266
	raw_spin_lock_irq(&ctx->lock);
2267
	perf_group_detach(event);
2268 2269
	list_del_event(event, ctx);
	raw_spin_unlock_irq(&ctx->lock);
2270
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
2271

2272 2273 2274 2275
	mutex_lock(&event->owner->perf_event_mutex);
	list_del_init(&event->owner_entry);
	mutex_unlock(&event->owner->perf_event_mutex);
	put_task_struct(event->owner);
2276

2277
	free_event(event);
T
Thomas Gleixner 已提交
2278 2279 2280

	return 0;
}
2281
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
T
Thomas Gleixner 已提交
2282

2283 2284 2285 2286
/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
2287
{
2288
	struct perf_event *event = file->private_data;
2289

2290
	file->private_data = NULL;
2291

2292
	return perf_event_release_kernel(event);
2293 2294
}

2295
static int perf_event_read_size(struct perf_event *event)
2296 2297 2298 2299 2300
{
	int entry = sizeof(u64); /* value */
	int size = 0;
	int nr = 1;

2301
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2302 2303
		size += sizeof(u64);

2304
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2305 2306
		size += sizeof(u64);

2307
	if (event->attr.read_format & PERF_FORMAT_ID)
2308 2309
		entry += sizeof(u64);

2310 2311
	if (event->attr.read_format & PERF_FORMAT_GROUP) {
		nr += event->group_leader->nr_siblings;
2312 2313 2314 2315 2316 2317 2318 2319
		size += sizeof(u64);
	}

	size += entry * nr;

	return size;
}

2320
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2321
{
2322
	struct perf_event *child;
2323 2324
	u64 total = 0;

2325 2326 2327
	*enabled = 0;
	*running = 0;

2328
	mutex_lock(&event->child_mutex);
2329
	total += perf_event_read(event);
2330 2331 2332 2333 2334 2335
	*enabled += event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
	*running += event->total_time_running +
			atomic64_read(&event->child_total_time_running);

	list_for_each_entry(child, &event->child_list, child_list) {
2336
		total += perf_event_read(child);
2337 2338 2339
		*enabled += child->total_time_enabled;
		*running += child->total_time_running;
	}
2340
	mutex_unlock(&event->child_mutex);
2341 2342 2343

	return total;
}
2344
EXPORT_SYMBOL_GPL(perf_event_read_value);
2345

2346
static int perf_event_read_group(struct perf_event *event,
2347 2348
				   u64 read_format, char __user *buf)
{
2349
	struct perf_event *leader = event->group_leader, *sub;
2350 2351
	int n = 0, size = 0, ret = -EFAULT;
	struct perf_event_context *ctx = leader->ctx;
2352
	u64 values[5];
2353
	u64 count, enabled, running;
2354

2355
	mutex_lock(&ctx->mutex);
2356
	count = perf_event_read_value(leader, &enabled, &running);
2357 2358

	values[n++] = 1 + leader->nr_siblings;
2359 2360 2361 2362
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
2363 2364 2365
	values[n++] = count;
	if (read_format & PERF_FORMAT_ID)
		values[n++] = primary_event_id(leader);
2366 2367 2368 2369

	size = n * sizeof(u64);

	if (copy_to_user(buf, values, size))
2370
		goto unlock;
2371

2372
	ret = size;
2373

2374
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2375
		n = 0;
2376

2377
		values[n++] = perf_event_read_value(sub, &enabled, &running);
2378 2379 2380 2381 2382
		if (read_format & PERF_FORMAT_ID)
			values[n++] = primary_event_id(sub);

		size = n * sizeof(u64);

2383
		if (copy_to_user(buf + ret, values, size)) {
2384 2385 2386
			ret = -EFAULT;
			goto unlock;
		}
2387 2388

		ret += size;
2389
	}
2390 2391
unlock:
	mutex_unlock(&ctx->mutex);
2392

2393
	return ret;
2394 2395
}

2396
static int perf_event_read_one(struct perf_event *event,
2397 2398
				 u64 read_format, char __user *buf)
{
2399
	u64 enabled, running;
2400 2401 2402
	u64 values[4];
	int n = 0;

2403 2404 2405 2406 2407
	values[n++] = perf_event_read_value(event, &enabled, &running);
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
2408
	if (read_format & PERF_FORMAT_ID)
2409
		values[n++] = primary_event_id(event);
2410 2411 2412 2413 2414 2415 2416

	if (copy_to_user(buf, values, n * sizeof(u64)))
		return -EFAULT;

	return n * sizeof(u64);
}

T
Thomas Gleixner 已提交
2417
/*
2418
 * Read the performance event - simple non blocking version for now
T
Thomas Gleixner 已提交
2419 2420
 */
static ssize_t
2421
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
T
Thomas Gleixner 已提交
2422
{
2423
	u64 read_format = event->attr.read_format;
2424
	int ret;
T
Thomas Gleixner 已提交
2425

2426
	/*
2427
	 * Return end-of-file for a read on a event that is in
2428 2429 2430
	 * error state (i.e. because it was pinned but it couldn't be
	 * scheduled on to the CPU at some point).
	 */
2431
	if (event->state == PERF_EVENT_STATE_ERROR)
2432 2433
		return 0;

2434
	if (count < perf_event_read_size(event))
2435 2436
		return -ENOSPC;

2437
	WARN_ON_ONCE(event->ctx->parent_ctx);
2438
	if (read_format & PERF_FORMAT_GROUP)
2439
		ret = perf_event_read_group(event, read_format, buf);
2440
	else
2441
		ret = perf_event_read_one(event, read_format, buf);
T
Thomas Gleixner 已提交
2442

2443
	return ret;
T
Thomas Gleixner 已提交
2444 2445 2446 2447 2448
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
2449
	struct perf_event *event = file->private_data;
T
Thomas Gleixner 已提交
2450

2451
	return perf_read_hw(event, buf, count);
T
Thomas Gleixner 已提交
2452 2453 2454 2455
}

static unsigned int perf_poll(struct file *file, poll_table *wait)
{
2456
	struct perf_event *event = file->private_data;
2457
	struct perf_buffer *buffer;
2458
	unsigned int events = POLL_HUP;
P
Peter Zijlstra 已提交
2459 2460

	rcu_read_lock();
2461 2462 2463
	buffer = rcu_dereference(event->buffer);
	if (buffer)
		events = atomic_xchg(&buffer->poll, 0);
P
Peter Zijlstra 已提交
2464
	rcu_read_unlock();
T
Thomas Gleixner 已提交
2465

2466
	poll_wait(file, &event->waitq, wait);
T
Thomas Gleixner 已提交
2467 2468 2469 2470

	return events;
}

2471
static void perf_event_reset(struct perf_event *event)
2472
{
2473
	(void)perf_event_read(event);
2474
	local64_set(&event->count, 0);
2475
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
2476 2477
}

2478
/*
2479 2480 2481 2482
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in sync_child_event if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
2483
 */
2484 2485
static void perf_event_for_each_child(struct perf_event *event,
					void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2486
{
2487
	struct perf_event *child;
P
Peter Zijlstra 已提交
2488

2489 2490 2491 2492
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
	func(event);
	list_for_each_entry(child, &event->child_list, child_list)
P
Peter Zijlstra 已提交
2493
		func(child);
2494
	mutex_unlock(&event->child_mutex);
P
Peter Zijlstra 已提交
2495 2496
}

2497 2498
static void perf_event_for_each(struct perf_event *event,
				  void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2499
{
2500 2501
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *sibling;
P
Peter Zijlstra 已提交
2502

2503 2504
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
2505
	event = event->group_leader;
2506

2507 2508 2509 2510
	perf_event_for_each_child(event, func);
	func(event);
	list_for_each_entry(sibling, &event->sibling_list, group_entry)
		perf_event_for_each_child(event, func);
2511
	mutex_unlock(&ctx->mutex);
2512 2513
}

2514
static int perf_event_period(struct perf_event *event, u64 __user *arg)
2515
{
2516
	struct perf_event_context *ctx = event->ctx;
2517 2518 2519 2520
	unsigned long size;
	int ret = 0;
	u64 value;

2521
	if (!event->attr.sample_period)
2522 2523 2524 2525 2526 2527 2528 2529 2530
		return -EINVAL;

	size = copy_from_user(&value, arg, sizeof(value));
	if (size != sizeof(value))
		return -EFAULT;

	if (!value)
		return -EINVAL;

2531
	raw_spin_lock_irq(&ctx->lock);
2532 2533
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate) {
2534 2535 2536 2537
			ret = -EINVAL;
			goto unlock;
		}

2538
		event->attr.sample_freq = value;
2539
	} else {
2540 2541
		event->attr.sample_period = value;
		event->hw.sample_period = value;
2542 2543
	}
unlock:
2544
	raw_spin_unlock_irq(&ctx->lock);
2545 2546 2547 2548

	return ret;
}

2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
static const struct file_operations perf_fops;

static struct perf_event *perf_fget_light(int fd, int *fput_needed)
{
	struct file *file;

	file = fget_light(fd, fput_needed);
	if (!file)
		return ERR_PTR(-EBADF);

	if (file->f_op != &perf_fops) {
		fput_light(file, *fput_needed);
		*fput_needed = 0;
		return ERR_PTR(-EBADF);
	}

	return file->private_data;
}

static int perf_event_set_output(struct perf_event *event,
				 struct perf_event *output_event);
L
Li Zefan 已提交
2570
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2571

2572 2573
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
2574 2575
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
P
Peter Zijlstra 已提交
2576
	u32 flags = arg;
2577 2578

	switch (cmd) {
2579 2580
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
2581
		break;
2582 2583
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
2584
		break;
2585 2586
	case PERF_EVENT_IOC_RESET:
		func = perf_event_reset;
2587
		break;
P
Peter Zijlstra 已提交
2588

2589 2590
	case PERF_EVENT_IOC_REFRESH:
		return perf_event_refresh(event, arg);
2591

2592 2593
	case PERF_EVENT_IOC_PERIOD:
		return perf_event_period(event, (u64 __user *)arg);
2594

2595
	case PERF_EVENT_IOC_SET_OUTPUT:
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612
	{
		struct perf_event *output_event = NULL;
		int fput_needed = 0;
		int ret;

		if (arg != -1) {
			output_event = perf_fget_light(arg, &fput_needed);
			if (IS_ERR(output_event))
				return PTR_ERR(output_event);
		}

		ret = perf_event_set_output(event, output_event);
		if (output_event)
			fput_light(output_event->filp, fput_needed);

		return ret;
	}
2613

L
Li Zefan 已提交
2614 2615 2616
	case PERF_EVENT_IOC_SET_FILTER:
		return perf_event_set_filter(event, (void __user *)arg);

2617
	default:
P
Peter Zijlstra 已提交
2618
		return -ENOTTY;
2619
	}
P
Peter Zijlstra 已提交
2620 2621

	if (flags & PERF_IOC_FLAG_GROUP)
2622
		perf_event_for_each(event, func);
P
Peter Zijlstra 已提交
2623
	else
2624
		perf_event_for_each_child(event, func);
P
Peter Zijlstra 已提交
2625 2626

	return 0;
2627 2628
}

2629
int perf_event_task_enable(void)
2630
{
2631
	struct perf_event *event;
2632

2633 2634 2635 2636
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_enable);
	mutex_unlock(&current->perf_event_mutex);
2637 2638 2639 2640

	return 0;
}

2641
int perf_event_task_disable(void)
2642
{
2643
	struct perf_event *event;
2644

2645 2646 2647 2648
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_disable);
	mutex_unlock(&current->perf_event_mutex);
2649 2650 2651 2652

	return 0;
}

2653 2654
#ifndef PERF_EVENT_INDEX_OFFSET
# define PERF_EVENT_INDEX_OFFSET 0
I
Ingo Molnar 已提交
2655 2656
#endif

2657
static int perf_event_index(struct perf_event *event)
2658
{
P
Peter Zijlstra 已提交
2659 2660 2661
	if (event->hw.state & PERF_HES_STOPPED)
		return 0;

2662
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2663 2664
		return 0;

2665
	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2666 2667
}

2668 2669 2670 2671 2672
/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
2673
void perf_event_update_userpage(struct perf_event *event)
2674
{
2675
	struct perf_event_mmap_page *userpg;
2676
	struct perf_buffer *buffer;
2677 2678

	rcu_read_lock();
2679 2680
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
2681 2682
		goto unlock;

2683
	userpg = buffer->user_page;
2684

2685 2686 2687 2688 2689
	/*
	 * Disable preemption so as to not let the corresponding user-space
	 * spin too long if we get preempted.
	 */
	preempt_disable();
2690
	++userpg->lock;
2691
	barrier();
2692
	userpg->index = perf_event_index(event);
P
Peter Zijlstra 已提交
2693
	userpg->offset = perf_event_count(event);
2694
	if (event->state == PERF_EVENT_STATE_ACTIVE)
2695
		userpg->offset -= local64_read(&event->hw.prev_count);
2696

2697 2698
	userpg->time_enabled = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2699

2700 2701
	userpg->time_running = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2702

2703
	barrier();
2704
	++userpg->lock;
2705
	preempt_enable();
2706
unlock:
2707
	rcu_read_unlock();
2708 2709
}

2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728
static unsigned long perf_data_size(struct perf_buffer *buffer);

static void
perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
{
	long max_size = perf_data_size(buffer);

	if (watermark)
		buffer->watermark = min(max_size, watermark);

	if (!buffer->watermark)
		buffer->watermark = max_size / 2;

	if (flags & PERF_BUFFER_WRITABLE)
		buffer->writable = 1;

	atomic_set(&buffer->refcount, 1);
}

2729
#ifndef CONFIG_PERF_USE_VMALLOC
2730

2731 2732 2733
/*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
2734

2735
static struct page *
2736
perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2737
{
2738
	if (pgoff > buffer->nr_pages)
2739
		return NULL;
2740

2741
	if (pgoff == 0)
2742
		return virt_to_page(buffer->user_page);
2743

2744
	return virt_to_page(buffer->data_pages[pgoff - 1]);
2745 2746
}

2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759
static void *perf_mmap_alloc_page(int cpu)
{
	struct page *page;
	int node;

	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
	if (!page)
		return NULL;

	return page_address(page);
}

2760
static struct perf_buffer *
2761
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2762
{
2763
	struct perf_buffer *buffer;
2764 2765 2766
	unsigned long size;
	int i;

2767
	size = sizeof(struct perf_buffer);
2768 2769
	size += nr_pages * sizeof(void *);

2770 2771
	buffer = kzalloc(size, GFP_KERNEL);
	if (!buffer)
2772 2773
		goto fail;

2774
	buffer->user_page = perf_mmap_alloc_page(cpu);
2775
	if (!buffer->user_page)
2776 2777 2778
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
2779
		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2780
		if (!buffer->data_pages[i])
2781 2782 2783
			goto fail_data_pages;
	}

2784
	buffer->nr_pages = nr_pages;
2785

2786 2787
	perf_buffer_init(buffer, watermark, flags);

2788
	return buffer;
2789 2790 2791

fail_data_pages:
	for (i--; i >= 0; i--)
2792
		free_page((unsigned long)buffer->data_pages[i]);
2793

2794
	free_page((unsigned long)buffer->user_page);
2795 2796

fail_user_page:
2797
	kfree(buffer);
2798 2799

fail:
2800
	return NULL;
2801 2802
}

2803 2804
static void perf_mmap_free_page(unsigned long addr)
{
K
Kevin Cernekee 已提交
2805
	struct page *page = virt_to_page((void *)addr);
2806 2807 2808 2809 2810

	page->mapping = NULL;
	__free_page(page);
}

2811
static void perf_buffer_free(struct perf_buffer *buffer)
2812 2813 2814
{
	int i;

2815 2816 2817 2818
	perf_mmap_free_page((unsigned long)buffer->user_page);
	for (i = 0; i < buffer->nr_pages; i++)
		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
	kfree(buffer);
2819 2820
}

2821
static inline int page_order(struct perf_buffer *buffer)
2822 2823 2824 2825
{
	return 0;
}

2826 2827 2828 2829 2830 2831 2832 2833
#else

/*
 * Back perf_mmap() with vmalloc memory.
 *
 * Required for architectures that have d-cache aliasing issues.
 */

2834
static inline int page_order(struct perf_buffer *buffer)
2835
{
2836
	return buffer->page_order;
2837 2838
}

2839
static struct page *
2840
perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2841
{
2842
	if (pgoff > (1UL << page_order(buffer)))
2843 2844
		return NULL;

2845
	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2846 2847 2848 2849 2850 2851 2852 2853 2854
}

static void perf_mmap_unmark_page(void *addr)
{
	struct page *page = vmalloc_to_page(addr);

	page->mapping = NULL;
}

2855
static void perf_buffer_free_work(struct work_struct *work)
2856
{
2857
	struct perf_buffer *buffer;
2858 2859 2860
	void *base;
	int i, nr;

2861 2862
	buffer = container_of(work, struct perf_buffer, work);
	nr = 1 << page_order(buffer);
2863

2864
	base = buffer->user_page;
2865 2866 2867 2868
	for (i = 0; i < nr + 1; i++)
		perf_mmap_unmark_page(base + (i * PAGE_SIZE));

	vfree(base);
2869
	kfree(buffer);
2870 2871
}

2872
static void perf_buffer_free(struct perf_buffer *buffer)
2873
{
2874
	schedule_work(&buffer->work);
2875 2876
}

2877
static struct perf_buffer *
2878
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2879
{
2880
	struct perf_buffer *buffer;
2881 2882 2883
	unsigned long size;
	void *all_buf;

2884
	size = sizeof(struct perf_buffer);
2885 2886
	size += sizeof(void *);

2887 2888
	buffer = kzalloc(size, GFP_KERNEL);
	if (!buffer)
2889 2890
		goto fail;

2891
	INIT_WORK(&buffer->work, perf_buffer_free_work);
2892 2893 2894 2895 2896

	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

2897 2898 2899 2900
	buffer->user_page = all_buf;
	buffer->data_pages[0] = all_buf + PAGE_SIZE;
	buffer->page_order = ilog2(nr_pages);
	buffer->nr_pages = 1;
2901

2902 2903
	perf_buffer_init(buffer, watermark, flags);

2904
	return buffer;
2905 2906

fail_all_buf:
2907
	kfree(buffer);
2908 2909 2910 2911 2912 2913 2914

fail:
	return NULL;
}

#endif

2915
static unsigned long perf_data_size(struct perf_buffer *buffer)
2916
{
2917
	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2918 2919
}

2920 2921 2922
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct perf_event *event = vma->vm_file->private_data;
2923
	struct perf_buffer *buffer;
2924 2925 2926 2927 2928 2929 2930 2931 2932
	int ret = VM_FAULT_SIGBUS;

	if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
			ret = 0;
		return ret;
	}

	rcu_read_lock();
2933 2934
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
2935 2936 2937 2938 2939
		goto unlock;

	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
		goto unlock;

2940
	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954
	if (!vmf->page)
		goto unlock;

	get_page(vmf->page);
	vmf->page->mapping = vma->vm_file->f_mapping;
	vmf->page->index   = vmf->pgoff;

	ret = 0;
unlock:
	rcu_read_unlock();

	return ret;
}

2955
static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2956
{
2957
	struct perf_buffer *buffer;
2958

2959 2960
	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
	perf_buffer_free(buffer);
2961 2962
}

2963
static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2964
{
2965
	struct perf_buffer *buffer;
2966

2967
	rcu_read_lock();
2968 2969 2970 2971
	buffer = rcu_dereference(event->buffer);
	if (buffer) {
		if (!atomic_inc_not_zero(&buffer->refcount))
			buffer = NULL;
2972 2973 2974
	}
	rcu_read_unlock();

2975
	return buffer;
2976 2977
}

2978
static void perf_buffer_put(struct perf_buffer *buffer)
2979
{
2980
	if (!atomic_dec_and_test(&buffer->refcount))
2981
		return;
2982

2983
	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2984 2985 2986 2987
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
2988
	struct perf_event *event = vma->vm_file->private_data;
2989

2990
	atomic_inc(&event->mmap_count);
2991 2992 2993 2994
}

static void perf_mmap_close(struct vm_area_struct *vma)
{
2995
	struct perf_event *event = vma->vm_file->private_data;
2996

2997
	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2998
		unsigned long size = perf_data_size(event->buffer);
2999
		struct user_struct *user = event->mmap_user;
3000
		struct perf_buffer *buffer = event->buffer;
3001

3002
		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3003
		vma->vm_mm->locked_vm -= event->mmap_locked;
3004
		rcu_assign_pointer(event->buffer, NULL);
3005
		mutex_unlock(&event->mmap_mutex);
3006

3007
		perf_buffer_put(buffer);
3008
		free_uid(user);
3009
	}
3010 3011
}

3012
static const struct vm_operations_struct perf_mmap_vmops = {
3013 3014 3015 3016
	.open		= perf_mmap_open,
	.close		= perf_mmap_close,
	.fault		= perf_mmap_fault,
	.page_mkwrite	= perf_mmap_fault,
3017 3018 3019 3020
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
3021
	struct perf_event *event = file->private_data;
3022
	unsigned long user_locked, user_lock_limit;
3023
	struct user_struct *user = current_user();
3024
	unsigned long locked, lock_limit;
3025
	struct perf_buffer *buffer;
3026 3027
	unsigned long vma_size;
	unsigned long nr_pages;
3028
	long user_extra, extra;
3029
	int ret = 0, flags = 0;
3030

3031 3032 3033 3034 3035 3036 3037 3038
	/*
	 * Don't allow mmap() of inherited per-task counters. This would
	 * create a performance issue due to all children writing to the
	 * same buffer.
	 */
	if (event->cpu == -1 && event->attr.inherit)
		return -EINVAL;

3039
	if (!(vma->vm_flags & VM_SHARED))
3040
		return -EINVAL;
3041 3042 3043 3044

	vma_size = vma->vm_end - vma->vm_start;
	nr_pages = (vma_size / PAGE_SIZE) - 1;

3045
	/*
3046
	 * If we have buffer pages ensure they're a power-of-two number, so we
3047 3048 3049
	 * can do bitmasks instead of modulo.
	 */
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
3050 3051
		return -EINVAL;

3052
	if (vma_size != PAGE_SIZE * (1 + nr_pages))
3053 3054
		return -EINVAL;

3055 3056
	if (vma->vm_pgoff != 0)
		return -EINVAL;
3057

3058 3059
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->mmap_mutex);
3060 3061 3062
	if (event->buffer) {
		if (event->buffer->nr_pages == nr_pages)
			atomic_inc(&event->buffer->refcount);
3063
		else
3064 3065 3066 3067
			ret = -EINVAL;
		goto unlock;
	}

3068
	user_extra = nr_pages + 1;
3069
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
I
Ingo Molnar 已提交
3070 3071 3072 3073 3074 3075

	/*
	 * Increase the limit linearly with more CPUs:
	 */
	user_lock_limit *= num_online_cpus();

3076
	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3077

3078 3079 3080
	extra = 0;
	if (user_locked > user_lock_limit)
		extra = user_locked - user_lock_limit;
3081

3082
	lock_limit = rlimit(RLIMIT_MEMLOCK);
3083
	lock_limit >>= PAGE_SHIFT;
3084
	locked = vma->vm_mm->locked_vm + extra;
3085

3086 3087
	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
		!capable(CAP_IPC_LOCK)) {
3088 3089 3090
		ret = -EPERM;
		goto unlock;
	}
3091

3092
	WARN_ON(event->buffer);
3093

3094 3095 3096 3097 3098
	if (vma->vm_flags & VM_WRITE)
		flags |= PERF_BUFFER_WRITABLE;

	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
				   event->cpu, flags);
3099
	if (!buffer) {
3100
		ret = -ENOMEM;
3101
		goto unlock;
3102
	}
3103
	rcu_assign_pointer(event->buffer, buffer);
3104

3105 3106 3107 3108 3109
	atomic_long_add(user_extra, &user->locked_vm);
	event->mmap_locked = extra;
	event->mmap_user = get_current_user();
	vma->vm_mm->locked_vm += event->mmap_locked;

3110
unlock:
3111 3112
	if (!ret)
		atomic_inc(&event->mmap_count);
3113
	mutex_unlock(&event->mmap_mutex);
3114 3115 3116

	vma->vm_flags |= VM_RESERVED;
	vma->vm_ops = &perf_mmap_vmops;
3117 3118

	return ret;
3119 3120
}

P
Peter Zijlstra 已提交
3121 3122 3123
static int perf_fasync(int fd, struct file *filp, int on)
{
	struct inode *inode = filp->f_path.dentry->d_inode;
3124
	struct perf_event *event = filp->private_data;
P
Peter Zijlstra 已提交
3125 3126 3127
	int retval;

	mutex_lock(&inode->i_mutex);
3128
	retval = fasync_helper(fd, filp, on, &event->fasync);
P
Peter Zijlstra 已提交
3129 3130 3131 3132 3133 3134 3135 3136
	mutex_unlock(&inode->i_mutex);

	if (retval < 0)
		return retval;

	return 0;
}

T
Thomas Gleixner 已提交
3137
static const struct file_operations perf_fops = {
3138
	.llseek			= no_llseek,
T
Thomas Gleixner 已提交
3139 3140 3141
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
3142 3143
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
3144
	.mmap			= perf_mmap,
P
Peter Zijlstra 已提交
3145
	.fasync			= perf_fasync,
T
Thomas Gleixner 已提交
3146 3147
};

3148
/*
3149
 * Perf event wakeup
3150 3151 3152 3153 3154
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

3155
void perf_event_wakeup(struct perf_event *event)
3156
{
3157
	wake_up_all(&event->waitq);
3158

3159 3160 3161
	if (event->pending_kill) {
		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
		event->pending_kill = 0;
3162
	}
3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173
}

/*
 * Pending wakeups
 *
 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
 *
 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
 * single linked list and use cmpxchg() to add entries lockless.
 */

3174
static void perf_pending_event(struct perf_pending_entry *entry)
3175
{
3176 3177
	struct perf_event *event = container_of(entry,
			struct perf_event, pending);
3178

3179 3180 3181
	if (event->pending_disable) {
		event->pending_disable = 0;
		__perf_event_disable(event);
3182 3183
	}

3184 3185 3186
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
3187 3188 3189
	}
}

3190
#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
3191

3192
static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
3193 3194 3195
	PENDING_TAIL,
};

3196 3197
static void perf_pending_queue(struct perf_pending_entry *entry,
			       void (*func)(struct perf_pending_entry *))
3198
{
3199
	struct perf_pending_entry **head;
3200

3201
	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
3202 3203
		return;

3204 3205 3206
	entry->func = func;

	head = &get_cpu_var(perf_pending_head);
3207 3208

	do {
3209 3210
		entry->next = *head;
	} while (cmpxchg(head, entry->next, entry) != entry->next);
3211

3212
	set_perf_event_pending();
3213

3214
	put_cpu_var(perf_pending_head);
3215 3216 3217 3218
}

static int __perf_pending_run(void)
{
3219
	struct perf_pending_entry *list;
3220 3221
	int nr = 0;

3222
	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
3223
	while (list != PENDING_TAIL) {
3224 3225
		void (*func)(struct perf_pending_entry *);
		struct perf_pending_entry *entry = list;
3226 3227 3228

		list = list->next;

3229 3230
		func = entry->func;
		entry->next = NULL;
3231 3232 3233 3234 3235 3236 3237
		/*
		 * Ensure we observe the unqueue before we issue the wakeup,
		 * so that we won't be waiting forever.
		 * -- see perf_not_pending().
		 */
		smp_wmb();

3238
		func(entry);
3239 3240 3241 3242 3243 3244
		nr++;
	}

	return nr;
}

3245
static inline int perf_not_pending(struct perf_event *event)
3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259
{
	/*
	 * If we flush on whatever cpu we run, there is a chance we don't
	 * need to wait.
	 */
	get_cpu();
	__perf_pending_run();
	put_cpu();

	/*
	 * Ensure we see the proper queue state before going to sleep
	 * so that we do not miss the wakeup. -- see perf_pending_handle()
	 */
	smp_rmb();
3260
	return event->pending.next == NULL;
3261 3262
}

3263
static void perf_pending_sync(struct perf_event *event)
3264
{
3265
	wait_event(event->waitq, perf_not_pending(event));
3266 3267
}

3268
void perf_event_do_pending(void)
3269 3270 3271 3272
{
	__perf_pending_run();
}

3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293
/*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
 * another virtualization implementation supporting the callbacks.
 */
struct perf_guest_info_callbacks *perf_guest_cbs;

int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
	perf_guest_cbs = cbs;
	return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
	perf_guest_cbs = NULL;
	return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);

3294 3295 3296
/*
 * Output
 */
3297
static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3298
			      unsigned long offset, unsigned long head)
3299 3300 3301
{
	unsigned long mask;

3302
	if (!buffer->writable)
3303 3304
		return true;

3305
	mask = perf_data_size(buffer) - 1;
3306 3307 3308 3309 3310 3311 3312 3313 3314 3315

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

3316
static void perf_output_wakeup(struct perf_output_handle *handle)
3317
{
3318
	atomic_set(&handle->buffer->poll, POLL_IN);
3319

3320
	if (handle->nmi) {
3321 3322 3323
		handle->event->pending_wakeup = 1;
		perf_pending_queue(&handle->event->pending,
				   perf_pending_event);
3324
	} else
3325
		perf_event_wakeup(handle->event);
3326 3327
}

3328
/*
3329
 * We need to ensure a later event_id doesn't publish a head when a former
3330
 * event isn't done writing. However since we need to deal with NMIs we
3331 3332 3333
 * cannot fully serialize things.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
3334
 * event completes.
3335
 */
3336
static void perf_output_get_handle(struct perf_output_handle *handle)
3337
{
3338
	struct perf_buffer *buffer = handle->buffer;
3339

3340
	preempt_disable();
3341 3342
	local_inc(&buffer->nest);
	handle->wakeup = local_read(&buffer->wakeup);
3343 3344
}

3345
static void perf_output_put_handle(struct perf_output_handle *handle)
3346
{
3347
	struct perf_buffer *buffer = handle->buffer;
3348
	unsigned long head;
3349 3350

again:
3351
	head = local_read(&buffer->head);
3352 3353

	/*
3354
	 * IRQ/NMI can happen here, which means we can miss a head update.
3355 3356
	 */

3357
	if (!local_dec_and_test(&buffer->nest))
3358
		goto out;
3359 3360

	/*
3361
	 * Publish the known good head. Rely on the full barrier implied
3362
	 * by atomic_dec_and_test() order the buffer->head read and this
3363
	 * write.
3364
	 */
3365
	buffer->user_page->data_head = head;
3366

3367 3368
	/*
	 * Now check if we missed an update, rely on the (compiler)
3369
	 * barrier in atomic_dec_and_test() to re-read buffer->head.
3370
	 */
3371 3372
	if (unlikely(head != local_read(&buffer->head))) {
		local_inc(&buffer->nest);
3373 3374 3375
		goto again;
	}

3376
	if (handle->wakeup != local_read(&buffer->wakeup))
3377
		perf_output_wakeup(handle);
3378

P
Peter Zijlstra 已提交
3379
out:
3380
	preempt_enable();
3381 3382
}

3383
__always_inline void perf_output_copy(struct perf_output_handle *handle,
3384
		      const void *buf, unsigned int len)
3385
{
3386
	do {
3387
		unsigned long size = min_t(unsigned long, handle->size, len);
3388 3389 3390 3391 3392

		memcpy(handle->addr, buf, size);

		len -= size;
		handle->addr += size;
3393
		buf += size;
3394 3395
		handle->size -= size;
		if (!handle->size) {
3396
			struct perf_buffer *buffer = handle->buffer;
3397

3398
			handle->page++;
3399 3400 3401
			handle->page &= buffer->nr_pages - 1;
			handle->addr = buffer->data_pages[handle->page];
			handle->size = PAGE_SIZE << page_order(buffer);
3402 3403
		}
	} while (len);
3404 3405
}

3406
int perf_output_begin(struct perf_output_handle *handle,
3407
		      struct perf_event *event, unsigned int size,
3408
		      int nmi, int sample)
3409
{
3410
	struct perf_buffer *buffer;
3411
	unsigned long tail, offset, head;
3412 3413 3414 3415 3416 3417
	int have_lost;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;
3418

3419
	rcu_read_lock();
3420
	/*
3421
	 * For inherited events we send all the output towards the parent.
3422
	 */
3423 3424
	if (event->parent)
		event = event->parent;
3425

3426 3427
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
3428 3429
		goto out;

3430
	handle->buffer	= buffer;
3431
	handle->event	= event;
3432 3433
	handle->nmi	= nmi;
	handle->sample	= sample;
3434

3435
	if (!buffer->nr_pages)
3436
		goto out;
3437

3438
	have_lost = local_read(&buffer->lost);
3439 3440 3441
	if (have_lost)
		size += sizeof(lost_event);

3442
	perf_output_get_handle(handle);
3443

3444
	do {
3445 3446 3447 3448 3449
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
3450
		tail = ACCESS_ONCE(buffer->user_page->data_tail);
3451
		smp_rmb();
3452
		offset = head = local_read(&buffer->head);
P
Peter Zijlstra 已提交
3453
		head += size;
3454
		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3455
			goto fail;
3456
	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
3457

3458 3459
	if (head - local_read(&buffer->wakeup) > buffer->watermark)
		local_add(buffer->watermark, &buffer->wakeup);
3460

3461 3462 3463 3464
	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
	handle->page &= buffer->nr_pages - 1;
	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
	handle->addr = buffer->data_pages[handle->page];
3465
	handle->addr += handle->size;
3466
	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3467

3468
	if (have_lost) {
3469
		lost_event.header.type = PERF_RECORD_LOST;
3470 3471
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
3472
		lost_event.id          = event->id;
3473
		lost_event.lost        = local_xchg(&buffer->lost, 0);
3474 3475 3476 3477

		perf_output_put(handle, lost_event);
	}

3478
	return 0;
3479

3480
fail:
3481
	local_inc(&buffer->lost);
3482
	perf_output_put_handle(handle);
3483 3484
out:
	rcu_read_unlock();
3485

3486 3487
	return -ENOSPC;
}
3488

3489
void perf_output_end(struct perf_output_handle *handle)
3490
{
3491
	struct perf_event *event = handle->event;
3492
	struct perf_buffer *buffer = handle->buffer;
3493

3494
	int wakeup_events = event->attr.wakeup_events;
P
Peter Zijlstra 已提交
3495

3496
	if (handle->sample && wakeup_events) {
3497
		int events = local_inc_return(&buffer->events);
P
Peter Zijlstra 已提交
3498
		if (events >= wakeup_events) {
3499 3500
			local_sub(wakeup_events, &buffer->events);
			local_inc(&buffer->wakeup);
P
Peter Zijlstra 已提交
3501
		}
3502 3503
	}

3504
	perf_output_put_handle(handle);
3505
	rcu_read_unlock();
3506 3507
}

3508
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3509 3510
{
	/*
3511
	 * only top level events have the pid namespace they were created in
3512
	 */
3513 3514
	if (event->parent)
		event = event->parent;
3515

3516
	return task_tgid_nr_ns(p, event->ns);
3517 3518
}

3519
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3520 3521
{
	/*
3522
	 * only top level events have the pid namespace they were created in
3523
	 */
3524 3525
	if (event->parent)
		event = event->parent;
3526

3527
	return task_pid_nr_ns(p, event->ns);
3528 3529
}

3530
static void perf_output_read_one(struct perf_output_handle *handle,
3531
				 struct perf_event *event)
3532
{
3533
	u64 read_format = event->attr.read_format;
3534 3535 3536
	u64 values[4];
	int n = 0;

P
Peter Zijlstra 已提交
3537
	values[n++] = perf_event_count(event);
3538
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3539 3540
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
3541 3542
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3543 3544
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
3545 3546
	}
	if (read_format & PERF_FORMAT_ID)
3547
		values[n++] = primary_event_id(event);
3548 3549 3550 3551 3552

	perf_output_copy(handle, values, n * sizeof(u64));
}

/*
3553
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3554 3555
 */
static void perf_output_read_group(struct perf_output_handle *handle,
3556
			    struct perf_event *event)
3557
{
3558 3559
	struct perf_event *leader = event->group_leader, *sub;
	u64 read_format = event->attr.read_format;
3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570
	u64 values[5];
	int n = 0;

	values[n++] = 1 + leader->nr_siblings;

	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = leader->total_time_enabled;

	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = leader->total_time_running;

3571
	if (leader != event)
3572 3573
		leader->pmu->read(leader);

P
Peter Zijlstra 已提交
3574
	values[n++] = perf_event_count(leader);
3575
	if (read_format & PERF_FORMAT_ID)
3576
		values[n++] = primary_event_id(leader);
3577 3578 3579

	perf_output_copy(handle, values, n * sizeof(u64));

3580
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3581 3582
		n = 0;

3583
		if (sub != event)
3584 3585
			sub->pmu->read(sub);

P
Peter Zijlstra 已提交
3586
		values[n++] = perf_event_count(sub);
3587
		if (read_format & PERF_FORMAT_ID)
3588
			values[n++] = primary_event_id(sub);
3589 3590 3591 3592 3593 3594

		perf_output_copy(handle, values, n * sizeof(u64));
	}
}

static void perf_output_read(struct perf_output_handle *handle,
3595
			     struct perf_event *event)
3596
{
3597 3598
	if (event->attr.read_format & PERF_FORMAT_GROUP)
		perf_output_read_group(handle, event);
3599
	else
3600
		perf_output_read_one(handle, event);
3601 3602
}

3603 3604 3605
void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
3606
			struct perf_event *event)
3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	if (sample_type & PERF_SAMPLE_ADDR)
		perf_output_put(handle, data->addr);

	if (sample_type & PERF_SAMPLE_ID)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		perf_output_put(handle, data->stream_id);

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	if (sample_type & PERF_SAMPLE_READ)
3637
		perf_output_read(handle, event);
3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674

	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
		if (data->callchain) {
			int size = 1;

			if (data->callchain)
				size += data->callchain->nr;

			size *= sizeof(u64);

			perf_output_copy(handle, data->callchain, size);
		} else {
			u64 nr = 0;
			perf_output_put(handle, nr);
		}
	}

	if (sample_type & PERF_SAMPLE_RAW) {
		if (data->raw) {
			perf_output_put(handle, data->raw->size);
			perf_output_copy(handle, data->raw->data,
					 data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(handle, raw);
		}
	}
}

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
3675
			 struct perf_event *event,
3676
			 struct pt_regs *regs)
3677
{
3678
	u64 sample_type = event->attr.sample_type;
3679

3680
	data->type = sample_type;
3681

3682
	header->type = PERF_RECORD_SAMPLE;
3683 3684 3685 3686
	header->size = sizeof(*header);

	header->misc = 0;
	header->misc |= perf_misc_flags(regs);
3687

3688
	if (sample_type & PERF_SAMPLE_IP) {
3689 3690 3691
		data->ip = perf_instruction_pointer(regs);

		header->size += sizeof(data->ip);
3692
	}
3693

3694
	if (sample_type & PERF_SAMPLE_TID) {
3695
		/* namespace issues */
3696 3697
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
3698

3699
		header->size += sizeof(data->tid_entry);
3700 3701
	}

3702
	if (sample_type & PERF_SAMPLE_TIME) {
P
Peter Zijlstra 已提交
3703
		data->time = perf_clock();
3704

3705
		header->size += sizeof(data->time);
3706 3707
	}

3708
	if (sample_type & PERF_SAMPLE_ADDR)
3709
		header->size += sizeof(data->addr);
3710

3711
	if (sample_type & PERF_SAMPLE_ID) {
3712
		data->id = primary_event_id(event);
3713

3714 3715 3716 3717
		header->size += sizeof(data->id);
	}

	if (sample_type & PERF_SAMPLE_STREAM_ID) {
3718
		data->stream_id = event->id;
3719 3720 3721

		header->size += sizeof(data->stream_id);
	}
3722

3723
	if (sample_type & PERF_SAMPLE_CPU) {
3724 3725
		data->cpu_entry.cpu		= raw_smp_processor_id();
		data->cpu_entry.reserved	= 0;
3726

3727
		header->size += sizeof(data->cpu_entry);
3728 3729
	}

3730
	if (sample_type & PERF_SAMPLE_PERIOD)
3731
		header->size += sizeof(data->period);
3732

3733
	if (sample_type & PERF_SAMPLE_READ)
3734
		header->size += perf_event_read_size(event);
3735

3736
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3737
		int size = 1;
3738

3739 3740 3741 3742 3743 3744
		data->callchain = perf_callchain(regs);

		if (data->callchain)
			size += data->callchain->nr;

		header->size += size * sizeof(u64);
3745 3746
	}

3747
	if (sample_type & PERF_SAMPLE_RAW) {
3748 3749 3750 3751 3752 3753 3754 3755
		int size = sizeof(u32);

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
3756
		header->size += size;
3757
	}
3758
}
3759

3760
static void perf_event_output(struct perf_event *event, int nmi,
3761 3762 3763 3764 3765
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
	struct perf_output_handle handle;
	struct perf_event_header header;
3766

3767 3768 3769
	/* protect the callchain buffers */
	rcu_read_lock();

3770
	perf_prepare_sample(&header, data, event, regs);
P
Peter Zijlstra 已提交
3771

3772
	if (perf_output_begin(&handle, event, header.size, nmi, 1))
3773
		goto exit;
3774

3775
	perf_output_sample(&handle, &header, data, event);
3776

3777
	perf_output_end(&handle);
3778 3779 3780

exit:
	rcu_read_unlock();
3781 3782
}

3783
/*
3784
 * read event_id
3785 3786 3787 3788 3789 3790 3791 3792 3793 3794
 */

struct perf_read_event {
	struct perf_event_header	header;

	u32				pid;
	u32				tid;
};

static void
3795
perf_event_read_event(struct perf_event *event,
3796 3797 3798
			struct task_struct *task)
{
	struct perf_output_handle handle;
3799
	struct perf_read_event read_event = {
3800
		.header = {
3801
			.type = PERF_RECORD_READ,
3802
			.misc = 0,
3803
			.size = sizeof(read_event) + perf_event_read_size(event),
3804
		},
3805 3806
		.pid = perf_event_pid(event, task),
		.tid = perf_event_tid(event, task),
3807
	};
3808
	int ret;
3809

3810
	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3811 3812 3813
	if (ret)
		return;

3814
	perf_output_put(&handle, read_event);
3815
	perf_output_read(&handle, event);
3816

3817 3818 3819
	perf_output_end(&handle);
}

P
Peter Zijlstra 已提交
3820
/*
P
Peter Zijlstra 已提交
3821 3822
 * task tracking -- fork/exit
 *
3823
 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
P
Peter Zijlstra 已提交
3824 3825
 */

P
Peter Zijlstra 已提交
3826
struct perf_task_event {
3827
	struct task_struct		*task;
3828
	struct perf_event_context	*task_ctx;
P
Peter Zijlstra 已提交
3829 3830 3831 3832 3833 3834

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				ppid;
P
Peter Zijlstra 已提交
3835 3836
		u32				tid;
		u32				ptid;
3837
		u64				time;
3838
	} event_id;
P
Peter Zijlstra 已提交
3839 3840
};

3841
static void perf_event_task_output(struct perf_event *event,
P
Peter Zijlstra 已提交
3842
				     struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3843 3844
{
	struct perf_output_handle handle;
P
Peter Zijlstra 已提交
3845
	struct task_struct *task = task_event->task;
3846 3847
	int size, ret;

3848 3849
	size  = task_event->event_id.header.size;
	ret = perf_output_begin(&handle, event, size, 0, 0);
P
Peter Zijlstra 已提交
3850

3851
	if (ret)
P
Peter Zijlstra 已提交
3852 3853
		return;

3854 3855
	task_event->event_id.pid = perf_event_pid(event, task);
	task_event->event_id.ppid = perf_event_pid(event, current);
P
Peter Zijlstra 已提交
3856

3857 3858
	task_event->event_id.tid = perf_event_tid(event, task);
	task_event->event_id.ptid = perf_event_tid(event, current);
P
Peter Zijlstra 已提交
3859

3860
	perf_output_put(&handle, task_event->event_id);
3861

P
Peter Zijlstra 已提交
3862 3863 3864
	perf_output_end(&handle);
}

3865
static int perf_event_task_match(struct perf_event *event)
P
Peter Zijlstra 已提交
3866
{
P
Peter Zijlstra 已提交
3867
	if (event->state < PERF_EVENT_STATE_INACTIVE)
3868 3869
		return 0;

3870 3871 3872
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3873 3874
	if (event->attr.comm || event->attr.mmap ||
	    event->attr.mmap_data || event->attr.task)
P
Peter Zijlstra 已提交
3875 3876 3877 3878 3879
		return 1;

	return 0;
}

3880
static void perf_event_task_ctx(struct perf_event_context *ctx,
P
Peter Zijlstra 已提交
3881
				  struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3882
{
3883
	struct perf_event *event;
P
Peter Zijlstra 已提交
3884

3885 3886 3887
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_task_match(event))
			perf_event_task_output(event, task_event);
P
Peter Zijlstra 已提交
3888 3889 3890
	}
}

3891
static void perf_event_task_event(struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3892 3893
{
	struct perf_cpu_context *cpuctx;
P
Peter Zijlstra 已提交
3894
	struct perf_event_context *ctx;
P
Peter Zijlstra 已提交
3895
	struct pmu *pmu;
P
Peter Zijlstra 已提交
3896
	int ctxn;
P
Peter Zijlstra 已提交
3897

3898
	rcu_read_lock();
P
Peter Zijlstra 已提交
3899
	list_for_each_entry_rcu(pmu, &pmus, entry) {
3900
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3901
		perf_event_task_ctx(&cpuctx->ctx, task_event);
P
Peter Zijlstra 已提交
3902 3903 3904 3905 3906

		ctx = task_event->task_ctx;
		if (!ctx) {
			ctxn = pmu->task_ctx_nr;
			if (ctxn < 0)
3907
				goto next;
P
Peter Zijlstra 已提交
3908 3909 3910 3911
			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		}
		if (ctx)
			perf_event_task_ctx(ctx, task_event);
3912 3913
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3914
	}
P
Peter Zijlstra 已提交
3915 3916 3917
	rcu_read_unlock();
}

3918 3919
static void perf_event_task(struct task_struct *task,
			      struct perf_event_context *task_ctx,
3920
			      int new)
P
Peter Zijlstra 已提交
3921
{
P
Peter Zijlstra 已提交
3922
	struct perf_task_event task_event;
P
Peter Zijlstra 已提交
3923

3924 3925 3926
	if (!atomic_read(&nr_comm_events) &&
	    !atomic_read(&nr_mmap_events) &&
	    !atomic_read(&nr_task_events))
P
Peter Zijlstra 已提交
3927 3928
		return;

P
Peter Zijlstra 已提交
3929
	task_event = (struct perf_task_event){
3930 3931
		.task	  = task,
		.task_ctx = task_ctx,
3932
		.event_id    = {
P
Peter Zijlstra 已提交
3933
			.header = {
3934
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3935
				.misc = 0,
3936
				.size = sizeof(task_event.event_id),
P
Peter Zijlstra 已提交
3937
			},
3938 3939
			/* .pid  */
			/* .ppid */
P
Peter Zijlstra 已提交
3940 3941
			/* .tid  */
			/* .ptid */
P
Peter Zijlstra 已提交
3942
			.time = perf_clock(),
P
Peter Zijlstra 已提交
3943 3944 3945
		},
	};

3946
	perf_event_task_event(&task_event);
P
Peter Zijlstra 已提交
3947 3948
}

3949
void perf_event_fork(struct task_struct *task)
P
Peter Zijlstra 已提交
3950
{
3951
	perf_event_task(task, NULL, 1);
P
Peter Zijlstra 已提交
3952 3953
}

3954 3955 3956 3957 3958
/*
 * comm tracking
 */

struct perf_comm_event {
3959 3960
	struct task_struct	*task;
	char			*comm;
3961 3962 3963 3964 3965 3966 3967
	int			comm_size;

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
3968
	} event_id;
3969 3970
};

3971
static void perf_event_comm_output(struct perf_event *event,
3972 3973 3974
				     struct perf_comm_event *comm_event)
{
	struct perf_output_handle handle;
3975 3976
	int size = comm_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3977 3978 3979 3980

	if (ret)
		return;

3981 3982
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3983

3984
	perf_output_put(&handle, comm_event->event_id);
3985 3986 3987 3988 3989
	perf_output_copy(&handle, comm_event->comm,
				   comm_event->comm_size);
	perf_output_end(&handle);
}

3990
static int perf_event_comm_match(struct perf_event *event)
3991
{
P
Peter Zijlstra 已提交
3992
	if (event->state < PERF_EVENT_STATE_INACTIVE)
3993 3994
		return 0;

3995 3996 3997
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3998
	if (event->attr.comm)
3999 4000 4001 4002 4003
		return 1;

	return 0;
}

4004
static void perf_event_comm_ctx(struct perf_event_context *ctx,
4005 4006
				  struct perf_comm_event *comm_event)
{
4007
	struct perf_event *event;
4008

4009 4010 4011
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_comm_match(event))
			perf_event_comm_output(event, comm_event);
4012 4013 4014
	}
}

4015
static void perf_event_comm_event(struct perf_comm_event *comm_event)
4016 4017
{
	struct perf_cpu_context *cpuctx;
4018
	struct perf_event_context *ctx;
4019
	char comm[TASK_COMM_LEN];
4020
	unsigned int size;
P
Peter Zijlstra 已提交
4021
	struct pmu *pmu;
P
Peter Zijlstra 已提交
4022
	int ctxn;
4023

4024
	memset(comm, 0, sizeof(comm));
4025
	strlcpy(comm, comm_event->task->comm, sizeof(comm));
4026
	size = ALIGN(strlen(comm)+1, sizeof(u64));
4027 4028 4029 4030

	comm_event->comm = comm;
	comm_event->comm_size = size;

4031
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4032

4033
	rcu_read_lock();
P
Peter Zijlstra 已提交
4034
	list_for_each_entry_rcu(pmu, &pmus, entry) {
4035
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4036
		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
P
Peter Zijlstra 已提交
4037 4038 4039

		ctxn = pmu->task_ctx_nr;
		if (ctxn < 0)
4040
			goto next;
P
Peter Zijlstra 已提交
4041 4042 4043 4044

		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		if (ctx)
			perf_event_comm_ctx(ctx, comm_event);
4045 4046
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4047
	}
4048
	rcu_read_unlock();
4049 4050
}

4051
void perf_event_comm(struct task_struct *task)
4052
{
4053
	struct perf_comm_event comm_event;
P
Peter Zijlstra 已提交
4054 4055
	struct perf_event_context *ctx;
	int ctxn;
4056

P
Peter Zijlstra 已提交
4057 4058 4059 4060
	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (!ctx)
			continue;
4061

P
Peter Zijlstra 已提交
4062 4063
		perf_event_enable_on_exec(ctx);
	}
4064

4065
	if (!atomic_read(&nr_comm_events))
4066
		return;
4067

4068
	comm_event = (struct perf_comm_event){
4069
		.task	= task,
4070 4071
		/* .comm      */
		/* .comm_size */
4072
		.event_id  = {
4073
			.header = {
4074
				.type = PERF_RECORD_COMM,
4075 4076 4077 4078 4079
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
4080 4081 4082
		},
	};

4083
	perf_event_comm_event(&comm_event);
4084 4085
}

4086 4087 4088 4089 4090
/*
 * mmap tracking
 */

struct perf_mmap_event {
4091 4092 4093 4094
	struct vm_area_struct	*vma;

	const char		*file_name;
	int			file_size;
4095 4096 4097 4098 4099 4100 4101 4102 4103

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
		u64				start;
		u64				len;
		u64				pgoff;
4104
	} event_id;
4105 4106
};

4107
static void perf_event_mmap_output(struct perf_event *event,
4108 4109 4110
				     struct perf_mmap_event *mmap_event)
{
	struct perf_output_handle handle;
4111 4112
	int size = mmap_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
4113 4114 4115 4116

	if (ret)
		return;

4117 4118
	mmap_event->event_id.pid = perf_event_pid(event, current);
	mmap_event->event_id.tid = perf_event_tid(event, current);
4119

4120
	perf_output_put(&handle, mmap_event->event_id);
4121 4122
	perf_output_copy(&handle, mmap_event->file_name,
				   mmap_event->file_size);
4123
	perf_output_end(&handle);
4124 4125
}

4126
static int perf_event_mmap_match(struct perf_event *event,
4127 4128
				   struct perf_mmap_event *mmap_event,
				   int executable)
4129
{
P
Peter Zijlstra 已提交
4130
	if (event->state < PERF_EVENT_STATE_INACTIVE)
4131 4132
		return 0;

4133 4134 4135
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

4136 4137
	if ((!executable && event->attr.mmap_data) ||
	    (executable && event->attr.mmap))
4138 4139 4140 4141 4142
		return 1;

	return 0;
}

4143
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4144 4145
				  struct perf_mmap_event *mmap_event,
				  int executable)
4146
{
4147
	struct perf_event *event;
4148

4149
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4150
		if (perf_event_mmap_match(event, mmap_event, executable))
4151
			perf_event_mmap_output(event, mmap_event);
4152 4153 4154
	}
}

4155
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4156 4157
{
	struct perf_cpu_context *cpuctx;
4158
	struct perf_event_context *ctx;
4159 4160
	struct vm_area_struct *vma = mmap_event->vma;
	struct file *file = vma->vm_file;
4161 4162 4163
	unsigned int size;
	char tmp[16];
	char *buf = NULL;
4164
	const char *name;
P
Peter Zijlstra 已提交
4165
	struct pmu *pmu;
P
Peter Zijlstra 已提交
4166
	int ctxn;
4167

4168 4169
	memset(tmp, 0, sizeof(tmp));

4170
	if (file) {
4171 4172 4173 4174 4175 4176
		/*
		 * d_path works from the end of the buffer backwards, so we
		 * need to add enough zero bytes after the string to handle
		 * the 64bit alignment we do later.
		 */
		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4177 4178 4179 4180
		if (!buf) {
			name = strncpy(tmp, "//enomem", sizeof(tmp));
			goto got_name;
		}
4181
		name = d_path(&file->f_path, buf, PATH_MAX);
4182 4183 4184 4185 4186
		if (IS_ERR(name)) {
			name = strncpy(tmp, "//toolong", sizeof(tmp));
			goto got_name;
		}
	} else {
4187 4188 4189
		if (arch_vma_name(mmap_event->vma)) {
			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				       sizeof(tmp));
4190
			goto got_name;
4191
		}
4192 4193 4194 4195

		if (!vma->vm_mm) {
			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			goto got_name;
4196 4197 4198 4199 4200 4201 4202 4203
		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
				vma->vm_end >= vma->vm_mm->brk) {
			name = strncpy(tmp, "[heap]", sizeof(tmp));
			goto got_name;
		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
				vma->vm_end >= vma->vm_mm->start_stack) {
			name = strncpy(tmp, "[stack]", sizeof(tmp));
			goto got_name;
4204 4205
		}

4206 4207 4208 4209 4210
		name = strncpy(tmp, "//anon", sizeof(tmp));
		goto got_name;
	}

got_name:
4211
	size = ALIGN(strlen(name)+1, sizeof(u64));
4212 4213 4214 4215

	mmap_event->file_name = name;
	mmap_event->file_size = size;

4216
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4217

4218
	rcu_read_lock();
P
Peter Zijlstra 已提交
4219
	list_for_each_entry_rcu(pmu, &pmus, entry) {
4220
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4221 4222
		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
					vma->vm_flags & VM_EXEC);
P
Peter Zijlstra 已提交
4223 4224 4225

		ctxn = pmu->task_ctx_nr;
		if (ctxn < 0)
4226
			goto next;
P
Peter Zijlstra 已提交
4227 4228 4229 4230 4231 4232

		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		if (ctx) {
			perf_event_mmap_ctx(ctx, mmap_event,
					vma->vm_flags & VM_EXEC);
		}
4233 4234
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4235
	}
4236 4237
	rcu_read_unlock();

4238 4239 4240
	kfree(buf);
}

4241
void perf_event_mmap(struct vm_area_struct *vma)
4242
{
4243 4244
	struct perf_mmap_event mmap_event;

4245
	if (!atomic_read(&nr_mmap_events))
4246 4247 4248
		return;

	mmap_event = (struct perf_mmap_event){
4249
		.vma	= vma,
4250 4251
		/* .file_name */
		/* .file_size */
4252
		.event_id  = {
4253
			.header = {
4254
				.type = PERF_RECORD_MMAP,
4255
				.misc = PERF_RECORD_MISC_USER,
4256 4257 4258 4259
				/* .size */
			},
			/* .pid */
			/* .tid */
4260 4261
			.start  = vma->vm_start,
			.len    = vma->vm_end - vma->vm_start,
4262
			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
4263 4264 4265
		},
	};

4266
	perf_event_mmap_event(&mmap_event);
4267 4268
}

4269 4270 4271 4272
/*
 * IRQ throttle logging
 */

4273
static void perf_log_throttle(struct perf_event *event, int enable)
4274 4275 4276 4277 4278 4279 4280
{
	struct perf_output_handle handle;
	int ret;

	struct {
		struct perf_event_header	header;
		u64				time;
4281
		u64				id;
4282
		u64				stream_id;
4283 4284
	} throttle_event = {
		.header = {
4285
			.type = PERF_RECORD_THROTTLE,
4286 4287 4288
			.misc = 0,
			.size = sizeof(throttle_event),
		},
P
Peter Zijlstra 已提交
4289
		.time		= perf_clock(),
4290 4291
		.id		= primary_event_id(event),
		.stream_id	= event->id,
4292 4293
	};

4294
	if (enable)
4295
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4296

4297
	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
4298 4299 4300 4301 4302 4303 4304
	if (ret)
		return;

	perf_output_put(&handle, throttle_event);
	perf_output_end(&handle);
}

4305
/*
4306
 * Generic event overflow handling, sampling.
4307 4308
 */

4309
static int __perf_event_overflow(struct perf_event *event, int nmi,
4310 4311
				   int throttle, struct perf_sample_data *data,
				   struct pt_regs *regs)
4312
{
4313 4314
	int events = atomic_read(&event->event_limit);
	struct hw_perf_event *hwc = &event->hw;
4315 4316
	int ret = 0;

4317
	if (!throttle) {
4318
		hwc->interrupts++;
4319
	} else {
4320 4321
		if (hwc->interrupts != MAX_INTERRUPTS) {
			hwc->interrupts++;
4322
			if (HZ * hwc->interrupts >
4323
					(u64)sysctl_perf_event_sample_rate) {
4324
				hwc->interrupts = MAX_INTERRUPTS;
4325
				perf_log_throttle(event, 0);
4326 4327 4328 4329
				ret = 1;
			}
		} else {
			/*
4330
			 * Keep re-disabling events even though on the previous
4331
			 * pass we disabled it - just in case we raced with a
4332
			 * sched-in and the event got enabled again:
4333
			 */
4334 4335 4336
			ret = 1;
		}
	}
4337

4338
	if (event->attr.freq) {
P
Peter Zijlstra 已提交
4339
		u64 now = perf_clock();
4340
		s64 delta = now - hwc->freq_time_stamp;
4341

4342
		hwc->freq_time_stamp = now;
4343

4344 4345
		if (delta > 0 && delta < 2*TICK_NSEC)
			perf_adjust_period(event, delta, hwc->last_period);
4346 4347
	}

4348 4349
	/*
	 * XXX event_limit might not quite work as expected on inherited
4350
	 * events
4351 4352
	 */

4353 4354
	event->pending_kill = POLL_IN;
	if (events && atomic_dec_and_test(&event->event_limit)) {
4355
		ret = 1;
4356
		event->pending_kill = POLL_HUP;
4357
		if (nmi) {
4358 4359 4360
			event->pending_disable = 1;
			perf_pending_queue(&event->pending,
					   perf_pending_event);
4361
		} else
4362
			perf_event_disable(event);
4363 4364
	}

4365 4366 4367 4368 4369
	if (event->overflow_handler)
		event->overflow_handler(event, nmi, data, regs);
	else
		perf_event_output(event, nmi, data, regs);

4370
	return ret;
4371 4372
}

4373
int perf_event_overflow(struct perf_event *event, int nmi,
4374 4375
			  struct perf_sample_data *data,
			  struct pt_regs *regs)
4376
{
4377
	return __perf_event_overflow(event, nmi, 1, data, regs);
4378 4379
}

4380
/*
4381
 * Generic software event infrastructure
4382 4383
 */

4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394
struct swevent_htable {
	struct swevent_hlist		*swevent_hlist;
	struct mutex			hlist_mutex;
	int				hlist_refcount;

	/* Recursion avoidance in each contexts */
	int				recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

4395
/*
4396 4397
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
4398 4399 4400 4401
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

4402
static u64 perf_swevent_set_period(struct perf_event *event)
4403
{
4404
	struct hw_perf_event *hwc = &event->hw;
4405 4406 4407 4408 4409
	u64 period = hwc->last_period;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;
4410 4411

again:
4412
	old = val = local64_read(&hwc->period_left);
4413 4414
	if (val < 0)
		return 0;
4415

4416 4417 4418
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
4419
	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4420
		goto again;
4421

4422
	return nr;
4423 4424
}

4425
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4426 4427
				    int nmi, struct perf_sample_data *data,
				    struct pt_regs *regs)
4428
{
4429
	struct hw_perf_event *hwc = &event->hw;
4430
	int throttle = 0;
4431

4432
	data->period = event->hw.last_period;
4433 4434
	if (!overflow)
		overflow = perf_swevent_set_period(event);
4435

4436 4437
	if (hwc->interrupts == MAX_INTERRUPTS)
		return;
4438

4439
	for (; overflow; overflow--) {
4440
		if (__perf_event_overflow(event, nmi, throttle,
4441
					    data, regs)) {
4442 4443 4444 4445 4446 4447
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
4448
		throttle = 1;
4449
	}
4450 4451
}

P
Peter Zijlstra 已提交
4452
static void perf_swevent_event(struct perf_event *event, u64 nr,
4453 4454
			       int nmi, struct perf_sample_data *data,
			       struct pt_regs *regs)
4455
{
4456
	struct hw_perf_event *hwc = &event->hw;
4457

4458
	local64_add(nr, &event->count);
4459

4460 4461 4462
	if (!regs)
		return;

4463 4464
	if (!hwc->sample_period)
		return;
4465

4466 4467 4468
	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
		return perf_swevent_overflow(event, 1, nmi, data, regs);

4469
	if (local64_add_negative(nr, &hwc->period_left))
4470
		return;
4471

4472
	perf_swevent_overflow(event, 0, nmi, data, regs);
4473 4474
}

4475 4476 4477
static int perf_exclude_event(struct perf_event *event,
			      struct pt_regs *regs)
{
P
Peter Zijlstra 已提交
4478 4479 4480
	if (event->hw.state & PERF_HES_STOPPED)
		return 0;

4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491
	if (regs) {
		if (event->attr.exclude_user && user_mode(regs))
			return 1;

		if (event->attr.exclude_kernel && !user_mode(regs))
			return 1;
	}

	return 0;
}

4492
static int perf_swevent_match(struct perf_event *event,
P
Peter Zijlstra 已提交
4493
				enum perf_type_id type,
L
Li Zefan 已提交
4494 4495 4496
				u32 event_id,
				struct perf_sample_data *data,
				struct pt_regs *regs)
4497
{
4498
	if (event->attr.type != type)
4499
		return 0;
4500

4501
	if (event->attr.config != event_id)
4502 4503
		return 0;

4504 4505
	if (perf_exclude_event(event, regs))
		return 0;
4506 4507 4508 4509

	return 1;
}

4510 4511 4512 4513 4514 4515 4516
static inline u64 swevent_hash(u64 type, u32 event_id)
{
	u64 val = event_id | (type << 32);

	return hash_64(val, SWEVENT_HLIST_BITS);
}

4517 4518
static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4519
{
4520 4521 4522 4523
	u64 hash = swevent_hash(type, event_id);

	return &hlist->heads[hash];
}
4524

4525 4526
/* For the read side: events when they trigger */
static inline struct hlist_head *
4527
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4528 4529
{
	struct swevent_hlist *hlist;
4530

4531
	hlist = rcu_dereference(swhash->swevent_hlist);
4532 4533 4534
	if (!hlist)
		return NULL;

4535 4536 4537 4538 4539
	return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
4540
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4541 4542 4543 4544 4545 4546 4547 4548 4549 4550
{
	struct swevent_hlist *hlist;
	u32 event_id = event->attr.config;
	u64 type = event->attr.type;

	/*
	 * Event scheduling is always serialized against hlist allocation
	 * and release. Which makes the protected version suitable here.
	 * The context lock guarantees that.
	 */
4551
	hlist = rcu_dereference_protected(swhash->swevent_hlist,
4552 4553 4554 4555 4556
					  lockdep_is_held(&event->ctx->lock));
	if (!hlist)
		return NULL;

	return __find_swevent_head(hlist, type, event_id);
4557 4558 4559 4560 4561 4562
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
				    u64 nr, int nmi,
				    struct perf_sample_data *data,
				    struct pt_regs *regs)
4563
{
4564
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4565
	struct perf_event *event;
4566 4567
	struct hlist_node *node;
	struct hlist_head *head;
4568

4569
	rcu_read_lock();
4570
	head = find_swevent_head_rcu(swhash, type, event_id);
4571 4572 4573 4574
	if (!head)
		goto end;

	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
L
Li Zefan 已提交
4575
		if (perf_swevent_match(event, type, event_id, data, regs))
P
Peter Zijlstra 已提交
4576
			perf_swevent_event(event, nr, nmi, data, regs);
4577
	}
4578 4579
end:
	rcu_read_unlock();
4580 4581
}

4582
int perf_swevent_get_recursion_context(void)
P
Peter Zijlstra 已提交
4583
{
4584
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
P
Peter Zijlstra 已提交
4585

4586
	return get_recursion_context(swhash->recursion);
P
Peter Zijlstra 已提交
4587
}
I
Ingo Molnar 已提交
4588
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
P
Peter Zijlstra 已提交
4589

4590
void inline perf_swevent_put_recursion_context(int rctx)
4591
{
4592
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4593

4594
	put_recursion_context(swhash->recursion, rctx);
4595
}
4596

4597
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4598
			    struct pt_regs *regs, u64 addr)
4599
{
4600
	struct perf_sample_data data;
4601 4602
	int rctx;

4603
	preempt_disable_notrace();
4604 4605 4606
	rctx = perf_swevent_get_recursion_context();
	if (rctx < 0)
		return;
4607

4608
	perf_sample_data_init(&data, addr);
4609

4610
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4611 4612

	perf_swevent_put_recursion_context(rctx);
4613
	preempt_enable_notrace();
4614 4615
}

4616
static void perf_swevent_read(struct perf_event *event)
4617 4618 4619
{
}

P
Peter Zijlstra 已提交
4620
static int perf_swevent_add(struct perf_event *event, int flags)
4621
{
4622
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4623
	struct hw_perf_event *hwc = &event->hw;
4624 4625
	struct hlist_head *head;

4626 4627
	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
4628
		perf_swevent_set_period(event);
4629
	}
4630

P
Peter Zijlstra 已提交
4631 4632
	hwc->state = !(flags & PERF_EF_START);

4633
	head = find_swevent_head(swhash, event);
4634 4635 4636 4637 4638
	if (WARN_ON_ONCE(!head))
		return -EINVAL;

	hlist_add_head_rcu(&event->hlist_entry, head);

4639 4640 4641
	return 0;
}

P
Peter Zijlstra 已提交
4642
static void perf_swevent_del(struct perf_event *event, int flags)
4643
{
4644
	hlist_del_rcu(&event->hlist_entry);
4645 4646
}

P
Peter Zijlstra 已提交
4647
static void perf_swevent_start(struct perf_event *event, int flags)
4648
{
P
Peter Zijlstra 已提交
4649
	event->hw.state = 0;
4650 4651
}

P
Peter Zijlstra 已提交
4652
static void perf_swevent_stop(struct perf_event *event, int flags)
4653
{
P
Peter Zijlstra 已提交
4654
	event->hw.state = PERF_HES_STOPPED;
4655 4656
}

4657 4658
/* Deref the hlist from the update side */
static inline struct swevent_hlist *
4659
swevent_hlist_deref(struct swevent_htable *swhash)
4660
{
4661 4662
	return rcu_dereference_protected(swhash->swevent_hlist,
					 lockdep_is_held(&swhash->hlist_mutex));
4663 4664
}

4665 4666 4667 4668 4669 4670 4671 4672
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
	struct swevent_hlist *hlist;

	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
	kfree(hlist);
}

4673
static void swevent_hlist_release(struct swevent_htable *swhash)
4674
{
4675
	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4676

4677
	if (!hlist)
4678 4679
		return;

4680
	rcu_assign_pointer(swhash->swevent_hlist, NULL);
4681 4682 4683 4684 4685
	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}

static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
{
4686
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4687

4688
	mutex_lock(&swhash->hlist_mutex);
4689

4690 4691
	if (!--swhash->hlist_refcount)
		swevent_hlist_release(swhash);
4692

4693
	mutex_unlock(&swhash->hlist_mutex);
4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710
}

static void swevent_hlist_put(struct perf_event *event)
{
	int cpu;

	if (event->cpu != -1) {
		swevent_hlist_put_cpu(event, event->cpu);
		return;
	}

	for_each_possible_cpu(cpu)
		swevent_hlist_put_cpu(event, cpu);
}

static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
4711
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4712 4713
	int err = 0;

4714
	mutex_lock(&swhash->hlist_mutex);
4715

4716
	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4717 4718 4719 4720 4721 4722 4723
		struct swevent_hlist *hlist;

		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
		if (!hlist) {
			err = -ENOMEM;
			goto exit;
		}
4724
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
4725
	}
4726
	swhash->hlist_refcount++;
P
Peter Zijlstra 已提交
4727
exit:
4728
	mutex_unlock(&swhash->hlist_mutex);
4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751

	return err;
}

static int swevent_hlist_get(struct perf_event *event)
{
	int err;
	int cpu, failed_cpu;

	if (event->cpu != -1)
		return swevent_hlist_get_cpu(event, event->cpu);

	get_online_cpus();
	for_each_possible_cpu(cpu) {
		err = swevent_hlist_get_cpu(event, cpu);
		if (err) {
			failed_cpu = cpu;
			goto fail;
		}
	}
	put_online_cpus();

	return 0;
P
Peter Zijlstra 已提交
4752
fail:
4753 4754 4755 4756 4757 4758 4759 4760 4761 4762
	for_each_possible_cpu(cpu) {
		if (cpu == failed_cpu)
			break;
		swevent_hlist_put_cpu(event, cpu);
	}

	put_online_cpus();
	return err;
}

4763
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4764

4765 4766 4767
static void sw_perf_event_destroy(struct perf_event *event)
{
	u64 event_id = event->attr.config;
4768

4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808
	WARN_ON(event->parent);

	atomic_dec(&perf_swevent_enabled[event_id]);
	swevent_hlist_put(event);
}

static int perf_swevent_init(struct perf_event *event)
{
	int event_id = event->attr.config;

	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	switch (event_id) {
	case PERF_COUNT_SW_CPU_CLOCK:
	case PERF_COUNT_SW_TASK_CLOCK:
		return -ENOENT;

	default:
		break;
	}

	if (event_id > PERF_COUNT_SW_MAX)
		return -ENOENT;

	if (!event->parent) {
		int err;

		err = swevent_hlist_get(event);
		if (err)
			return err;

		atomic_inc(&perf_swevent_enabled[event_id]);
		event->destroy = sw_perf_event_destroy;
	}

	return 0;
}

static struct pmu perf_swevent = {
4809 4810
	.task_ctx_nr	= perf_sw_context,

4811
	.event_init	= perf_swevent_init,
P
Peter Zijlstra 已提交
4812 4813 4814 4815
	.add		= perf_swevent_add,
	.del		= perf_swevent_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
4816 4817 4818
	.read		= perf_swevent_read,
};

4819 4820
#ifdef CONFIG_EVENT_TRACING

4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834
static int perf_tp_filter_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	void *record = data->raw->data;

	if (likely(!event->filter) || filter_match_preds(event->filter, record))
		return 1;
	return 0;
}

static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
4835 4836 4837 4838
	/*
	 * All tracepoints are from kernel-space.
	 */
	if (event->attr.exclude_kernel)
4839 4840 4841 4842 4843 4844 4845 4846 4847
		return 0;

	if (!perf_tp_filter_match(event, data))
		return 0;

	return 1;
}

void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4848
		   struct pt_regs *regs, struct hlist_head *head, int rctx)
4849 4850
{
	struct perf_sample_data data;
4851 4852 4853
	struct perf_event *event;
	struct hlist_node *node;

4854 4855 4856 4857 4858 4859 4860 4861
	struct perf_raw_record raw = {
		.size = entry_size,
		.data = record,
	};

	perf_sample_data_init(&data, addr);
	data.raw = &raw;

4862 4863
	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
		if (perf_tp_event_match(event, &data, regs))
P
Peter Zijlstra 已提交
4864
			perf_swevent_event(event, count, 1, &data, regs);
4865
	}
4866 4867

	perf_swevent_put_recursion_context(rctx);
4868 4869 4870
}
EXPORT_SYMBOL_GPL(perf_tp_event);

4871
static void tp_perf_event_destroy(struct perf_event *event)
4872
{
4873
	perf_trace_destroy(event);
4874 4875
}

4876
static int perf_tp_event_init(struct perf_event *event)
4877
{
4878 4879
	int err;

4880 4881 4882
	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -ENOENT;

4883 4884 4885 4886
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
4887
	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4888
			perf_paranoid_tracepoint_raw() &&
4889
			!capable(CAP_SYS_ADMIN))
4890
		return -EPERM;
4891

4892 4893
	err = perf_trace_init(event);
	if (err)
4894
		return err;
4895

4896
	event->destroy = tp_perf_event_destroy;
4897

4898 4899 4900 4901
	return 0;
}

static struct pmu perf_tracepoint = {
4902 4903
	.task_ctx_nr	= perf_sw_context,

4904
	.event_init	= perf_tp_event_init,
P
Peter Zijlstra 已提交
4905 4906 4907 4908
	.add		= perf_trace_add,
	.del		= perf_trace_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
4909 4910 4911 4912 4913 4914
	.read		= perf_swevent_read,
};

static inline void perf_tp_register(void)
{
	perf_pmu_register(&perf_tracepoint);
4915
}
L
Li Zefan 已提交
4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	char *filter_str;
	int ret;

	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -EINVAL;

	filter_str = strndup_user(arg, PAGE_SIZE);
	if (IS_ERR(filter_str))
		return PTR_ERR(filter_str);

	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);

	kfree(filter_str);
	return ret;
}

static void perf_event_free_filter(struct perf_event *event)
{
	ftrace_profile_free_filter(event);
}

4940
#else
L
Li Zefan 已提交
4941

4942
static inline void perf_tp_register(void)
4943 4944
{
}
L
Li Zefan 已提交
4945 4946 4947 4948 4949 4950 4951 4952 4953 4954

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	return -ENOENT;
}

static void perf_event_free_filter(struct perf_event *event)
{
}

4955
#endif /* CONFIG_EVENT_TRACING */
4956

4957
#ifdef CONFIG_HAVE_HW_BREAKPOINT
4958
void perf_bp_event(struct perf_event *bp, void *data)
4959
{
4960 4961 4962 4963 4964
	struct perf_sample_data sample;
	struct pt_regs *regs = data;

	perf_sample_data_init(&sample, bp->attr.bp_addr);

P
Peter Zijlstra 已提交
4965 4966
	if (!bp->hw.state && !perf_exclude_event(bp, regs))
		perf_swevent_event(bp, 1, 1, &sample, regs);
4967
}
4968 4969 4970 4971 4972
#endif

/*
 * hrtimer based swevent callback
 */
4973

4974
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4975
{
4976 4977 4978 4979 4980
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
	struct pt_regs *regs;
	struct perf_event *event;
	u64 period;
4981

4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993
	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
	event->pmu->read(event);

	perf_sample_data_init(&data, 0);
	data.period = event->hw.last_period;
	regs = get_irq_regs();

	if (regs && !perf_exclude_event(event, regs)) {
		if (!(event->attr.exclude_idle && current->pid == 0))
			if (perf_event_overflow(event, 0, &data, regs))
				ret = HRTIMER_NORESTART;
	}
4994

4995 4996
	period = max_t(u64, 10000, event->hw.sample_period);
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4997

4998
	return ret;
4999 5000
}

5001
static void perf_swevent_start_hrtimer(struct perf_event *event)
5002
{
5003
	struct hw_perf_event *hwc = &event->hw;
5004

5005 5006 5007
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hwc->hrtimer.function = perf_swevent_hrtimer;
	if (hwc->sample_period) {
P
Peter Zijlstra 已提交
5008
		s64 period = local64_read(&hwc->period_left);
5009

P
Peter Zijlstra 已提交
5010 5011
		if (period) {
			if (period < 0)
5012
				period = 10000;
P
Peter Zijlstra 已提交
5013 5014

			local64_set(&hwc->period_left, 0);
5015 5016 5017 5018 5019
		} else {
			period = max_t(u64, 10000, hwc->sample_period);
		}
		__hrtimer_start_range_ns(&hwc->hrtimer,
				ns_to_ktime(period), 0,
5020
				HRTIMER_MODE_REL_PINNED, 0);
5021
	}
5022
}
5023 5024

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5025
{
5026 5027 5028 5029
	struct hw_perf_event *hwc = &event->hw;

	if (hwc->sample_period) {
		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
P
Peter Zijlstra 已提交
5030
		local64_set(&hwc->period_left, ktime_to_ns(remaining));
5031 5032 5033

		hrtimer_cancel(&hwc->hrtimer);
	}
5034 5035
}

5036 5037 5038 5039 5040
/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
5041
{
5042 5043 5044
	s64 prev;
	u64 now;

P
Peter Zijlstra 已提交
5045
	now = local_clock();
5046 5047
	prev = local64_xchg(&event->hw.prev_count, now);
	local64_add(now - prev, &event->count);
5048 5049
}

P
Peter Zijlstra 已提交
5050
static void cpu_clock_event_start(struct perf_event *event, int flags)
5051
{
P
Peter Zijlstra 已提交
5052
	local64_set(&event->hw.prev_count, local_clock());
5053 5054 5055
	perf_swevent_start_hrtimer(event);
}

P
Peter Zijlstra 已提交
5056
static void cpu_clock_event_stop(struct perf_event *event, int flags)
5057
{
5058 5059 5060
	perf_swevent_cancel_hrtimer(event);
	cpu_clock_event_update(event);
}
5061

P
Peter Zijlstra 已提交
5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074
static int cpu_clock_event_add(struct perf_event *event, int flags)
{
	if (flags & PERF_EF_START)
		cpu_clock_event_start(event, flags);

	return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
	cpu_clock_event_stop(event, flags);
}

5075 5076 5077 5078
static void cpu_clock_event_read(struct perf_event *event)
{
	cpu_clock_event_update(event);
}
5079

5080 5081 5082 5083 5084 5085 5086 5087 5088
static int cpu_clock_event_init(struct perf_event *event)
{
	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
		return -ENOENT;

	return 0;
5089 5090
}

5091
static struct pmu perf_cpu_clock = {
5092 5093
	.task_ctx_nr	= perf_sw_context,

5094
	.event_init	= cpu_clock_event_init,
P
Peter Zijlstra 已提交
5095 5096 5097 5098
	.add		= cpu_clock_event_add,
	.del		= cpu_clock_event_del,
	.start		= cpu_clock_event_start,
	.stop		= cpu_clock_event_stop,
5099 5100 5101 5102 5103 5104 5105 5106
	.read		= cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
5107
{
5108 5109
	u64 prev;
	s64 delta;
5110

5111 5112 5113 5114
	prev = local64_xchg(&event->hw.prev_count, now);
	delta = now - prev;
	local64_add(delta, &event->count);
}
5115

P
Peter Zijlstra 已提交
5116
static void task_clock_event_start(struct perf_event *event, int flags)
5117
{
P
Peter Zijlstra 已提交
5118
	local64_set(&event->hw.prev_count, event->ctx->time);
5119 5120 5121
	perf_swevent_start_hrtimer(event);
}

P
Peter Zijlstra 已提交
5122
static void task_clock_event_stop(struct perf_event *event, int flags)
5123 5124 5125
{
	perf_swevent_cancel_hrtimer(event);
	task_clock_event_update(event, event->ctx->time);
P
Peter Zijlstra 已提交
5126 5127 5128 5129 5130 5131
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
	if (flags & PERF_EF_START)
		task_clock_event_start(event, flags);
5132

P
Peter Zijlstra 已提交
5133 5134 5135 5136 5137 5138
	return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
	task_clock_event_stop(event, PERF_EF_UPDATE);
5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157
}

static void task_clock_event_read(struct perf_event *event)
{
	u64 time;

	if (!in_nmi()) {
		update_context_time(event->ctx);
		time = event->ctx->time;
	} else {
		u64 now = perf_clock();
		u64 delta = now - event->ctx->timestamp;
		time = event->ctx->time + delta;
	}

	task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
L
Li Zefan 已提交
5158
{
5159 5160 5161 5162 5163 5164 5165
	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
		return -ENOENT;

	return 0;
L
Li Zefan 已提交
5166 5167
}

5168
static struct pmu perf_task_clock = {
5169 5170
	.task_ctx_nr	= perf_sw_context,

5171
	.event_init	= task_clock_event_init,
P
Peter Zijlstra 已提交
5172 5173 5174 5175
	.add		= task_clock_event_add,
	.del		= task_clock_event_del,
	.start		= task_clock_event_start,
	.stop		= task_clock_event_stop,
5176 5177
	.read		= task_clock_event_read,
};
L
Li Zefan 已提交
5178

P
Peter Zijlstra 已提交
5179
static void perf_pmu_nop_void(struct pmu *pmu)
5180 5181
{
}
L
Li Zefan 已提交
5182

P
Peter Zijlstra 已提交
5183
static int perf_pmu_nop_int(struct pmu *pmu)
L
Li Zefan 已提交
5184
{
P
Peter Zijlstra 已提交
5185
	return 0;
L
Li Zefan 已提交
5186 5187
}

P
Peter Zijlstra 已提交
5188
static void perf_pmu_start_txn(struct pmu *pmu)
L
Li Zefan 已提交
5189
{
P
Peter Zijlstra 已提交
5190
	perf_pmu_disable(pmu);
L
Li Zefan 已提交
5191 5192
}

P
Peter Zijlstra 已提交
5193 5194 5195 5196 5197
static int perf_pmu_commit_txn(struct pmu *pmu)
{
	perf_pmu_enable(pmu);
	return 0;
}
5198

P
Peter Zijlstra 已提交
5199
static void perf_pmu_cancel_txn(struct pmu *pmu)
5200
{
P
Peter Zijlstra 已提交
5201
	perf_pmu_enable(pmu);
5202 5203
}

P
Peter Zijlstra 已提交
5204 5205 5206 5207 5208
/*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
 */
static void *find_pmu_context(int ctxn)
5209
{
P
Peter Zijlstra 已提交
5210
	struct pmu *pmu;
5211

P
Peter Zijlstra 已提交
5212 5213
	if (ctxn < 0)
		return NULL;
5214

P
Peter Zijlstra 已提交
5215 5216 5217 5218
	list_for_each_entry(pmu, &pmus, entry) {
		if (pmu->task_ctx_nr == ctxn)
			return pmu->pmu_cpu_context;
	}
5219

P
Peter Zijlstra 已提交
5220
	return NULL;
5221 5222
}

P
Peter Zijlstra 已提交
5223
static void free_pmu_context(void * __percpu cpu_context)
5224
{
P
Peter Zijlstra 已提交
5225
	struct pmu *pmu;
5226

P
Peter Zijlstra 已提交
5227 5228 5229 5230 5231 5232 5233 5234
	mutex_lock(&pmus_lock);
	/*
	 * Like a real lame refcount.
	 */
	list_for_each_entry(pmu, &pmus, entry) {
		if (pmu->pmu_cpu_context == cpu_context)
			goto out;
	}
5235

P
Peter Zijlstra 已提交
5236 5237 5238
	free_percpu(cpu_context);
out:
	mutex_unlock(&pmus_lock);
5239 5240
}

5241
int perf_pmu_register(struct pmu *pmu)
5242
{
P
Peter Zijlstra 已提交
5243
	int cpu, ret;
5244

5245
	mutex_lock(&pmus_lock);
P
Peter Zijlstra 已提交
5246 5247 5248 5249
	ret = -ENOMEM;
	pmu->pmu_disable_count = alloc_percpu(int);
	if (!pmu->pmu_disable_count)
		goto unlock;
5250

P
Peter Zijlstra 已提交
5251 5252 5253
	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
	if (pmu->pmu_cpu_context)
		goto got_cpu_context;
5254

P
Peter Zijlstra 已提交
5255 5256 5257
	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
	if (!pmu->pmu_cpu_context)
		goto free_pdc;
5258

P
Peter Zijlstra 已提交
5259 5260 5261 5262
	for_each_possible_cpu(cpu) {
		struct perf_cpu_context *cpuctx;

		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5263
		__perf_event_init_context(&cpuctx->ctx);
5264
		cpuctx->ctx.type = cpu_context;
P
Peter Zijlstra 已提交
5265
		cpuctx->ctx.pmu = pmu;
5266 5267
		cpuctx->jiffies_interval = 1;
		INIT_LIST_HEAD(&cpuctx->rotation_list);
P
Peter Zijlstra 已提交
5268 5269
	}

P
Peter Zijlstra 已提交
5270
got_cpu_context:
P
Peter Zijlstra 已提交
5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292
	if (!pmu->start_txn) {
		if (pmu->pmu_enable) {
			/*
			 * If we have pmu_enable/pmu_disable calls, install
			 * transaction stubs that use that to try and batch
			 * hardware accesses.
			 */
			pmu->start_txn  = perf_pmu_start_txn;
			pmu->commit_txn = perf_pmu_commit_txn;
			pmu->cancel_txn = perf_pmu_cancel_txn;
		} else {
			pmu->start_txn  = perf_pmu_nop_void;
			pmu->commit_txn = perf_pmu_nop_int;
			pmu->cancel_txn = perf_pmu_nop_void;
		}
	}

	if (!pmu->pmu_enable) {
		pmu->pmu_enable  = perf_pmu_nop_void;
		pmu->pmu_disable = perf_pmu_nop_void;
	}

5293
	list_add_rcu(&pmu->entry, &pmus);
P
Peter Zijlstra 已提交
5294 5295
	ret = 0;
unlock:
5296 5297
	mutex_unlock(&pmus_lock);

P
Peter Zijlstra 已提交
5298
	return ret;
P
Peter Zijlstra 已提交
5299 5300 5301 5302

free_pdc:
	free_percpu(pmu->pmu_disable_count);
	goto unlock;
5303 5304
}

5305
void perf_pmu_unregister(struct pmu *pmu)
5306
{
5307 5308 5309
	mutex_lock(&pmus_lock);
	list_del_rcu(&pmu->entry);
	mutex_unlock(&pmus_lock);
5310

5311
	/*
P
Peter Zijlstra 已提交
5312 5313
	 * We dereference the pmu list under both SRCU and regular RCU, so
	 * synchronize against both of those.
5314
	 */
5315
	synchronize_srcu(&pmus_srcu);
P
Peter Zijlstra 已提交
5316
	synchronize_rcu();
5317

P
Peter Zijlstra 已提交
5318
	free_percpu(pmu->pmu_disable_count);
P
Peter Zijlstra 已提交
5319
	free_pmu_context(pmu->pmu_cpu_context);
5320
}
5321

5322 5323 5324 5325 5326 5327 5328 5329 5330
struct pmu *perf_init_event(struct perf_event *event)
{
	struct pmu *pmu = NULL;
	int idx;

	idx = srcu_read_lock(&pmus_srcu);
	list_for_each_entry_rcu(pmu, &pmus, entry) {
		int ret = pmu->event_init(event);
		if (!ret)
P
Peter Zijlstra 已提交
5331
			goto unlock;
5332

5333 5334
		if (ret != -ENOENT) {
			pmu = ERR_PTR(ret);
P
Peter Zijlstra 已提交
5335
			goto unlock;
5336
		}
5337
	}
P
Peter Zijlstra 已提交
5338 5339
	pmu = ERR_PTR(-ENOENT);
unlock:
5340
	srcu_read_unlock(&pmus_srcu, idx);
5341

5342
	return pmu;
5343 5344
}

T
Thomas Gleixner 已提交
5345
/*
5346
 * Allocate and initialize a event structure
T
Thomas Gleixner 已提交
5347
 */
5348
static struct perf_event *
5349
perf_event_alloc(struct perf_event_attr *attr, int cpu,
5350 5351
		   struct perf_event *group_leader,
		   struct perf_event *parent_event,
5352
		   perf_overflow_handler_t overflow_handler)
T
Thomas Gleixner 已提交
5353
{
P
Peter Zijlstra 已提交
5354
	struct pmu *pmu;
5355 5356
	struct perf_event *event;
	struct hw_perf_event *hwc;
5357
	long err;
T
Thomas Gleixner 已提交
5358

5359
	event = kzalloc(sizeof(*event), GFP_KERNEL);
5360
	if (!event)
5361
		return ERR_PTR(-ENOMEM);
T
Thomas Gleixner 已提交
5362

5363
	/*
5364
	 * Single events are their own group leaders, with an
5365 5366 5367
	 * empty sibling list:
	 */
	if (!group_leader)
5368
		group_leader = event;
5369

5370 5371
	mutex_init(&event->child_mutex);
	INIT_LIST_HEAD(&event->child_list);
5372

5373 5374 5375 5376
	INIT_LIST_HEAD(&event->group_entry);
	INIT_LIST_HEAD(&event->event_entry);
	INIT_LIST_HEAD(&event->sibling_list);
	init_waitqueue_head(&event->waitq);
T
Thomas Gleixner 已提交
5377

5378
	mutex_init(&event->mmap_mutex);
5379

5380 5381 5382 5383 5384
	event->cpu		= cpu;
	event->attr		= *attr;
	event->group_leader	= group_leader;
	event->pmu		= NULL;
	event->oncpu		= -1;
5385

5386
	event->parent		= parent_event;
5387

5388 5389
	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
	event->id		= atomic64_inc_return(&perf_event_id);
5390

5391
	event->state		= PERF_EVENT_STATE_INACTIVE;
5392

5393 5394
	if (!overflow_handler && parent_event)
		overflow_handler = parent_event->overflow_handler;
5395
	
5396
	event->overflow_handler	= overflow_handler;
5397

5398
	if (attr->disabled)
5399
		event->state = PERF_EVENT_STATE_OFF;
5400

5401
	pmu = NULL;
5402

5403
	hwc = &event->hw;
5404
	hwc->sample_period = attr->sample_period;
5405
	if (attr->freq && attr->sample_freq)
5406
		hwc->sample_period = 1;
5407
	hwc->last_period = hwc->sample_period;
5408

5409
	local64_set(&hwc->period_left, hwc->sample_period);
5410

5411
	/*
5412
	 * we currently do not support PERF_FORMAT_GROUP on inherited events
5413
	 */
5414
	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
5415 5416
		goto done;

5417
	pmu = perf_init_event(event);
5418

5419 5420
done:
	err = 0;
5421
	if (!pmu)
5422
		err = -EINVAL;
5423 5424
	else if (IS_ERR(pmu))
		err = PTR_ERR(pmu);
5425

5426
	if (err) {
5427 5428 5429
		if (event->ns)
			put_pid_ns(event->ns);
		kfree(event);
5430
		return ERR_PTR(err);
I
Ingo Molnar 已提交
5431
	}
5432

5433
	event->pmu = pmu;
T
Thomas Gleixner 已提交
5434

5435 5436
	if (!event->parent) {
		atomic_inc(&nr_events);
5437
		if (event->attr.mmap || event->attr.mmap_data)
5438 5439 5440 5441 5442
			atomic_inc(&nr_mmap_events);
		if (event->attr.comm)
			atomic_inc(&nr_comm_events);
		if (event->attr.task)
			atomic_inc(&nr_task_events);
5443 5444 5445 5446 5447 5448 5449
		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
			err = get_callchain_buffers();
			if (err) {
				free_event(event);
				return ERR_PTR(err);
			}
		}
5450
	}
5451

5452
	return event;
T
Thomas Gleixner 已提交
5453 5454
}

5455 5456
static int perf_copy_attr(struct perf_event_attr __user *uattr,
			  struct perf_event_attr *attr)
5457 5458
{
	u32 size;
5459
	int ret;
5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483

	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
		return -EFAULT;

	/*
	 * zero the full structure, so that a short copy will be nice.
	 */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	if (size > PAGE_SIZE)	/* silly large */
		goto err_size;

	if (!size)		/* abi compat */
		size = PERF_ATTR_SIZE_VER0;

	if (size < PERF_ATTR_SIZE_VER0)
		goto err_size;

	/*
	 * If we're handed a bigger struct than we know of,
5484 5485 5486
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
5487 5488
	 */
	if (size > sizeof(*attr)) {
5489 5490 5491
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;
5492

5493 5494
		addr = (void __user *)uattr + sizeof(*attr);
		end  = (void __user *)uattr + size;
5495

5496
		for (; addr < end; addr++) {
5497 5498 5499 5500 5501 5502
			ret = get_user(val, addr);
			if (ret)
				return ret;
			if (val)
				goto err_size;
		}
5503
		size = sizeof(*attr);
5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516
	}

	ret = copy_from_user(attr, uattr, size);
	if (ret)
		return -EFAULT;

	/*
	 * If the type exists, the corresponding creation will verify
	 * the attr->config.
	 */
	if (attr->type >= PERF_TYPE_MAX)
		return -EINVAL;

5517
	if (attr->__reserved_1)
5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534
		return -EINVAL;

	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
		return -EINVAL;

	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
		return -EINVAL;

out:
	return ret;

err_size:
	put_user(sizeof(*attr), &uattr->size);
	ret = -E2BIG;
	goto out;
}

5535 5536
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5537
{
5538
	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5539 5540
	int ret = -EINVAL;

5541
	if (!output_event)
5542 5543
		goto set;

5544 5545
	/* don't allow circular references */
	if (event == output_event)
5546 5547
		goto out;

5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559
	/*
	 * Don't allow cross-cpu buffers
	 */
	if (output_event->cpu != event->cpu)
		goto out;

	/*
	 * If its not a per-cpu buffer, it must be the same task.
	 */
	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
		goto out;

5560
set:
5561
	mutex_lock(&event->mmap_mutex);
5562 5563 5564
	/* Can't redirect output if we've got an active mmap() */
	if (atomic_read(&event->mmap_count))
		goto unlock;
5565

5566 5567
	if (output_event) {
		/* get the buffer we want to redirect to */
5568 5569
		buffer = perf_buffer_get(output_event);
		if (!buffer)
5570
			goto unlock;
5571 5572
	}

5573 5574
	old_buffer = event->buffer;
	rcu_assign_pointer(event->buffer, buffer);
5575
	ret = 0;
5576 5577 5578
unlock:
	mutex_unlock(&event->mmap_mutex);

5579 5580
	if (old_buffer)
		perf_buffer_put(old_buffer);
5581 5582 5583 5584
out:
	return ret;
}

T
Thomas Gleixner 已提交
5585
/**
5586
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
I
Ingo Molnar 已提交
5587
 *
5588
 * @attr_uptr:	event_id type attributes for monitoring/sampling
T
Thomas Gleixner 已提交
5589
 * @pid:		target pid
I
Ingo Molnar 已提交
5590
 * @cpu:		target cpu
5591
 * @group_fd:		group leader event fd
T
Thomas Gleixner 已提交
5592
 */
5593 5594
SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
5595
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
T
Thomas Gleixner 已提交
5596
{
5597 5598
	struct perf_event *group_leader = NULL, *output_event = NULL;
	struct perf_event *event, *sibling;
5599 5600 5601
	struct perf_event_attr attr;
	struct perf_event_context *ctx;
	struct file *event_file = NULL;
5602
	struct file *group_file = NULL;
M
Matt Helsley 已提交
5603
	struct task_struct *task = NULL;
5604
	struct pmu *pmu;
5605
	int event_fd;
5606
	int move_group = 0;
5607
	int fput_needed = 0;
5608
	int err;
T
Thomas Gleixner 已提交
5609

5610
	/* for future expandability... */
5611
	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
5612 5613
		return -EINVAL;

5614 5615 5616
	err = perf_copy_attr(attr_uptr, &attr);
	if (err)
		return err;
5617

5618 5619 5620 5621 5622
	if (!attr.exclude_kernel) {
		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

5623
	if (attr.freq) {
5624
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
5625 5626 5627
			return -EINVAL;
	}

5628 5629 5630 5631
	event_fd = get_unused_fd_flags(O_RDWR);
	if (event_fd < 0)
		return event_fd;

5632 5633 5634 5635
	if (group_fd != -1) {
		group_leader = perf_fget_light(group_fd, &fput_needed);
		if (IS_ERR(group_leader)) {
			err = PTR_ERR(group_leader);
5636
			goto err_fd;
5637 5638 5639 5640 5641 5642 5643 5644
		}
		group_file = group_leader->filp;
		if (flags & PERF_FLAG_FD_OUTPUT)
			output_event = group_leader;
		if (flags & PERF_FLAG_FD_NO_GROUP)
			group_leader = NULL;
	}

5645 5646 5647 5648 5649 5650
	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
		goto err_fd;
	}

5651 5652 5653 5654 5655
	/*
	 * Special case software events and allow them to be part of
	 * any hardware group.
	 */
	pmu = event->pmu;
5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678

	if (group_leader &&
	    (is_software_event(event) != is_software_event(group_leader))) {
		if (is_software_event(event)) {
			/*
			 * If event and group_leader are not both a software
			 * event, and event is, then group leader is not.
			 *
			 * Allow the addition of software events to !software
			 * groups, this is safe because software events never
			 * fail to schedule.
			 */
			pmu = group_leader->pmu;
		} else if (is_software_event(group_leader) &&
			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
			/*
			 * In case the group is a pure software group, and we
			 * try to add a hardware event, move the whole group to
			 * the hardware context.
			 */
			move_group = 1;
		}
	}
5679

5680
	if (pid != -1) {
M
Matt Helsley 已提交
5681
		task = find_lively_task_by_vpid(pid);
5682 5683 5684 5685 5686
		if (IS_ERR(task)) {
			err = PTR_ERR(task);
			goto err_group_fd;
		}
	}
M
Matt Helsley 已提交
5687

5688 5689 5690
	/*
	 * Get the target context (task or percpu):
	 */
M
Matt Helsley 已提交
5691
	ctx = find_get_context(pmu, task, cpu);
5692 5693 5694 5695 5696
	if (IS_ERR(ctx)) {
		err = PTR_ERR(ctx);
		goto err_group_fd;
	}

I
Ingo Molnar 已提交
5697
	/*
5698
	 * Look up the group leader (we will attach this event to it):
5699
	 */
5700
	if (group_leader) {
5701
		err = -EINVAL;
5702 5703

		/*
I
Ingo Molnar 已提交
5704 5705 5706 5707
		 * Do not allow a recursive hierarchy (this new sibling
		 * becoming part of another group-sibling):
		 */
		if (group_leader->group_leader != group_leader)
5708
			goto err_context;
I
Ingo Molnar 已提交
5709 5710 5711
		/*
		 * Do not allow to attach to a group in a different
		 * task or CPU context:
5712
		 */
5713 5714 5715 5716 5717 5718 5719 5720
		if (move_group) {
			if (group_leader->ctx->type != ctx->type)
				goto err_context;
		} else {
			if (group_leader->ctx != ctx)
				goto err_context;
		}

5721 5722 5723
		/*
		 * Only a group leader can be exclusive or pinned
		 */
5724
		if (attr.exclusive || attr.pinned)
5725
			goto err_context;
5726 5727 5728 5729 5730
	}

	if (output_event) {
		err = perf_event_set_output(event, output_event);
		if (err)
5731
			goto err_context;
5732
	}
T
Thomas Gleixner 已提交
5733

5734 5735 5736
	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
	if (IS_ERR(event_file)) {
		err = PTR_ERR(event_file);
5737
		goto err_context;
5738
	}
5739

5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751
	if (move_group) {
		struct perf_event_context *gctx = group_leader->ctx;

		mutex_lock(&gctx->mutex);
		perf_event_remove_from_context(group_leader);
		list_for_each_entry(sibling, &group_leader->sibling_list,
				    group_entry) {
			perf_event_remove_from_context(sibling);
			put_ctx(gctx);
		}
		mutex_unlock(&gctx->mutex);
		put_ctx(gctx);
5752
	}
5753

5754
	event->filp = event_file;
5755
	WARN_ON_ONCE(ctx->parent_ctx);
5756
	mutex_lock(&ctx->mutex);
5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767

	if (move_group) {
		perf_install_in_context(ctx, group_leader, cpu);
		get_ctx(ctx);
		list_for_each_entry(sibling, &group_leader->sibling_list,
				    group_entry) {
			perf_install_in_context(ctx, sibling, cpu);
			get_ctx(ctx);
		}
	}

5768
	perf_install_in_context(ctx, event, cpu);
5769
	++ctx->generation;
5770
	mutex_unlock(&ctx->mutex);
5771

5772
	event->owner = current;
5773
	get_task_struct(current);
5774 5775 5776
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
5777

5778 5779 5780 5781 5782 5783
	/*
	 * Drop the reference on the group_event after placing the
	 * new event on the sibling_list. This ensures destruction
	 * of the group leader will find the pointer to itself in
	 * perf_group_detach().
	 */
5784 5785 5786
	fput_light(group_file, fput_needed);
	fd_install(event_fd, event_file);
	return event_fd;
T
Thomas Gleixner 已提交
5787

5788
err_context:
5789
	put_ctx(ctx);
5790 5791
err_group_fd:
	fput_light(group_file, fput_needed);
5792
	free_event(event);
5793 5794
err_fd:
	put_unused_fd(event_fd);
5795
	return err;
T
Thomas Gleixner 已提交
5796 5797
}

5798 5799 5800 5801 5802
/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
M
Matt Helsley 已提交
5803
 * @task: task to profile (NULL for percpu)
5804 5805 5806
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
M
Matt Helsley 已提交
5807
				 struct task_struct *task,
5808
				 perf_overflow_handler_t overflow_handler)
5809 5810
{
	struct perf_event_context *ctx;
5811
	struct perf_event *event;
5812
	int err;
5813

5814 5815 5816
	/*
	 * Get the target context (task or percpu):
	 */
5817

5818 5819 5820 5821 5822
	event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
		goto err;
	}
5823

M
Matt Helsley 已提交
5824
	ctx = find_get_context(event->pmu, task, cpu);
5825 5826
	if (IS_ERR(ctx)) {
		err = PTR_ERR(ctx);
5827
		goto err_free;
5828
	}
5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844

	event->filp = NULL;
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
	perf_install_in_context(ctx, event, cpu);
	++ctx->generation;
	mutex_unlock(&ctx->mutex);

	event->owner = current;
	get_task_struct(current);
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);

	return event;

5845 5846 5847
err_free:
	free_event(event);
err:
5848
	return ERR_PTR(err);
5849
}
5850
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5851

5852
static void sync_child_event(struct perf_event *child_event,
5853
			       struct task_struct *child)
5854
{
5855
	struct perf_event *parent_event = child_event->parent;
5856
	u64 child_val;
5857

5858 5859
	if (child_event->attr.inherit_stat)
		perf_event_read_event(child_event, child);
5860

P
Peter Zijlstra 已提交
5861
	child_val = perf_event_count(child_event);
5862 5863 5864 5865

	/*
	 * Add back the child's count to the parent's count:
	 */
5866
	atomic64_add(child_val, &parent_event->child_count);
5867 5868 5869 5870
	atomic64_add(child_event->total_time_enabled,
		     &parent_event->child_total_time_enabled);
	atomic64_add(child_event->total_time_running,
		     &parent_event->child_total_time_running);
5871 5872

	/*
5873
	 * Remove this event from the parent's list
5874
	 */
5875 5876 5877 5878
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_del_init(&child_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
5879 5880

	/*
5881
	 * Release the parent event, if this was the last
5882 5883
	 * reference to it.
	 */
5884
	fput(parent_event->filp);
5885 5886
}

5887
static void
5888 5889
__perf_event_exit_task(struct perf_event *child_event,
			 struct perf_event_context *child_ctx,
5890
			 struct task_struct *child)
5891
{
5892
	struct perf_event *parent_event;
5893

5894
	perf_event_remove_from_context(child_event);
5895

5896
	parent_event = child_event->parent;
5897
	/*
5898
	 * It can happen that parent exits first, and has events
5899
	 * that are still around due to the child reference. These
5900
	 * events need to be zapped - but otherwise linger.
5901
	 */
5902 5903 5904
	if (parent_event) {
		sync_child_event(child_event, child);
		free_event(child_event);
5905
	}
5906 5907
}

P
Peter Zijlstra 已提交
5908
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5909
{
5910 5911
	struct perf_event *child_event, *tmp;
	struct perf_event_context *child_ctx;
5912
	unsigned long flags;
5913

P
Peter Zijlstra 已提交
5914
	if (likely(!child->perf_event_ctxp[ctxn])) {
5915
		perf_event_task(child, NULL, 0);
5916
		return;
P
Peter Zijlstra 已提交
5917
	}
5918

5919
	local_irq_save(flags);
5920 5921 5922 5923 5924 5925
	/*
	 * We can't reschedule here because interrupts are disabled,
	 * and either child is current or it is a task that can't be
	 * scheduled, so we are now safe from rescheduling changing
	 * our context.
	 */
P
Peter Zijlstra 已提交
5926
	child_ctx = child->perf_event_ctxp[ctxn];
5927
	__perf_event_task_sched_out(child_ctx);
5928 5929 5930

	/*
	 * Take the context lock here so that if find_get_context is
5931
	 * reading child->perf_event_ctxp, we wait until it has
5932 5933
	 * incremented the context's refcount before we do put_ctx below.
	 */
5934
	raw_spin_lock(&child_ctx->lock);
P
Peter Zijlstra 已提交
5935
	child->perf_event_ctxp[ctxn] = NULL;
5936 5937 5938
	/*
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
5939
	 * the events from it.
5940 5941
	 */
	unclone_ctx(child_ctx);
5942
	update_context_time(child_ctx);
5943
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
5944 5945

	/*
5946 5947 5948
	 * Report the task dead after unscheduling the events so that we
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
	 * get a few PERF_RECORD_READ events.
P
Peter Zijlstra 已提交
5949
	 */
5950
	perf_event_task(child, child_ctx, 0);
5951

5952 5953 5954
	/*
	 * We can recurse on the same lock type through:
	 *
5955 5956 5957
	 *   __perf_event_exit_task()
	 *     sync_child_event()
	 *       fput(parent_event->filp)
5958 5959 5960 5961 5962
	 *         perf_release()
	 *           mutex_lock(&ctx->mutex)
	 *
	 * But since its the parent context it won't be the same instance.
	 */
5963
	mutex_lock(&child_ctx->mutex);
5964

5965
again:
5966 5967 5968 5969 5970
	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
				 group_entry)
		__perf_event_exit_task(child_event, child_ctx, child);

	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5971
				 group_entry)
5972
		__perf_event_exit_task(child_event, child_ctx, child);
5973 5974

	/*
5975
	 * If the last event was a group event, it will have appended all
5976 5977 5978
	 * its siblings to the list, but we obtained 'tmp' before that which
	 * will still point to the list head terminating the iteration.
	 */
5979 5980
	if (!list_empty(&child_ctx->pinned_groups) ||
	    !list_empty(&child_ctx->flexible_groups))
5981
		goto again;
5982 5983 5984 5985

	mutex_unlock(&child_ctx->mutex);

	put_ctx(child_ctx);
5986 5987
}

P
Peter Zijlstra 已提交
5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998
/*
 * When a child task exits, feed back event values to parent events.
 */
void perf_event_exit_task(struct task_struct *child)
{
	int ctxn;

	for_each_task_context_nr(ctxn)
		perf_event_exit_task_context(child, ctxn);
}

5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012
static void perf_free_event(struct perf_event *event,
			    struct perf_event_context *ctx)
{
	struct perf_event *parent = event->parent;

	if (WARN_ON_ONCE(!parent))
		return;

	mutex_lock(&parent->child_mutex);
	list_del_init(&event->child_list);
	mutex_unlock(&parent->child_mutex);

	fput(parent->filp);

6013
	perf_group_detach(event);
6014 6015 6016 6017
	list_del_event(event, ctx);
	free_event(event);
}

6018 6019
/*
 * free an unexposed, unused context as created by inheritance by
P
Peter Zijlstra 已提交
6020
 * perf_event_init_task below, used by fork() in case of fail.
6021
 */
6022
void perf_event_free_task(struct task_struct *task)
6023
{
P
Peter Zijlstra 已提交
6024
	struct perf_event_context *ctx;
6025
	struct perf_event *event, *tmp;
P
Peter Zijlstra 已提交
6026
	int ctxn;
6027

P
Peter Zijlstra 已提交
6028 6029 6030 6031
	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (!ctx)
			continue;
6032

P
Peter Zijlstra 已提交
6033
		mutex_lock(&ctx->mutex);
6034
again:
P
Peter Zijlstra 已提交
6035 6036 6037
		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
				group_entry)
			perf_free_event(event, ctx);
6038

P
Peter Zijlstra 已提交
6039 6040 6041
		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
				group_entry)
			perf_free_event(event, ctx);
6042

P
Peter Zijlstra 已提交
6043 6044 6045
		if (!list_empty(&ctx->pinned_groups) ||
				!list_empty(&ctx->flexible_groups))
			goto again;
6046

P
Peter Zijlstra 已提交
6047
		mutex_unlock(&ctx->mutex);
6048

P
Peter Zijlstra 已提交
6049 6050
		put_ctx(ctx);
	}
6051 6052
}

6053 6054 6055 6056 6057 6058 6059 6060
void perf_event_delayed_put(struct task_struct *task)
{
	int ctxn;

	for_each_task_context_nr(ctxn)
		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

P
Peter Zijlstra 已提交
6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072
/*
 * inherit a event from parent task to child task:
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
	      struct task_struct *parent,
	      struct perf_event_context *parent_ctx,
	      struct task_struct *child,
	      struct perf_event *group_leader,
	      struct perf_event_context *child_ctx)
{
	struct perf_event *child_event;
6073
	unsigned long flags;
P
Peter Zijlstra 已提交
6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117

	/*
	 * Instead of creating recursive hierarchies of events,
	 * we link inherited events back to the original parent,
	 * which has a filp for sure, which we use as the reference
	 * count:
	 */
	if (parent_event->parent)
		parent_event = parent_event->parent;

	child_event = perf_event_alloc(&parent_event->attr,
					   parent_event->cpu,
					   group_leader, parent_event,
					   NULL);
	if (IS_ERR(child_event))
		return child_event;
	get_ctx(child_ctx);

	/*
	 * Make the child state follow the state of the parent event,
	 * not its attr.disabled bit.  We hold the parent's mutex,
	 * so we won't race with perf_event_{en, dis}able_family.
	 */
	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
		child_event->state = PERF_EVENT_STATE_INACTIVE;
	else
		child_event->state = PERF_EVENT_STATE_OFF;

	if (parent_event->attr.freq) {
		u64 sample_period = parent_event->hw.sample_period;
		struct hw_perf_event *hwc = &child_event->hw;

		hwc->sample_period = sample_period;
		hwc->last_period   = sample_period;

		local64_set(&hwc->period_left, sample_period);
	}

	child_event->ctx = child_ctx;
	child_event->overflow_handler = parent_event->overflow_handler;

	/*
	 * Link it up in the child's context:
	 */
6118
	raw_spin_lock_irqsave(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
6119
	add_event_to_ctx(child_event, child_ctx);
6120
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161

	/*
	 * Get a reference to the parent filp - we will fput it
	 * when the child event exits. This is safe to do because
	 * we are in the parent and we know that the filp still
	 * exists and has a nonzero count:
	 */
	atomic_long_inc(&parent_event->filp->f_count);

	/*
	 * Link this into the parent event's child list
	 */
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_add_tail(&child_event->child_list, &parent_event->child_list);
	mutex_unlock(&parent_event->child_mutex);

	return child_event;
}

static int inherit_group(struct perf_event *parent_event,
	      struct task_struct *parent,
	      struct perf_event_context *parent_ctx,
	      struct task_struct *child,
	      struct perf_event_context *child_ctx)
{
	struct perf_event *leader;
	struct perf_event *sub;
	struct perf_event *child_ctr;

	leader = inherit_event(parent_event, parent, parent_ctx,
				 child, NULL, child_ctx);
	if (IS_ERR(leader))
		return PTR_ERR(leader);
	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
		child_ctr = inherit_event(sub, parent, parent_ctx,
					    child, leader, child_ctx);
		if (IS_ERR(child_ctr))
			return PTR_ERR(child_ctr);
	}
	return 0;
6162 6163 6164 6165 6166
}

static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
		   struct perf_event_context *parent_ctx,
P
Peter Zijlstra 已提交
6167
		   struct task_struct *child, int ctxn,
6168 6169 6170
		   int *inherited_all)
{
	int ret;
P
Peter Zijlstra 已提交
6171
	struct perf_event_context *child_ctx;
6172 6173 6174 6175

	if (!event->attr.inherit) {
		*inherited_all = 0;
		return 0;
6176 6177
	}

P
Peter Zijlstra 已提交
6178
       	child_ctx = child->perf_event_ctxp[ctxn];
6179 6180 6181 6182 6183 6184 6185
	if (!child_ctx) {
		/*
		 * This is executed from the parent task context, so
		 * inherit events that have been marked for cloning.
		 * First allocate and initialize a context for the
		 * child.
		 */
6186

6187
		child_ctx = alloc_perf_context(event->pmu, child);
6188 6189
		if (!child_ctx)
			return -ENOMEM;
6190

P
Peter Zijlstra 已提交
6191
		child->perf_event_ctxp[ctxn] = child_ctx;
6192 6193 6194 6195 6196 6197 6198 6199 6200
	}

	ret = inherit_group(event, parent, parent_ctx,
			    child, child_ctx);

	if (ret)
		*inherited_all = 0;

	return ret;
6201 6202
}

6203
/*
6204
 * Initialize the perf_event context in task_struct
6205
 */
P
Peter Zijlstra 已提交
6206
int perf_event_init_context(struct task_struct *child, int ctxn)
6207
{
6208
	struct perf_event_context *child_ctx, *parent_ctx;
6209 6210
	struct perf_event_context *cloned_ctx;
	struct perf_event *event;
6211
	struct task_struct *parent = current;
6212
	int inherited_all = 1;
6213
	int ret = 0;
6214

P
Peter Zijlstra 已提交
6215
	child->perf_event_ctxp[ctxn] = NULL;
6216

6217 6218
	mutex_init(&child->perf_event_mutex);
	INIT_LIST_HEAD(&child->perf_event_list);
6219

P
Peter Zijlstra 已提交
6220
	if (likely(!parent->perf_event_ctxp[ctxn]))
6221 6222
		return 0;

6223
	/*
6224 6225
	 * If the parent's context is a clone, pin it so it won't get
	 * swapped under us.
6226
	 */
P
Peter Zijlstra 已提交
6227
	parent_ctx = perf_pin_task_context(parent, ctxn);
6228

6229 6230 6231 6232 6233 6234 6235
	/*
	 * No need to check if parent_ctx != NULL here; since we saw
	 * it non-NULL earlier, the only reason for it to become NULL
	 * is if we exit, and since we're currently in the middle of
	 * a fork we can't be exiting at the same time.
	 */

6236 6237 6238 6239
	/*
	 * Lock the parent list. No need to lock the child - not PID
	 * hashed yet and not running, so nobody can access it.
	 */
6240
	mutex_lock(&parent_ctx->mutex);
6241 6242 6243 6244 6245

	/*
	 * We dont have to disable NMIs - we are only looking at
	 * the list, not manipulating it:
	 */
6246
	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
P
Peter Zijlstra 已提交
6247 6248
		ret = inherit_task_group(event, parent, parent_ctx,
					 child, ctxn, &inherited_all);
6249 6250 6251
		if (ret)
			break;
	}
6252

6253
	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
P
Peter Zijlstra 已提交
6254 6255
		ret = inherit_task_group(event, parent, parent_ctx,
					 child, ctxn, &inherited_all);
6256
		if (ret)
6257
			break;
6258 6259
	}

P
Peter Zijlstra 已提交
6260
	child_ctx = child->perf_event_ctxp[ctxn];
6261

6262
	if (child_ctx && inherited_all) {
6263 6264 6265
		/*
		 * Mark the child context as a clone of the parent
		 * context, or of whatever the parent is a clone of.
6266 6267
		 * Note that if the parent is a clone, it could get
		 * uncloned at any point, but that doesn't matter
6268
		 * because the list of events and the generation
6269
		 * count can't have changed since we took the mutex.
6270
		 */
6271 6272 6273
		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
		if (cloned_ctx) {
			child_ctx->parent_ctx = cloned_ctx;
6274
			child_ctx->parent_gen = parent_ctx->parent_gen;
6275 6276 6277 6278 6279
		} else {
			child_ctx->parent_ctx = parent_ctx;
			child_ctx->parent_gen = parent_ctx->generation;
		}
		get_ctx(child_ctx->parent_ctx);
6280 6281
	}

6282
	mutex_unlock(&parent_ctx->mutex);
6283

6284
	perf_unpin_context(parent_ctx);
6285

6286
	return ret;
6287 6288
}

P
Peter Zijlstra 已提交
6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304
/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child)
{
	int ctxn, ret;

	for_each_task_context_nr(ctxn) {
		ret = perf_event_init_context(child, ctxn);
		if (ret)
			return ret;
	}

	return 0;
}

6305 6306
static void __init perf_event_init_all_cpus(void)
{
6307
	struct swevent_htable *swhash;
6308 6309 6310
	int cpu;

	for_each_possible_cpu(cpu) {
6311 6312
		swhash = &per_cpu(swevent_htable, cpu);
		mutex_init(&swhash->hlist_mutex);
6313
		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
6314 6315 6316
	}
}

6317
static void __cpuinit perf_event_init_cpu(int cpu)
T
Thomas Gleixner 已提交
6318
{
P
Peter Zijlstra 已提交
6319
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
T
Thomas Gleixner 已提交
6320

6321 6322
	mutex_lock(&swhash->hlist_mutex);
	if (swhash->hlist_refcount > 0) {
6323 6324
		struct swevent_hlist *hlist;

6325 6326 6327
		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
		WARN_ON(!hlist);
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
6328
	}
6329
	mutex_unlock(&swhash->hlist_mutex);
T
Thomas Gleixner 已提交
6330 6331 6332
}

#ifdef CONFIG_HOTPLUG_CPU
6333
static void perf_pmu_rotate_stop(struct pmu *pmu)
T
Thomas Gleixner 已提交
6334
{
6335 6336 6337 6338 6339 6340 6341
	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

	WARN_ON(!irqs_disabled());

	list_del_init(&cpuctx->rotation_list);
}

P
Peter Zijlstra 已提交
6342
static void __perf_event_exit_context(void *__info)
T
Thomas Gleixner 已提交
6343
{
P
Peter Zijlstra 已提交
6344
	struct perf_event_context *ctx = __info;
6345
	struct perf_event *event, *tmp;
T
Thomas Gleixner 已提交
6346

P
Peter Zijlstra 已提交
6347
	perf_pmu_rotate_stop(ctx->pmu);
6348

6349 6350 6351
	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
		__perf_event_remove_from_context(event);
	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6352
		__perf_event_remove_from_context(event);
T
Thomas Gleixner 已提交
6353
}
P
Peter Zijlstra 已提交
6354 6355 6356 6357 6358 6359 6360 6361 6362

static void perf_event_exit_cpu_context(int cpu)
{
	struct perf_event_context *ctx;
	struct pmu *pmu;
	int idx;

	idx = srcu_read_lock(&pmus_srcu);
	list_for_each_entry_rcu(pmu, &pmus, entry) {
6363
		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
P
Peter Zijlstra 已提交
6364 6365 6366 6367 6368 6369 6370 6371

		mutex_lock(&ctx->mutex);
		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
		mutex_unlock(&ctx->mutex);
	}
	srcu_read_unlock(&pmus_srcu, idx);
}

6372
static void perf_event_exit_cpu(int cpu)
T
Thomas Gleixner 已提交
6373
{
6374
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6375

6376 6377 6378
	mutex_lock(&swhash->hlist_mutex);
	swevent_hlist_release(swhash);
	mutex_unlock(&swhash->hlist_mutex);
6379

P
Peter Zijlstra 已提交
6380
	perf_event_exit_cpu_context(cpu);
T
Thomas Gleixner 已提交
6381 6382
}
#else
6383
static inline void perf_event_exit_cpu(int cpu) { }
T
Thomas Gleixner 已提交
6384 6385 6386 6387 6388 6389 6390
#endif

static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

P
Peter Zijlstra 已提交
6391
	switch (action & ~CPU_TASKS_FROZEN) {
T
Thomas Gleixner 已提交
6392 6393

	case CPU_UP_PREPARE:
P
Peter Zijlstra 已提交
6394
	case CPU_DOWN_FAILED:
6395
		perf_event_init_cpu(cpu);
T
Thomas Gleixner 已提交
6396 6397
		break;

P
Peter Zijlstra 已提交
6398
	case CPU_UP_CANCELED:
T
Thomas Gleixner 已提交
6399
	case CPU_DOWN_PREPARE:
6400
		perf_event_exit_cpu(cpu);
T
Thomas Gleixner 已提交
6401 6402 6403 6404 6405 6406 6407 6408 6409
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

6410
void __init perf_event_init(void)
T
Thomas Gleixner 已提交
6411
{
6412
	perf_event_init_all_cpus();
6413 6414 6415 6416 6417 6418
	init_srcu_struct(&pmus_srcu);
	perf_pmu_register(&perf_swevent);
	perf_pmu_register(&perf_cpu_clock);
	perf_pmu_register(&perf_task_clock);
	perf_tp_register();
	perf_cpu_notifier(perf_cpu_notify);
T
Thomas Gleixner 已提交
6419
}