perf_event.c 117.2 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events core code:
T
Thomas Gleixner 已提交
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
I
Ingo Molnar 已提交
9
 * For licensing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
T
Thomas Gleixner 已提交
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
T
Thomas Gleixner 已提交
17 18
#include <linux/poll.h>
#include <linux/sysfs.h>
19
#include <linux/dcache.h>
T
Thomas Gleixner 已提交
20
#include <linux/percpu.h>
21
#include <linux/ptrace.h>
22
#include <linux/vmstat.h>
23
#include <linux/vmalloc.h>
24 25
#include <linux/hardirq.h>
#include <linux/rculist.h>
T
Thomas Gleixner 已提交
26 27 28
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
I
Ingo Molnar 已提交
29
#include <linux/kernel_stat.h>
30
#include <linux/perf_event.h>
L
Li Zefan 已提交
31
#include <linux/ftrace_event.h>
T
Thomas Gleixner 已提交
32

33 34
#include <asm/irq_regs.h>

T
Thomas Gleixner 已提交
35
/*
36
 * Each CPU has a list of per CPU events:
T
Thomas Gleixner 已提交
37 38 39
 */
DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

40
int perf_max_events __read_mostly = 1;
T
Thomas Gleixner 已提交
41 42 43
static int perf_reserved_percpu __read_mostly;
static int perf_overcommit __read_mostly = 1;

44 45 46 47
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
48

49
/*
50
 * perf event paranoia level:
51 52
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
53
 *   1 - disallow cpu events for unpriv
54
 *   2 - disallow kernel profiling for unpriv
55
 */
56
int sysctl_perf_event_paranoid __read_mostly = 1;
57

58 59
static inline bool perf_paranoid_tracepoint_raw(void)
{
60
	return sysctl_perf_event_paranoid > -1;
61 62
}

63 64
static inline bool perf_paranoid_cpu(void)
{
65
	return sysctl_perf_event_paranoid > 0;
66 67 68 69
}

static inline bool perf_paranoid_kernel(void)
{
70
	return sysctl_perf_event_paranoid > 1;
71 72
}

73
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
74 75

/*
76
 * max perf event sample rate
77
 */
78
int sysctl_perf_event_sample_rate __read_mostly = 100000;
79

80
static atomic64_t perf_event_id;
81

T
Thomas Gleixner 已提交
82
/*
83
 * Lock for (sysadmin-configurable) event reservations:
T
Thomas Gleixner 已提交
84
 */
85
static DEFINE_SPINLOCK(perf_resource_lock);
T
Thomas Gleixner 已提交
86 87 88 89

/*
 * Architecture provided APIs - weak aliases:
 */
90
extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
T
Thomas Gleixner 已提交
91
{
92
	return NULL;
T
Thomas Gleixner 已提交
93 94
}

95 96 97
void __weak hw_perf_disable(void)		{ barrier(); }
void __weak hw_perf_enable(void)		{ barrier(); }

98 99
void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
100 101

int __weak
102
hw_perf_group_sched_in(struct perf_event *group_leader,
103
	       struct perf_cpu_context *cpuctx,
104
	       struct perf_event_context *ctx, int cpu)
105 106 107
{
	return 0;
}
T
Thomas Gleixner 已提交
108

109
void __weak perf_event_print_debug(void)	{ }
110

111
static DEFINE_PER_CPU(int, perf_disable_count);
112 113 114

void __perf_disable(void)
{
115
	__get_cpu_var(perf_disable_count)++;
116 117 118 119
}

bool __perf_enable(void)
{
120
	return !--__get_cpu_var(perf_disable_count);
121 122 123 124 125 126 127 128 129 130 131 132 133 134
}

void perf_disable(void)
{
	__perf_disable();
	hw_perf_disable();
}

void perf_enable(void)
{
	if (__perf_enable())
		hw_perf_enable();
}

135
static void get_ctx(struct perf_event_context *ctx)
136
{
137
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
138 139
}

140 141
static void free_ctx(struct rcu_head *head)
{
142
	struct perf_event_context *ctx;
143

144
	ctx = container_of(head, struct perf_event_context, rcu_head);
145 146 147
	kfree(ctx);
}

148
static void put_ctx(struct perf_event_context *ctx)
149
{
150 151 152
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
153 154 155
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
156
	}
157 158
}

159
static void unclone_ctx(struct perf_event_context *ctx)
160 161 162 163 164 165 166
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

167
/*
168
 * If we inherit events we want to return the parent event id
169 170
 * to userspace.
 */
171
static u64 primary_event_id(struct perf_event *event)
172
{
173
	u64 id = event->id;
174

175 176
	if (event->parent)
		id = event->parent->id;
177 178 179 180

	return id;
}

181
/*
182
 * Get the perf_event_context for a task and lock it.
183 184 185
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
186
static struct perf_event_context *
187
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
188
{
189
	struct perf_event_context *ctx;
190 191 192

	rcu_read_lock();
 retry:
193
	ctx = rcu_dereference(task->perf_event_ctxp);
194 195 196 197
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
198
		 * perf_event_task_sched_out, though the
199 200 201 202 203 204 205
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
		spin_lock_irqsave(&ctx->lock, *flags);
206
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
207 208 209
			spin_unlock_irqrestore(&ctx->lock, *flags);
			goto retry;
		}
210 211 212 213 214

		if (!atomic_inc_not_zero(&ctx->refcount)) {
			spin_unlock_irqrestore(&ctx->lock, *flags);
			ctx = NULL;
		}
215 216 217 218 219 220 221 222 223 224
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
225
static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
226
{
227
	struct perf_event_context *ctx;
228 229 230 231 232 233 234 235 236 237
	unsigned long flags;

	ctx = perf_lock_task_context(task, &flags);
	if (ctx) {
		++ctx->pin_count;
		spin_unlock_irqrestore(&ctx->lock, flags);
	}
	return ctx;
}

238
static void perf_unpin_context(struct perf_event_context *ctx)
239 240 241 242 243 244 245 246 247
{
	unsigned long flags;

	spin_lock_irqsave(&ctx->lock, flags);
	--ctx->pin_count;
	spin_unlock_irqrestore(&ctx->lock, flags);
	put_ctx(ctx);
}

248
/*
249
 * Add a event from the lists for its context.
250 251
 * Must be called with ctx->mutex and ctx->lock held.
 */
252
static void
253
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
254
{
255
	struct perf_event *group_leader = event->group_leader;
256 257

	/*
258 259
	 * Depending on whether it is a standalone or sibling event,
	 * add it straight to the context's event list, or to the group
260 261
	 * leader's sibling list:
	 */
262 263
	if (group_leader == event)
		list_add_tail(&event->group_entry, &ctx->group_list);
P
Peter Zijlstra 已提交
264
	else {
265
		list_add_tail(&event->group_entry, &group_leader->sibling_list);
P
Peter Zijlstra 已提交
266 267
		group_leader->nr_siblings++;
	}
P
Peter Zijlstra 已提交
268

269 270 271
	list_add_rcu(&event->event_entry, &ctx->event_list);
	ctx->nr_events++;
	if (event->attr.inherit_stat)
272
		ctx->nr_stat++;
273 274
}

275
/*
276
 * Remove a event from the lists for its context.
277
 * Must be called with ctx->mutex and ctx->lock held.
278
 */
279
static void
280
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
281
{
282
	struct perf_event *sibling, *tmp;
283

284
	if (list_empty(&event->group_entry))
285
		return;
286 287
	ctx->nr_events--;
	if (event->attr.inherit_stat)
288
		ctx->nr_stat--;
289

290 291
	list_del_init(&event->group_entry);
	list_del_rcu(&event->event_entry);
292

293 294
	if (event->group_leader != event)
		event->group_leader->nr_siblings--;
P
Peter Zijlstra 已提交
295

296
	/*
297 298
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
299 300
	 * to the context list directly:
	 */
301
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
302

303
		list_move_tail(&sibling->group_entry, &ctx->group_list);
304 305 306 307
		sibling->group_leader = sibling;
	}
}

308
static void
309
event_sched_out(struct perf_event *event,
310
		  struct perf_cpu_context *cpuctx,
311
		  struct perf_event_context *ctx)
312
{
313
	if (event->state != PERF_EVENT_STATE_ACTIVE)
314 315
		return;

316 317 318 319
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
320
	}
321 322 323
	event->tstamp_stopped = ctx->time;
	event->pmu->disable(event);
	event->oncpu = -1;
324

325
	if (!is_software_event(event))
326 327
		cpuctx->active_oncpu--;
	ctx->nr_active--;
328
	if (event->attr.exclusive || !cpuctx->active_oncpu)
329 330 331
		cpuctx->exclusive = 0;
}

332
static void
333
group_sched_out(struct perf_event *group_event,
334
		struct perf_cpu_context *cpuctx,
335
		struct perf_event_context *ctx)
336
{
337
	struct perf_event *event;
338

339
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
340 341
		return;

342
	event_sched_out(group_event, cpuctx, ctx);
343 344 345 346

	/*
	 * Schedule out siblings (if any):
	 */
347 348
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
349

350
	if (group_event->attr.exclusive)
351 352 353
		cpuctx->exclusive = 0;
}

T
Thomas Gleixner 已提交
354
/*
355
 * Cross CPU call to remove a performance event
T
Thomas Gleixner 已提交
356
 *
357
 * We disable the event on the hardware level first. After that we
T
Thomas Gleixner 已提交
358 359
 * remove it from the context list.
 */
360
static void __perf_event_remove_from_context(void *info)
T
Thomas Gleixner 已提交
361 362
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
363 364
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
365 366 367 368 369 370

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
371
	if (ctx->task && cpuctx->task_ctx != ctx)
T
Thomas Gleixner 已提交
372 373
		return;

374
	spin_lock(&ctx->lock);
375 376
	/*
	 * Protect the list operation against NMI by disabling the
377
	 * events on a global level.
378 379
	 */
	perf_disable();
T
Thomas Gleixner 已提交
380

381
	event_sched_out(event, cpuctx, ctx);
382

383
	list_del_event(event, ctx);
T
Thomas Gleixner 已提交
384 385 386

	if (!ctx->task) {
		/*
387
		 * Allow more per task events with respect to the
T
Thomas Gleixner 已提交
388 389 390
		 * reservation:
		 */
		cpuctx->max_pertask =
391 392
			min(perf_max_events - ctx->nr_events,
			    perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
393 394
	}

395
	perf_enable();
396
	spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
397 398 399 400
}


/*
401
 * Remove the event from a task's (or a CPU's) list of events.
T
Thomas Gleixner 已提交
402
 *
403
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
404
 *
405
 * CPU events are removed with a smp call. For task events we only
T
Thomas Gleixner 已提交
406
 * call when the task is on a CPU.
407
 *
408 409
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
410 411
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
412
 * When called from perf_event_exit_task, it's OK because the
413
 * context has been detached from its task.
T
Thomas Gleixner 已提交
414
 */
415
static void perf_event_remove_from_context(struct perf_event *event)
T
Thomas Gleixner 已提交
416
{
417
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
418 419 420 421
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
422
		 * Per cpu events are removed via an smp call and
T
Thomas Gleixner 已提交
423 424
		 * the removal is always sucessful.
		 */
425 426 427
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
T
Thomas Gleixner 已提交
428 429 430 431
		return;
	}

retry:
432 433
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
T
Thomas Gleixner 已提交
434 435 436 437 438

	spin_lock_irq(&ctx->lock);
	/*
	 * If the context is active we need to retry the smp call.
	 */
439
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
T
Thomas Gleixner 已提交
440 441 442 443 444 445
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
446
	 * can remove the event safely, if the call above did not
T
Thomas Gleixner 已提交
447 448
	 * succeed.
	 */
449 450
	if (!list_empty(&event->group_entry)) {
		list_del_event(event, ctx);
T
Thomas Gleixner 已提交
451 452 453 454
	}
	spin_unlock_irq(&ctx->lock);
}

455
static inline u64 perf_clock(void)
456
{
457
	return cpu_clock(smp_processor_id());
458 459 460 461 462
}

/*
 * Update the record of the current time in a context.
 */
463
static void update_context_time(struct perf_event_context *ctx)
464
{
465 466 467 468
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
469 470 471
}

/*
472
 * Update the total_time_enabled and total_time_running fields for a event.
473
 */
474
static void update_event_times(struct perf_event *event)
475
{
476
	struct perf_event_context *ctx = event->ctx;
477 478
	u64 run_end;

479 480
	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481 482
		return;

483
	event->total_time_enabled = ctx->time - event->tstamp_enabled;
484

485 486
	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
487 488 489
	else
		run_end = ctx->time;

490
	event->total_time_running = run_end - event->tstamp_running;
491 492 493
}

/*
494
 * Update total_time_enabled and total_time_running for all events in a group.
495
 */
496
static void update_group_times(struct perf_event *leader)
497
{
498
	struct perf_event *event;
499

500 501 502
	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
503 504
}

505
/*
506
 * Cross CPU call to disable a performance event
507
 */
508
static void __perf_event_disable(void *info)
509
{
510
	struct perf_event *event = info;
511
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
512
	struct perf_event_context *ctx = event->ctx;
513 514

	/*
515 516
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
517
	 */
518
	if (ctx->task && cpuctx->task_ctx != ctx)
519 520
		return;

521
	spin_lock(&ctx->lock);
522 523

	/*
524
	 * If the event is on, turn it off.
525 526
	 * If it is in error state, leave it in error state.
	 */
527
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
528
		update_context_time(ctx);
529 530 531
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
532
		else
533 534
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
535 536
	}

537
	spin_unlock(&ctx->lock);
538 539 540
}

/*
541
 * Disable a event.
542
 *
543 544
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
545
 * remains valid.  This condition is satisifed when called through
546 547 548 549
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
550
 * is the current context on this CPU and preemption is disabled,
551
 * hence we can't get into perf_event_task_sched_out for this context.
552
 */
553
static void perf_event_disable(struct perf_event *event)
554
{
555
	struct perf_event_context *ctx = event->ctx;
556 557 558 559
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
560
		 * Disable the event on the cpu that it's on
561
		 */
562 563
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
564 565 566 567
		return;
	}

 retry:
568
	task_oncpu_function_call(task, __perf_event_disable, event);
569 570 571

	spin_lock_irq(&ctx->lock);
	/*
572
	 * If the event is still active, we need to retry the cross-call.
573
	 */
574
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
575 576 577 578 579 580 581 582
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
583 584 585
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
586
	}
587 588 589 590

	spin_unlock_irq(&ctx->lock);
}

591
static int
592
event_sched_in(struct perf_event *event,
593
		 struct perf_cpu_context *cpuctx,
594
		 struct perf_event_context *ctx,
595 596
		 int cpu)
{
597
	if (event->state <= PERF_EVENT_STATE_OFF)
598 599
		return 0;

600 601
	event->state = PERF_EVENT_STATE_ACTIVE;
	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
602 603 604 605 606
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

607 608 609
	if (event->pmu->enable(event)) {
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
610 611 612
		return -EAGAIN;
	}

613
	event->tstamp_running += ctx->time - event->tstamp_stopped;
614

615
	if (!is_software_event(event))
616
		cpuctx->active_oncpu++;
617 618
	ctx->nr_active++;

619
	if (event->attr.exclusive)
620 621
		cpuctx->exclusive = 1;

622 623 624
	return 0;
}

625
static int
626
group_sched_in(struct perf_event *group_event,
627
	       struct perf_cpu_context *cpuctx,
628
	       struct perf_event_context *ctx,
629 630
	       int cpu)
{
631
	struct perf_event *event, *partial_group;
632 633
	int ret;

634
	if (group_event->state == PERF_EVENT_STATE_OFF)
635 636
		return 0;

637
	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
638 639 640
	if (ret)
		return ret < 0 ? ret : 0;

641
	if (event_sched_in(group_event, cpuctx, ctx, cpu))
642 643 644 645 646
		return -EAGAIN;

	/*
	 * Schedule in siblings as one group (if any):
	 */
647 648 649
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event_sched_in(event, cpuctx, ctx, cpu)) {
			partial_group = event;
650 651 652 653 654 655 656 657 658 659 660
			goto group_error;
		}
	}

	return 0;

group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
	 */
661 662
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
663
			break;
664
		event_sched_out(event, cpuctx, ctx);
665
	}
666
	event_sched_out(group_event, cpuctx, ctx);
667 668 669 670

	return -EAGAIN;
}

671
/*
672 673
 * Return 1 for a group consisting entirely of software events,
 * 0 if the group contains any hardware events.
674
 */
675
static int is_software_only_group(struct perf_event *leader)
676
{
677
	struct perf_event *event;
678

679
	if (!is_software_event(leader))
680
		return 0;
P
Peter Zijlstra 已提交
681

682 683
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		if (!is_software_event(event))
684
			return 0;
P
Peter Zijlstra 已提交
685

686 687 688 689
	return 1;
}

/*
690
 * Work out whether we can put this event group on the CPU now.
691
 */
692
static int group_can_go_on(struct perf_event *event,
693 694 695 696
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
697
	 * Groups consisting entirely of software events can always go on.
698
	 */
699
	if (is_software_only_group(event))
700 701 702
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
703
	 * events can go on.
704 705 706 707 708
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
709
	 * events on the CPU, it can't go on.
710
	 */
711
	if (event->attr.exclusive && cpuctx->active_oncpu)
712 713 714 715 716 717 718 719
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

720 721
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
722
{
723 724 725 726
	list_add_event(event, ctx);
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
727 728
}

T
Thomas Gleixner 已提交
729
/*
730
 * Cross CPU call to install and enable a performance event
731 732
 *
 * Must be called with ctx->mutex held
T
Thomas Gleixner 已提交
733 734 735 736
 */
static void __perf_install_in_context(void *info)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
737 738 739
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
T
Thomas Gleixner 已提交
740
	int cpu = smp_processor_id();
741
	int err;
T
Thomas Gleixner 已提交
742 743 744 745 746

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
747
	 * Or possibly this is the right context but it isn't
748
	 * on this cpu because it had no events.
T
Thomas Gleixner 已提交
749
	 */
750
	if (ctx->task && cpuctx->task_ctx != ctx) {
751
		if (cpuctx->task_ctx || ctx->task != current)
752 753 754
			return;
		cpuctx->task_ctx = ctx;
	}
T
Thomas Gleixner 已提交
755

756
	spin_lock(&ctx->lock);
757
	ctx->is_active = 1;
758
	update_context_time(ctx);
T
Thomas Gleixner 已提交
759 760 761

	/*
	 * Protect the list operation against NMI by disabling the
762
	 * events on a global level. NOP for non NMI based events.
T
Thomas Gleixner 已提交
763
	 */
764
	perf_disable();
T
Thomas Gleixner 已提交
765

766
	add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
767

768
	/*
769
	 * Don't put the event on if it is disabled or if
770 771
	 * it is in a group and the group isn't on.
	 */
772 773
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
774 775
		goto unlock;

776
	/*
777 778 779
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
780
	 */
781
	if (!group_can_go_on(event, cpuctx, 1))
782 783
		err = -EEXIST;
	else
784
		err = event_sched_in(event, cpuctx, ctx, cpu);
785

786 787
	if (err) {
		/*
788
		 * This event couldn't go on.  If it is in a group
789
		 * then we have to pull the whole group off.
790
		 * If the event group is pinned then put it in error state.
791
		 */
792
		if (leader != event)
793
			group_sched_out(leader, cpuctx, ctx);
794
		if (leader->attr.pinned) {
795
			update_group_times(leader);
796
			leader->state = PERF_EVENT_STATE_ERROR;
797
		}
798
	}
T
Thomas Gleixner 已提交
799

800
	if (!err && !ctx->task && cpuctx->max_pertask)
T
Thomas Gleixner 已提交
801 802
		cpuctx->max_pertask--;

803
 unlock:
804
	perf_enable();
805

806
	spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
807 808 809
}

/*
810
 * Attach a performance event to a context
T
Thomas Gleixner 已提交
811
 *
812 813
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
T
Thomas Gleixner 已提交
814
 *
815
 * If the event is attached to a task which is on a CPU we use a smp
T
Thomas Gleixner 已提交
816 817
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
818 819
 *
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
820 821
 */
static void
822 823
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
T
Thomas Gleixner 已提交
824 825 826 827 828 829
			int cpu)
{
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
830
		 * Per cpu events are installed via an smp call and
T
Thomas Gleixner 已提交
831 832 833
		 * the install is always sucessful.
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
834
					 event, 1);
T
Thomas Gleixner 已提交
835 836 837 838 839
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
840
				 event);
T
Thomas Gleixner 已提交
841 842 843 844 845

	spin_lock_irq(&ctx->lock);
	/*
	 * we need to retry the smp call.
	 */
846
	if (ctx->is_active && list_empty(&event->group_entry)) {
T
Thomas Gleixner 已提交
847 848 849 850 851 852
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
853
	 * can add the event safely, if it the call above did not
T
Thomas Gleixner 已提交
854 855
	 * succeed.
	 */
856 857
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
858 859 860
	spin_unlock_irq(&ctx->lock);
}

861
/*
862
 * Put a event into inactive state and update time fields.
863 864 865 866 867 868
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
869 870
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
871
{
872
	struct perf_event *sub;
873

874 875 876 877
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
	list_for_each_entry(sub, &event->sibling_list, group_entry)
		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
878 879 880 881
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
}

882
/*
883
 * Cross CPU call to enable a performance event
884
 */
885
static void __perf_event_enable(void *info)
886
{
887
	struct perf_event *event = info;
888
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
889 890
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
891
	int err;
892

893
	/*
894 895
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
896
	 */
897
	if (ctx->task && cpuctx->task_ctx != ctx) {
898
		if (cpuctx->task_ctx || ctx->task != current)
899 900 901
			return;
		cpuctx->task_ctx = ctx;
	}
902

903
	spin_lock(&ctx->lock);
904
	ctx->is_active = 1;
905
	update_context_time(ctx);
906

907
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
908
		goto unlock;
909
	__perf_event_mark_enabled(event, ctx);
910 911

	/*
912
	 * If the event is in a group and isn't the group leader,
913
	 * then don't put it on unless the group is on.
914
	 */
915
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
916
		goto unlock;
917

918
	if (!group_can_go_on(event, cpuctx, 1)) {
919
		err = -EEXIST;
920
	} else {
921
		perf_disable();
922 923
		if (event == leader)
			err = group_sched_in(event, cpuctx, ctx,
924 925
					     smp_processor_id());
		else
926
			err = event_sched_in(event, cpuctx, ctx,
927
					       smp_processor_id());
928
		perf_enable();
929
	}
930 931 932

	if (err) {
		/*
933
		 * If this event can't go on and it's part of a
934 935
		 * group, then the whole group has to come off.
		 */
936
		if (leader != event)
937
			group_sched_out(leader, cpuctx, ctx);
938
		if (leader->attr.pinned) {
939
			update_group_times(leader);
940
			leader->state = PERF_EVENT_STATE_ERROR;
941
		}
942 943 944
	}

 unlock:
945
	spin_unlock(&ctx->lock);
946 947 948
}

/*
949
 * Enable a event.
950
 *
951 952
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
953
 * remains valid.  This condition is satisfied when called through
954 955
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
956
 */
957
static void perf_event_enable(struct perf_event *event)
958
{
959
	struct perf_event_context *ctx = event->ctx;
960 961 962 963
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
964
		 * Enable the event on the cpu that it's on
965
		 */
966 967
		smp_call_function_single(event->cpu, __perf_event_enable,
					 event, 1);
968 969 970 971
		return;
	}

	spin_lock_irq(&ctx->lock);
972
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
973 974 975
		goto out;

	/*
976 977
	 * If the event is in error state, clear that first.
	 * That way, if we see the event in error state below, we
978 979 980 981
	 * know that it has gone back into error state, as distinct
	 * from the task having been scheduled away before the
	 * cross-call arrived.
	 */
982 983
	if (event->state == PERF_EVENT_STATE_ERROR)
		event->state = PERF_EVENT_STATE_OFF;
984 985 986

 retry:
	spin_unlock_irq(&ctx->lock);
987
	task_oncpu_function_call(task, __perf_event_enable, event);
988 989 990 991

	spin_lock_irq(&ctx->lock);

	/*
992
	 * If the context is active and the event is still off,
993 994
	 * we need to retry the cross-call.
	 */
995
	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
996 997 998 999 1000 1001
		goto retry;

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
1002 1003
	if (event->state == PERF_EVENT_STATE_OFF)
		__perf_event_mark_enabled(event, ctx);
1004

1005 1006 1007 1008
 out:
	spin_unlock_irq(&ctx->lock);
}

1009
static int perf_event_refresh(struct perf_event *event, int refresh)
1010
{
1011
	/*
1012
	 * not supported on inherited events
1013
	 */
1014
	if (event->attr.inherit)
1015 1016
		return -EINVAL;

1017 1018
	atomic_add(refresh, &event->event_limit);
	perf_event_enable(event);
1019 1020

	return 0;
1021 1022
}

1023
void __perf_event_sched_out(struct perf_event_context *ctx,
1024 1025
			      struct perf_cpu_context *cpuctx)
{
1026
	struct perf_event *event;
1027

1028 1029
	spin_lock(&ctx->lock);
	ctx->is_active = 0;
1030
	if (likely(!ctx->nr_events))
1031
		goto out;
1032
	update_context_time(ctx);
1033

1034
	perf_disable();
1035 1036 1037 1038
	if (ctx->nr_active)
		list_for_each_entry(event, &ctx->group_list, group_entry)
			group_sched_out(event, cpuctx, ctx);

1039
	perf_enable();
1040
 out:
1041 1042 1043
	spin_unlock(&ctx->lock);
}

1044 1045 1046
/*
 * Test whether two contexts are equivalent, i.e. whether they
 * have both been cloned from the same version of the same context
1047 1048 1049 1050
 * and they both have the same number of enabled events.
 * If the number of enabled events is the same, then the set
 * of enabled events should be the same, because these are both
 * inherited contexts, therefore we can't access individual events
1051
 * in them directly with an fd; we can only enable/disable all
1052
 * events via prctl, or enable/disable all events in a family
1053 1054
 * via ioctl, which will have the same effect on both contexts.
 */
1055 1056
static int context_equiv(struct perf_event_context *ctx1,
			 struct perf_event_context *ctx2)
1057 1058
{
	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1059
		&& ctx1->parent_gen == ctx2->parent_gen
1060
		&& !ctx1->pin_count && !ctx2->pin_count;
1061 1062
}

1063
static void __perf_event_read(void *event);
1064

1065 1066
static void __perf_event_sync_stat(struct perf_event *event,
				     struct perf_event *next_event)
1067 1068 1069
{
	u64 value;

1070
	if (!event->attr.inherit_stat)
1071 1072 1073
		return;

	/*
1074
	 * Update the event value, we cannot use perf_event_read()
1075 1076
	 * because we're in the middle of a context switch and have IRQs
	 * disabled, which upsets smp_call_function_single(), however
1077
	 * we know the event must be on the current CPU, therefore we
1078 1079
	 * don't need to use it.
	 */
1080 1081 1082
	switch (event->state) {
	case PERF_EVENT_STATE_ACTIVE:
		__perf_event_read(event);
1083 1084
		break;

1085 1086
	case PERF_EVENT_STATE_INACTIVE:
		update_event_times(event);
1087 1088 1089 1090 1091 1092 1093
		break;

	default:
		break;
	}

	/*
1094
	 * In order to keep per-task stats reliable we need to flip the event
1095 1096
	 * values when we flip the contexts.
	 */
1097 1098 1099
	value = atomic64_read(&next_event->count);
	value = atomic64_xchg(&event->count, value);
	atomic64_set(&next_event->count, value);
1100

1101 1102
	swap(event->total_time_enabled, next_event->total_time_enabled);
	swap(event->total_time_running, next_event->total_time_running);
1103

1104
	/*
1105
	 * Since we swizzled the values, update the user visible data too.
1106
	 */
1107 1108
	perf_event_update_userpage(event);
	perf_event_update_userpage(next_event);
1109 1110 1111 1112 1113
}

#define list_next_entry(pos, member) \
	list_entry(pos->member.next, typeof(*pos), member)

1114 1115
static void perf_event_sync_stat(struct perf_event_context *ctx,
				   struct perf_event_context *next_ctx)
1116
{
1117
	struct perf_event *event, *next_event;
1118 1119 1120 1121

	if (!ctx->nr_stat)
		return;

1122 1123
	event = list_first_entry(&ctx->event_list,
				   struct perf_event, event_entry);
1124

1125 1126
	next_event = list_first_entry(&next_ctx->event_list,
					struct perf_event, event_entry);
1127

1128 1129
	while (&event->event_entry != &ctx->event_list &&
	       &next_event->event_entry != &next_ctx->event_list) {
1130

1131
		__perf_event_sync_stat(event, next_event);
1132

1133 1134
		event = list_next_entry(event, event_entry);
		next_event = list_next_entry(next_event, event_entry);
1135 1136 1137
	}
}

T
Thomas Gleixner 已提交
1138
/*
1139
 * Called from scheduler to remove the events of the current task,
T
Thomas Gleixner 已提交
1140 1141
 * with interrupts disabled.
 *
1142
 * We stop each event and update the event value in event->count.
T
Thomas Gleixner 已提交
1143
 *
I
Ingo Molnar 已提交
1144
 * This does not protect us against NMI, but disable()
1145 1146 1147
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
T
Thomas Gleixner 已提交
1148
 */
1149
void perf_event_task_sched_out(struct task_struct *task,
1150
				 struct task_struct *next, int cpu)
T
Thomas Gleixner 已提交
1151 1152
{
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1153 1154 1155
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event_context *next_ctx;
	struct perf_event_context *parent;
1156
	struct pt_regs *regs;
1157
	int do_switch = 1;
T
Thomas Gleixner 已提交
1158

1159
	regs = task_pt_regs(task);
1160
	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1161

1162
	if (likely(!ctx || !cpuctx->task_ctx))
T
Thomas Gleixner 已提交
1163 1164
		return;

1165
	update_context_time(ctx);
1166 1167 1168

	rcu_read_lock();
	parent = rcu_dereference(ctx->parent_ctx);
1169
	next_ctx = next->perf_event_ctxp;
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
	if (parent && next_ctx &&
	    rcu_dereference(next_ctx->parent_ctx) == parent) {
		/*
		 * Looks like the two contexts are clones, so we might be
		 * able to optimize the context switch.  We lock both
		 * contexts and check that they are clones under the
		 * lock (including re-checking that neither has been
		 * uncloned in the meantime).  It doesn't matter which
		 * order we take the locks because no other cpu could
		 * be trying to lock both of these tasks.
		 */
		spin_lock(&ctx->lock);
		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
		if (context_equiv(ctx, next_ctx)) {
1184 1185
			/*
			 * XXX do we need a memory barrier of sorts
1186
			 * wrt to rcu_dereference() of perf_event_ctxp
1187
			 */
1188 1189
			task->perf_event_ctxp = next_ctx;
			next->perf_event_ctxp = ctx;
1190 1191 1192
			ctx->task = next;
			next_ctx->task = task;
			do_switch = 0;
1193

1194
			perf_event_sync_stat(ctx, next_ctx);
1195 1196 1197
		}
		spin_unlock(&next_ctx->lock);
		spin_unlock(&ctx->lock);
1198
	}
1199
	rcu_read_unlock();
1200

1201
	if (do_switch) {
1202
		__perf_event_sched_out(ctx, cpuctx);
1203 1204
		cpuctx->task_ctx = NULL;
	}
T
Thomas Gleixner 已提交
1205 1206
}

1207 1208 1209
/*
 * Called with IRQs disabled
 */
1210
static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1211 1212 1213
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);

1214 1215
	if (!cpuctx->task_ctx)
		return;
1216 1217 1218 1219

	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
		return;

1220
	__perf_event_sched_out(ctx, cpuctx);
1221 1222 1223
	cpuctx->task_ctx = NULL;
}

1224 1225 1226
/*
 * Called with IRQs disabled
 */
1227
static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1228
{
1229
	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
1230 1231
}

1232
static void
1233
__perf_event_sched_in(struct perf_event_context *ctx,
1234
			struct perf_cpu_context *cpuctx, int cpu)
T
Thomas Gleixner 已提交
1235
{
1236
	struct perf_event *event;
1237
	int can_add_hw = 1;
T
Thomas Gleixner 已提交
1238

1239 1240
	spin_lock(&ctx->lock);
	ctx->is_active = 1;
1241
	if (likely(!ctx->nr_events))
1242
		goto out;
T
Thomas Gleixner 已提交
1243

1244
	ctx->timestamp = perf_clock();
1245

1246
	perf_disable();
1247 1248 1249 1250 1251

	/*
	 * First go through the list and put on any pinned groups
	 * in order to give them the best chance of going on.
	 */
1252 1253 1254
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (event->state <= PERF_EVENT_STATE_OFF ||
		    !event->attr.pinned)
1255
			continue;
1256
		if (event->cpu != -1 && event->cpu != cpu)
1257 1258
			continue;

1259 1260
		if (group_can_go_on(event, cpuctx, 1))
			group_sched_in(event, cpuctx, ctx, cpu);
1261 1262 1263 1264 1265

		/*
		 * If this pinned group hasn't been scheduled,
		 * put it in error state.
		 */
1266 1267 1268
		if (event->state == PERF_EVENT_STATE_INACTIVE) {
			update_group_times(event);
			event->state = PERF_EVENT_STATE_ERROR;
1269
		}
1270 1271
	}

1272
	list_for_each_entry(event, &ctx->group_list, group_entry) {
1273
		/*
1274 1275
		 * Ignore events in OFF or ERROR state, and
		 * ignore pinned events since we did them already.
1276
		 */
1277 1278
		if (event->state <= PERF_EVENT_STATE_OFF ||
		    event->attr.pinned)
1279 1280
			continue;

1281 1282
		/*
		 * Listen to the 'cpu' scheduling filter constraint
1283
		 * of events:
1284
		 */
1285
		if (event->cpu != -1 && event->cpu != cpu)
T
Thomas Gleixner 已提交
1286 1287
			continue;

1288 1289
		if (group_can_go_on(event, cpuctx, can_add_hw))
			if (group_sched_in(event, cpuctx, ctx, cpu))
1290
				can_add_hw = 0;
T
Thomas Gleixner 已提交
1291
	}
1292
	perf_enable();
1293
 out:
T
Thomas Gleixner 已提交
1294
	spin_unlock(&ctx->lock);
1295 1296 1297
}

/*
1298
 * Called from scheduler to add the events of the current task
1299 1300
 * with interrupts disabled.
 *
1301
 * We restore the event value and then enable it.
1302 1303
 *
 * This does not protect us against NMI, but enable()
1304 1305 1306
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
1307
 */
1308
void perf_event_task_sched_in(struct task_struct *task, int cpu)
1309 1310
{
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1311
	struct perf_event_context *ctx = task->perf_event_ctxp;
1312

1313 1314
	if (likely(!ctx))
		return;
1315 1316
	if (cpuctx->task_ctx == ctx)
		return;
1317
	__perf_event_sched_in(ctx, cpuctx, cpu);
T
Thomas Gleixner 已提交
1318 1319 1320
	cpuctx->task_ctx = ctx;
}

1321
static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1322
{
1323
	struct perf_event_context *ctx = &cpuctx->ctx;
1324

1325
	__perf_event_sched_in(ctx, cpuctx, cpu);
1326 1327
}

1328 1329
#define MAX_INTERRUPTS (~0ULL)

1330
static void perf_log_throttle(struct perf_event *event, int enable);
1331

1332
static void perf_adjust_period(struct perf_event *event, u64 events)
1333
{
1334
	struct hw_perf_event *hwc = &event->hw;
1335 1336 1337 1338
	u64 period, sample_period;
	s64 delta;

	events *= hwc->sample_period;
1339
	period = div64_u64(events, event->attr.sample_freq);
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */

	sample_period = hwc->sample_period + delta;

	if (!sample_period)
		sample_period = 1;

	hwc->sample_period = sample_period;
}

1352
static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1353
{
1354 1355
	struct perf_event *event;
	struct hw_perf_event *hwc;
1356
	u64 interrupts, freq;
1357 1358

	spin_lock(&ctx->lock);
1359 1360
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (event->state != PERF_EVENT_STATE_ACTIVE)
1361 1362
			continue;

1363
		hwc = &event->hw;
1364 1365 1366

		interrupts = hwc->interrupts;
		hwc->interrupts = 0;
1367

1368
		/*
1369
		 * unthrottle events on the tick
1370
		 */
1371
		if (interrupts == MAX_INTERRUPTS) {
1372 1373 1374
			perf_log_throttle(event, 1);
			event->pmu->unthrottle(event);
			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1375 1376
		}

1377
		if (!event->attr.freq || !event->attr.sample_freq)
1378 1379
			continue;

1380 1381 1382
		/*
		 * if the specified freq < HZ then we need to skip ticks
		 */
1383 1384
		if (event->attr.sample_freq < HZ) {
			freq = event->attr.sample_freq;
1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397

			hwc->freq_count += freq;
			hwc->freq_interrupts += interrupts;

			if (hwc->freq_count < HZ)
				continue;

			interrupts = hwc->freq_interrupts;
			hwc->freq_interrupts = 0;
			hwc->freq_count -= HZ;
		} else
			freq = HZ;

1398
		perf_adjust_period(event, freq * interrupts);
1399

1400 1401 1402 1403 1404 1405 1406
		/*
		 * In order to avoid being stalled by an (accidental) huge
		 * sample period, force reset the sample period if we didn't
		 * get any events in this freq period.
		 */
		if (!interrupts) {
			perf_disable();
1407
			event->pmu->disable(event);
1408
			atomic64_set(&hwc->period_left, 0);
1409
			event->pmu->enable(event);
1410 1411
			perf_enable();
		}
1412 1413 1414 1415
	}
	spin_unlock(&ctx->lock);
}

1416
/*
1417
 * Round-robin a context's events:
1418
 */
1419
static void rotate_ctx(struct perf_event_context *ctx)
T
Thomas Gleixner 已提交
1420
{
1421
	struct perf_event *event;
T
Thomas Gleixner 已提交
1422

1423
	if (!ctx->nr_events)
T
Thomas Gleixner 已提交
1424 1425 1426 1427
		return;

	spin_lock(&ctx->lock);
	/*
1428
	 * Rotate the first entry last (works just fine for group events too):
T
Thomas Gleixner 已提交
1429
	 */
1430
	perf_disable();
1431 1432
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		list_move_tail(&event->group_entry, &ctx->group_list);
T
Thomas Gleixner 已提交
1433 1434
		break;
	}
1435
	perf_enable();
T
Thomas Gleixner 已提交
1436 1437

	spin_unlock(&ctx->lock);
1438 1439
}

1440
void perf_event_task_tick(struct task_struct *curr, int cpu)
1441
{
1442
	struct perf_cpu_context *cpuctx;
1443
	struct perf_event_context *ctx;
1444

1445
	if (!atomic_read(&nr_events))
1446 1447 1448
		return;

	cpuctx = &per_cpu(perf_cpu_context, cpu);
1449
	ctx = curr->perf_event_ctxp;
1450

1451
	perf_ctx_adjust_freq(&cpuctx->ctx);
1452
	if (ctx)
1453
		perf_ctx_adjust_freq(ctx);
1454

1455
	perf_event_cpu_sched_out(cpuctx);
1456
	if (ctx)
1457
		__perf_event_task_sched_out(ctx);
T
Thomas Gleixner 已提交
1458

1459
	rotate_ctx(&cpuctx->ctx);
1460 1461
	if (ctx)
		rotate_ctx(ctx);
1462

1463
	perf_event_cpu_sched_in(cpuctx, cpu);
1464
	if (ctx)
1465
		perf_event_task_sched_in(curr, cpu);
T
Thomas Gleixner 已提交
1466 1467
}

1468
/*
1469
 * Enable all of a task's events that have been marked enable-on-exec.
1470 1471
 * This expects task == current.
 */
1472
static void perf_event_enable_on_exec(struct task_struct *task)
1473
{
1474 1475
	struct perf_event_context *ctx;
	struct perf_event *event;
1476 1477 1478 1479
	unsigned long flags;
	int enabled = 0;

	local_irq_save(flags);
1480 1481
	ctx = task->perf_event_ctxp;
	if (!ctx || !ctx->nr_events)
1482 1483
		goto out;

1484
	__perf_event_task_sched_out(ctx);
1485 1486 1487

	spin_lock(&ctx->lock);

1488 1489
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (!event->attr.enable_on_exec)
1490
			continue;
1491 1492
		event->attr.enable_on_exec = 0;
		if (event->state >= PERF_EVENT_STATE_INACTIVE)
1493
			continue;
1494
		__perf_event_mark_enabled(event, ctx);
1495 1496 1497 1498
		enabled = 1;
	}

	/*
1499
	 * Unclone this context if we enabled any event.
1500
	 */
1501 1502
	if (enabled)
		unclone_ctx(ctx);
1503 1504 1505

	spin_unlock(&ctx->lock);

1506
	perf_event_task_sched_in(task, smp_processor_id());
1507 1508 1509 1510
 out:
	local_irq_restore(flags);
}

T
Thomas Gleixner 已提交
1511
/*
1512
 * Cross CPU call to read the hardware event
T
Thomas Gleixner 已提交
1513
 */
1514
static void __perf_event_read(void *info)
T
Thomas Gleixner 已提交
1515
{
1516
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517 1518
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
I
Ingo Molnar 已提交
1519
	unsigned long flags;
I
Ingo Molnar 已提交
1520

1521 1522 1523 1524
	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu.  If not it has been
	 * scheduled out before the smp call arrived.  In that case
1525 1526
	 * event->count would have been updated to a recent sample
	 * when the event was scheduled out.
1527 1528 1529 1530
	 */
	if (ctx->task && cpuctx->task_ctx != ctx)
		return;

1531
	local_irq_save(flags);
1532
	if (ctx->is_active)
1533
		update_context_time(ctx);
1534 1535
	event->pmu->read(event);
	update_event_times(event);
1536
	local_irq_restore(flags);
T
Thomas Gleixner 已提交
1537 1538
}

1539
static u64 perf_event_read(struct perf_event *event)
T
Thomas Gleixner 已提交
1540 1541
{
	/*
1542 1543
	 * If event is enabled and currently active on a CPU, update the
	 * value in the event structure:
T
Thomas Gleixner 已提交
1544
	 */
1545 1546 1547 1548 1549
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
		smp_call_function_single(event->oncpu,
					 __perf_event_read, event, 1);
	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_event_times(event);
T
Thomas Gleixner 已提交
1550 1551
	}

1552
	return atomic64_read(&event->count);
T
Thomas Gleixner 已提交
1553 1554
}

1555
/*
1556
 * Initialize the perf_event context in a task_struct:
1557 1558
 */
static void
1559
__perf_event_init_context(struct perf_event_context *ctx,
1560 1561 1562 1563 1564
			    struct task_struct *task)
{
	memset(ctx, 0, sizeof(*ctx));
	spin_lock_init(&ctx->lock);
	mutex_init(&ctx->mutex);
1565
	INIT_LIST_HEAD(&ctx->group_list);
1566 1567 1568 1569 1570
	INIT_LIST_HEAD(&ctx->event_list);
	atomic_set(&ctx->refcount, 1);
	ctx->task = task;
}

1571
static struct perf_event_context *find_get_context(pid_t pid, int cpu)
T
Thomas Gleixner 已提交
1572
{
1573
	struct perf_event_context *ctx;
1574
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
1575
	struct task_struct *task;
1576
	unsigned long flags;
1577
	int err;
T
Thomas Gleixner 已提交
1578 1579

	/*
1580
	 * If cpu is not a wildcard then this is a percpu event:
T
Thomas Gleixner 已提交
1581 1582
	 */
	if (cpu != -1) {
1583
		/* Must be root to operate on a CPU event: */
1584
		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
T
Thomas Gleixner 已提交
1585 1586 1587 1588 1589 1590
			return ERR_PTR(-EACCES);

		if (cpu < 0 || cpu > num_possible_cpus())
			return ERR_PTR(-EINVAL);

		/*
1591
		 * We could be clever and allow to attach a event to an
T
Thomas Gleixner 已提交
1592 1593 1594 1595 1596 1597 1598 1599
		 * offline CPU and activate it when the CPU comes up, but
		 * that's for later.
		 */
		if (!cpu_isset(cpu, cpu_online_map))
			return ERR_PTR(-ENODEV);

		cpuctx = &per_cpu(perf_cpu_context, cpu);
		ctx = &cpuctx->ctx;
1600
		get_ctx(ctx);
T
Thomas Gleixner 已提交
1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616

		return ctx;
	}

	rcu_read_lock();
	if (!pid)
		task = current;
	else
		task = find_task_by_vpid(pid);
	if (task)
		get_task_struct(task);
	rcu_read_unlock();

	if (!task)
		return ERR_PTR(-ESRCH);

1617
	/*
1618
	 * Can't attach events to a dying task.
1619 1620 1621 1622 1623
	 */
	err = -ESRCH;
	if (task->flags & PF_EXITING)
		goto errout;

T
Thomas Gleixner 已提交
1624
	/* Reuse ptrace permission checks for now. */
1625 1626 1627 1628 1629
	err = -EACCES;
	if (!ptrace_may_access(task, PTRACE_MODE_READ))
		goto errout;

 retry:
1630
	ctx = perf_lock_task_context(task, &flags);
1631
	if (ctx) {
1632
		unclone_ctx(ctx);
1633
		spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1634 1635
	}

1636
	if (!ctx) {
1637
		ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1638 1639 1640
		err = -ENOMEM;
		if (!ctx)
			goto errout;
1641
		__perf_event_init_context(ctx, task);
1642
		get_ctx(ctx);
1643
		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1644 1645 1646 1647 1648
			/*
			 * We raced with some other task; use
			 * the context they set.
			 */
			kfree(ctx);
1649
			goto retry;
1650
		}
1651
		get_task_struct(task);
1652 1653
	}

1654
	put_task_struct(task);
T
Thomas Gleixner 已提交
1655
	return ctx;
1656 1657 1658 1659

 errout:
	put_task_struct(task);
	return ERR_PTR(err);
T
Thomas Gleixner 已提交
1660 1661
}

L
Li Zefan 已提交
1662 1663
static void perf_event_free_filter(struct perf_event *event);

1664
static void free_event_rcu(struct rcu_head *head)
P
Peter Zijlstra 已提交
1665
{
1666
	struct perf_event *event;
P
Peter Zijlstra 已提交
1667

1668 1669 1670
	event = container_of(head, struct perf_event, rcu_head);
	if (event->ns)
		put_pid_ns(event->ns);
L
Li Zefan 已提交
1671
	perf_event_free_filter(event);
1672
	kfree(event);
P
Peter Zijlstra 已提交
1673 1674
}

1675
static void perf_pending_sync(struct perf_event *event);
1676

1677
static void free_event(struct perf_event *event)
1678
{
1679
	perf_pending_sync(event);
1680

1681 1682 1683 1684 1685 1686 1687 1688
	if (!event->parent) {
		atomic_dec(&nr_events);
		if (event->attr.mmap)
			atomic_dec(&nr_mmap_events);
		if (event->attr.comm)
			atomic_dec(&nr_comm_events);
		if (event->attr.task)
			atomic_dec(&nr_task_events);
1689
	}
1690

1691 1692 1693
	if (event->output) {
		fput(event->output->filp);
		event->output = NULL;
1694 1695
	}

1696 1697
	if (event->destroy)
		event->destroy(event);
1698

1699 1700
	put_ctx(event->ctx);
	call_rcu(&event->rcu_head, free_event_rcu);
1701 1702
}

T
Thomas Gleixner 已提交
1703 1704 1705 1706 1707
/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
1708 1709
	struct perf_event *event = file->private_data;
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
1710 1711 1712

	file->private_data = NULL;

1713
	WARN_ON_ONCE(ctx->parent_ctx);
1714
	mutex_lock(&ctx->mutex);
1715
	perf_event_remove_from_context(event);
1716
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
1717

1718 1719 1720 1721
	mutex_lock(&event->owner->perf_event_mutex);
	list_del_init(&event->owner_entry);
	mutex_unlock(&event->owner->perf_event_mutex);
	put_task_struct(event->owner);
1722

1723
	free_event(event);
T
Thomas Gleixner 已提交
1724 1725 1726 1727

	return 0;
}

1728
static int perf_event_read_size(struct perf_event *event)
1729 1730 1731 1732 1733
{
	int entry = sizeof(u64); /* value */
	int size = 0;
	int nr = 1;

1734
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1735 1736
		size += sizeof(u64);

1737
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1738 1739
		size += sizeof(u64);

1740
	if (event->attr.read_format & PERF_FORMAT_ID)
1741 1742
		entry += sizeof(u64);

1743 1744
	if (event->attr.read_format & PERF_FORMAT_GROUP) {
		nr += event->group_leader->nr_siblings;
1745 1746 1747 1748 1749 1750 1751 1752
		size += sizeof(u64);
	}

	size += entry * nr;

	return size;
}

1753
static u64 perf_event_read_value(struct perf_event *event)
1754
{
1755
	struct perf_event *child;
1756 1757
	u64 total = 0;

1758 1759 1760
	total += perf_event_read(event);
	list_for_each_entry(child, &event->child_list, child_list)
		total += perf_event_read(child);
1761 1762 1763 1764

	return total;
}

1765
static int perf_event_read_entry(struct perf_event *event,
1766 1767 1768 1769 1770
				   u64 read_format, char __user *buf)
{
	int n = 0, count = 0;
	u64 values[2];

1771
	values[n++] = perf_event_read_value(event);
1772
	if (read_format & PERF_FORMAT_ID)
1773
		values[n++] = primary_event_id(event);
1774 1775 1776 1777 1778 1779 1780 1781 1782

	count = n * sizeof(u64);

	if (copy_to_user(buf, values, count))
		return -EFAULT;

	return count;
}

1783
static int perf_event_read_group(struct perf_event *event,
1784 1785
				   u64 read_format, char __user *buf)
{
1786
	struct perf_event *leader = event->group_leader, *sub;
1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804
	int n = 0, size = 0, err = -EFAULT;
	u64 values[3];

	values[n++] = 1 + leader->nr_siblings;
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
		values[n++] = leader->total_time_enabled +
			atomic64_read(&leader->child_total_time_enabled);
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
		values[n++] = leader->total_time_running +
			atomic64_read(&leader->child_total_time_running);
	}

	size = n * sizeof(u64);

	if (copy_to_user(buf, values, size))
		return -EFAULT;

1805
	err = perf_event_read_entry(leader, read_format, buf + size);
1806 1807 1808 1809 1810
	if (err < 0)
		return err;

	size += err;

1811
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1812
		err = perf_event_read_entry(sub, read_format,
1813 1814 1815 1816 1817 1818 1819 1820 1821 1822
				buf + size);
		if (err < 0)
			return err;

		size += err;
	}

	return size;
}

1823
static int perf_event_read_one(struct perf_event *event,
1824 1825 1826 1827 1828
				 u64 read_format, char __user *buf)
{
	u64 values[4];
	int n = 0;

1829
	values[n++] = perf_event_read_value(event);
1830
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1831 1832
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
1833 1834
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1835 1836
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
1837 1838
	}
	if (read_format & PERF_FORMAT_ID)
1839
		values[n++] = primary_event_id(event);
1840 1841 1842 1843 1844 1845 1846

	if (copy_to_user(buf, values, n * sizeof(u64)))
		return -EFAULT;

	return n * sizeof(u64);
}

T
Thomas Gleixner 已提交
1847
/*
1848
 * Read the performance event - simple non blocking version for now
T
Thomas Gleixner 已提交
1849 1850
 */
static ssize_t
1851
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
T
Thomas Gleixner 已提交
1852
{
1853
	u64 read_format = event->attr.read_format;
1854
	int ret;
T
Thomas Gleixner 已提交
1855

1856
	/*
1857
	 * Return end-of-file for a read on a event that is in
1858 1859 1860
	 * error state (i.e. because it was pinned but it couldn't be
	 * scheduled on to the CPU at some point).
	 */
1861
	if (event->state == PERF_EVENT_STATE_ERROR)
1862 1863
		return 0;

1864
	if (count < perf_event_read_size(event))
1865 1866
		return -ENOSPC;

1867 1868
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
1869
	if (read_format & PERF_FORMAT_GROUP)
1870
		ret = perf_event_read_group(event, read_format, buf);
1871
	else
1872 1873
		ret = perf_event_read_one(event, read_format, buf);
	mutex_unlock(&event->child_mutex);
T
Thomas Gleixner 已提交
1874

1875
	return ret;
T
Thomas Gleixner 已提交
1876 1877 1878 1879 1880
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
1881
	struct perf_event *event = file->private_data;
T
Thomas Gleixner 已提交
1882

1883
	return perf_read_hw(event, buf, count);
T
Thomas Gleixner 已提交
1884 1885 1886 1887
}

static unsigned int perf_poll(struct file *file, poll_table *wait)
{
1888
	struct perf_event *event = file->private_data;
P
Peter Zijlstra 已提交
1889
	struct perf_mmap_data *data;
1890
	unsigned int events = POLL_HUP;
P
Peter Zijlstra 已提交
1891 1892

	rcu_read_lock();
1893
	data = rcu_dereference(event->data);
P
Peter Zijlstra 已提交
1894
	if (data)
1895
		events = atomic_xchg(&data->poll, 0);
P
Peter Zijlstra 已提交
1896
	rcu_read_unlock();
T
Thomas Gleixner 已提交
1897

1898
	poll_wait(file, &event->waitq, wait);
T
Thomas Gleixner 已提交
1899 1900 1901 1902

	return events;
}

1903
static void perf_event_reset(struct perf_event *event)
1904
{
1905 1906 1907
	(void)perf_event_read(event);
	atomic64_set(&event->count, 0);
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
1908 1909
}

1910
/*
1911 1912 1913 1914
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in sync_child_event if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
1915
 */
1916 1917
static void perf_event_for_each_child(struct perf_event *event,
					void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
1918
{
1919
	struct perf_event *child;
P
Peter Zijlstra 已提交
1920

1921 1922 1923 1924
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
	func(event);
	list_for_each_entry(child, &event->child_list, child_list)
P
Peter Zijlstra 已提交
1925
		func(child);
1926
	mutex_unlock(&event->child_mutex);
P
Peter Zijlstra 已提交
1927 1928
}

1929 1930
static void perf_event_for_each(struct perf_event *event,
				  void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
1931
{
1932 1933
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *sibling;
P
Peter Zijlstra 已提交
1934

1935 1936
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
1937
	event = event->group_leader;
1938

1939 1940 1941 1942
	perf_event_for_each_child(event, func);
	func(event);
	list_for_each_entry(sibling, &event->sibling_list, group_entry)
		perf_event_for_each_child(event, func);
1943
	mutex_unlock(&ctx->mutex);
1944 1945
}

1946
static int perf_event_period(struct perf_event *event, u64 __user *arg)
1947
{
1948
	struct perf_event_context *ctx = event->ctx;
1949 1950 1951 1952
	unsigned long size;
	int ret = 0;
	u64 value;

1953
	if (!event->attr.sample_period)
1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
		return -EINVAL;

	size = copy_from_user(&value, arg, sizeof(value));
	if (size != sizeof(value))
		return -EFAULT;

	if (!value)
		return -EINVAL;

	spin_lock_irq(&ctx->lock);
1964 1965
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate) {
1966 1967 1968 1969
			ret = -EINVAL;
			goto unlock;
		}

1970
		event->attr.sample_freq = value;
1971
	} else {
1972 1973
		event->attr.sample_period = value;
		event->hw.sample_period = value;
1974 1975 1976 1977 1978 1979 1980
	}
unlock:
	spin_unlock_irq(&ctx->lock);

	return ret;
}

L
Li Zefan 已提交
1981 1982
static int perf_event_set_output(struct perf_event *event, int output_fd);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1983

1984 1985
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
1986 1987
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
P
Peter Zijlstra 已提交
1988
	u32 flags = arg;
1989 1990

	switch (cmd) {
1991 1992
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
1993
		break;
1994 1995
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
1996
		break;
1997 1998
	case PERF_EVENT_IOC_RESET:
		func = perf_event_reset;
1999
		break;
P
Peter Zijlstra 已提交
2000

2001 2002
	case PERF_EVENT_IOC_REFRESH:
		return perf_event_refresh(event, arg);
2003

2004 2005
	case PERF_EVENT_IOC_PERIOD:
		return perf_event_period(event, (u64 __user *)arg);
2006

2007 2008
	case PERF_EVENT_IOC_SET_OUTPUT:
		return perf_event_set_output(event, arg);
2009

L
Li Zefan 已提交
2010 2011 2012
	case PERF_EVENT_IOC_SET_FILTER:
		return perf_event_set_filter(event, (void __user *)arg);

2013
	default:
P
Peter Zijlstra 已提交
2014
		return -ENOTTY;
2015
	}
P
Peter Zijlstra 已提交
2016 2017

	if (flags & PERF_IOC_FLAG_GROUP)
2018
		perf_event_for_each(event, func);
P
Peter Zijlstra 已提交
2019
	else
2020
		perf_event_for_each_child(event, func);
P
Peter Zijlstra 已提交
2021 2022

	return 0;
2023 2024
}

2025
int perf_event_task_enable(void)
2026
{
2027
	struct perf_event *event;
2028

2029 2030 2031 2032
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_enable);
	mutex_unlock(&current->perf_event_mutex);
2033 2034 2035 2036

	return 0;
}

2037
int perf_event_task_disable(void)
2038
{
2039
	struct perf_event *event;
2040

2041 2042 2043 2044
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_disable);
	mutex_unlock(&current->perf_event_mutex);
2045 2046 2047 2048

	return 0;
}

2049 2050
#ifndef PERF_EVENT_INDEX_OFFSET
# define PERF_EVENT_INDEX_OFFSET 0
I
Ingo Molnar 已提交
2051 2052
#endif

2053
static int perf_event_index(struct perf_event *event)
2054
{
2055
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2056 2057
		return 0;

2058
	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2059 2060
}

2061 2062 2063 2064 2065
/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
2066
void perf_event_update_userpage(struct perf_event *event)
2067
{
2068
	struct perf_event_mmap_page *userpg;
2069
	struct perf_mmap_data *data;
2070 2071

	rcu_read_lock();
2072
	data = rcu_dereference(event->data);
2073 2074 2075 2076
	if (!data)
		goto unlock;

	userpg = data->user_page;
2077

2078 2079 2080 2081 2082
	/*
	 * Disable preemption so as to not let the corresponding user-space
	 * spin too long if we get preempted.
	 */
	preempt_disable();
2083
	++userpg->lock;
2084
	barrier();
2085 2086 2087 2088
	userpg->index = perf_event_index(event);
	userpg->offset = atomic64_read(&event->count);
	if (event->state == PERF_EVENT_STATE_ACTIVE)
		userpg->offset -= atomic64_read(&event->hw.prev_count);
2089

2090 2091
	userpg->time_enabled = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2092

2093 2094
	userpg->time_running = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2095

2096
	barrier();
2097
	++userpg->lock;
2098
	preempt_enable();
2099
unlock:
2100
	rcu_read_unlock();
2101 2102
}

2103
static unsigned long perf_data_size(struct perf_mmap_data *data)
2104
{
2105 2106
	return data->nr_pages << (PAGE_SHIFT + data->data_order);
}
2107

2108
#ifndef CONFIG_PERF_USE_VMALLOC
2109

2110 2111 2112
/*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
2113

2114 2115 2116 2117 2118
static struct page *
perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
{
	if (pgoff > data->nr_pages)
		return NULL;
2119

2120 2121
	if (pgoff == 0)
		return virt_to_page(data->user_page);
2122

2123
	return virt_to_page(data->data_pages[pgoff - 1]);
2124 2125
}

2126 2127
static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2128 2129 2130 2131 2132
{
	struct perf_mmap_data *data;
	unsigned long size;
	int i;

2133
	WARN_ON(atomic_read(&event->mmap_count));
2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151

	size = sizeof(struct perf_mmap_data);
	size += nr_pages * sizeof(void *);

	data = kzalloc(size, GFP_KERNEL);
	if (!data)
		goto fail;

	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
	if (!data->user_page)
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
		if (!data->data_pages[i])
			goto fail_data_pages;
	}

2152
	data->data_order = 0;
2153 2154
	data->nr_pages = nr_pages;

2155
	return data;
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166

fail_data_pages:
	for (i--; i >= 0; i--)
		free_page((unsigned long)data->data_pages[i]);

	free_page((unsigned long)data->user_page);

fail_user_page:
	kfree(data);

fail:
2167
	return NULL;
2168 2169
}

2170 2171
static void perf_mmap_free_page(unsigned long addr)
{
K
Kevin Cernekee 已提交
2172
	struct page *page = virt_to_page((void *)addr);
2173 2174 2175 2176 2177

	page->mapping = NULL;
	__free_page(page);
}

2178
static void perf_mmap_data_free(struct perf_mmap_data *data)
2179 2180 2181
{
	int i;

2182
	perf_mmap_free_page((unsigned long)data->user_page);
2183
	for (i = 0; i < data->nr_pages; i++)
2184
		perf_mmap_free_page((unsigned long)data->data_pages[i]);
2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
}

#else

/*
 * Back perf_mmap() with vmalloc memory.
 *
 * Required for architectures that have d-cache aliasing issues.
 */

static struct page *
perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
{
	if (pgoff > (1UL << data->data_order))
		return NULL;

	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
}

static void perf_mmap_unmark_page(void *addr)
{
	struct page *page = vmalloc_to_page(addr);

	page->mapping = NULL;
}

static void perf_mmap_data_free_work(struct work_struct *work)
{
	struct perf_mmap_data *data;
	void *base;
	int i, nr;

	data = container_of(work, struct perf_mmap_data, work);
	nr = 1 << data->data_order;

	base = data->user_page;
	for (i = 0; i < nr + 1; i++)
		perf_mmap_unmark_page(base + (i * PAGE_SIZE));

	vfree(base);
}

static void perf_mmap_data_free(struct perf_mmap_data *data)
{
	schedule_work(&data->work);
}

static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
{
	struct perf_mmap_data *data;
	unsigned long size;
	void *all_buf;

	WARN_ON(atomic_read(&event->mmap_count));
2240

2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329
	size = sizeof(struct perf_mmap_data);
	size += sizeof(void *);

	data = kzalloc(size, GFP_KERNEL);
	if (!data)
		goto fail;

	INIT_WORK(&data->work, perf_mmap_data_free_work);

	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

	data->user_page = all_buf;
	data->data_pages[0] = all_buf + PAGE_SIZE;
	data->data_order = ilog2(nr_pages);
	data->nr_pages = 1;

	return data;

fail_all_buf:
	kfree(data);

fail:
	return NULL;
}

#endif

static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct perf_event *event = vma->vm_file->private_data;
	struct perf_mmap_data *data;
	int ret = VM_FAULT_SIGBUS;

	if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
			ret = 0;
		return ret;
	}

	rcu_read_lock();
	data = rcu_dereference(event->data);
	if (!data)
		goto unlock;

	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
		goto unlock;

	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
	if (!vmf->page)
		goto unlock;

	get_page(vmf->page);
	vmf->page->mapping = vma->vm_file->f_mapping;
	vmf->page->index   = vmf->pgoff;

	ret = 0;
unlock:
	rcu_read_unlock();

	return ret;
}

static void
perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
{
	long max_size = perf_data_size(data);

	atomic_set(&data->lock, -1);

	if (event->attr.watermark) {
		data->watermark = min_t(long, max_size,
					event->attr.wakeup_watermark);
	}

	if (!data->watermark)
		data->watermark = max_t(long, PAGE_SIZE, max_size / 2);


	rcu_assign_pointer(event->data, data);
}

static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
{
	struct perf_mmap_data *data;

	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
	perf_mmap_data_free(data);
2330 2331 2332
	kfree(data);
}

2333
static void perf_mmap_data_release(struct perf_event *event)
2334
{
2335
	struct perf_mmap_data *data = event->data;
2336

2337
	WARN_ON(atomic_read(&event->mmap_count));
2338

2339
	rcu_assign_pointer(event->data, NULL);
2340
	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2341 2342 2343 2344
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
2345
	struct perf_event *event = vma->vm_file->private_data;
2346

2347
	atomic_inc(&event->mmap_count);
2348 2349 2350 2351
}

static void perf_mmap_close(struct vm_area_struct *vma)
{
2352
	struct perf_event *event = vma->vm_file->private_data;
2353

2354 2355
	WARN_ON_ONCE(event->ctx->parent_ctx);
	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2356
		unsigned long size = perf_data_size(event->data);
2357 2358
		struct user_struct *user = current_user();

2359
		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2360
		vma->vm_mm->locked_vm -= event->data->nr_locked;
2361
		perf_mmap_data_release(event);
2362
		mutex_unlock(&event->mmap_mutex);
2363
	}
2364 2365
}

2366
static const struct vm_operations_struct perf_mmap_vmops = {
2367 2368 2369 2370
	.open		= perf_mmap_open,
	.close		= perf_mmap_close,
	.fault		= perf_mmap_fault,
	.page_mkwrite	= perf_mmap_fault,
2371 2372 2373 2374
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
2375
	struct perf_event *event = file->private_data;
2376
	unsigned long user_locked, user_lock_limit;
2377
	struct user_struct *user = current_user();
2378
	unsigned long locked, lock_limit;
2379
	struct perf_mmap_data *data;
2380 2381
	unsigned long vma_size;
	unsigned long nr_pages;
2382
	long user_extra, extra;
2383
	int ret = 0;
2384

2385
	if (!(vma->vm_flags & VM_SHARED))
2386
		return -EINVAL;
2387 2388 2389 2390

	vma_size = vma->vm_end - vma->vm_start;
	nr_pages = (vma_size / PAGE_SIZE) - 1;

2391 2392 2393 2394 2395
	/*
	 * If we have data pages ensure they're a power-of-two number, so we
	 * can do bitmasks instead of modulo.
	 */
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
2396 2397
		return -EINVAL;

2398
	if (vma_size != PAGE_SIZE * (1 + nr_pages))
2399 2400
		return -EINVAL;

2401 2402
	if (vma->vm_pgoff != 0)
		return -EINVAL;
2403

2404 2405 2406
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->mmap_mutex);
	if (event->output) {
2407 2408 2409 2410
		ret = -EINVAL;
		goto unlock;
	}

2411 2412
	if (atomic_inc_not_zero(&event->mmap_count)) {
		if (nr_pages != event->data->nr_pages)
2413 2414 2415 2416
			ret = -EINVAL;
		goto unlock;
	}

2417
	user_extra = nr_pages + 1;
2418
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
I
Ingo Molnar 已提交
2419 2420 2421 2422 2423 2424

	/*
	 * Increase the limit linearly with more CPUs:
	 */
	user_lock_limit *= num_online_cpus();

2425
	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2426

2427 2428 2429
	extra = 0;
	if (user_locked > user_lock_limit)
		extra = user_locked - user_lock_limit;
2430 2431 2432

	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
	lock_limit >>= PAGE_SHIFT;
2433
	locked = vma->vm_mm->locked_vm + extra;
2434

2435 2436
	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
		!capable(CAP_IPC_LOCK)) {
2437 2438 2439
		ret = -EPERM;
		goto unlock;
	}
2440

2441
	WARN_ON(event->data);
2442 2443 2444 2445

	data = perf_mmap_data_alloc(event, nr_pages);
	ret = -ENOMEM;
	if (!data)
2446 2447
		goto unlock;

2448 2449 2450
	ret = 0;
	perf_mmap_data_init(event, data);

2451
	atomic_set(&event->mmap_count, 1);
2452
	atomic_long_add(user_extra, &user->locked_vm);
2453
	vma->vm_mm->locked_vm += extra;
2454
	event->data->nr_locked = extra;
2455
	if (vma->vm_flags & VM_WRITE)
2456
		event->data->writable = 1;
2457

2458
unlock:
2459
	mutex_unlock(&event->mmap_mutex);
2460 2461 2462

	vma->vm_flags |= VM_RESERVED;
	vma->vm_ops = &perf_mmap_vmops;
2463 2464

	return ret;
2465 2466
}

P
Peter Zijlstra 已提交
2467 2468 2469
static int perf_fasync(int fd, struct file *filp, int on)
{
	struct inode *inode = filp->f_path.dentry->d_inode;
2470
	struct perf_event *event = filp->private_data;
P
Peter Zijlstra 已提交
2471 2472 2473
	int retval;

	mutex_lock(&inode->i_mutex);
2474
	retval = fasync_helper(fd, filp, on, &event->fasync);
P
Peter Zijlstra 已提交
2475 2476 2477 2478 2479 2480 2481 2482
	mutex_unlock(&inode->i_mutex);

	if (retval < 0)
		return retval;

	return 0;
}

T
Thomas Gleixner 已提交
2483 2484 2485 2486
static const struct file_operations perf_fops = {
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
2487 2488
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
2489
	.mmap			= perf_mmap,
P
Peter Zijlstra 已提交
2490
	.fasync			= perf_fasync,
T
Thomas Gleixner 已提交
2491 2492
};

2493
/*
2494
 * Perf event wakeup
2495 2496 2497 2498 2499
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

2500
void perf_event_wakeup(struct perf_event *event)
2501
{
2502
	wake_up_all(&event->waitq);
2503

2504 2505 2506
	if (event->pending_kill) {
		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
		event->pending_kill = 0;
2507
	}
2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518
}

/*
 * Pending wakeups
 *
 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
 *
 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
 * single linked list and use cmpxchg() to add entries lockless.
 */

2519
static void perf_pending_event(struct perf_pending_entry *entry)
2520
{
2521 2522
	struct perf_event *event = container_of(entry,
			struct perf_event, pending);
2523

2524 2525 2526
	if (event->pending_disable) {
		event->pending_disable = 0;
		__perf_event_disable(event);
2527 2528
	}

2529 2530 2531
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
2532 2533 2534
	}
}

2535
#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2536

2537
static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2538 2539 2540
	PENDING_TAIL,
};

2541 2542
static void perf_pending_queue(struct perf_pending_entry *entry,
			       void (*func)(struct perf_pending_entry *))
2543
{
2544
	struct perf_pending_entry **head;
2545

2546
	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2547 2548
		return;

2549 2550 2551
	entry->func = func;

	head = &get_cpu_var(perf_pending_head);
2552 2553

	do {
2554 2555
		entry->next = *head;
	} while (cmpxchg(head, entry->next, entry) != entry->next);
2556

2557
	set_perf_event_pending();
2558

2559
	put_cpu_var(perf_pending_head);
2560 2561 2562 2563
}

static int __perf_pending_run(void)
{
2564
	struct perf_pending_entry *list;
2565 2566
	int nr = 0;

2567
	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2568
	while (list != PENDING_TAIL) {
2569 2570
		void (*func)(struct perf_pending_entry *);
		struct perf_pending_entry *entry = list;
2571 2572 2573

		list = list->next;

2574 2575
		func = entry->func;
		entry->next = NULL;
2576 2577 2578 2579 2580 2581 2582
		/*
		 * Ensure we observe the unqueue before we issue the wakeup,
		 * so that we won't be waiting forever.
		 * -- see perf_not_pending().
		 */
		smp_wmb();

2583
		func(entry);
2584 2585 2586 2587 2588 2589
		nr++;
	}

	return nr;
}

2590
static inline int perf_not_pending(struct perf_event *event)
2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
{
	/*
	 * If we flush on whatever cpu we run, there is a chance we don't
	 * need to wait.
	 */
	get_cpu();
	__perf_pending_run();
	put_cpu();

	/*
	 * Ensure we see the proper queue state before going to sleep
	 * so that we do not miss the wakeup. -- see perf_pending_handle()
	 */
	smp_rmb();
2605
	return event->pending.next == NULL;
2606 2607
}

2608
static void perf_pending_sync(struct perf_event *event)
2609
{
2610
	wait_event(event->waitq, perf_not_pending(event));
2611 2612
}

2613
void perf_event_do_pending(void)
2614 2615 2616 2617
{
	__perf_pending_run();
}

2618 2619 2620 2621
/*
 * Callchain support -- arch specific
 */

2622
__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2623 2624 2625 2626
{
	return NULL;
}

2627 2628 2629
/*
 * Output
 */
2630 2631
static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
			      unsigned long offset, unsigned long head)
2632 2633 2634 2635 2636 2637
{
	unsigned long mask;

	if (!data->writable)
		return true;

2638
	mask = perf_data_size(data) - 1;
2639 2640 2641 2642 2643 2644 2645 2646 2647 2648

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

2649
static void perf_output_wakeup(struct perf_output_handle *handle)
2650
{
2651 2652
	atomic_set(&handle->data->poll, POLL_IN);

2653
	if (handle->nmi) {
2654 2655 2656
		handle->event->pending_wakeup = 1;
		perf_pending_queue(&handle->event->pending,
				   perf_pending_event);
2657
	} else
2658
		perf_event_wakeup(handle->event);
2659 2660
}

2661 2662 2663
/*
 * Curious locking construct.
 *
2664 2665
 * We need to ensure a later event_id doesn't publish a head when a former
 * event_id isn't done writing. However since we need to deal with NMIs we
2666 2667 2668 2669 2670 2671
 * cannot fully serialize things.
 *
 * What we do is serialize between CPUs so we only have to deal with NMI
 * nesting on a single CPU.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
2672
 * event_id completes.
2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686
 */
static void perf_output_lock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
	int cpu;

	handle->locked = 0;

	local_irq_save(handle->flags);
	cpu = smp_processor_id();

	if (in_nmi() && atomic_read(&data->lock) == cpu)
		return;

2687
	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2688 2689 2690 2691 2692 2693 2694 2695
		cpu_relax();

	handle->locked = 1;
}

static void perf_output_unlock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
2696 2697
	unsigned long head;
	int cpu;
2698

2699
	data->done_head = data->head;
2700 2701 2702 2703 2704 2705 2706 2707 2708 2709

	if (!handle->locked)
		goto out;

again:
	/*
	 * The xchg implies a full barrier that ensures all writes are done
	 * before we publish the new head, matched by a rmb() in userspace when
	 * reading this position.
	 */
2710
	while ((head = atomic_long_xchg(&data->done_head, 0)))
2711 2712 2713
		data->user_page->data_head = head;

	/*
2714
	 * NMI can happen here, which means we can miss a done_head update.
2715 2716
	 */

2717
	cpu = atomic_xchg(&data->lock, -1);
2718 2719 2720 2721 2722
	WARN_ON_ONCE(cpu != smp_processor_id());

	/*
	 * Therefore we have to validate we did not indeed do so.
	 */
2723
	if (unlikely(atomic_long_read(&data->done_head))) {
2724 2725 2726
		/*
		 * Since we had it locked, we can lock it again.
		 */
2727
		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2728 2729 2730 2731 2732
			cpu_relax();

		goto again;
	}

2733
	if (atomic_xchg(&data->wakeup, 0))
2734 2735 2736 2737 2738
		perf_output_wakeup(handle);
out:
	local_irq_restore(handle->flags);
}

2739 2740
void perf_output_copy(struct perf_output_handle *handle,
		      const void *buf, unsigned int len)
2741 2742
{
	unsigned int pages_mask;
2743
	unsigned long offset;
2744 2745 2746 2747 2748 2749 2750 2751
	unsigned int size;
	void **pages;

	offset		= handle->offset;
	pages_mask	= handle->data->nr_pages - 1;
	pages		= handle->data->data_pages;

	do {
2752 2753
		unsigned long page_offset;
		unsigned long page_size;
2754 2755 2756
		int nr;

		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
2757 2758 2759
		page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
		page_offset = offset & (page_size - 1);
		size	    = min_t(unsigned int, page_size - page_offset, len);
2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776

		memcpy(pages[nr] + page_offset, buf, size);

		len	    -= size;
		buf	    += size;
		offset	    += size;
	} while (len);

	handle->offset = offset;

	/*
	 * Check we didn't copy past our reservation window, taking the
	 * possible unsigned int wrap into account.
	 */
	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}

2777
int perf_output_begin(struct perf_output_handle *handle,
2778
		      struct perf_event *event, unsigned int size,
2779
		      int nmi, int sample)
2780
{
2781
	struct perf_event *output_event;
2782
	struct perf_mmap_data *data;
2783
	unsigned long tail, offset, head;
2784 2785 2786 2787 2788 2789
	int have_lost;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;
2790

2791
	rcu_read_lock();
2792
	/*
2793
	 * For inherited events we send all the output towards the parent.
2794
	 */
2795 2796
	if (event->parent)
		event = event->parent;
2797

2798 2799 2800
	output_event = rcu_dereference(event->output);
	if (output_event)
		event = output_event;
2801

2802
	data = rcu_dereference(event->data);
2803 2804 2805
	if (!data)
		goto out;

2806
	handle->data	= data;
2807
	handle->event	= event;
2808 2809
	handle->nmi	= nmi;
	handle->sample	= sample;
2810

2811
	if (!data->nr_pages)
2812
		goto fail;
2813

2814 2815 2816 2817
	have_lost = atomic_read(&data->lost);
	if (have_lost)
		size += sizeof(lost_event);

2818 2819
	perf_output_lock(handle);

2820
	do {
2821 2822 2823 2824 2825 2826 2827
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
		tail = ACCESS_ONCE(data->user_page->data_tail);
		smp_rmb();
2828
		offset = head = atomic_long_read(&data->head);
P
Peter Zijlstra 已提交
2829
		head += size;
2830
		if (unlikely(!perf_output_space(data, tail, offset, head)))
2831
			goto fail;
2832
	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2833

2834
	handle->offset	= offset;
2835
	handle->head	= head;
2836

2837
	if (head - tail > data->watermark)
2838
		atomic_set(&data->wakeup, 1);
2839

2840
	if (have_lost) {
2841
		lost_event.header.type = PERF_RECORD_LOST;
2842 2843
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
2844
		lost_event.id          = event->id;
2845 2846 2847 2848 2849
		lost_event.lost        = atomic_xchg(&data->lost, 0);

		perf_output_put(handle, lost_event);
	}

2850
	return 0;
2851

2852
fail:
2853 2854
	atomic_inc(&data->lost);
	perf_output_unlock(handle);
2855 2856
out:
	rcu_read_unlock();
2857

2858 2859
	return -ENOSPC;
}
2860

2861
void perf_output_end(struct perf_output_handle *handle)
2862
{
2863
	struct perf_event *event = handle->event;
2864 2865
	struct perf_mmap_data *data = handle->data;

2866
	int wakeup_events = event->attr.wakeup_events;
P
Peter Zijlstra 已提交
2867

2868
	if (handle->sample && wakeup_events) {
2869
		int events = atomic_inc_return(&data->events);
P
Peter Zijlstra 已提交
2870
		if (events >= wakeup_events) {
2871
			atomic_sub(wakeup_events, &data->events);
2872
			atomic_set(&data->wakeup, 1);
P
Peter Zijlstra 已提交
2873
		}
2874 2875 2876
	}

	perf_output_unlock(handle);
2877
	rcu_read_unlock();
2878 2879
}

2880
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2881 2882
{
	/*
2883
	 * only top level events have the pid namespace they were created in
2884
	 */
2885 2886
	if (event->parent)
		event = event->parent;
2887

2888
	return task_tgid_nr_ns(p, event->ns);
2889 2890
}

2891
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2892 2893
{
	/*
2894
	 * only top level events have the pid namespace they were created in
2895
	 */
2896 2897
	if (event->parent)
		event = event->parent;
2898

2899
	return task_pid_nr_ns(p, event->ns);
2900 2901
}

2902
static void perf_output_read_one(struct perf_output_handle *handle,
2903
				 struct perf_event *event)
2904
{
2905
	u64 read_format = event->attr.read_format;
2906 2907 2908
	u64 values[4];
	int n = 0;

2909
	values[n++] = atomic64_read(&event->count);
2910
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2911 2912
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2913 2914
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2915 2916
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2917 2918
	}
	if (read_format & PERF_FORMAT_ID)
2919
		values[n++] = primary_event_id(event);
2920 2921 2922 2923 2924

	perf_output_copy(handle, values, n * sizeof(u64));
}

/*
2925
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2926 2927
 */
static void perf_output_read_group(struct perf_output_handle *handle,
2928
			    struct perf_event *event)
2929
{
2930 2931
	struct perf_event *leader = event->group_leader, *sub;
	u64 read_format = event->attr.read_format;
2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942
	u64 values[5];
	int n = 0;

	values[n++] = 1 + leader->nr_siblings;

	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = leader->total_time_enabled;

	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = leader->total_time_running;

2943
	if (leader != event)
2944 2945 2946 2947
		leader->pmu->read(leader);

	values[n++] = atomic64_read(&leader->count);
	if (read_format & PERF_FORMAT_ID)
2948
		values[n++] = primary_event_id(leader);
2949 2950 2951

	perf_output_copy(handle, values, n * sizeof(u64));

2952
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2953 2954
		n = 0;

2955
		if (sub != event)
2956 2957 2958 2959
			sub->pmu->read(sub);

		values[n++] = atomic64_read(&sub->count);
		if (read_format & PERF_FORMAT_ID)
2960
			values[n++] = primary_event_id(sub);
2961 2962 2963 2964 2965 2966

		perf_output_copy(handle, values, n * sizeof(u64));
	}
}

static void perf_output_read(struct perf_output_handle *handle,
2967
			     struct perf_event *event)
2968
{
2969 2970
	if (event->attr.read_format & PERF_FORMAT_GROUP)
		perf_output_read_group(handle, event);
2971
	else
2972
		perf_output_read_one(handle, event);
2973 2974
}

2975 2976 2977
void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
2978
			struct perf_event *event)
2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	if (sample_type & PERF_SAMPLE_ADDR)
		perf_output_put(handle, data->addr);

	if (sample_type & PERF_SAMPLE_ID)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		perf_output_put(handle, data->stream_id);

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	if (sample_type & PERF_SAMPLE_READ)
3009
		perf_output_read(handle, event);
3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046

	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
		if (data->callchain) {
			int size = 1;

			if (data->callchain)
				size += data->callchain->nr;

			size *= sizeof(u64);

			perf_output_copy(handle, data->callchain, size);
		} else {
			u64 nr = 0;
			perf_output_put(handle, nr);
		}
	}

	if (sample_type & PERF_SAMPLE_RAW) {
		if (data->raw) {
			perf_output_put(handle, data->raw->size);
			perf_output_copy(handle, data->raw->data,
					 data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(handle, raw);
		}
	}
}

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
3047
			 struct perf_event *event,
3048
			 struct pt_regs *regs)
3049
{
3050
	u64 sample_type = event->attr.sample_type;
3051

3052
	data->type = sample_type;
3053

3054
	header->type = PERF_RECORD_SAMPLE;
3055 3056 3057 3058
	header->size = sizeof(*header);

	header->misc = 0;
	header->misc |= perf_misc_flags(regs);
3059

3060
	if (sample_type & PERF_SAMPLE_IP) {
3061 3062 3063
		data->ip = perf_instruction_pointer(regs);

		header->size += sizeof(data->ip);
3064
	}
3065

3066
	if (sample_type & PERF_SAMPLE_TID) {
3067
		/* namespace issues */
3068 3069
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
3070

3071
		header->size += sizeof(data->tid_entry);
3072 3073
	}

3074
	if (sample_type & PERF_SAMPLE_TIME) {
P
Peter Zijlstra 已提交
3075
		data->time = perf_clock();
3076

3077
		header->size += sizeof(data->time);
3078 3079
	}

3080
	if (sample_type & PERF_SAMPLE_ADDR)
3081
		header->size += sizeof(data->addr);
3082

3083
	if (sample_type & PERF_SAMPLE_ID) {
3084
		data->id = primary_event_id(event);
3085

3086 3087 3088 3089
		header->size += sizeof(data->id);
	}

	if (sample_type & PERF_SAMPLE_STREAM_ID) {
3090
		data->stream_id = event->id;
3091 3092 3093

		header->size += sizeof(data->stream_id);
	}
3094

3095
	if (sample_type & PERF_SAMPLE_CPU) {
3096 3097
		data->cpu_entry.cpu		= raw_smp_processor_id();
		data->cpu_entry.reserved	= 0;
3098

3099
		header->size += sizeof(data->cpu_entry);
3100 3101
	}

3102
	if (sample_type & PERF_SAMPLE_PERIOD)
3103
		header->size += sizeof(data->period);
3104

3105
	if (sample_type & PERF_SAMPLE_READ)
3106
		header->size += perf_event_read_size(event);
3107

3108
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3109
		int size = 1;
3110

3111 3112 3113 3114 3115 3116
		data->callchain = perf_callchain(regs);

		if (data->callchain)
			size += data->callchain->nr;

		header->size += size * sizeof(u64);
3117 3118
	}

3119
	if (sample_type & PERF_SAMPLE_RAW) {
3120 3121 3122 3123 3124 3125 3126 3127
		int size = sizeof(u32);

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
3128
		header->size += size;
3129
	}
3130
}
3131

3132
static void perf_event_output(struct perf_event *event, int nmi,
3133 3134 3135 3136 3137
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
	struct perf_output_handle handle;
	struct perf_event_header header;
3138

3139
	perf_prepare_sample(&header, data, event, regs);
P
Peter Zijlstra 已提交
3140

3141
	if (perf_output_begin(&handle, event, header.size, nmi, 1))
3142
		return;
3143

3144
	perf_output_sample(&handle, &header, data, event);
3145

3146
	perf_output_end(&handle);
3147 3148
}

3149
/*
3150
 * read event_id
3151 3152 3153 3154 3155 3156 3157 3158 3159 3160
 */

struct perf_read_event {
	struct perf_event_header	header;

	u32				pid;
	u32				tid;
};

static void
3161
perf_event_read_event(struct perf_event *event,
3162 3163 3164
			struct task_struct *task)
{
	struct perf_output_handle handle;
3165
	struct perf_read_event read_event = {
3166
		.header = {
3167
			.type = PERF_RECORD_READ,
3168
			.misc = 0,
3169
			.size = sizeof(read_event) + perf_event_read_size(event),
3170
		},
3171 3172
		.pid = perf_event_pid(event, task),
		.tid = perf_event_tid(event, task),
3173
	};
3174
	int ret;
3175

3176
	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3177 3178 3179
	if (ret)
		return;

3180
	perf_output_put(&handle, read_event);
3181
	perf_output_read(&handle, event);
3182

3183 3184 3185
	perf_output_end(&handle);
}

P
Peter Zijlstra 已提交
3186
/*
P
Peter Zijlstra 已提交
3187 3188 3189
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.task
P
Peter Zijlstra 已提交
3190 3191
 */

P
Peter Zijlstra 已提交
3192
struct perf_task_event {
3193
	struct task_struct		*task;
3194
	struct perf_event_context	*task_ctx;
P
Peter Zijlstra 已提交
3195 3196 3197 3198 3199 3200

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				ppid;
P
Peter Zijlstra 已提交
3201 3202
		u32				tid;
		u32				ptid;
3203
		u64				time;
3204
	} event_id;
P
Peter Zijlstra 已提交
3205 3206
};

3207
static void perf_event_task_output(struct perf_event *event,
P
Peter Zijlstra 已提交
3208
				     struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3209 3210
{
	struct perf_output_handle handle;
3211
	int size;
P
Peter Zijlstra 已提交
3212
	struct task_struct *task = task_event->task;
3213 3214
	int ret;

3215 3216
	size  = task_event->event_id.header.size;
	ret = perf_output_begin(&handle, event, size, 0, 0);
P
Peter Zijlstra 已提交
3217 3218 3219 3220

	if (ret)
		return;

3221 3222
	task_event->event_id.pid = perf_event_pid(event, task);
	task_event->event_id.ppid = perf_event_pid(event, current);
P
Peter Zijlstra 已提交
3223

3224 3225
	task_event->event_id.tid = perf_event_tid(event, task);
	task_event->event_id.ptid = perf_event_tid(event, current);
P
Peter Zijlstra 已提交
3226

3227
	task_event->event_id.time = perf_clock();
3228

3229
	perf_output_put(&handle, task_event->event_id);
3230

P
Peter Zijlstra 已提交
3231 3232 3233
	perf_output_end(&handle);
}

3234
static int perf_event_task_match(struct perf_event *event)
P
Peter Zijlstra 已提交
3235
{
3236
	if (event->attr.comm || event->attr.mmap || event->attr.task)
P
Peter Zijlstra 已提交
3237 3238 3239 3240 3241
		return 1;

	return 0;
}

3242
static void perf_event_task_ctx(struct perf_event_context *ctx,
P
Peter Zijlstra 已提交
3243
				  struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3244
{
3245
	struct perf_event *event;
P
Peter Zijlstra 已提交
3246 3247 3248 3249 3250

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3251 3252 3253
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_task_match(event))
			perf_event_task_output(event, task_event);
P
Peter Zijlstra 已提交
3254 3255 3256 3257
	}
	rcu_read_unlock();
}

3258
static void perf_event_task_event(struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3259 3260
{
	struct perf_cpu_context *cpuctx;
3261
	struct perf_event_context *ctx = task_event->task_ctx;
P
Peter Zijlstra 已提交
3262 3263

	cpuctx = &get_cpu_var(perf_cpu_context);
3264
	perf_event_task_ctx(&cpuctx->ctx, task_event);
P
Peter Zijlstra 已提交
3265 3266 3267
	put_cpu_var(perf_cpu_context);

	rcu_read_lock();
3268
	if (!ctx)
3269
		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
P
Peter Zijlstra 已提交
3270
	if (ctx)
3271
		perf_event_task_ctx(ctx, task_event);
P
Peter Zijlstra 已提交
3272 3273 3274
	rcu_read_unlock();
}

3275 3276
static void perf_event_task(struct task_struct *task,
			      struct perf_event_context *task_ctx,
3277
			      int new)
P
Peter Zijlstra 已提交
3278
{
P
Peter Zijlstra 已提交
3279
	struct perf_task_event task_event;
P
Peter Zijlstra 已提交
3280

3281 3282 3283
	if (!atomic_read(&nr_comm_events) &&
	    !atomic_read(&nr_mmap_events) &&
	    !atomic_read(&nr_task_events))
P
Peter Zijlstra 已提交
3284 3285
		return;

P
Peter Zijlstra 已提交
3286
	task_event = (struct perf_task_event){
3287 3288
		.task	  = task,
		.task_ctx = task_ctx,
3289
		.event_id    = {
P
Peter Zijlstra 已提交
3290
			.header = {
3291
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3292
				.misc = 0,
3293
				.size = sizeof(task_event.event_id),
P
Peter Zijlstra 已提交
3294
			},
3295 3296
			/* .pid  */
			/* .ppid */
P
Peter Zijlstra 已提交
3297 3298
			/* .tid  */
			/* .ptid */
P
Peter Zijlstra 已提交
3299 3300 3301
		},
	};

3302
	perf_event_task_event(&task_event);
P
Peter Zijlstra 已提交
3303 3304
}

3305
void perf_event_fork(struct task_struct *task)
P
Peter Zijlstra 已提交
3306
{
3307
	perf_event_task(task, NULL, 1);
P
Peter Zijlstra 已提交
3308 3309
}

3310 3311 3312 3313 3314
/*
 * comm tracking
 */

struct perf_comm_event {
3315 3316
	struct task_struct	*task;
	char			*comm;
3317 3318 3319 3320 3321 3322 3323
	int			comm_size;

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
3324
	} event_id;
3325 3326
};

3327
static void perf_event_comm_output(struct perf_event *event,
3328 3329 3330
				     struct perf_comm_event *comm_event)
{
	struct perf_output_handle handle;
3331 3332
	int size = comm_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3333 3334 3335 3336

	if (ret)
		return;

3337 3338
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3339

3340
	perf_output_put(&handle, comm_event->event_id);
3341 3342 3343 3344 3345
	perf_output_copy(&handle, comm_event->comm,
				   comm_event->comm_size);
	perf_output_end(&handle);
}

3346
static int perf_event_comm_match(struct perf_event *event)
3347
{
3348
	if (event->attr.comm)
3349 3350 3351 3352 3353
		return 1;

	return 0;
}

3354
static void perf_event_comm_ctx(struct perf_event_context *ctx,
3355 3356
				  struct perf_comm_event *comm_event)
{
3357
	struct perf_event *event;
3358 3359 3360 3361 3362

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3363 3364 3365
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_comm_match(event))
			perf_event_comm_output(event, comm_event);
3366 3367 3368 3369
	}
	rcu_read_unlock();
}

3370
static void perf_event_comm_event(struct perf_comm_event *comm_event)
3371 3372
{
	struct perf_cpu_context *cpuctx;
3373
	struct perf_event_context *ctx;
3374
	unsigned int size;
3375
	char comm[TASK_COMM_LEN];
3376

3377 3378
	memset(comm, 0, sizeof(comm));
	strncpy(comm, comm_event->task->comm, sizeof(comm));
3379
	size = ALIGN(strlen(comm)+1, sizeof(u64));
3380 3381 3382 3383

	comm_event->comm = comm;
	comm_event->comm_size = size;

3384
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3385 3386

	cpuctx = &get_cpu_var(perf_cpu_context);
3387
	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3388
	put_cpu_var(perf_cpu_context);
3389 3390 3391 3392 3393 3394

	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3395
	ctx = rcu_dereference(current->perf_event_ctxp);
3396
	if (ctx)
3397
		perf_event_comm_ctx(ctx, comm_event);
3398
	rcu_read_unlock();
3399 3400
}

3401
void perf_event_comm(struct task_struct *task)
3402
{
3403 3404
	struct perf_comm_event comm_event;

3405 3406
	if (task->perf_event_ctxp)
		perf_event_enable_on_exec(task);
3407

3408
	if (!atomic_read(&nr_comm_events))
3409
		return;
3410

3411
	comm_event = (struct perf_comm_event){
3412
		.task	= task,
3413 3414
		/* .comm      */
		/* .comm_size */
3415
		.event_id  = {
3416
			.header = {
3417
				.type = PERF_RECORD_COMM,
3418 3419 3420 3421 3422
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3423 3424 3425
		},
	};

3426
	perf_event_comm_event(&comm_event);
3427 3428
}

3429 3430 3431 3432 3433
/*
 * mmap tracking
 */

struct perf_mmap_event {
3434 3435 3436 3437
	struct vm_area_struct	*vma;

	const char		*file_name;
	int			file_size;
3438 3439 3440 3441 3442 3443 3444 3445 3446

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
		u64				start;
		u64				len;
		u64				pgoff;
3447
	} event_id;
3448 3449
};

3450
static void perf_event_mmap_output(struct perf_event *event,
3451 3452 3453
				     struct perf_mmap_event *mmap_event)
{
	struct perf_output_handle handle;
3454 3455
	int size = mmap_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3456 3457 3458 3459

	if (ret)
		return;

3460 3461
	mmap_event->event_id.pid = perf_event_pid(event, current);
	mmap_event->event_id.tid = perf_event_tid(event, current);
3462

3463
	perf_output_put(&handle, mmap_event->event_id);
3464 3465
	perf_output_copy(&handle, mmap_event->file_name,
				   mmap_event->file_size);
3466
	perf_output_end(&handle);
3467 3468
}

3469
static int perf_event_mmap_match(struct perf_event *event,
3470 3471
				   struct perf_mmap_event *mmap_event)
{
3472
	if (event->attr.mmap)
3473 3474 3475 3476 3477
		return 1;

	return 0;
}

3478
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3479 3480
				  struct perf_mmap_event *mmap_event)
{
3481
	struct perf_event *event;
3482 3483 3484 3485 3486

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3487 3488 3489
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_mmap_match(event, mmap_event))
			perf_event_mmap_output(event, mmap_event);
3490 3491 3492 3493
	}
	rcu_read_unlock();
}

3494
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3495 3496
{
	struct perf_cpu_context *cpuctx;
3497
	struct perf_event_context *ctx;
3498 3499
	struct vm_area_struct *vma = mmap_event->vma;
	struct file *file = vma->vm_file;
3500 3501 3502
	unsigned int size;
	char tmp[16];
	char *buf = NULL;
3503
	const char *name;
3504

3505 3506
	memset(tmp, 0, sizeof(tmp));

3507
	if (file) {
3508 3509 3510 3511 3512 3513
		/*
		 * d_path works from the end of the buffer backwards, so we
		 * need to add enough zero bytes after the string to handle
		 * the 64bit alignment we do later.
		 */
		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3514 3515 3516 3517
		if (!buf) {
			name = strncpy(tmp, "//enomem", sizeof(tmp));
			goto got_name;
		}
3518
		name = d_path(&file->f_path, buf, PATH_MAX);
3519 3520 3521 3522 3523
		if (IS_ERR(name)) {
			name = strncpy(tmp, "//toolong", sizeof(tmp));
			goto got_name;
		}
	} else {
3524 3525 3526
		if (arch_vma_name(mmap_event->vma)) {
			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				       sizeof(tmp));
3527
			goto got_name;
3528
		}
3529 3530 3531 3532 3533 3534

		if (!vma->vm_mm) {
			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			goto got_name;
		}

3535 3536 3537 3538 3539
		name = strncpy(tmp, "//anon", sizeof(tmp));
		goto got_name;
	}

got_name:
3540
	size = ALIGN(strlen(name)+1, sizeof(u64));
3541 3542 3543 3544

	mmap_event->file_name = name;
	mmap_event->file_size = size;

3545
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3546 3547

	cpuctx = &get_cpu_var(perf_cpu_context);
3548
	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3549 3550
	put_cpu_var(perf_cpu_context);

3551 3552 3553 3554 3555
	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3556
	ctx = rcu_dereference(current->perf_event_ctxp);
3557
	if (ctx)
3558
		perf_event_mmap_ctx(ctx, mmap_event);
3559 3560
	rcu_read_unlock();

3561 3562 3563
	kfree(buf);
}

3564
void __perf_event_mmap(struct vm_area_struct *vma)
3565
{
3566 3567
	struct perf_mmap_event mmap_event;

3568
	if (!atomic_read(&nr_mmap_events))
3569 3570 3571
		return;

	mmap_event = (struct perf_mmap_event){
3572
		.vma	= vma,
3573 3574
		/* .file_name */
		/* .file_size */
3575
		.event_id  = {
3576
			.header = {
3577
				.type = PERF_RECORD_MMAP,
3578 3579 3580 3581 3582
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3583 3584 3585
			.start  = vma->vm_start,
			.len    = vma->vm_end - vma->vm_start,
			.pgoff  = vma->vm_pgoff,
3586 3587 3588
		},
	};

3589
	perf_event_mmap_event(&mmap_event);
3590 3591
}

3592 3593 3594 3595
/*
 * IRQ throttle logging
 */

3596
static void perf_log_throttle(struct perf_event *event, int enable)
3597 3598 3599 3600 3601 3602 3603
{
	struct perf_output_handle handle;
	int ret;

	struct {
		struct perf_event_header	header;
		u64				time;
3604
		u64				id;
3605
		u64				stream_id;
3606 3607
	} throttle_event = {
		.header = {
3608
			.type = PERF_RECORD_THROTTLE,
3609 3610 3611
			.misc = 0,
			.size = sizeof(throttle_event),
		},
P
Peter Zijlstra 已提交
3612
		.time		= perf_clock(),
3613 3614
		.id		= primary_event_id(event),
		.stream_id	= event->id,
3615 3616
	};

3617
	if (enable)
3618
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3619

3620
	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3621 3622 3623 3624 3625 3626 3627
	if (ret)
		return;

	perf_output_put(&handle, throttle_event);
	perf_output_end(&handle);
}

3628
/*
3629
 * Generic event overflow handling, sampling.
3630 3631
 */

3632
static int __perf_event_overflow(struct perf_event *event, int nmi,
3633 3634
				   int throttle, struct perf_sample_data *data,
				   struct pt_regs *regs)
3635
{
3636 3637
	int events = atomic_read(&event->event_limit);
	struct hw_perf_event *hwc = &event->hw;
3638 3639
	int ret = 0;

3640
	throttle = (throttle && event->pmu->unthrottle != NULL);
3641

3642
	if (!throttle) {
3643
		hwc->interrupts++;
3644
	} else {
3645 3646
		if (hwc->interrupts != MAX_INTERRUPTS) {
			hwc->interrupts++;
3647
			if (HZ * hwc->interrupts >
3648
					(u64)sysctl_perf_event_sample_rate) {
3649
				hwc->interrupts = MAX_INTERRUPTS;
3650
				perf_log_throttle(event, 0);
3651 3652 3653 3654
				ret = 1;
			}
		} else {
			/*
3655
			 * Keep re-disabling events even though on the previous
3656
			 * pass we disabled it - just in case we raced with a
3657
			 * sched-in and the event got enabled again:
3658
			 */
3659 3660 3661
			ret = 1;
		}
	}
3662

3663
	if (event->attr.freq) {
P
Peter Zijlstra 已提交
3664
		u64 now = perf_clock();
3665 3666 3667 3668 3669
		s64 delta = now - hwc->freq_stamp;

		hwc->freq_stamp = now;

		if (delta > 0 && delta < TICK_NSEC)
3670
			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3671 3672
	}

3673 3674
	/*
	 * XXX event_limit might not quite work as expected on inherited
3675
	 * events
3676 3677
	 */

3678 3679
	event->pending_kill = POLL_IN;
	if (events && atomic_dec_and_test(&event->event_limit)) {
3680
		ret = 1;
3681
		event->pending_kill = POLL_HUP;
3682
		if (nmi) {
3683 3684 3685
			event->pending_disable = 1;
			perf_pending_queue(&event->pending,
					   perf_pending_event);
3686
		} else
3687
			perf_event_disable(event);
3688 3689
	}

3690
	perf_event_output(event, nmi, data, regs);
3691
	return ret;
3692 3693
}

3694
int perf_event_overflow(struct perf_event *event, int nmi,
3695 3696
			  struct perf_sample_data *data,
			  struct pt_regs *regs)
3697
{
3698
	return __perf_event_overflow(event, nmi, 1, data, regs);
3699 3700
}

3701
/*
3702
 * Generic software event infrastructure
3703 3704
 */

3705
/*
3706 3707
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
3708 3709 3710 3711
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

3712
static u64 perf_swevent_set_period(struct perf_event *event)
3713
{
3714
	struct hw_perf_event *hwc = &event->hw;
3715 3716 3717 3718 3719
	u64 period = hwc->last_period;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;
3720 3721

again:
3722 3723 3724
	old = val = atomic64_read(&hwc->period_left);
	if (val < 0)
		return 0;
3725

3726 3727 3728 3729 3730
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
		goto again;
3731

3732
	return nr;
3733 3734
}

3735
static void perf_swevent_overflow(struct perf_event *event,
3736 3737
				    int nmi, struct perf_sample_data *data,
				    struct pt_regs *regs)
3738
{
3739
	struct hw_perf_event *hwc = &event->hw;
3740
	int throttle = 0;
3741
	u64 overflow;
3742

3743 3744
	data->period = event->hw.last_period;
	overflow = perf_swevent_set_period(event);
3745

3746 3747
	if (hwc->interrupts == MAX_INTERRUPTS)
		return;
3748

3749
	for (; overflow; overflow--) {
3750
		if (__perf_event_overflow(event, nmi, throttle,
3751
					    data, regs)) {
3752 3753 3754 3755 3756 3757
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
3758
		throttle = 1;
3759
	}
3760 3761
}

3762
static void perf_swevent_unthrottle(struct perf_event *event)
3763 3764
{
	/*
3765
	 * Nothing to do, we already reset hwc->interrupts.
3766
	 */
3767
}
3768

3769
static void perf_swevent_add(struct perf_event *event, u64 nr,
3770 3771
			       int nmi, struct perf_sample_data *data,
			       struct pt_regs *regs)
3772
{
3773
	struct hw_perf_event *hwc = &event->hw;
3774

3775
	atomic64_add(nr, &event->count);
3776

3777 3778
	if (!hwc->sample_period)
		return;
3779

3780
	if (!regs)
3781
		return;
3782

3783
	if (!atomic64_add_negative(nr, &hwc->period_left))
3784
		perf_swevent_overflow(event, nmi, data, regs);
3785 3786
}

3787
static int perf_swevent_is_counting(struct perf_event *event)
3788
{
3789
	/*
3790
	 * The event is active, we're good!
3791
	 */
3792
	if (event->state == PERF_EVENT_STATE_ACTIVE)
3793 3794
		return 1;

3795
	/*
3796
	 * The event is off/error, not counting.
3797
	 */
3798
	if (event->state != PERF_EVENT_STATE_INACTIVE)
3799 3800 3801
		return 0;

	/*
3802
	 * The event is inactive, if the context is active
3803 3804
	 * we're part of a group that didn't make it on the 'pmu',
	 * not counting.
3805
	 */
3806
	if (event->ctx->is_active)
3807 3808 3809 3810 3811 3812 3813 3814
		return 0;

	/*
	 * We're inactive and the context is too, this means the
	 * task is scheduled out, we're counting events that happen
	 * to us, like migration events.
	 */
	return 1;
3815 3816
}

L
Li Zefan 已提交
3817 3818 3819
static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data);

3820
static int perf_swevent_match(struct perf_event *event,
P
Peter Zijlstra 已提交
3821
				enum perf_type_id type,
L
Li Zefan 已提交
3822 3823 3824
				u32 event_id,
				struct perf_sample_data *data,
				struct pt_regs *regs)
3825
{
3826
	if (!perf_swevent_is_counting(event))
3827 3828
		return 0;

3829
	if (event->attr.type != type)
3830
		return 0;
3831
	if (event->attr.config != event_id)
3832 3833
		return 0;

3834
	if (regs) {
3835
		if (event->attr.exclude_user && user_mode(regs))
3836
			return 0;
3837

3838
		if (event->attr.exclude_kernel && !user_mode(regs))
3839 3840
			return 0;
	}
3841

L
Li Zefan 已提交
3842 3843 3844 3845
	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
	    !perf_tp_event_match(event, data))
		return 0;

3846 3847 3848
	return 1;
}

3849
static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3850
				     enum perf_type_id type,
3851
				     u32 event_id, u64 nr, int nmi,
3852 3853
				     struct perf_sample_data *data,
				     struct pt_regs *regs)
3854
{
3855
	struct perf_event *event;
3856

3857
	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3858 3859
		return;

P
Peter Zijlstra 已提交
3860
	rcu_read_lock();
3861
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
L
Li Zefan 已提交
3862
		if (perf_swevent_match(event, type, event_id, data, regs))
3863
			perf_swevent_add(event, nr, nmi, data, regs);
3864
	}
P
Peter Zijlstra 已提交
3865
	rcu_read_unlock();
3866 3867
}

3868
static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
P
Peter Zijlstra 已提交
3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881
{
	if (in_nmi())
		return &cpuctx->recursion[3];

	if (in_irq())
		return &cpuctx->recursion[2];

	if (in_softirq())
		return &cpuctx->recursion[1];

	return &cpuctx->recursion[0];
}

3882
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3883
				    u64 nr, int nmi,
3884 3885
				    struct perf_sample_data *data,
				    struct pt_regs *regs)
3886 3887
{
	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3888 3889
	int *recursion = perf_swevent_recursion_context(cpuctx);
	struct perf_event_context *ctx;
P
Peter Zijlstra 已提交
3890 3891 3892 3893 3894 3895

	if (*recursion)
		goto out;

	(*recursion)++;
	barrier();
3896

3897
	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3898
				 nr, nmi, data, regs);
3899 3900 3901 3902 3903
	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3904
	ctx = rcu_dereference(current->perf_event_ctxp);
3905
	if (ctx)
3906
		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3907
	rcu_read_unlock();
3908

P
Peter Zijlstra 已提交
3909 3910 3911 3912
	barrier();
	(*recursion)--;

out:
3913 3914 3915
	put_cpu_var(perf_cpu_context);
}

3916
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3917
			    struct pt_regs *regs, u64 addr)
3918
{
3919 3920 3921 3922
	struct perf_sample_data data = {
		.addr = addr,
	};

3923
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3924
				&data, regs);
3925 3926
}

3927
static void perf_swevent_read(struct perf_event *event)
3928 3929 3930
{
}

3931
static int perf_swevent_enable(struct perf_event *event)
3932
{
3933
	struct hw_perf_event *hwc = &event->hw;
3934 3935 3936

	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
3937
		perf_swevent_set_period(event);
3938
	}
3939 3940 3941
	return 0;
}

3942
static void perf_swevent_disable(struct perf_event *event)
3943 3944 3945
{
}

3946
static const struct pmu perf_ops_generic = {
3947 3948 3949 3950
	.enable		= perf_swevent_enable,
	.disable	= perf_swevent_disable,
	.read		= perf_swevent_read,
	.unthrottle	= perf_swevent_unthrottle,
3951 3952
};

3953
/*
3954
 * hrtimer based swevent callback
3955 3956
 */

3957
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3958 3959 3960
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
3961
	struct pt_regs *regs;
3962
	struct perf_event *event;
3963 3964
	u64 period;

3965 3966
	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
	event->pmu->read(event);
3967 3968

	data.addr = 0;
3969
	regs = get_irq_regs();
3970 3971 3972 3973
	/*
	 * In case we exclude kernel IPs or are somehow not in interrupt
	 * context, provide the next best thing, the user IP.
	 */
3974 3975
	if ((event->attr.exclude_kernel || !regs) &&
			!event->attr.exclude_user)
3976
		regs = task_pt_regs(current);
3977

3978
	if (regs) {
3979
		if (perf_event_overflow(event, 0, &data, regs))
3980 3981 3982
			ret = HRTIMER_NORESTART;
	}

3983
	period = max_t(u64, 10000, event->hw.sample_period);
3984 3985 3986 3987 3988
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}

3989
/*
3990
 * Software event: cpu wall time clock
3991 3992
 */

3993
static void cpu_clock_perf_event_update(struct perf_event *event)
3994 3995 3996 3997 3998 3999
{
	int cpu = raw_smp_processor_id();
	s64 prev;
	u64 now;

	now = cpu_clock(cpu);
4000 4001 4002
	prev = atomic64_read(&event->hw.prev_count);
	atomic64_set(&event->hw.prev_count, now);
	atomic64_add(now - prev, &event->count);
4003 4004
}

4005
static int cpu_clock_perf_event_enable(struct perf_event *event)
4006
{
4007
	struct hw_perf_event *hwc = &event->hw;
4008 4009 4010
	int cpu = raw_smp_processor_id();

	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4011
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4012
	hwc->hrtimer.function = perf_swevent_hrtimer;
4013 4014
	if (hwc->sample_period) {
		u64 period = max_t(u64, 10000, hwc->sample_period);
4015
		__hrtimer_start_range_ns(&hwc->hrtimer,
4016
				ns_to_ktime(period), 0,
4017 4018 4019 4020 4021 4022
				HRTIMER_MODE_REL, 0);
	}

	return 0;
}

4023
static void cpu_clock_perf_event_disable(struct perf_event *event)
4024
{
4025 4026 4027
	if (event->hw.sample_period)
		hrtimer_cancel(&event->hw.hrtimer);
	cpu_clock_perf_event_update(event);
4028 4029
}

4030
static void cpu_clock_perf_event_read(struct perf_event *event)
4031
{
4032
	cpu_clock_perf_event_update(event);
4033 4034
}

4035
static const struct pmu perf_ops_cpu_clock = {
4036 4037 4038
	.enable		= cpu_clock_perf_event_enable,
	.disable	= cpu_clock_perf_event_disable,
	.read		= cpu_clock_perf_event_read,
4039 4040
};

4041
/*
4042
 * Software event: task time clock
4043 4044
 */

4045
static void task_clock_perf_event_update(struct perf_event *event, u64 now)
I
Ingo Molnar 已提交
4046
{
4047
	u64 prev;
I
Ingo Molnar 已提交
4048 4049
	s64 delta;

4050
	prev = atomic64_xchg(&event->hw.prev_count, now);
I
Ingo Molnar 已提交
4051
	delta = now - prev;
4052
	atomic64_add(delta, &event->count);
4053 4054
}

4055
static int task_clock_perf_event_enable(struct perf_event *event)
I
Ingo Molnar 已提交
4056
{
4057
	struct hw_perf_event *hwc = &event->hw;
4058 4059
	u64 now;

4060
	now = event->ctx->time;
4061

4062
	atomic64_set(&hwc->prev_count, now);
4063
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4064
	hwc->hrtimer.function = perf_swevent_hrtimer;
4065 4066
	if (hwc->sample_period) {
		u64 period = max_t(u64, 10000, hwc->sample_period);
4067
		__hrtimer_start_range_ns(&hwc->hrtimer,
4068
				ns_to_ktime(period), 0,
4069 4070
				HRTIMER_MODE_REL, 0);
	}
4071 4072

	return 0;
I
Ingo Molnar 已提交
4073 4074
}

4075
static void task_clock_perf_event_disable(struct perf_event *event)
4076
{
4077 4078 4079
	if (event->hw.sample_period)
		hrtimer_cancel(&event->hw.hrtimer);
	task_clock_perf_event_update(event, event->ctx->time);
4080

4081
}
I
Ingo Molnar 已提交
4082

4083
static void task_clock_perf_event_read(struct perf_event *event)
4084
{
4085 4086 4087
	u64 time;

	if (!in_nmi()) {
4088 4089
		update_context_time(event->ctx);
		time = event->ctx->time;
4090 4091
	} else {
		u64 now = perf_clock();
4092 4093
		u64 delta = now - event->ctx->timestamp;
		time = event->ctx->time + delta;
4094 4095
	}

4096
	task_clock_perf_event_update(event, time);
4097 4098
}

4099
static const struct pmu perf_ops_task_clock = {
4100 4101 4102
	.enable		= task_clock_perf_event_enable,
	.disable	= task_clock_perf_event_disable,
	.read		= task_clock_perf_event_read,
4103 4104
};

4105
#ifdef CONFIG_EVENT_PROFILE
L
Li Zefan 已提交
4106

4107
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4108
			  int entry_size)
4109
{
4110
	struct perf_raw_record raw = {
4111
		.size = entry_size,
4112
		.data = record,
4113 4114
	};

4115
	struct perf_sample_data data = {
4116
		.addr = addr,
4117
		.raw = &raw,
4118
	};
4119

4120 4121 4122 4123
	struct pt_regs *regs = get_irq_regs();

	if (!regs)
		regs = task_pt_regs(current);
4124

4125
	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4126
				&data, regs);
4127
}
4128
EXPORT_SYMBOL_GPL(perf_tp_event);
4129

L
Li Zefan 已提交
4130 4131 4132 4133 4134 4135 4136 4137 4138
static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	void *record = data->raw->data;

	if (likely(!event->filter) || filter_match_preds(event->filter, record))
		return 1;
	return 0;
}
4139

4140
static void tp_perf_event_destroy(struct perf_event *event)
4141
{
4142
	ftrace_profile_disable(event->attr.config);
4143 4144
}

4145
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4146
{
4147 4148 4149 4150
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
4151
	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4152
			perf_paranoid_tracepoint_raw() &&
4153 4154 4155
			!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

4156
	if (ftrace_profile_enable(event->attr.config))
4157 4158
		return NULL;

4159
	event->destroy = tp_perf_event_destroy;
4160 4161 4162

	return &perf_ops_generic;
}
L
Li Zefan 已提交
4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	char *filter_str;
	int ret;

	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -EINVAL;

	filter_str = strndup_user(arg, PAGE_SIZE);
	if (IS_ERR(filter_str))
		return PTR_ERR(filter_str);

	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);

	kfree(filter_str);
	return ret;
}

static void perf_event_free_filter(struct perf_event *event)
{
	ftrace_profile_free_filter(event);
}

4187
#else
L
Li Zefan 已提交
4188 4189 4190 4191 4192 4193 4194

static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	return 1;
}

4195
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4196 4197 4198
{
	return NULL;
}
L
Li Zefan 已提交
4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	return -ENOENT;
}

static void perf_event_free_filter(struct perf_event *event)
{
}

#endif /* CONFIG_EVENT_PROFILE */
4210

4211
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4212

4213
static void sw_perf_event_destroy(struct perf_event *event)
4214
{
4215
	u64 event_id = event->attr.config;
4216

4217
	WARN_ON(event->parent);
4218

4219
	atomic_dec(&perf_swevent_enabled[event_id]);
4220 4221
}

4222
static const struct pmu *sw_perf_event_init(struct perf_event *event)
4223
{
4224
	const struct pmu *pmu = NULL;
4225
	u64 event_id = event->attr.config;
4226

4227
	/*
4228
	 * Software events (currently) can't in general distinguish
4229 4230 4231 4232 4233
	 * between user, kernel and hypervisor events.
	 * However, context switches and cpu migrations are considered
	 * to be kernel events, and page faults are never hypervisor
	 * events.
	 */
4234
	switch (event_id) {
4235
	case PERF_COUNT_SW_CPU_CLOCK:
4236
		pmu = &perf_ops_cpu_clock;
4237

4238
		break;
4239
	case PERF_COUNT_SW_TASK_CLOCK:
4240
		/*
4241 4242
		 * If the user instantiates this as a per-cpu event,
		 * use the cpu_clock event instead.
4243
		 */
4244
		if (event->ctx->task)
4245
			pmu = &perf_ops_task_clock;
4246
		else
4247
			pmu = &perf_ops_cpu_clock;
4248

4249
		break;
4250 4251 4252 4253 4254
	case PERF_COUNT_SW_PAGE_FAULTS:
	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
	case PERF_COUNT_SW_CONTEXT_SWITCHES:
	case PERF_COUNT_SW_CPU_MIGRATIONS:
4255 4256 4257
		if (!event->parent) {
			atomic_inc(&perf_swevent_enabled[event_id]);
			event->destroy = sw_perf_event_destroy;
4258
		}
4259
		pmu = &perf_ops_generic;
4260
		break;
4261
	}
4262

4263
	return pmu;
4264 4265
}

T
Thomas Gleixner 已提交
4266
/*
4267
 * Allocate and initialize a event structure
T
Thomas Gleixner 已提交
4268
 */
4269 4270
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr,
4271
		   int cpu,
4272 4273 4274
		   struct perf_event_context *ctx,
		   struct perf_event *group_leader,
		   struct perf_event *parent_event,
4275
		   gfp_t gfpflags)
T
Thomas Gleixner 已提交
4276
{
4277
	const struct pmu *pmu;
4278 4279
	struct perf_event *event;
	struct hw_perf_event *hwc;
4280
	long err;
T
Thomas Gleixner 已提交
4281

4282 4283
	event = kzalloc(sizeof(*event), gfpflags);
	if (!event)
4284
		return ERR_PTR(-ENOMEM);
T
Thomas Gleixner 已提交
4285

4286
	/*
4287
	 * Single events are their own group leaders, with an
4288 4289 4290
	 * empty sibling list:
	 */
	if (!group_leader)
4291
		group_leader = event;
4292

4293 4294
	mutex_init(&event->child_mutex);
	INIT_LIST_HEAD(&event->child_list);
4295

4296 4297 4298 4299
	INIT_LIST_HEAD(&event->group_entry);
	INIT_LIST_HEAD(&event->event_entry);
	INIT_LIST_HEAD(&event->sibling_list);
	init_waitqueue_head(&event->waitq);
T
Thomas Gleixner 已提交
4300

4301
	mutex_init(&event->mmap_mutex);
4302

4303 4304 4305 4306 4307 4308
	event->cpu		= cpu;
	event->attr		= *attr;
	event->group_leader	= group_leader;
	event->pmu		= NULL;
	event->ctx		= ctx;
	event->oncpu		= -1;
4309

4310
	event->parent		= parent_event;
4311

4312 4313
	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
	event->id		= atomic64_inc_return(&perf_event_id);
4314

4315
	event->state		= PERF_EVENT_STATE_INACTIVE;
4316

4317
	if (attr->disabled)
4318
		event->state = PERF_EVENT_STATE_OFF;
4319

4320
	pmu = NULL;
4321

4322
	hwc = &event->hw;
4323
	hwc->sample_period = attr->sample_period;
4324
	if (attr->freq && attr->sample_freq)
4325
		hwc->sample_period = 1;
4326
	hwc->last_period = hwc->sample_period;
4327 4328

	atomic64_set(&hwc->period_left, hwc->sample_period);
4329

4330
	/*
4331
	 * we currently do not support PERF_FORMAT_GROUP on inherited events
4332
	 */
4333
	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4334 4335
		goto done;

4336
	switch (attr->type) {
4337
	case PERF_TYPE_RAW:
4338
	case PERF_TYPE_HARDWARE:
4339
	case PERF_TYPE_HW_CACHE:
4340
		pmu = hw_perf_event_init(event);
4341 4342 4343
		break;

	case PERF_TYPE_SOFTWARE:
4344
		pmu = sw_perf_event_init(event);
4345 4346 4347
		break;

	case PERF_TYPE_TRACEPOINT:
4348
		pmu = tp_perf_event_init(event);
4349
		break;
4350 4351 4352

	default:
		break;
4353
	}
4354 4355
done:
	err = 0;
4356
	if (!pmu)
4357
		err = -EINVAL;
4358 4359
	else if (IS_ERR(pmu))
		err = PTR_ERR(pmu);
4360

4361
	if (err) {
4362 4363 4364
		if (event->ns)
			put_pid_ns(event->ns);
		kfree(event);
4365
		return ERR_PTR(err);
I
Ingo Molnar 已提交
4366
	}
4367

4368
	event->pmu = pmu;
T
Thomas Gleixner 已提交
4369

4370 4371 4372 4373 4374 4375 4376 4377
	if (!event->parent) {
		atomic_inc(&nr_events);
		if (event->attr.mmap)
			atomic_inc(&nr_mmap_events);
		if (event->attr.comm)
			atomic_inc(&nr_comm_events);
		if (event->attr.task)
			atomic_inc(&nr_task_events);
4378
	}
4379

4380
	return event;
T
Thomas Gleixner 已提交
4381 4382
}

4383 4384
static int perf_copy_attr(struct perf_event_attr __user *uattr,
			  struct perf_event_attr *attr)
4385 4386
{
	u32 size;
4387
	int ret;
4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411

	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
		return -EFAULT;

	/*
	 * zero the full structure, so that a short copy will be nice.
	 */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	if (size > PAGE_SIZE)	/* silly large */
		goto err_size;

	if (!size)		/* abi compat */
		size = PERF_ATTR_SIZE_VER0;

	if (size < PERF_ATTR_SIZE_VER0)
		goto err_size;

	/*
	 * If we're handed a bigger struct than we know of,
4412 4413 4414
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
4415 4416
	 */
	if (size > sizeof(*attr)) {
4417 4418 4419
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;
4420

4421 4422
		addr = (void __user *)uattr + sizeof(*attr);
		end  = (void __user *)uattr + size;
4423

4424
		for (; addr < end; addr++) {
4425 4426 4427 4428 4429 4430
			ret = get_user(val, addr);
			if (ret)
				return ret;
			if (val)
				goto err_size;
		}
4431
		size = sizeof(*attr);
4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462
	}

	ret = copy_from_user(attr, uattr, size);
	if (ret)
		return -EFAULT;

	/*
	 * If the type exists, the corresponding creation will verify
	 * the attr->config.
	 */
	if (attr->type >= PERF_TYPE_MAX)
		return -EINVAL;

	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
		return -EINVAL;

	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
		return -EINVAL;

	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
		return -EINVAL;

out:
	return ret;

err_size:
	put_user(sizeof(*attr), &uattr->size);
	ret = -E2BIG;
	goto out;
}

L
Li Zefan 已提交
4463
static int perf_event_set_output(struct perf_event *event, int output_fd)
4464
{
4465
	struct perf_event *output_event = NULL;
4466
	struct file *output_file = NULL;
4467
	struct perf_event *old_output;
4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480
	int fput_needed = 0;
	int ret = -EINVAL;

	if (!output_fd)
		goto set;

	output_file = fget_light(output_fd, &fput_needed);
	if (!output_file)
		return -EBADF;

	if (output_file->f_op != &perf_fops)
		goto out;

4481
	output_event = output_file->private_data;
4482 4483

	/* Don't chain output fds */
4484
	if (output_event->output)
4485 4486 4487
		goto out;

	/* Don't set an output fd when we already have an output channel */
4488
	if (event->data)
4489 4490 4491 4492 4493
		goto out;

	atomic_long_inc(&output_file->f_count);

set:
4494 4495 4496 4497
	mutex_lock(&event->mmap_mutex);
	old_output = event->output;
	rcu_assign_pointer(event->output, output_event);
	mutex_unlock(&event->mmap_mutex);
4498 4499 4500 4501

	if (old_output) {
		/*
		 * we need to make sure no existing perf_output_*()
4502
		 * is still referencing this event.
4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513
		 */
		synchronize_rcu();
		fput(old_output->filp);
	}

	ret = 0;
out:
	fput_light(output_file, fput_needed);
	return ret;
}

T
Thomas Gleixner 已提交
4514
/**
4515
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
I
Ingo Molnar 已提交
4516
 *
4517
 * @attr_uptr:	event_id type attributes for monitoring/sampling
T
Thomas Gleixner 已提交
4518
 * @pid:		target pid
I
Ingo Molnar 已提交
4519
 * @cpu:		target cpu
4520
 * @group_fd:		group leader event fd
T
Thomas Gleixner 已提交
4521
 */
4522 4523
SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
4524
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
T
Thomas Gleixner 已提交
4525
{
4526 4527 4528 4529
	struct perf_event *event, *group_leader;
	struct perf_event_attr attr;
	struct perf_event_context *ctx;
	struct file *event_file = NULL;
4530 4531
	struct file *group_file = NULL;
	int fput_needed = 0;
4532
	int fput_needed2 = 0;
4533
	int err;
T
Thomas Gleixner 已提交
4534

4535
	/* for future expandability... */
4536
	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4537 4538
		return -EINVAL;

4539 4540 4541
	err = perf_copy_attr(attr_uptr, &attr);
	if (err)
		return err;
4542

4543 4544 4545 4546 4547
	if (!attr.exclude_kernel) {
		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

4548
	if (attr.freq) {
4549
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
4550 4551 4552
			return -EINVAL;
	}

4553
	/*
I
Ingo Molnar 已提交
4554 4555 4556 4557 4558 4559 4560
	 * Get the target context (task or percpu):
	 */
	ctx = find_get_context(pid, cpu);
	if (IS_ERR(ctx))
		return PTR_ERR(ctx);

	/*
4561
	 * Look up the group leader (we will attach this event to it):
4562 4563
	 */
	group_leader = NULL;
4564
	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4565
		err = -EINVAL;
4566 4567
		group_file = fget_light(group_fd, &fput_needed);
		if (!group_file)
I
Ingo Molnar 已提交
4568
			goto err_put_context;
4569
		if (group_file->f_op != &perf_fops)
I
Ingo Molnar 已提交
4570
			goto err_put_context;
4571 4572 4573

		group_leader = group_file->private_data;
		/*
I
Ingo Molnar 已提交
4574 4575 4576 4577 4578 4579 4580 4581
		 * Do not allow a recursive hierarchy (this new sibling
		 * becoming part of another group-sibling):
		 */
		if (group_leader->group_leader != group_leader)
			goto err_put_context;
		/*
		 * Do not allow to attach to a group in a different
		 * task or CPU context:
4582
		 */
I
Ingo Molnar 已提交
4583 4584
		if (group_leader->ctx != ctx)
			goto err_put_context;
4585 4586 4587
		/*
		 * Only a group leader can be exclusive or pinned
		 */
4588
		if (attr.exclusive || attr.pinned)
4589
			goto err_put_context;
4590 4591
	}

4592
	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4593
				     NULL, GFP_KERNEL);
4594 4595
	err = PTR_ERR(event);
	if (IS_ERR(event))
T
Thomas Gleixner 已提交
4596 4597
		goto err_put_context;

4598
	err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4599
	if (err < 0)
4600 4601
		goto err_free_put_context;

4602 4603
	event_file = fget_light(err, &fput_needed2);
	if (!event_file)
4604 4605
		goto err_free_put_context;

4606
	if (flags & PERF_FLAG_FD_OUTPUT) {
4607
		err = perf_event_set_output(event, group_fd);
4608 4609
		if (err)
			goto err_fput_free_put_context;
4610 4611
	}

4612
	event->filp = event_file;
4613
	WARN_ON_ONCE(ctx->parent_ctx);
4614
	mutex_lock(&ctx->mutex);
4615
	perf_install_in_context(ctx, event, cpu);
4616
	++ctx->generation;
4617
	mutex_unlock(&ctx->mutex);
4618

4619
	event->owner = current;
4620
	get_task_struct(current);
4621 4622 4623
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
4624

4625
err_fput_free_put_context:
4626
	fput_light(event_file, fput_needed2);
T
Thomas Gleixner 已提交
4627

4628
err_free_put_context:
4629
	if (err < 0)
4630
		kfree(event);
T
Thomas Gleixner 已提交
4631 4632

err_put_context:
4633 4634 4635 4636
	if (err < 0)
		put_ctx(ctx);

	fput_light(group_file, fput_needed);
T
Thomas Gleixner 已提交
4637

4638
	return err;
T
Thomas Gleixner 已提交
4639 4640
}

4641
/*
4642
 * inherit a event from parent task to child task:
4643
 */
4644 4645
static struct perf_event *
inherit_event(struct perf_event *parent_event,
4646
	      struct task_struct *parent,
4647
	      struct perf_event_context *parent_ctx,
4648
	      struct task_struct *child,
4649 4650
	      struct perf_event *group_leader,
	      struct perf_event_context *child_ctx)
4651
{
4652
	struct perf_event *child_event;
4653

4654
	/*
4655 4656
	 * Instead of creating recursive hierarchies of events,
	 * we link inherited events back to the original parent,
4657 4658 4659
	 * which has a filp for sure, which we use as the reference
	 * count:
	 */
4660 4661
	if (parent_event->parent)
		parent_event = parent_event->parent;
4662

4663 4664 4665
	child_event = perf_event_alloc(&parent_event->attr,
					   parent_event->cpu, child_ctx,
					   group_leader, parent_event,
4666
					   GFP_KERNEL);
4667 4668
	if (IS_ERR(child_event))
		return child_event;
4669
	get_ctx(child_ctx);
4670

4671
	/*
4672
	 * Make the child state follow the state of the parent event,
4673
	 * not its attr.disabled bit.  We hold the parent's mutex,
4674
	 * so we won't race with perf_event_{en, dis}able_family.
4675
	 */
4676 4677
	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
		child_event->state = PERF_EVENT_STATE_INACTIVE;
4678
	else
4679
		child_event->state = PERF_EVENT_STATE_OFF;
4680

4681 4682
	if (parent_event->attr.freq)
		child_event->hw.sample_period = parent_event->hw.sample_period;
4683

4684 4685 4686
	/*
	 * Link it up in the child's context:
	 */
4687
	add_event_to_ctx(child_event, child_ctx);
4688 4689 4690

	/*
	 * Get a reference to the parent filp - we will fput it
4691
	 * when the child event exits. This is safe to do because
4692 4693 4694
	 * we are in the parent and we know that the filp still
	 * exists and has a nonzero count:
	 */
4695
	atomic_long_inc(&parent_event->filp->f_count);
4696

4697
	/*
4698
	 * Link this into the parent event's child list
4699
	 */
4700 4701 4702 4703
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_add_tail(&child_event->child_list, &parent_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
4704

4705
	return child_event;
4706 4707
}

4708
static int inherit_group(struct perf_event *parent_event,
4709
	      struct task_struct *parent,
4710
	      struct perf_event_context *parent_ctx,
4711
	      struct task_struct *child,
4712
	      struct perf_event_context *child_ctx)
4713
{
4714 4715 4716
	struct perf_event *leader;
	struct perf_event *sub;
	struct perf_event *child_ctr;
4717

4718
	leader = inherit_event(parent_event, parent, parent_ctx,
4719
				 child, NULL, child_ctx);
4720 4721
	if (IS_ERR(leader))
		return PTR_ERR(leader);
4722 4723
	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
		child_ctr = inherit_event(sub, parent, parent_ctx,
4724 4725 4726
					    child, leader, child_ctx);
		if (IS_ERR(child_ctr))
			return PTR_ERR(child_ctr);
4727
	}
4728 4729 4730
	return 0;
}

4731
static void sync_child_event(struct perf_event *child_event,
4732
			       struct task_struct *child)
4733
{
4734
	struct perf_event *parent_event = child_event->parent;
4735
	u64 child_val;
4736

4737 4738
	if (child_event->attr.inherit_stat)
		perf_event_read_event(child_event, child);
4739

4740
	child_val = atomic64_read(&child_event->count);
4741 4742 4743 4744

	/*
	 * Add back the child's count to the parent's count:
	 */
4745 4746 4747 4748 4749
	atomic64_add(child_val, &parent_event->count);
	atomic64_add(child_event->total_time_enabled,
		     &parent_event->child_total_time_enabled);
	atomic64_add(child_event->total_time_running,
		     &parent_event->child_total_time_running);
4750 4751

	/*
4752
	 * Remove this event from the parent's list
4753
	 */
4754 4755 4756 4757
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_del_init(&child_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
4758 4759

	/*
4760
	 * Release the parent event, if this was the last
4761 4762
	 * reference to it.
	 */
4763
	fput(parent_event->filp);
4764 4765
}

4766
static void
4767 4768
__perf_event_exit_task(struct perf_event *child_event,
			 struct perf_event_context *child_ctx,
4769
			 struct task_struct *child)
4770
{
4771
	struct perf_event *parent_event;
4772

4773 4774
	update_event_times(child_event);
	perf_event_remove_from_context(child_event);
4775

4776
	parent_event = child_event->parent;
4777
	/*
4778
	 * It can happen that parent exits first, and has events
4779
	 * that are still around due to the child reference. These
4780
	 * events need to be zapped - but otherwise linger.
4781
	 */
4782 4783 4784
	if (parent_event) {
		sync_child_event(child_event, child);
		free_event(child_event);
4785
	}
4786 4787 4788
}

/*
4789
 * When a child task exits, feed back event values to parent events.
4790
 */
4791
void perf_event_exit_task(struct task_struct *child)
4792
{
4793 4794
	struct perf_event *child_event, *tmp;
	struct perf_event_context *child_ctx;
4795
	unsigned long flags;
4796

4797 4798
	if (likely(!child->perf_event_ctxp)) {
		perf_event_task(child, NULL, 0);
4799
		return;
P
Peter Zijlstra 已提交
4800
	}
4801

4802
	local_irq_save(flags);
4803 4804 4805 4806 4807 4808
	/*
	 * We can't reschedule here because interrupts are disabled,
	 * and either child is current or it is a task that can't be
	 * scheduled, so we are now safe from rescheduling changing
	 * our context.
	 */
4809 4810
	child_ctx = child->perf_event_ctxp;
	__perf_event_task_sched_out(child_ctx);
4811 4812 4813

	/*
	 * Take the context lock here so that if find_get_context is
4814
	 * reading child->perf_event_ctxp, we wait until it has
4815 4816 4817
	 * incremented the context's refcount before we do put_ctx below.
	 */
	spin_lock(&child_ctx->lock);
4818
	child->perf_event_ctxp = NULL;
4819 4820 4821
	/*
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
4822
	 * the events from it.
4823 4824
	 */
	unclone_ctx(child_ctx);
P
Peter Zijlstra 已提交
4825 4826 4827
	spin_unlock_irqrestore(&child_ctx->lock, flags);

	/*
4828 4829 4830
	 * Report the task dead after unscheduling the events so that we
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
	 * get a few PERF_RECORD_READ events.
P
Peter Zijlstra 已提交
4831
	 */
4832
	perf_event_task(child, child_ctx, 0);
4833

4834 4835 4836
	/*
	 * We can recurse on the same lock type through:
	 *
4837 4838 4839
	 *   __perf_event_exit_task()
	 *     sync_child_event()
	 *       fput(parent_event->filp)
4840 4841 4842 4843 4844 4845
	 *         perf_release()
	 *           mutex_lock(&ctx->mutex)
	 *
	 * But since its the parent context it won't be the same instance.
	 */
	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4846

4847
again:
4848
	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4849
				 group_entry)
4850
		__perf_event_exit_task(child_event, child_ctx, child);
4851 4852

	/*
4853
	 * If the last event was a group event, it will have appended all
4854 4855 4856
	 * its siblings to the list, but we obtained 'tmp' before that which
	 * will still point to the list head terminating the iteration.
	 */
4857
	if (!list_empty(&child_ctx->group_list))
4858
		goto again;
4859 4860 4861 4862

	mutex_unlock(&child_ctx->mutex);

	put_ctx(child_ctx);
4863 4864
}

4865 4866 4867 4868
/*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
 */
4869
void perf_event_free_task(struct task_struct *task)
4870
{
4871 4872
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event *event, *tmp;
4873 4874 4875 4876 4877 4878

	if (!ctx)
		return;

	mutex_lock(&ctx->mutex);
again:
4879 4880
	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
		struct perf_event *parent = event->parent;
4881 4882 4883 4884 4885

		if (WARN_ON_ONCE(!parent))
			continue;

		mutex_lock(&parent->child_mutex);
4886
		list_del_init(&event->child_list);
4887 4888 4889 4890
		mutex_unlock(&parent->child_mutex);

		fput(parent->filp);

4891 4892
		list_del_event(event, ctx);
		free_event(event);
4893 4894
	}

4895
	if (!list_empty(&ctx->group_list))
4896 4897 4898 4899 4900 4901 4902
		goto again;

	mutex_unlock(&ctx->mutex);

	put_ctx(ctx);
}

4903
/*
4904
 * Initialize the perf_event context in task_struct
4905
 */
4906
int perf_event_init_task(struct task_struct *child)
4907
{
4908 4909 4910
	struct perf_event_context *child_ctx, *parent_ctx;
	struct perf_event_context *cloned_ctx;
	struct perf_event *event;
4911
	struct task_struct *parent = current;
4912
	int inherited_all = 1;
4913
	int ret = 0;
4914

4915
	child->perf_event_ctxp = NULL;
4916

4917 4918
	mutex_init(&child->perf_event_mutex);
	INIT_LIST_HEAD(&child->perf_event_list);
4919

4920
	if (likely(!parent->perf_event_ctxp))
4921 4922
		return 0;

4923 4924
	/*
	 * This is executed from the parent task context, so inherit
4925
	 * events that have been marked for cloning.
4926
	 * First allocate and initialize a context for the child.
4927 4928
	 */

4929
	child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4930
	if (!child_ctx)
4931
		return -ENOMEM;
4932

4933 4934
	__perf_event_init_context(child_ctx, child);
	child->perf_event_ctxp = child_ctx;
4935
	get_task_struct(child);
4936

4937
	/*
4938 4939
	 * If the parent's context is a clone, pin it so it won't get
	 * swapped under us.
4940
	 */
4941 4942
	parent_ctx = perf_pin_task_context(parent);

4943 4944 4945 4946 4947 4948 4949
	/*
	 * No need to check if parent_ctx != NULL here; since we saw
	 * it non-NULL earlier, the only reason for it to become NULL
	 * is if we exit, and since we're currently in the middle of
	 * a fork we can't be exiting at the same time.
	 */

4950 4951 4952 4953
	/*
	 * Lock the parent list. No need to lock the child - not PID
	 * hashed yet and not running, so nobody can access it.
	 */
4954
	mutex_lock(&parent_ctx->mutex);
4955 4956 4957 4958 4959

	/*
	 * We dont have to disable NMIs - we are only looking at
	 * the list, not manipulating it:
	 */
4960
	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4961

4962
		if (!event->attr.inherit) {
4963
			inherited_all = 0;
4964
			continue;
4965
		}
4966

4967
		ret = inherit_group(event, parent, parent_ctx,
4968 4969
					     child, child_ctx);
		if (ret) {
4970
			inherited_all = 0;
4971
			break;
4972 4973 4974 4975 4976 4977 4978
		}
	}

	if (inherited_all) {
		/*
		 * Mark the child context as a clone of the parent
		 * context, or of whatever the parent is a clone of.
4979 4980
		 * Note that if the parent is a clone, it could get
		 * uncloned at any point, but that doesn't matter
4981
		 * because the list of events and the generation
4982
		 * count can't have changed since we took the mutex.
4983
		 */
4984 4985 4986
		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
		if (cloned_ctx) {
			child_ctx->parent_ctx = cloned_ctx;
4987
			child_ctx->parent_gen = parent_ctx->parent_gen;
4988 4989 4990 4991 4992
		} else {
			child_ctx->parent_ctx = parent_ctx;
			child_ctx->parent_gen = parent_ctx->generation;
		}
		get_ctx(child_ctx->parent_ctx);
4993 4994
	}

4995
	mutex_unlock(&parent_ctx->mutex);
4996

4997
	perf_unpin_context(parent_ctx);
4998

4999
	return ret;
5000 5001
}

5002
static void __cpuinit perf_event_init_cpu(int cpu)
T
Thomas Gleixner 已提交
5003
{
5004
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
5005

5006
	cpuctx = &per_cpu(perf_cpu_context, cpu);
5007
	__perf_event_init_context(&cpuctx->ctx, NULL);
T
Thomas Gleixner 已提交
5008

5009
	spin_lock(&perf_resource_lock);
5010
	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5011
	spin_unlock(&perf_resource_lock);
5012

5013
	hw_perf_event_setup(cpu);
T
Thomas Gleixner 已提交
5014 5015 5016
}

#ifdef CONFIG_HOTPLUG_CPU
5017
static void __perf_event_exit_cpu(void *info)
T
Thomas Gleixner 已提交
5018 5019
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5020 5021
	struct perf_event_context *ctx = &cpuctx->ctx;
	struct perf_event *event, *tmp;
T
Thomas Gleixner 已提交
5022

5023 5024
	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
		__perf_event_remove_from_context(event);
T
Thomas Gleixner 已提交
5025
}
5026
static void perf_event_exit_cpu(int cpu)
T
Thomas Gleixner 已提交
5027
{
5028
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5029
	struct perf_event_context *ctx = &cpuctx->ctx;
5030 5031

	mutex_lock(&ctx->mutex);
5032
	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5033
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
5034 5035
}
#else
5036
static inline void perf_event_exit_cpu(int cpu) { }
T
Thomas Gleixner 已提交
5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047
#endif

static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action) {

	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
5048
		perf_event_init_cpu(cpu);
T
Thomas Gleixner 已提交
5049 5050
		break;

5051 5052
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
5053
		hw_perf_event_setup_online(cpu);
5054 5055
		break;

T
Thomas Gleixner 已提交
5056 5057
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
5058
		perf_event_exit_cpu(cpu);
T
Thomas Gleixner 已提交
5059 5060 5061 5062 5063 5064 5065 5066 5067
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

5068 5069 5070
/*
 * This has to have a higher priority than migration_notifier in sched.c.
 */
T
Thomas Gleixner 已提交
5071 5072
static struct notifier_block __cpuinitdata perf_cpu_nb = {
	.notifier_call		= perf_cpu_notify,
5073
	.priority		= 20,
T
Thomas Gleixner 已提交
5074 5075
};

5076
void __init perf_event_init(void)
T
Thomas Gleixner 已提交
5077 5078 5079
{
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
			(void *)(long)smp_processor_id());
5080 5081
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
			(void *)(long)smp_processor_id());
T
Thomas Gleixner 已提交
5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101
	register_cpu_notifier(&perf_cpu_nb);
}

static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_reserved_percpu);
}

static ssize_t
perf_set_reserve_percpu(struct sysdev_class *class,
			const char *buf,
			size_t count)
{
	struct perf_cpu_context *cpuctx;
	unsigned long val;
	int err, cpu, mpt;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
5102
	if (val > perf_max_events)
T
Thomas Gleixner 已提交
5103 5104
		return -EINVAL;

5105
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5106 5107 5108 5109
	perf_reserved_percpu = val;
	for_each_online_cpu(cpu) {
		cpuctx = &per_cpu(perf_cpu_context, cpu);
		spin_lock_irq(&cpuctx->ctx.lock);
5110 5111
		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
			  perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
5112 5113 5114
		cpuctx->max_pertask = mpt;
		spin_unlock_irq(&cpuctx->ctx.lock);
	}
5115
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136

	return count;
}

static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_overcommit);
}

static ssize_t
perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
{
	unsigned long val;
	int err;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
	if (val > 1)
		return -EINVAL;

5137
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5138
	perf_overcommit = val;
5139
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165

	return count;
}

static SYSDEV_CLASS_ATTR(
				reserve_percpu,
				0644,
				perf_show_reserve_percpu,
				perf_set_reserve_percpu
			);

static SYSDEV_CLASS_ATTR(
				overcommit,
				0644,
				perf_show_overcommit,
				perf_set_overcommit
			);

static struct attribute *perfclass_attrs[] = {
	&attr_reserve_percpu.attr,
	&attr_overcommit.attr,
	NULL
};

static struct attribute_group perfclass_attr_group = {
	.attrs			= perfclass_attrs,
5166
	.name			= "perf_events",
T
Thomas Gleixner 已提交
5167 5168
};

5169
static int __init perf_event_sysfs_init(void)
T
Thomas Gleixner 已提交
5170 5171 5172 5173
{
	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				  &perfclass_attr_group);
}
5174
device_initcall(perf_event_sysfs_init);