perf_event.c 124.6 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events core code:
T
Thomas Gleixner 已提交
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
I
Ingo Molnar 已提交
9
 * For licensing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
T
Thomas Gleixner 已提交
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
T
Thomas Gleixner 已提交
17 18
#include <linux/poll.h>
#include <linux/sysfs.h>
19
#include <linux/dcache.h>
T
Thomas Gleixner 已提交
20
#include <linux/percpu.h>
21
#include <linux/ptrace.h>
22
#include <linux/vmstat.h>
23
#include <linux/vmalloc.h>
24 25
#include <linux/hardirq.h>
#include <linux/rculist.h>
T
Thomas Gleixner 已提交
26 27 28
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
I
Ingo Molnar 已提交
29
#include <linux/kernel_stat.h>
30
#include <linux/perf_event.h>
L
Li Zefan 已提交
31
#include <linux/ftrace_event.h>
32
#include <linux/hw_breakpoint.h>
T
Thomas Gleixner 已提交
33

34 35
#include <asm/irq_regs.h>

T
Thomas Gleixner 已提交
36
/*
37
 * Each CPU has a list of per CPU events:
T
Thomas Gleixner 已提交
38
 */
39
static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
T
Thomas Gleixner 已提交
40

41
int perf_max_events __read_mostly = 1;
T
Thomas Gleixner 已提交
42 43 44
static int perf_reserved_percpu __read_mostly;
static int perf_overcommit __read_mostly = 1;

45 46 47 48
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
49

50
/*
51
 * perf event paranoia level:
52 53
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
54
 *   1 - disallow cpu events for unpriv
55
 *   2 - disallow kernel profiling for unpriv
56
 */
57
int sysctl_perf_event_paranoid __read_mostly = 1;
58

59 60
static inline bool perf_paranoid_tracepoint_raw(void)
{
61
	return sysctl_perf_event_paranoid > -1;
62 63
}

64 65
static inline bool perf_paranoid_cpu(void)
{
66
	return sysctl_perf_event_paranoid > 0;
67 68 69 70
}

static inline bool perf_paranoid_kernel(void)
{
71
	return sysctl_perf_event_paranoid > 1;
72 73
}

74
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 76

/*
77
 * max perf event sample rate
78
 */
79
int sysctl_perf_event_sample_rate __read_mostly = 100000;
80

81
static atomic64_t perf_event_id;
82

T
Thomas Gleixner 已提交
83
/*
84
 * Lock for (sysadmin-configurable) event reservations:
T
Thomas Gleixner 已提交
85
 */
86
static DEFINE_SPINLOCK(perf_resource_lock);
T
Thomas Gleixner 已提交
87 88 89 90

/*
 * Architecture provided APIs - weak aliases:
 */
91
extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
T
Thomas Gleixner 已提交
92
{
93
	return NULL;
T
Thomas Gleixner 已提交
94 95
}

96 97 98
void __weak hw_perf_disable(void)		{ barrier(); }
void __weak hw_perf_enable(void)		{ barrier(); }

99 100
void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
101 102

int __weak
103
hw_perf_group_sched_in(struct perf_event *group_leader,
104
	       struct perf_cpu_context *cpuctx,
105
	       struct perf_event_context *ctx, int cpu)
106 107 108
{
	return 0;
}
T
Thomas Gleixner 已提交
109

110
void __weak perf_event_print_debug(void)	{ }
111

112
static DEFINE_PER_CPU(int, perf_disable_count);
113 114 115

void __perf_disable(void)
{
116
	__get_cpu_var(perf_disable_count)++;
117 118 119 120
}

bool __perf_enable(void)
{
121
	return !--__get_cpu_var(perf_disable_count);
122 123 124 125 126 127 128 129 130 131 132 133 134 135
}

void perf_disable(void)
{
	__perf_disable();
	hw_perf_disable();
}

void perf_enable(void)
{
	if (__perf_enable())
		hw_perf_enable();
}

136
static void get_ctx(struct perf_event_context *ctx)
137
{
138
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
139 140
}

141 142
static void free_ctx(struct rcu_head *head)
{
143
	struct perf_event_context *ctx;
144

145
	ctx = container_of(head, struct perf_event_context, rcu_head);
146 147 148
	kfree(ctx);
}

149
static void put_ctx(struct perf_event_context *ctx)
150
{
151 152 153
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
154 155 156
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
157
	}
158 159
}

160
static void unclone_ctx(struct perf_event_context *ctx)
161 162 163 164 165 166 167
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

168
/*
169
 * If we inherit events we want to return the parent event id
170 171
 * to userspace.
 */
172
static u64 primary_event_id(struct perf_event *event)
173
{
174
	u64 id = event->id;
175

176 177
	if (event->parent)
		id = event->parent->id;
178 179 180 181

	return id;
}

182
/*
183
 * Get the perf_event_context for a task and lock it.
184 185 186
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
187
static struct perf_event_context *
188
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189
{
190
	struct perf_event_context *ctx;
191 192 193

	rcu_read_lock();
 retry:
194
	ctx = rcu_dereference(task->perf_event_ctxp);
195 196 197 198
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
199
		 * perf_event_task_sched_out, though the
200 201 202 203 204 205
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
206
		raw_spin_lock_irqsave(&ctx->lock, *flags);
207
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
209 210
			goto retry;
		}
211 212

		if (!atomic_inc_not_zero(&ctx->refcount)) {
213
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
214 215
			ctx = NULL;
		}
216 217 218 219 220 221 222 223 224 225
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
226
static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
227
{
228
	struct perf_event_context *ctx;
229 230 231 232 233
	unsigned long flags;

	ctx = perf_lock_task_context(task, &flags);
	if (ctx) {
		++ctx->pin_count;
234
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
235 236 237 238
	}
	return ctx;
}

239
static void perf_unpin_context(struct perf_event_context *ctx)
240 241 242
{
	unsigned long flags;

243
	raw_spin_lock_irqsave(&ctx->lock, flags);
244
	--ctx->pin_count;
245
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
246 247 248
	put_ctx(ctx);
}

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
static inline u64 perf_clock(void)
{
	return cpu_clock(smp_processor_id());
}

/*
 * Update the record of the current time in a context.
 */
static void update_context_time(struct perf_event_context *ctx)
{
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
}

/*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
static void update_event_times(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	u64 run_end;

	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
		return;

277 278 279 280 281 282
	if (ctx->is_active)
		run_end = ctx->time;
	else
		run_end = event->tstamp_stopped;

	event->total_time_enabled = run_end - event->tstamp_enabled;
283 284 285 286 287 288 289 290 291

	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
	else
		run_end = ctx->time;

	event->total_time_running = run_end - event->tstamp_running;
}

292 293 294 295 296 297 298 299 300
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
	if (event->attr.pinned)
		return &ctx->pinned_groups;
	else
		return &ctx->flexible_groups;
}

301
/*
302
 * Add a event from the lists for its context.
303 304
 * Must be called with ctx->mutex and ctx->lock held.
 */
305
static void
306
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
307
{
308
	struct perf_event *group_leader = event->group_leader;
309 310

	/*
311 312
	 * Depending on whether it is a standalone or sibling event,
	 * add it straight to the context's event list, or to the group
313 314
	 * leader's sibling list:
	 */
315 316 317
	if (group_leader == event) {
		struct list_head *list;

318 319 320
		if (is_software_event(event))
			event->group_flags |= PERF_GROUP_SOFTWARE;

321 322 323
		list = ctx_group_list(event, ctx);
		list_add_tail(&event->group_entry, list);
	} else {
324 325 326 327
		if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
		    !is_software_event(event))
			group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;

328
		list_add_tail(&event->group_entry, &group_leader->sibling_list);
P
Peter Zijlstra 已提交
329 330
		group_leader->nr_siblings++;
	}
P
Peter Zijlstra 已提交
331

332 333 334
	list_add_rcu(&event->event_entry, &ctx->event_list);
	ctx->nr_events++;
	if (event->attr.inherit_stat)
335
		ctx->nr_stat++;
336 337
}

338
/*
339
 * Remove a event from the lists for its context.
340
 * Must be called with ctx->mutex and ctx->lock held.
341
 */
342
static void
343
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
344
{
345
	struct perf_event *sibling, *tmp;
346

347
	if (list_empty(&event->group_entry))
348
		return;
349 350
	ctx->nr_events--;
	if (event->attr.inherit_stat)
351
		ctx->nr_stat--;
352

353 354
	list_del_init(&event->group_entry);
	list_del_rcu(&event->event_entry);
355

356 357
	if (event->group_leader != event)
		event->group_leader->nr_siblings--;
P
Peter Zijlstra 已提交
358

359
	update_event_times(event);
360 361 362 363 364 365 366 367 368 369

	/*
	 * If event was in error state, then keep it
	 * that way, otherwise bogus counts will be
	 * returned on read(). The only way to get out
	 * of error state is by explicit re-enabling
	 * of the event
	 */
	if (event->state > PERF_EVENT_STATE_OFF)
		event->state = PERF_EVENT_STATE_OFF;
370

371
	/*
372 373
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
374 375
	 * to the context list directly:
	 */
376
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
377
		struct list_head *list;
378

379 380
		list = ctx_group_list(event, ctx);
		list_move_tail(&sibling->group_entry, list);
381
		sibling->group_leader = sibling;
382 383 384

		/* Inherit group flags from the previous leader */
		sibling->group_flags = event->group_flags;
385 386 387
	}
}

388
static void
389
event_sched_out(struct perf_event *event,
390
		  struct perf_cpu_context *cpuctx,
391
		  struct perf_event_context *ctx)
392
{
393
	if (event->state != PERF_EVENT_STATE_ACTIVE)
394 395
		return;

396 397 398 399
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
400
	}
401 402 403
	event->tstamp_stopped = ctx->time;
	event->pmu->disable(event);
	event->oncpu = -1;
404

405
	if (!is_software_event(event))
406 407
		cpuctx->active_oncpu--;
	ctx->nr_active--;
408
	if (event->attr.exclusive || !cpuctx->active_oncpu)
409 410 411
		cpuctx->exclusive = 0;
}

412
static void
413
group_sched_out(struct perf_event *group_event,
414
		struct perf_cpu_context *cpuctx,
415
		struct perf_event_context *ctx)
416
{
417
	struct perf_event *event;
418

419
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
420 421
		return;

422
	event_sched_out(group_event, cpuctx, ctx);
423 424 425 426

	/*
	 * Schedule out siblings (if any):
	 */
427 428
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
429

430
	if (group_event->attr.exclusive)
431 432 433
		cpuctx->exclusive = 0;
}

T
Thomas Gleixner 已提交
434
/*
435
 * Cross CPU call to remove a performance event
T
Thomas Gleixner 已提交
436
 *
437
 * We disable the event on the hardware level first. After that we
T
Thomas Gleixner 已提交
438 439
 * remove it from the context list.
 */
440
static void __perf_event_remove_from_context(void *info)
T
Thomas Gleixner 已提交
441 442
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
443 444
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
445 446 447 448 449 450

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
451
	if (ctx->task && cpuctx->task_ctx != ctx)
T
Thomas Gleixner 已提交
452 453
		return;

454
	raw_spin_lock(&ctx->lock);
455 456
	/*
	 * Protect the list operation against NMI by disabling the
457
	 * events on a global level.
458 459
	 */
	perf_disable();
T
Thomas Gleixner 已提交
460

461
	event_sched_out(event, cpuctx, ctx);
462

463
	list_del_event(event, ctx);
T
Thomas Gleixner 已提交
464 465 466

	if (!ctx->task) {
		/*
467
		 * Allow more per task events with respect to the
T
Thomas Gleixner 已提交
468 469 470
		 * reservation:
		 */
		cpuctx->max_pertask =
471 472
			min(perf_max_events - ctx->nr_events,
			    perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
473 474
	}

475
	perf_enable();
476
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
477 478 479 480
}


/*
481
 * Remove the event from a task's (or a CPU's) list of events.
T
Thomas Gleixner 已提交
482
 *
483
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
484
 *
485
 * CPU events are removed with a smp call. For task events we only
T
Thomas Gleixner 已提交
486
 * call when the task is on a CPU.
487
 *
488 489
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
490 491
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
492
 * When called from perf_event_exit_task, it's OK because the
493
 * context has been detached from its task.
T
Thomas Gleixner 已提交
494
 */
495
static void perf_event_remove_from_context(struct perf_event *event)
T
Thomas Gleixner 已提交
496
{
497
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
498 499 500 501
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
502
		 * Per cpu events are removed via an smp call and
503
		 * the removal is always successful.
T
Thomas Gleixner 已提交
504
		 */
505 506 507
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
T
Thomas Gleixner 已提交
508 509 510 511
		return;
	}

retry:
512 513
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
T
Thomas Gleixner 已提交
514

515
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
516 517 518
	/*
	 * If the context is active we need to retry the smp call.
	 */
519
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
520
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
521 522 523 524 525
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
526
	 * can remove the event safely, if the call above did not
T
Thomas Gleixner 已提交
527 528
	 * succeed.
	 */
P
Peter Zijlstra 已提交
529
	if (!list_empty(&event->group_entry))
530
		list_del_event(event, ctx);
531
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
532 533
}

534
/*
535
 * Update total_time_enabled and total_time_running for all events in a group.
536
 */
537
static void update_group_times(struct perf_event *leader)
538
{
539
	struct perf_event *event;
540

541 542 543
	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
544 545
}

546
/*
547
 * Cross CPU call to disable a performance event
548
 */
549
static void __perf_event_disable(void *info)
550
{
551
	struct perf_event *event = info;
552
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
553
	struct perf_event_context *ctx = event->ctx;
554 555

	/*
556 557
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
558
	 */
559
	if (ctx->task && cpuctx->task_ctx != ctx)
560 561
		return;

562
	raw_spin_lock(&ctx->lock);
563 564

	/*
565
	 * If the event is on, turn it off.
566 567
	 * If it is in error state, leave it in error state.
	 */
568
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
569
		update_context_time(ctx);
570 571 572
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
573
		else
574 575
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
576 577
	}

578
	raw_spin_unlock(&ctx->lock);
579 580 581
}

/*
582
 * Disable a event.
583
 *
584 585
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
586
 * remains valid.  This condition is satisifed when called through
587 588 589 590
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
591
 * is the current context on this CPU and preemption is disabled,
592
 * hence we can't get into perf_event_task_sched_out for this context.
593
 */
594
void perf_event_disable(struct perf_event *event)
595
{
596
	struct perf_event_context *ctx = event->ctx;
597 598 599 600
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
601
		 * Disable the event on the cpu that it's on
602
		 */
603 604
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
605 606 607 608
		return;
	}

 retry:
609
	task_oncpu_function_call(task, __perf_event_disable, event);
610

611
	raw_spin_lock_irq(&ctx->lock);
612
	/*
613
	 * If the event is still active, we need to retry the cross-call.
614
	 */
615
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
616
		raw_spin_unlock_irq(&ctx->lock);
617 618 619 620 621 622 623
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
624 625 626
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
627
	}
628

629
	raw_spin_unlock_irq(&ctx->lock);
630 631
}

632
static int
633
event_sched_in(struct perf_event *event,
634
		 struct perf_cpu_context *cpuctx,
635
		 struct perf_event_context *ctx,
636 637
		 int cpu)
{
638
	if (event->state <= PERF_EVENT_STATE_OFF)
639 640
		return 0;

641 642
	event->state = PERF_EVENT_STATE_ACTIVE;
	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
643 644 645 646 647
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

648 649 650
	if (event->pmu->enable(event)) {
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
651 652 653
		return -EAGAIN;
	}

654
	event->tstamp_running += ctx->time - event->tstamp_stopped;
655

656
	if (!is_software_event(event))
657
		cpuctx->active_oncpu++;
658 659
	ctx->nr_active++;

660
	if (event->attr.exclusive)
661 662
		cpuctx->exclusive = 1;

663 664 665
	return 0;
}

666
static int
667
group_sched_in(struct perf_event *group_event,
668
	       struct perf_cpu_context *cpuctx,
669
	       struct perf_event_context *ctx,
670 671
	       int cpu)
{
672
	struct perf_event *event, *partial_group;
673 674
	int ret;

675
	if (group_event->state == PERF_EVENT_STATE_OFF)
676 677
		return 0;

678
	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
679 680 681
	if (ret)
		return ret < 0 ? ret : 0;

682
	if (event_sched_in(group_event, cpuctx, ctx, cpu))
683 684 685 686 687
		return -EAGAIN;

	/*
	 * Schedule in siblings as one group (if any):
	 */
688 689 690
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event_sched_in(event, cpuctx, ctx, cpu)) {
			partial_group = event;
691 692 693 694 695 696 697 698 699 700 701
			goto group_error;
		}
	}

	return 0;

group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
	 */
702 703
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
704
			break;
705
		event_sched_out(event, cpuctx, ctx);
706
	}
707
	event_sched_out(group_event, cpuctx, ctx);
708 709 710 711

	return -EAGAIN;
}

712
/*
713
 * Work out whether we can put this event group on the CPU now.
714
 */
715
static int group_can_go_on(struct perf_event *event,
716 717 718 719
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
720
	 * Groups consisting entirely of software events can always go on.
721
	 */
722
	if (event->group_flags & PERF_GROUP_SOFTWARE)
723 724 725
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
726
	 * events can go on.
727 728 729 730 731
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
732
	 * events on the CPU, it can't go on.
733
	 */
734
	if (event->attr.exclusive && cpuctx->active_oncpu)
735 736 737 738 739 740 741 742
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

743 744
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
745
{
746 747 748 749
	list_add_event(event, ctx);
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
750 751
}

T
Thomas Gleixner 已提交
752
/*
753
 * Cross CPU call to install and enable a performance event
754 755
 *
 * Must be called with ctx->mutex held
T
Thomas Gleixner 已提交
756 757 758 759
 */
static void __perf_install_in_context(void *info)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
760 761 762
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
T
Thomas Gleixner 已提交
763
	int cpu = smp_processor_id();
764
	int err;
T
Thomas Gleixner 已提交
765 766 767 768 769

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
770
	 * Or possibly this is the right context but it isn't
771
	 * on this cpu because it had no events.
T
Thomas Gleixner 已提交
772
	 */
773
	if (ctx->task && cpuctx->task_ctx != ctx) {
774
		if (cpuctx->task_ctx || ctx->task != current)
775 776 777
			return;
		cpuctx->task_ctx = ctx;
	}
T
Thomas Gleixner 已提交
778

779
	raw_spin_lock(&ctx->lock);
780
	ctx->is_active = 1;
781
	update_context_time(ctx);
T
Thomas Gleixner 已提交
782 783 784

	/*
	 * Protect the list operation against NMI by disabling the
785
	 * events on a global level. NOP for non NMI based events.
T
Thomas Gleixner 已提交
786
	 */
787
	perf_disable();
T
Thomas Gleixner 已提交
788

789
	add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
790

791 792 793
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

794
	/*
795
	 * Don't put the event on if it is disabled or if
796 797
	 * it is in a group and the group isn't on.
	 */
798 799
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
800 801
		goto unlock;

802
	/*
803 804 805
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
806
	 */
807
	if (!group_can_go_on(event, cpuctx, 1))
808 809
		err = -EEXIST;
	else
810
		err = event_sched_in(event, cpuctx, ctx, cpu);
811

812 813
	if (err) {
		/*
814
		 * This event couldn't go on.  If it is in a group
815
		 * then we have to pull the whole group off.
816
		 * If the event group is pinned then put it in error state.
817
		 */
818
		if (leader != event)
819
			group_sched_out(leader, cpuctx, ctx);
820
		if (leader->attr.pinned) {
821
			update_group_times(leader);
822
			leader->state = PERF_EVENT_STATE_ERROR;
823
		}
824
	}
T
Thomas Gleixner 已提交
825

826
	if (!err && !ctx->task && cpuctx->max_pertask)
T
Thomas Gleixner 已提交
827 828
		cpuctx->max_pertask--;

829
 unlock:
830
	perf_enable();
831

832
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
833 834 835
}

/*
836
 * Attach a performance event to a context
T
Thomas Gleixner 已提交
837
 *
838 839
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
T
Thomas Gleixner 已提交
840
 *
841
 * If the event is attached to a task which is on a CPU we use a smp
T
Thomas Gleixner 已提交
842 843
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
844 845
 *
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
846 847
 */
static void
848 849
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
T
Thomas Gleixner 已提交
850 851 852 853 854 855
			int cpu)
{
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
856
		 * Per cpu events are installed via an smp call and
857
		 * the install is always successful.
T
Thomas Gleixner 已提交
858 859
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
860
					 event, 1);
T
Thomas Gleixner 已提交
861 862 863 864 865
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
866
				 event);
T
Thomas Gleixner 已提交
867

868
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
869 870 871
	/*
	 * we need to retry the smp call.
	 */
872
	if (ctx->is_active && list_empty(&event->group_entry)) {
873
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
874 875 876 877 878
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
879
	 * can add the event safely, if it the call above did not
T
Thomas Gleixner 已提交
880 881
	 * succeed.
	 */
882 883
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
884
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
885 886
}

887
/*
888
 * Put a event into inactive state and update time fields.
889 890 891 892 893 894
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
895 896
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
897
{
898
	struct perf_event *sub;
899

900 901 902 903
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
	list_for_each_entry(sub, &event->sibling_list, group_entry)
		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
904 905 906 907
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
}

908
/*
909
 * Cross CPU call to enable a performance event
910
 */
911
static void __perf_event_enable(void *info)
912
{
913
	struct perf_event *event = info;
914
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
915 916
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
917
	int err;
918

919
	/*
920 921
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
922
	 */
923
	if (ctx->task && cpuctx->task_ctx != ctx) {
924
		if (cpuctx->task_ctx || ctx->task != current)
925 926 927
			return;
		cpuctx->task_ctx = ctx;
	}
928

929
	raw_spin_lock(&ctx->lock);
930
	ctx->is_active = 1;
931
	update_context_time(ctx);
932

933
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
934
		goto unlock;
935
	__perf_event_mark_enabled(event, ctx);
936

937 938 939
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

940
	/*
941
	 * If the event is in a group and isn't the group leader,
942
	 * then don't put it on unless the group is on.
943
	 */
944
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
945
		goto unlock;
946

947
	if (!group_can_go_on(event, cpuctx, 1)) {
948
		err = -EEXIST;
949
	} else {
950
		perf_disable();
951 952
		if (event == leader)
			err = group_sched_in(event, cpuctx, ctx,
953 954
					     smp_processor_id());
		else
955
			err = event_sched_in(event, cpuctx, ctx,
956
					       smp_processor_id());
957
		perf_enable();
958
	}
959 960 961

	if (err) {
		/*
962
		 * If this event can't go on and it's part of a
963 964
		 * group, then the whole group has to come off.
		 */
965
		if (leader != event)
966
			group_sched_out(leader, cpuctx, ctx);
967
		if (leader->attr.pinned) {
968
			update_group_times(leader);
969
			leader->state = PERF_EVENT_STATE_ERROR;
970
		}
971 972 973
	}

 unlock:
974
	raw_spin_unlock(&ctx->lock);
975 976 977
}

/*
978
 * Enable a event.
979
 *
980 981
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
982
 * remains valid.  This condition is satisfied when called through
983 984
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
985
 */
986
void perf_event_enable(struct perf_event *event)
987
{
988
	struct perf_event_context *ctx = event->ctx;
989 990 991 992
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
993
		 * Enable the event on the cpu that it's on
994
		 */
995 996
		smp_call_function_single(event->cpu, __perf_event_enable,
					 event, 1);
997 998 999
		return;
	}

1000
	raw_spin_lock_irq(&ctx->lock);
1001
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
1002 1003 1004
		goto out;

	/*
1005 1006
	 * If the event is in error state, clear that first.
	 * That way, if we see the event in error state below, we
1007 1008 1009 1010
	 * know that it has gone back into error state, as distinct
	 * from the task having been scheduled away before the
	 * cross-call arrived.
	 */
1011 1012
	if (event->state == PERF_EVENT_STATE_ERROR)
		event->state = PERF_EVENT_STATE_OFF;
1013 1014

 retry:
1015
	raw_spin_unlock_irq(&ctx->lock);
1016
	task_oncpu_function_call(task, __perf_event_enable, event);
1017

1018
	raw_spin_lock_irq(&ctx->lock);
1019 1020

	/*
1021
	 * If the context is active and the event is still off,
1022 1023
	 * we need to retry the cross-call.
	 */
1024
	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1025 1026 1027 1028 1029 1030
		goto retry;

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
1031 1032
	if (event->state == PERF_EVENT_STATE_OFF)
		__perf_event_mark_enabled(event, ctx);
1033

1034
 out:
1035
	raw_spin_unlock_irq(&ctx->lock);
1036 1037
}

1038
static int perf_event_refresh(struct perf_event *event, int refresh)
1039
{
1040
	/*
1041
	 * not supported on inherited events
1042
	 */
1043
	if (event->attr.inherit)
1044 1045
		return -EINVAL;

1046 1047
	atomic_add(refresh, &event->event_limit);
	perf_event_enable(event);
1048 1049

	return 0;
1050 1051
}

1052 1053 1054 1055 1056 1057 1058 1059 1060
enum event_type_t {
	EVENT_FLEXIBLE = 0x1,
	EVENT_PINNED = 0x2,
	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

static void ctx_sched_out(struct perf_event_context *ctx,
			  struct perf_cpu_context *cpuctx,
			  enum event_type_t event_type)
1061
{
1062
	struct perf_event *event;
1063

1064
	raw_spin_lock(&ctx->lock);
1065
	ctx->is_active = 0;
1066
	if (likely(!ctx->nr_events))
1067
		goto out;
1068
	update_context_time(ctx);
1069

1070
	perf_disable();
1071 1072 1073 1074
	if (!ctx->nr_active)
		goto out_enable;

	if (event_type & EVENT_PINNED)
1075 1076 1077
		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
			group_sched_out(event, cpuctx, ctx);

1078
	if (event_type & EVENT_FLEXIBLE)
1079
		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1080
			group_sched_out(event, cpuctx, ctx);
1081 1082

 out_enable:
1083
	perf_enable();
1084
 out:
1085
	raw_spin_unlock(&ctx->lock);
1086 1087
}

1088 1089 1090
/*
 * Test whether two contexts are equivalent, i.e. whether they
 * have both been cloned from the same version of the same context
1091 1092 1093 1094
 * and they both have the same number of enabled events.
 * If the number of enabled events is the same, then the set
 * of enabled events should be the same, because these are both
 * inherited contexts, therefore we can't access individual events
1095
 * in them directly with an fd; we can only enable/disable all
1096
 * events via prctl, or enable/disable all events in a family
1097 1098
 * via ioctl, which will have the same effect on both contexts.
 */
1099 1100
static int context_equiv(struct perf_event_context *ctx1,
			 struct perf_event_context *ctx2)
1101 1102
{
	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1103
		&& ctx1->parent_gen == ctx2->parent_gen
1104
		&& !ctx1->pin_count && !ctx2->pin_count;
1105 1106
}

1107 1108
static void __perf_event_sync_stat(struct perf_event *event,
				     struct perf_event *next_event)
1109 1110 1111
{
	u64 value;

1112
	if (!event->attr.inherit_stat)
1113 1114 1115
		return;

	/*
1116
	 * Update the event value, we cannot use perf_event_read()
1117 1118
	 * because we're in the middle of a context switch and have IRQs
	 * disabled, which upsets smp_call_function_single(), however
1119
	 * we know the event must be on the current CPU, therefore we
1120 1121
	 * don't need to use it.
	 */
1122 1123
	switch (event->state) {
	case PERF_EVENT_STATE_ACTIVE:
1124 1125
		event->pmu->read(event);
		/* fall-through */
1126

1127 1128
	case PERF_EVENT_STATE_INACTIVE:
		update_event_times(event);
1129 1130 1131 1132 1133 1134 1135
		break;

	default:
		break;
	}

	/*
1136
	 * In order to keep per-task stats reliable we need to flip the event
1137 1138
	 * values when we flip the contexts.
	 */
1139 1140 1141
	value = atomic64_read(&next_event->count);
	value = atomic64_xchg(&event->count, value);
	atomic64_set(&next_event->count, value);
1142

1143 1144
	swap(event->total_time_enabled, next_event->total_time_enabled);
	swap(event->total_time_running, next_event->total_time_running);
1145

1146
	/*
1147
	 * Since we swizzled the values, update the user visible data too.
1148
	 */
1149 1150
	perf_event_update_userpage(event);
	perf_event_update_userpage(next_event);
1151 1152 1153 1154 1155
}

#define list_next_entry(pos, member) \
	list_entry(pos->member.next, typeof(*pos), member)

1156 1157
static void perf_event_sync_stat(struct perf_event_context *ctx,
				   struct perf_event_context *next_ctx)
1158
{
1159
	struct perf_event *event, *next_event;
1160 1161 1162 1163

	if (!ctx->nr_stat)
		return;

1164 1165
	update_context_time(ctx);

1166 1167
	event = list_first_entry(&ctx->event_list,
				   struct perf_event, event_entry);
1168

1169 1170
	next_event = list_first_entry(&next_ctx->event_list,
					struct perf_event, event_entry);
1171

1172 1173
	while (&event->event_entry != &ctx->event_list &&
	       &next_event->event_entry != &next_ctx->event_list) {
1174

1175
		__perf_event_sync_stat(event, next_event);
1176

1177 1178
		event = list_next_entry(event, event_entry);
		next_event = list_next_entry(next_event, event_entry);
1179 1180 1181
	}
}

T
Thomas Gleixner 已提交
1182
/*
1183
 * Called from scheduler to remove the events of the current task,
T
Thomas Gleixner 已提交
1184 1185
 * with interrupts disabled.
 *
1186
 * We stop each event and update the event value in event->count.
T
Thomas Gleixner 已提交
1187
 *
I
Ingo Molnar 已提交
1188
 * This does not protect us against NMI, but disable()
1189 1190 1191
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
T
Thomas Gleixner 已提交
1192
 */
1193
void perf_event_task_sched_out(struct task_struct *task,
1194
				 struct task_struct *next)
T
Thomas Gleixner 已提交
1195
{
1196
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1197 1198 1199
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event_context *next_ctx;
	struct perf_event_context *parent;
1200
	struct pt_regs *regs;
1201
	int do_switch = 1;
T
Thomas Gleixner 已提交
1202

1203
	regs = task_pt_regs(task);
1204
	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1205

1206
	if (likely(!ctx || !cpuctx->task_ctx))
T
Thomas Gleixner 已提交
1207 1208
		return;

1209 1210
	rcu_read_lock();
	parent = rcu_dereference(ctx->parent_ctx);
1211
	next_ctx = next->perf_event_ctxp;
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
	if (parent && next_ctx &&
	    rcu_dereference(next_ctx->parent_ctx) == parent) {
		/*
		 * Looks like the two contexts are clones, so we might be
		 * able to optimize the context switch.  We lock both
		 * contexts and check that they are clones under the
		 * lock (including re-checking that neither has been
		 * uncloned in the meantime).  It doesn't matter which
		 * order we take the locks because no other cpu could
		 * be trying to lock both of these tasks.
		 */
1223 1224
		raw_spin_lock(&ctx->lock);
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1225
		if (context_equiv(ctx, next_ctx)) {
1226 1227
			/*
			 * XXX do we need a memory barrier of sorts
1228
			 * wrt to rcu_dereference() of perf_event_ctxp
1229
			 */
1230 1231
			task->perf_event_ctxp = next_ctx;
			next->perf_event_ctxp = ctx;
1232 1233 1234
			ctx->task = next;
			next_ctx->task = task;
			do_switch = 0;
1235

1236
			perf_event_sync_stat(ctx, next_ctx);
1237
		}
1238 1239
		raw_spin_unlock(&next_ctx->lock);
		raw_spin_unlock(&ctx->lock);
1240
	}
1241
	rcu_read_unlock();
1242

1243
	if (do_switch) {
1244
		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1245 1246
		cpuctx->task_ctx = NULL;
	}
T
Thomas Gleixner 已提交
1247 1248
}

1249 1250
static void task_ctx_sched_out(struct perf_event_context *ctx,
			       enum event_type_t event_type)
1251 1252 1253
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);

1254 1255
	if (!cpuctx->task_ctx)
		return;
1256 1257 1258 1259

	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
		return;

1260
	ctx_sched_out(ctx, cpuctx, event_type);
1261 1262 1263
	cpuctx->task_ctx = NULL;
}

1264 1265 1266
/*
 * Called with IRQs disabled
 */
1267
static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1268
{
1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
	task_ctx_sched_out(ctx, EVENT_ALL);
}

/*
 * Called with IRQs disabled
 */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
			      enum event_type_t event_type)
{
	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1279 1280
}

1281
static void
1282 1283 1284
ctx_pinned_sched_in(struct perf_event_context *ctx,
		    struct perf_cpu_context *cpuctx,
		    int cpu)
T
Thomas Gleixner 已提交
1285
{
1286
	struct perf_event *event;
T
Thomas Gleixner 已提交
1287

1288 1289
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		if (event->state <= PERF_EVENT_STATE_OFF)
1290
			continue;
1291
		if (event->cpu != -1 && event->cpu != cpu)
1292 1293
			continue;

1294 1295
		if (group_can_go_on(event, cpuctx, 1))
			group_sched_in(event, cpuctx, ctx, cpu);
1296 1297 1298 1299 1300

		/*
		 * If this pinned group hasn't been scheduled,
		 * put it in error state.
		 */
1301 1302 1303
		if (event->state == PERF_EVENT_STATE_INACTIVE) {
			update_group_times(event);
			event->state = PERF_EVENT_STATE_ERROR;
1304
		}
1305
	}
1306 1307 1308 1309 1310 1311 1312 1313 1314
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
		      struct perf_cpu_context *cpuctx,
		      int cpu)
{
	struct perf_event *event;
	int can_add_hw = 1;
1315

1316 1317 1318
	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		/* Ignore events in OFF or ERROR state */
		if (event->state <= PERF_EVENT_STATE_OFF)
1319
			continue;
1320 1321
		/*
		 * Listen to the 'cpu' scheduling filter constraint
1322
		 * of events:
1323
		 */
1324
		if (event->cpu != -1 && event->cpu != cpu)
T
Thomas Gleixner 已提交
1325 1326
			continue;

1327 1328
		if (group_can_go_on(event, cpuctx, can_add_hw))
			if (group_sched_in(event, cpuctx, ctx, cpu))
1329
				can_add_hw = 0;
T
Thomas Gleixner 已提交
1330
	}
1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359
}

static void
ctx_sched_in(struct perf_event_context *ctx,
	     struct perf_cpu_context *cpuctx,
	     enum event_type_t event_type)
{
	int cpu = smp_processor_id();

	raw_spin_lock(&ctx->lock);
	ctx->is_active = 1;
	if (likely(!ctx->nr_events))
		goto out;

	ctx->timestamp = perf_clock();

	perf_disable();

	/*
	 * First go through the list and put on any pinned groups
	 * in order to give them the best chance of going on.
	 */
	if (event_type & EVENT_PINNED)
		ctx_pinned_sched_in(ctx, cpuctx, cpu);

	/* Then walk through the lower prio flexible groups */
	if (event_type & EVENT_FLEXIBLE)
		ctx_flexible_sched_in(ctx, cpuctx, cpu);

1360
	perf_enable();
1361
 out:
1362
	raw_spin_unlock(&ctx->lock);
1363 1364
}

1365 1366 1367 1368 1369 1370 1371 1372
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
			     enum event_type_t event_type)
{
	struct perf_event_context *ctx = &cpuctx->ctx;

	ctx_sched_in(ctx, cpuctx, event_type);
}

1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
static void task_ctx_sched_in(struct task_struct *task,
			      enum event_type_t event_type)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
	struct perf_event_context *ctx = task->perf_event_ctxp;

	if (likely(!ctx))
		return;
	if (cpuctx->task_ctx == ctx)
		return;
	ctx_sched_in(ctx, cpuctx, event_type);
	cpuctx->task_ctx = ctx;
}
1386
/*
1387
 * Called from scheduler to add the events of the current task
1388 1389
 * with interrupts disabled.
 *
1390
 * We restore the event value and then enable it.
1391 1392
 *
 * This does not protect us against NMI, but enable()
1393 1394 1395
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
1396
 */
1397
void perf_event_task_sched_in(struct task_struct *task)
1398
{
1399 1400
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
	struct perf_event_context *ctx = task->perf_event_ctxp;
T
Thomas Gleixner 已提交
1401

1402 1403
	if (likely(!ctx))
		return;
1404

1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
	if (cpuctx->task_ctx == ctx)
		return;

	/*
	 * We want to keep the following priority order:
	 * cpu pinned (that don't need to move), task pinned,
	 * cpu flexible, task flexible.
	 */
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);

	cpuctx->task_ctx = ctx;
1420 1421
}

1422 1423
#define MAX_INTERRUPTS (~0ULL)

1424
static void perf_log_throttle(struct perf_event *event, int enable);
1425

1426
static void perf_adjust_period(struct perf_event *event, u64 events)
1427
{
1428
	struct hw_perf_event *hwc = &event->hw;
1429 1430 1431 1432
	u64 period, sample_period;
	s64 delta;

	events *= hwc->sample_period;
1433
	period = div64_u64(events, event->attr.sample_freq);
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */

	sample_period = hwc->sample_period + delta;

	if (!sample_period)
		sample_period = 1;

	hwc->sample_period = sample_period;
}

1446
static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1447
{
1448 1449
	struct perf_event *event;
	struct hw_perf_event *hwc;
1450
	u64 interrupts, freq;
1451

1452
	raw_spin_lock(&ctx->lock);
1453
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1454
		if (event->state != PERF_EVENT_STATE_ACTIVE)
1455 1456
			continue;

1457 1458 1459
		if (event->cpu != -1 && event->cpu != smp_processor_id())
			continue;

1460
		hwc = &event->hw;
1461 1462 1463

		interrupts = hwc->interrupts;
		hwc->interrupts = 0;
1464

1465
		/*
1466
		 * unthrottle events on the tick
1467
		 */
1468
		if (interrupts == MAX_INTERRUPTS) {
1469 1470 1471
			perf_log_throttle(event, 1);
			event->pmu->unthrottle(event);
			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1472 1473
		}

1474
		if (!event->attr.freq || !event->attr.sample_freq)
1475 1476
			continue;

1477 1478 1479
		/*
		 * if the specified freq < HZ then we need to skip ticks
		 */
1480 1481
		if (event->attr.sample_freq < HZ) {
			freq = event->attr.sample_freq;
1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494

			hwc->freq_count += freq;
			hwc->freq_interrupts += interrupts;

			if (hwc->freq_count < HZ)
				continue;

			interrupts = hwc->freq_interrupts;
			hwc->freq_interrupts = 0;
			hwc->freq_count -= HZ;
		} else
			freq = HZ;

1495
		perf_adjust_period(event, freq * interrupts);
1496

1497 1498 1499 1500 1501 1502 1503
		/*
		 * In order to avoid being stalled by an (accidental) huge
		 * sample period, force reset the sample period if we didn't
		 * get any events in this freq period.
		 */
		if (!interrupts) {
			perf_disable();
1504
			event->pmu->disable(event);
1505
			atomic64_set(&hwc->period_left, 0);
1506
			event->pmu->enable(event);
1507 1508
			perf_enable();
		}
1509
	}
1510
	raw_spin_unlock(&ctx->lock);
1511 1512
}

1513
/*
1514
 * Round-robin a context's events:
1515
 */
1516
static void rotate_ctx(struct perf_event_context *ctx)
T
Thomas Gleixner 已提交
1517
{
1518
	if (!ctx->nr_events)
T
Thomas Gleixner 已提交
1519 1520
		return;

1521
	raw_spin_lock(&ctx->lock);
1522 1523

	/* Rotate the first entry last of non-pinned groups */
1524
	perf_disable();
1525

1526 1527
	list_rotate_left(&ctx->flexible_groups);

1528
	perf_enable();
T
Thomas Gleixner 已提交
1529

1530
	raw_spin_unlock(&ctx->lock);
1531 1532
}

1533
void perf_event_task_tick(struct task_struct *curr)
1534
{
1535
	struct perf_cpu_context *cpuctx;
1536
	struct perf_event_context *ctx;
1537

1538
	if (!atomic_read(&nr_events))
1539 1540
		return;

1541
	cpuctx = &__get_cpu_var(perf_cpu_context);
1542
	ctx = curr->perf_event_ctxp;
1543

1544
	perf_ctx_adjust_freq(&cpuctx->ctx);
1545
	if (ctx)
1546
		perf_ctx_adjust_freq(ctx);
1547

1548
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1549
	if (ctx)
1550
		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
T
Thomas Gleixner 已提交
1551

1552
	rotate_ctx(&cpuctx->ctx);
1553 1554
	if (ctx)
		rotate_ctx(ctx);
1555

1556
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1557
	if (ctx)
1558
		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
T
Thomas Gleixner 已提交
1559 1560
}

1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
static int event_enable_on_exec(struct perf_event *event,
				struct perf_event_context *ctx)
{
	if (!event->attr.enable_on_exec)
		return 0;

	event->attr.enable_on_exec = 0;
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
		return 0;

	__perf_event_mark_enabled(event, ctx);

	return 1;
}

1576
/*
1577
 * Enable all of a task's events that have been marked enable-on-exec.
1578 1579
 * This expects task == current.
 */
1580
static void perf_event_enable_on_exec(struct task_struct *task)
1581
{
1582 1583
	struct perf_event_context *ctx;
	struct perf_event *event;
1584 1585
	unsigned long flags;
	int enabled = 0;
1586
	int ret;
1587 1588

	local_irq_save(flags);
1589 1590
	ctx = task->perf_event_ctxp;
	if (!ctx || !ctx->nr_events)
1591 1592
		goto out;

1593
	__perf_event_task_sched_out(ctx);
1594

1595
	raw_spin_lock(&ctx->lock);
1596

1597 1598 1599 1600 1601 1602 1603 1604 1605 1606
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
	}

	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
1607 1608 1609
	}

	/*
1610
	 * Unclone this context if we enabled any event.
1611
	 */
1612 1613
	if (enabled)
		unclone_ctx(ctx);
1614

1615
	raw_spin_unlock(&ctx->lock);
1616

1617
	perf_event_task_sched_in(task);
1618 1619 1620 1621
 out:
	local_irq_restore(flags);
}

T
Thomas Gleixner 已提交
1622
/*
1623
 * Cross CPU call to read the hardware event
T
Thomas Gleixner 已提交
1624
 */
1625
static void __perf_event_read(void *info)
T
Thomas Gleixner 已提交
1626
{
1627
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1628 1629
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
I
Ingo Molnar 已提交
1630

1631 1632 1633 1634
	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu.  If not it has been
	 * scheduled out before the smp call arrived.  In that case
1635 1636
	 * event->count would have been updated to a recent sample
	 * when the event was scheduled out.
1637 1638 1639 1640
	 */
	if (ctx->task && cpuctx->task_ctx != ctx)
		return;

1641
	raw_spin_lock(&ctx->lock);
P
Peter Zijlstra 已提交
1642
	update_context_time(ctx);
1643
	update_event_times(event);
1644
	raw_spin_unlock(&ctx->lock);
P
Peter Zijlstra 已提交
1645

P
Peter Zijlstra 已提交
1646
	event->pmu->read(event);
T
Thomas Gleixner 已提交
1647 1648
}

1649
static u64 perf_event_read(struct perf_event *event)
T
Thomas Gleixner 已提交
1650 1651
{
	/*
1652 1653
	 * If event is enabled and currently active on a CPU, update the
	 * value in the event structure:
T
Thomas Gleixner 已提交
1654
	 */
1655 1656 1657 1658
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
		smp_call_function_single(event->oncpu,
					 __perf_event_read, event, 1);
	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
P
Peter Zijlstra 已提交
1659 1660 1661
		struct perf_event_context *ctx = event->ctx;
		unsigned long flags;

1662
		raw_spin_lock_irqsave(&ctx->lock, flags);
P
Peter Zijlstra 已提交
1663
		update_context_time(ctx);
1664
		update_event_times(event);
1665
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1666 1667
	}

1668
	return atomic64_read(&event->count);
T
Thomas Gleixner 已提交
1669 1670
}

1671
/*
1672
 * Initialize the perf_event context in a task_struct:
1673 1674
 */
static void
1675
__perf_event_init_context(struct perf_event_context *ctx,
1676 1677
			    struct task_struct *task)
{
1678
	raw_spin_lock_init(&ctx->lock);
1679
	mutex_init(&ctx->mutex);
1680 1681
	INIT_LIST_HEAD(&ctx->pinned_groups);
	INIT_LIST_HEAD(&ctx->flexible_groups);
1682 1683 1684 1685 1686
	INIT_LIST_HEAD(&ctx->event_list);
	atomic_set(&ctx->refcount, 1);
	ctx->task = task;
}

1687
static struct perf_event_context *find_get_context(pid_t pid, int cpu)
T
Thomas Gleixner 已提交
1688
{
1689
	struct perf_event_context *ctx;
1690
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
1691
	struct task_struct *task;
1692
	unsigned long flags;
1693
	int err;
T
Thomas Gleixner 已提交
1694

1695
	if (pid == -1 && cpu != -1) {
1696
		/* Must be root to operate on a CPU event: */
1697
		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
T
Thomas Gleixner 已提交
1698 1699
			return ERR_PTR(-EACCES);

1700
		if (cpu < 0 || cpu >= nr_cpumask_bits)
T
Thomas Gleixner 已提交
1701 1702 1703
			return ERR_PTR(-EINVAL);

		/*
1704
		 * We could be clever and allow to attach a event to an
T
Thomas Gleixner 已提交
1705 1706 1707
		 * offline CPU and activate it when the CPU comes up, but
		 * that's for later.
		 */
1708
		if (!cpu_online(cpu))
T
Thomas Gleixner 已提交
1709 1710 1711 1712
			return ERR_PTR(-ENODEV);

		cpuctx = &per_cpu(perf_cpu_context, cpu);
		ctx = &cpuctx->ctx;
1713
		get_ctx(ctx);
T
Thomas Gleixner 已提交
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729

		return ctx;
	}

	rcu_read_lock();
	if (!pid)
		task = current;
	else
		task = find_task_by_vpid(pid);
	if (task)
		get_task_struct(task);
	rcu_read_unlock();

	if (!task)
		return ERR_PTR(-ESRCH);

1730
	/*
1731
	 * Can't attach events to a dying task.
1732 1733 1734 1735 1736
	 */
	err = -ESRCH;
	if (task->flags & PF_EXITING)
		goto errout;

T
Thomas Gleixner 已提交
1737
	/* Reuse ptrace permission checks for now. */
1738 1739 1740 1741 1742
	err = -EACCES;
	if (!ptrace_may_access(task, PTRACE_MODE_READ))
		goto errout;

 retry:
1743
	ctx = perf_lock_task_context(task, &flags);
1744
	if (ctx) {
1745
		unclone_ctx(ctx);
1746
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1747 1748
	}

1749
	if (!ctx) {
1750
		ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1751 1752 1753
		err = -ENOMEM;
		if (!ctx)
			goto errout;
1754
		__perf_event_init_context(ctx, task);
1755
		get_ctx(ctx);
1756
		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1757 1758 1759 1760 1761
			/*
			 * We raced with some other task; use
			 * the context they set.
			 */
			kfree(ctx);
1762
			goto retry;
1763
		}
1764
		get_task_struct(task);
1765 1766
	}

1767
	put_task_struct(task);
T
Thomas Gleixner 已提交
1768
	return ctx;
1769 1770 1771 1772

 errout:
	put_task_struct(task);
	return ERR_PTR(err);
T
Thomas Gleixner 已提交
1773 1774
}

L
Li Zefan 已提交
1775 1776
static void perf_event_free_filter(struct perf_event *event);

1777
static void free_event_rcu(struct rcu_head *head)
P
Peter Zijlstra 已提交
1778
{
1779
	struct perf_event *event;
P
Peter Zijlstra 已提交
1780

1781 1782 1783
	event = container_of(head, struct perf_event, rcu_head);
	if (event->ns)
		put_pid_ns(event->ns);
L
Li Zefan 已提交
1784
	perf_event_free_filter(event);
1785
	kfree(event);
P
Peter Zijlstra 已提交
1786 1787
}

1788
static void perf_pending_sync(struct perf_event *event);
1789

1790
static void free_event(struct perf_event *event)
1791
{
1792
	perf_pending_sync(event);
1793

1794 1795 1796 1797 1798 1799 1800 1801
	if (!event->parent) {
		atomic_dec(&nr_events);
		if (event->attr.mmap)
			atomic_dec(&nr_mmap_events);
		if (event->attr.comm)
			atomic_dec(&nr_comm_events);
		if (event->attr.task)
			atomic_dec(&nr_task_events);
1802
	}
1803

1804 1805 1806
	if (event->output) {
		fput(event->output->filp);
		event->output = NULL;
1807 1808
	}

1809 1810
	if (event->destroy)
		event->destroy(event);
1811

1812 1813
	put_ctx(event->ctx);
	call_rcu(&event->rcu_head, free_event_rcu);
1814 1815
}

1816
int perf_event_release_kernel(struct perf_event *event)
T
Thomas Gleixner 已提交
1817
{
1818
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
1819

1820
	WARN_ON_ONCE(ctx->parent_ctx);
1821
	mutex_lock(&ctx->mutex);
1822
	perf_event_remove_from_context(event);
1823
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
1824

1825 1826 1827 1828
	mutex_lock(&event->owner->perf_event_mutex);
	list_del_init(&event->owner_entry);
	mutex_unlock(&event->owner->perf_event_mutex);
	put_task_struct(event->owner);
1829

1830
	free_event(event);
T
Thomas Gleixner 已提交
1831 1832 1833

	return 0;
}
1834
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
T
Thomas Gleixner 已提交
1835

1836 1837 1838 1839
/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
1840
{
1841
	struct perf_event *event = file->private_data;
1842

1843
	file->private_data = NULL;
1844

1845
	return perf_event_release_kernel(event);
1846 1847
}

1848
static int perf_event_read_size(struct perf_event *event)
1849 1850 1851 1852 1853
{
	int entry = sizeof(u64); /* value */
	int size = 0;
	int nr = 1;

1854
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1855 1856
		size += sizeof(u64);

1857
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1858 1859
		size += sizeof(u64);

1860
	if (event->attr.read_format & PERF_FORMAT_ID)
1861 1862
		entry += sizeof(u64);

1863 1864
	if (event->attr.read_format & PERF_FORMAT_GROUP) {
		nr += event->group_leader->nr_siblings;
1865 1866 1867 1868 1869 1870 1871 1872
		size += sizeof(u64);
	}

	size += entry * nr;

	return size;
}

1873
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1874
{
1875
	struct perf_event *child;
1876 1877
	u64 total = 0;

1878 1879 1880
	*enabled = 0;
	*running = 0;

1881
	mutex_lock(&event->child_mutex);
1882
	total += perf_event_read(event);
1883 1884 1885 1886 1887 1888
	*enabled += event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
	*running += event->total_time_running +
			atomic64_read(&event->child_total_time_running);

	list_for_each_entry(child, &event->child_list, child_list) {
1889
		total += perf_event_read(child);
1890 1891 1892
		*enabled += child->total_time_enabled;
		*running += child->total_time_running;
	}
1893
	mutex_unlock(&event->child_mutex);
1894 1895 1896

	return total;
}
1897
EXPORT_SYMBOL_GPL(perf_event_read_value);
1898

1899
static int perf_event_read_group(struct perf_event *event,
1900 1901
				   u64 read_format, char __user *buf)
{
1902
	struct perf_event *leader = event->group_leader, *sub;
1903 1904
	int n = 0, size = 0, ret = -EFAULT;
	struct perf_event_context *ctx = leader->ctx;
1905
	u64 values[5];
1906
	u64 count, enabled, running;
1907

1908
	mutex_lock(&ctx->mutex);
1909
	count = perf_event_read_value(leader, &enabled, &running);
1910 1911

	values[n++] = 1 + leader->nr_siblings;
1912 1913 1914 1915
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
1916 1917 1918
	values[n++] = count;
	if (read_format & PERF_FORMAT_ID)
		values[n++] = primary_event_id(leader);
1919 1920 1921 1922

	size = n * sizeof(u64);

	if (copy_to_user(buf, values, size))
1923
		goto unlock;
1924

1925
	ret = size;
1926

1927
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1928
		n = 0;
1929

1930
		values[n++] = perf_event_read_value(sub, &enabled, &running);
1931 1932 1933 1934 1935
		if (read_format & PERF_FORMAT_ID)
			values[n++] = primary_event_id(sub);

		size = n * sizeof(u64);

1936
		if (copy_to_user(buf + ret, values, size)) {
1937 1938 1939
			ret = -EFAULT;
			goto unlock;
		}
1940 1941

		ret += size;
1942
	}
1943 1944
unlock:
	mutex_unlock(&ctx->mutex);
1945

1946
	return ret;
1947 1948
}

1949
static int perf_event_read_one(struct perf_event *event,
1950 1951
				 u64 read_format, char __user *buf)
{
1952
	u64 enabled, running;
1953 1954 1955
	u64 values[4];
	int n = 0;

1956 1957 1958 1959 1960
	values[n++] = perf_event_read_value(event, &enabled, &running);
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
1961
	if (read_format & PERF_FORMAT_ID)
1962
		values[n++] = primary_event_id(event);
1963 1964 1965 1966 1967 1968 1969

	if (copy_to_user(buf, values, n * sizeof(u64)))
		return -EFAULT;

	return n * sizeof(u64);
}

T
Thomas Gleixner 已提交
1970
/*
1971
 * Read the performance event - simple non blocking version for now
T
Thomas Gleixner 已提交
1972 1973
 */
static ssize_t
1974
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
T
Thomas Gleixner 已提交
1975
{
1976
	u64 read_format = event->attr.read_format;
1977
	int ret;
T
Thomas Gleixner 已提交
1978

1979
	/*
1980
	 * Return end-of-file for a read on a event that is in
1981 1982 1983
	 * error state (i.e. because it was pinned but it couldn't be
	 * scheduled on to the CPU at some point).
	 */
1984
	if (event->state == PERF_EVENT_STATE_ERROR)
1985 1986
		return 0;

1987
	if (count < perf_event_read_size(event))
1988 1989
		return -ENOSPC;

1990
	WARN_ON_ONCE(event->ctx->parent_ctx);
1991
	if (read_format & PERF_FORMAT_GROUP)
1992
		ret = perf_event_read_group(event, read_format, buf);
1993
	else
1994
		ret = perf_event_read_one(event, read_format, buf);
T
Thomas Gleixner 已提交
1995

1996
	return ret;
T
Thomas Gleixner 已提交
1997 1998 1999 2000 2001
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
2002
	struct perf_event *event = file->private_data;
T
Thomas Gleixner 已提交
2003

2004
	return perf_read_hw(event, buf, count);
T
Thomas Gleixner 已提交
2005 2006 2007 2008
}

static unsigned int perf_poll(struct file *file, poll_table *wait)
{
2009
	struct perf_event *event = file->private_data;
P
Peter Zijlstra 已提交
2010
	struct perf_mmap_data *data;
2011
	unsigned int events = POLL_HUP;
P
Peter Zijlstra 已提交
2012 2013

	rcu_read_lock();
2014
	data = rcu_dereference(event->data);
P
Peter Zijlstra 已提交
2015
	if (data)
2016
		events = atomic_xchg(&data->poll, 0);
P
Peter Zijlstra 已提交
2017
	rcu_read_unlock();
T
Thomas Gleixner 已提交
2018

2019
	poll_wait(file, &event->waitq, wait);
T
Thomas Gleixner 已提交
2020 2021 2022 2023

	return events;
}

2024
static void perf_event_reset(struct perf_event *event)
2025
{
2026 2027 2028
	(void)perf_event_read(event);
	atomic64_set(&event->count, 0);
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
2029 2030
}

2031
/*
2032 2033 2034 2035
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in sync_child_event if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
2036
 */
2037 2038
static void perf_event_for_each_child(struct perf_event *event,
					void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2039
{
2040
	struct perf_event *child;
P
Peter Zijlstra 已提交
2041

2042 2043 2044 2045
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
	func(event);
	list_for_each_entry(child, &event->child_list, child_list)
P
Peter Zijlstra 已提交
2046
		func(child);
2047
	mutex_unlock(&event->child_mutex);
P
Peter Zijlstra 已提交
2048 2049
}

2050 2051
static void perf_event_for_each(struct perf_event *event,
				  void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2052
{
2053 2054
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *sibling;
P
Peter Zijlstra 已提交
2055

2056 2057
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
2058
	event = event->group_leader;
2059

2060 2061 2062 2063
	perf_event_for_each_child(event, func);
	func(event);
	list_for_each_entry(sibling, &event->sibling_list, group_entry)
		perf_event_for_each_child(event, func);
2064
	mutex_unlock(&ctx->mutex);
2065 2066
}

2067
static int perf_event_period(struct perf_event *event, u64 __user *arg)
2068
{
2069
	struct perf_event_context *ctx = event->ctx;
2070 2071 2072 2073
	unsigned long size;
	int ret = 0;
	u64 value;

2074
	if (!event->attr.sample_period)
2075 2076 2077 2078 2079 2080 2081 2082 2083
		return -EINVAL;

	size = copy_from_user(&value, arg, sizeof(value));
	if (size != sizeof(value))
		return -EFAULT;

	if (!value)
		return -EINVAL;

2084
	raw_spin_lock_irq(&ctx->lock);
2085 2086
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate) {
2087 2088 2089 2090
			ret = -EINVAL;
			goto unlock;
		}

2091
		event->attr.sample_freq = value;
2092
	} else {
2093 2094
		event->attr.sample_period = value;
		event->hw.sample_period = value;
2095 2096
	}
unlock:
2097
	raw_spin_unlock_irq(&ctx->lock);
2098 2099 2100 2101

	return ret;
}

L
Li Zefan 已提交
2102 2103
static int perf_event_set_output(struct perf_event *event, int output_fd);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2104

2105 2106
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
2107 2108
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
P
Peter Zijlstra 已提交
2109
	u32 flags = arg;
2110 2111

	switch (cmd) {
2112 2113
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
2114
		break;
2115 2116
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
2117
		break;
2118 2119
	case PERF_EVENT_IOC_RESET:
		func = perf_event_reset;
2120
		break;
P
Peter Zijlstra 已提交
2121

2122 2123
	case PERF_EVENT_IOC_REFRESH:
		return perf_event_refresh(event, arg);
2124

2125 2126
	case PERF_EVENT_IOC_PERIOD:
		return perf_event_period(event, (u64 __user *)arg);
2127

2128 2129
	case PERF_EVENT_IOC_SET_OUTPUT:
		return perf_event_set_output(event, arg);
2130

L
Li Zefan 已提交
2131 2132 2133
	case PERF_EVENT_IOC_SET_FILTER:
		return perf_event_set_filter(event, (void __user *)arg);

2134
	default:
P
Peter Zijlstra 已提交
2135
		return -ENOTTY;
2136
	}
P
Peter Zijlstra 已提交
2137 2138

	if (flags & PERF_IOC_FLAG_GROUP)
2139
		perf_event_for_each(event, func);
P
Peter Zijlstra 已提交
2140
	else
2141
		perf_event_for_each_child(event, func);
P
Peter Zijlstra 已提交
2142 2143

	return 0;
2144 2145
}

2146
int perf_event_task_enable(void)
2147
{
2148
	struct perf_event *event;
2149

2150 2151 2152 2153
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_enable);
	mutex_unlock(&current->perf_event_mutex);
2154 2155 2156 2157

	return 0;
}

2158
int perf_event_task_disable(void)
2159
{
2160
	struct perf_event *event;
2161

2162 2163 2164 2165
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_disable);
	mutex_unlock(&current->perf_event_mutex);
2166 2167 2168 2169

	return 0;
}

2170 2171
#ifndef PERF_EVENT_INDEX_OFFSET
# define PERF_EVENT_INDEX_OFFSET 0
I
Ingo Molnar 已提交
2172 2173
#endif

2174
static int perf_event_index(struct perf_event *event)
2175
{
2176
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2177 2178
		return 0;

2179
	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2180 2181
}

2182 2183 2184 2185 2186
/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
2187
void perf_event_update_userpage(struct perf_event *event)
2188
{
2189
	struct perf_event_mmap_page *userpg;
2190
	struct perf_mmap_data *data;
2191 2192

	rcu_read_lock();
2193
	data = rcu_dereference(event->data);
2194 2195 2196 2197
	if (!data)
		goto unlock;

	userpg = data->user_page;
2198

2199 2200 2201 2202 2203
	/*
	 * Disable preemption so as to not let the corresponding user-space
	 * spin too long if we get preempted.
	 */
	preempt_disable();
2204
	++userpg->lock;
2205
	barrier();
2206 2207 2208 2209
	userpg->index = perf_event_index(event);
	userpg->offset = atomic64_read(&event->count);
	if (event->state == PERF_EVENT_STATE_ACTIVE)
		userpg->offset -= atomic64_read(&event->hw.prev_count);
2210

2211 2212
	userpg->time_enabled = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2213

2214 2215
	userpg->time_running = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2216

2217
	barrier();
2218
	++userpg->lock;
2219
	preempt_enable();
2220
unlock:
2221
	rcu_read_unlock();
2222 2223
}

2224
static unsigned long perf_data_size(struct perf_mmap_data *data)
2225
{
2226 2227
	return data->nr_pages << (PAGE_SHIFT + data->data_order);
}
2228

2229
#ifndef CONFIG_PERF_USE_VMALLOC
2230

2231 2232 2233
/*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
2234

2235 2236 2237 2238 2239
static struct page *
perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
{
	if (pgoff > data->nr_pages)
		return NULL;
2240

2241 2242
	if (pgoff == 0)
		return virt_to_page(data->user_page);
2243

2244
	return virt_to_page(data->data_pages[pgoff - 1]);
2245 2246
}

2247 2248
static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2249 2250 2251 2252 2253
{
	struct perf_mmap_data *data;
	unsigned long size;
	int i;

2254
	WARN_ON(atomic_read(&event->mmap_count));
2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272

	size = sizeof(struct perf_mmap_data);
	size += nr_pages * sizeof(void *);

	data = kzalloc(size, GFP_KERNEL);
	if (!data)
		goto fail;

	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
	if (!data->user_page)
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
		if (!data->data_pages[i])
			goto fail_data_pages;
	}

2273
	data->data_order = 0;
2274 2275
	data->nr_pages = nr_pages;

2276
	return data;
2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287

fail_data_pages:
	for (i--; i >= 0; i--)
		free_page((unsigned long)data->data_pages[i]);

	free_page((unsigned long)data->user_page);

fail_user_page:
	kfree(data);

fail:
2288
	return NULL;
2289 2290
}

2291 2292
static void perf_mmap_free_page(unsigned long addr)
{
K
Kevin Cernekee 已提交
2293
	struct page *page = virt_to_page((void *)addr);
2294 2295 2296 2297 2298

	page->mapping = NULL;
	__free_page(page);
}

2299
static void perf_mmap_data_free(struct perf_mmap_data *data)
2300 2301 2302
{
	int i;

2303
	perf_mmap_free_page((unsigned long)data->user_page);
2304
	for (i = 0; i < data->nr_pages; i++)
2305
		perf_mmap_free_page((unsigned long)data->data_pages[i]);
2306
	kfree(data);
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346
}

#else

/*
 * Back perf_mmap() with vmalloc memory.
 *
 * Required for architectures that have d-cache aliasing issues.
 */

static struct page *
perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
{
	if (pgoff > (1UL << data->data_order))
		return NULL;

	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
}

static void perf_mmap_unmark_page(void *addr)
{
	struct page *page = vmalloc_to_page(addr);

	page->mapping = NULL;
}

static void perf_mmap_data_free_work(struct work_struct *work)
{
	struct perf_mmap_data *data;
	void *base;
	int i, nr;

	data = container_of(work, struct perf_mmap_data, work);
	nr = 1 << data->data_order;

	base = data->user_page;
	for (i = 0; i < nr + 1; i++)
		perf_mmap_unmark_page(base + (i * PAGE_SIZE));

	vfree(base);
2347
	kfree(data);
2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
}

static void perf_mmap_data_free(struct perf_mmap_data *data)
{
	schedule_work(&data->work);
}

static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
{
	struct perf_mmap_data *data;
	unsigned long size;
	void *all_buf;

	WARN_ON(atomic_read(&event->mmap_count));
2363

2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440
	size = sizeof(struct perf_mmap_data);
	size += sizeof(void *);

	data = kzalloc(size, GFP_KERNEL);
	if (!data)
		goto fail;

	INIT_WORK(&data->work, perf_mmap_data_free_work);

	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

	data->user_page = all_buf;
	data->data_pages[0] = all_buf + PAGE_SIZE;
	data->data_order = ilog2(nr_pages);
	data->nr_pages = 1;

	return data;

fail_all_buf:
	kfree(data);

fail:
	return NULL;
}

#endif

static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct perf_event *event = vma->vm_file->private_data;
	struct perf_mmap_data *data;
	int ret = VM_FAULT_SIGBUS;

	if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
			ret = 0;
		return ret;
	}

	rcu_read_lock();
	data = rcu_dereference(event->data);
	if (!data)
		goto unlock;

	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
		goto unlock;

	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
	if (!vmf->page)
		goto unlock;

	get_page(vmf->page);
	vmf->page->mapping = vma->vm_file->f_mapping;
	vmf->page->index   = vmf->pgoff;

	ret = 0;
unlock:
	rcu_read_unlock();

	return ret;
}

static void
perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
{
	long max_size = perf_data_size(data);

	atomic_set(&data->lock, -1);

	if (event->attr.watermark) {
		data->watermark = min_t(long, max_size,
					event->attr.wakeup_watermark);
	}

	if (!data->watermark)
2441
		data->watermark = max_size / 2;
2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452


	rcu_assign_pointer(event->data, data);
}

static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
{
	struct perf_mmap_data *data;

	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
	perf_mmap_data_free(data);
2453 2454
}

2455
static void perf_mmap_data_release(struct perf_event *event)
2456
{
2457
	struct perf_mmap_data *data = event->data;
2458

2459
	WARN_ON(atomic_read(&event->mmap_count));
2460

2461
	rcu_assign_pointer(event->data, NULL);
2462
	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2463 2464 2465 2466
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
2467
	struct perf_event *event = vma->vm_file->private_data;
2468

2469
	atomic_inc(&event->mmap_count);
2470 2471 2472 2473
}

static void perf_mmap_close(struct vm_area_struct *vma)
{
2474
	struct perf_event *event = vma->vm_file->private_data;
2475

2476 2477
	WARN_ON_ONCE(event->ctx->parent_ctx);
	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2478
		unsigned long size = perf_data_size(event->data);
2479 2480
		struct user_struct *user = current_user();

2481
		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2482
		vma->vm_mm->locked_vm -= event->data->nr_locked;
2483
		perf_mmap_data_release(event);
2484
		mutex_unlock(&event->mmap_mutex);
2485
	}
2486 2487
}

2488
static const struct vm_operations_struct perf_mmap_vmops = {
2489 2490 2491 2492
	.open		= perf_mmap_open,
	.close		= perf_mmap_close,
	.fault		= perf_mmap_fault,
	.page_mkwrite	= perf_mmap_fault,
2493 2494 2495 2496
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
2497
	struct perf_event *event = file->private_data;
2498
	unsigned long user_locked, user_lock_limit;
2499
	struct user_struct *user = current_user();
2500
	unsigned long locked, lock_limit;
2501
	struct perf_mmap_data *data;
2502 2503
	unsigned long vma_size;
	unsigned long nr_pages;
2504
	long user_extra, extra;
2505
	int ret = 0;
2506

2507
	if (!(vma->vm_flags & VM_SHARED))
2508
		return -EINVAL;
2509 2510 2511 2512

	vma_size = vma->vm_end - vma->vm_start;
	nr_pages = (vma_size / PAGE_SIZE) - 1;

2513 2514 2515 2516 2517
	/*
	 * If we have data pages ensure they're a power-of-two number, so we
	 * can do bitmasks instead of modulo.
	 */
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
2518 2519
		return -EINVAL;

2520
	if (vma_size != PAGE_SIZE * (1 + nr_pages))
2521 2522
		return -EINVAL;

2523 2524
	if (vma->vm_pgoff != 0)
		return -EINVAL;
2525

2526 2527 2528
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->mmap_mutex);
	if (event->output) {
2529 2530 2531 2532
		ret = -EINVAL;
		goto unlock;
	}

2533 2534
	if (atomic_inc_not_zero(&event->mmap_count)) {
		if (nr_pages != event->data->nr_pages)
2535 2536 2537 2538
			ret = -EINVAL;
		goto unlock;
	}

2539
	user_extra = nr_pages + 1;
2540
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
I
Ingo Molnar 已提交
2541 2542 2543 2544 2545 2546

	/*
	 * Increase the limit linearly with more CPUs:
	 */
	user_lock_limit *= num_online_cpus();

2547
	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2548

2549 2550 2551
	extra = 0;
	if (user_locked > user_lock_limit)
		extra = user_locked - user_lock_limit;
2552 2553 2554

	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
	lock_limit >>= PAGE_SHIFT;
2555
	locked = vma->vm_mm->locked_vm + extra;
2556

2557 2558
	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
		!capable(CAP_IPC_LOCK)) {
2559 2560 2561
		ret = -EPERM;
		goto unlock;
	}
2562

2563
	WARN_ON(event->data);
2564 2565 2566 2567

	data = perf_mmap_data_alloc(event, nr_pages);
	ret = -ENOMEM;
	if (!data)
2568 2569
		goto unlock;

2570 2571 2572
	ret = 0;
	perf_mmap_data_init(event, data);

2573
	atomic_set(&event->mmap_count, 1);
2574
	atomic_long_add(user_extra, &user->locked_vm);
2575
	vma->vm_mm->locked_vm += extra;
2576
	event->data->nr_locked = extra;
2577
	if (vma->vm_flags & VM_WRITE)
2578
		event->data->writable = 1;
2579

2580
unlock:
2581
	mutex_unlock(&event->mmap_mutex);
2582 2583 2584

	vma->vm_flags |= VM_RESERVED;
	vma->vm_ops = &perf_mmap_vmops;
2585 2586

	return ret;
2587 2588
}

P
Peter Zijlstra 已提交
2589 2590 2591
static int perf_fasync(int fd, struct file *filp, int on)
{
	struct inode *inode = filp->f_path.dentry->d_inode;
2592
	struct perf_event *event = filp->private_data;
P
Peter Zijlstra 已提交
2593 2594 2595
	int retval;

	mutex_lock(&inode->i_mutex);
2596
	retval = fasync_helper(fd, filp, on, &event->fasync);
P
Peter Zijlstra 已提交
2597 2598 2599 2600 2601 2602 2603 2604
	mutex_unlock(&inode->i_mutex);

	if (retval < 0)
		return retval;

	return 0;
}

T
Thomas Gleixner 已提交
2605 2606 2607 2608
static const struct file_operations perf_fops = {
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
2609 2610
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
2611
	.mmap			= perf_mmap,
P
Peter Zijlstra 已提交
2612
	.fasync			= perf_fasync,
T
Thomas Gleixner 已提交
2613 2614
};

2615
/*
2616
 * Perf event wakeup
2617 2618 2619 2620 2621
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

2622
void perf_event_wakeup(struct perf_event *event)
2623
{
2624
	wake_up_all(&event->waitq);
2625

2626 2627 2628
	if (event->pending_kill) {
		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
		event->pending_kill = 0;
2629
	}
2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640
}

/*
 * Pending wakeups
 *
 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
 *
 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
 * single linked list and use cmpxchg() to add entries lockless.
 */

2641
static void perf_pending_event(struct perf_pending_entry *entry)
2642
{
2643 2644
	struct perf_event *event = container_of(entry,
			struct perf_event, pending);
2645

2646 2647 2648
	if (event->pending_disable) {
		event->pending_disable = 0;
		__perf_event_disable(event);
2649 2650
	}

2651 2652 2653
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
2654 2655 2656
	}
}

2657
#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2658

2659
static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2660 2661 2662
	PENDING_TAIL,
};

2663 2664
static void perf_pending_queue(struct perf_pending_entry *entry,
			       void (*func)(struct perf_pending_entry *))
2665
{
2666
	struct perf_pending_entry **head;
2667

2668
	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2669 2670
		return;

2671 2672 2673
	entry->func = func;

	head = &get_cpu_var(perf_pending_head);
2674 2675

	do {
2676 2677
		entry->next = *head;
	} while (cmpxchg(head, entry->next, entry) != entry->next);
2678

2679
	set_perf_event_pending();
2680

2681
	put_cpu_var(perf_pending_head);
2682 2683 2684 2685
}

static int __perf_pending_run(void)
{
2686
	struct perf_pending_entry *list;
2687 2688
	int nr = 0;

2689
	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2690
	while (list != PENDING_TAIL) {
2691 2692
		void (*func)(struct perf_pending_entry *);
		struct perf_pending_entry *entry = list;
2693 2694 2695

		list = list->next;

2696 2697
		func = entry->func;
		entry->next = NULL;
2698 2699 2700 2701 2702 2703 2704
		/*
		 * Ensure we observe the unqueue before we issue the wakeup,
		 * so that we won't be waiting forever.
		 * -- see perf_not_pending().
		 */
		smp_wmb();

2705
		func(entry);
2706 2707 2708 2709 2710 2711
		nr++;
	}

	return nr;
}

2712
static inline int perf_not_pending(struct perf_event *event)
2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726
{
	/*
	 * If we flush on whatever cpu we run, there is a chance we don't
	 * need to wait.
	 */
	get_cpu();
	__perf_pending_run();
	put_cpu();

	/*
	 * Ensure we see the proper queue state before going to sleep
	 * so that we do not miss the wakeup. -- see perf_pending_handle()
	 */
	smp_rmb();
2727
	return event->pending.next == NULL;
2728 2729
}

2730
static void perf_pending_sync(struct perf_event *event)
2731
{
2732
	wait_event(event->waitq, perf_not_pending(event));
2733 2734
}

2735
void perf_event_do_pending(void)
2736 2737 2738 2739
{
	__perf_pending_run();
}

2740 2741 2742 2743
/*
 * Callchain support -- arch specific
 */

2744
__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2745 2746 2747 2748
{
	return NULL;
}

2749 2750 2751
/*
 * Output
 */
2752 2753
static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
			      unsigned long offset, unsigned long head)
2754 2755 2756 2757 2758 2759
{
	unsigned long mask;

	if (!data->writable)
		return true;

2760
	mask = perf_data_size(data) - 1;
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

2771
static void perf_output_wakeup(struct perf_output_handle *handle)
2772
{
2773 2774
	atomic_set(&handle->data->poll, POLL_IN);

2775
	if (handle->nmi) {
2776 2777 2778
		handle->event->pending_wakeup = 1;
		perf_pending_queue(&handle->event->pending,
				   perf_pending_event);
2779
	} else
2780
		perf_event_wakeup(handle->event);
2781 2782
}

2783 2784 2785
/*
 * Curious locking construct.
 *
2786 2787
 * We need to ensure a later event_id doesn't publish a head when a former
 * event_id isn't done writing. However since we need to deal with NMIs we
2788 2789 2790 2791 2792 2793
 * cannot fully serialize things.
 *
 * What we do is serialize between CPUs so we only have to deal with NMI
 * nesting on a single CPU.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
2794
 * event_id completes.
2795 2796 2797 2798
 */
static void perf_output_lock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
2799
	int cur, cpu = get_cpu();
2800 2801 2802

	handle->locked = 0;

2803 2804 2805 2806 2807 2808 2809 2810
	for (;;) {
		cur = atomic_cmpxchg(&data->lock, -1, cpu);
		if (cur == -1) {
			handle->locked = 1;
			break;
		}
		if (cur == cpu)
			break;
2811 2812

		cpu_relax();
2813
	}
2814 2815 2816 2817 2818
}

static void perf_output_unlock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
2819 2820
	unsigned long head;
	int cpu;
2821

2822
	data->done_head = data->head;
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832

	if (!handle->locked)
		goto out;

again:
	/*
	 * The xchg implies a full barrier that ensures all writes are done
	 * before we publish the new head, matched by a rmb() in userspace when
	 * reading this position.
	 */
2833
	while ((head = atomic_long_xchg(&data->done_head, 0)))
2834 2835 2836
		data->user_page->data_head = head;

	/*
2837
	 * NMI can happen here, which means we can miss a done_head update.
2838 2839
	 */

2840
	cpu = atomic_xchg(&data->lock, -1);
2841 2842 2843 2844 2845
	WARN_ON_ONCE(cpu != smp_processor_id());

	/*
	 * Therefore we have to validate we did not indeed do so.
	 */
2846
	if (unlikely(atomic_long_read(&data->done_head))) {
2847 2848 2849
		/*
		 * Since we had it locked, we can lock it again.
		 */
2850
		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2851 2852 2853 2854 2855
			cpu_relax();

		goto again;
	}

2856
	if (atomic_xchg(&data->wakeup, 0))
2857 2858
		perf_output_wakeup(handle);
out:
2859
	put_cpu();
2860 2861
}

2862 2863
void perf_output_copy(struct perf_output_handle *handle,
		      const void *buf, unsigned int len)
2864 2865
{
	unsigned int pages_mask;
2866
	unsigned long offset;
2867 2868 2869 2870 2871 2872 2873 2874
	unsigned int size;
	void **pages;

	offset		= handle->offset;
	pages_mask	= handle->data->nr_pages - 1;
	pages		= handle->data->data_pages;

	do {
2875 2876
		unsigned long page_offset;
		unsigned long page_size;
2877 2878 2879
		int nr;

		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
2880 2881 2882
		page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
		page_offset = offset & (page_size - 1);
		size	    = min_t(unsigned int, page_size - page_offset, len);
2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899

		memcpy(pages[nr] + page_offset, buf, size);

		len	    -= size;
		buf	    += size;
		offset	    += size;
	} while (len);

	handle->offset = offset;

	/*
	 * Check we didn't copy past our reservation window, taking the
	 * possible unsigned int wrap into account.
	 */
	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}

2900
int perf_output_begin(struct perf_output_handle *handle,
2901
		      struct perf_event *event, unsigned int size,
2902
		      int nmi, int sample)
2903
{
2904
	struct perf_event *output_event;
2905
	struct perf_mmap_data *data;
2906
	unsigned long tail, offset, head;
2907 2908 2909 2910 2911 2912
	int have_lost;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;
2913

2914
	rcu_read_lock();
2915
	/*
2916
	 * For inherited events we send all the output towards the parent.
2917
	 */
2918 2919
	if (event->parent)
		event = event->parent;
2920

2921 2922 2923
	output_event = rcu_dereference(event->output);
	if (output_event)
		event = output_event;
2924

2925
	data = rcu_dereference(event->data);
2926 2927 2928
	if (!data)
		goto out;

2929
	handle->data	= data;
2930
	handle->event	= event;
2931 2932
	handle->nmi	= nmi;
	handle->sample	= sample;
2933

2934
	if (!data->nr_pages)
2935
		goto fail;
2936

2937 2938 2939 2940
	have_lost = atomic_read(&data->lost);
	if (have_lost)
		size += sizeof(lost_event);

2941 2942
	perf_output_lock(handle);

2943
	do {
2944 2945 2946 2947 2948 2949 2950
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
		tail = ACCESS_ONCE(data->user_page->data_tail);
		smp_rmb();
2951
		offset = head = atomic_long_read(&data->head);
P
Peter Zijlstra 已提交
2952
		head += size;
2953
		if (unlikely(!perf_output_space(data, tail, offset, head)))
2954
			goto fail;
2955
	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2956

2957
	handle->offset	= offset;
2958
	handle->head	= head;
2959

2960
	if (head - tail > data->watermark)
2961
		atomic_set(&data->wakeup, 1);
2962

2963
	if (have_lost) {
2964
		lost_event.header.type = PERF_RECORD_LOST;
2965 2966
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
2967
		lost_event.id          = event->id;
2968 2969 2970 2971 2972
		lost_event.lost        = atomic_xchg(&data->lost, 0);

		perf_output_put(handle, lost_event);
	}

2973
	return 0;
2974

2975
fail:
2976 2977
	atomic_inc(&data->lost);
	perf_output_unlock(handle);
2978 2979
out:
	rcu_read_unlock();
2980

2981 2982
	return -ENOSPC;
}
2983

2984
void perf_output_end(struct perf_output_handle *handle)
2985
{
2986
	struct perf_event *event = handle->event;
2987 2988
	struct perf_mmap_data *data = handle->data;

2989
	int wakeup_events = event->attr.wakeup_events;
P
Peter Zijlstra 已提交
2990

2991
	if (handle->sample && wakeup_events) {
2992
		int events = atomic_inc_return(&data->events);
P
Peter Zijlstra 已提交
2993
		if (events >= wakeup_events) {
2994
			atomic_sub(wakeup_events, &data->events);
2995
			atomic_set(&data->wakeup, 1);
P
Peter Zijlstra 已提交
2996
		}
2997 2998 2999
	}

	perf_output_unlock(handle);
3000
	rcu_read_unlock();
3001 3002
}

3003
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3004 3005
{
	/*
3006
	 * only top level events have the pid namespace they were created in
3007
	 */
3008 3009
	if (event->parent)
		event = event->parent;
3010

3011
	return task_tgid_nr_ns(p, event->ns);
3012 3013
}

3014
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3015 3016
{
	/*
3017
	 * only top level events have the pid namespace they were created in
3018
	 */
3019 3020
	if (event->parent)
		event = event->parent;
3021

3022
	return task_pid_nr_ns(p, event->ns);
3023 3024
}

3025
static void perf_output_read_one(struct perf_output_handle *handle,
3026
				 struct perf_event *event)
3027
{
3028
	u64 read_format = event->attr.read_format;
3029 3030 3031
	u64 values[4];
	int n = 0;

3032
	values[n++] = atomic64_read(&event->count);
3033
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3034 3035
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
3036 3037
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3038 3039
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
3040 3041
	}
	if (read_format & PERF_FORMAT_ID)
3042
		values[n++] = primary_event_id(event);
3043 3044 3045 3046 3047

	perf_output_copy(handle, values, n * sizeof(u64));
}

/*
3048
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3049 3050
 */
static void perf_output_read_group(struct perf_output_handle *handle,
3051
			    struct perf_event *event)
3052
{
3053 3054
	struct perf_event *leader = event->group_leader, *sub;
	u64 read_format = event->attr.read_format;
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
	u64 values[5];
	int n = 0;

	values[n++] = 1 + leader->nr_siblings;

	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = leader->total_time_enabled;

	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = leader->total_time_running;

3066
	if (leader != event)
3067 3068 3069 3070
		leader->pmu->read(leader);

	values[n++] = atomic64_read(&leader->count);
	if (read_format & PERF_FORMAT_ID)
3071
		values[n++] = primary_event_id(leader);
3072 3073 3074

	perf_output_copy(handle, values, n * sizeof(u64));

3075
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3076 3077
		n = 0;

3078
		if (sub != event)
3079 3080 3081 3082
			sub->pmu->read(sub);

		values[n++] = atomic64_read(&sub->count);
		if (read_format & PERF_FORMAT_ID)
3083
			values[n++] = primary_event_id(sub);
3084 3085 3086 3087 3088 3089

		perf_output_copy(handle, values, n * sizeof(u64));
	}
}

static void perf_output_read(struct perf_output_handle *handle,
3090
			     struct perf_event *event)
3091
{
3092 3093
	if (event->attr.read_format & PERF_FORMAT_GROUP)
		perf_output_read_group(handle, event);
3094
	else
3095
		perf_output_read_one(handle, event);
3096 3097
}

3098 3099 3100
void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
3101
			struct perf_event *event)
3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	if (sample_type & PERF_SAMPLE_ADDR)
		perf_output_put(handle, data->addr);

	if (sample_type & PERF_SAMPLE_ID)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		perf_output_put(handle, data->stream_id);

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	if (sample_type & PERF_SAMPLE_READ)
3132
		perf_output_read(handle, event);
3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169

	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
		if (data->callchain) {
			int size = 1;

			if (data->callchain)
				size += data->callchain->nr;

			size *= sizeof(u64);

			perf_output_copy(handle, data->callchain, size);
		} else {
			u64 nr = 0;
			perf_output_put(handle, nr);
		}
	}

	if (sample_type & PERF_SAMPLE_RAW) {
		if (data->raw) {
			perf_output_put(handle, data->raw->size);
			perf_output_copy(handle, data->raw->data,
					 data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(handle, raw);
		}
	}
}

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
3170
			 struct perf_event *event,
3171
			 struct pt_regs *regs)
3172
{
3173
	u64 sample_type = event->attr.sample_type;
3174

3175
	data->type = sample_type;
3176

3177
	header->type = PERF_RECORD_SAMPLE;
3178 3179 3180 3181
	header->size = sizeof(*header);

	header->misc = 0;
	header->misc |= perf_misc_flags(regs);
3182

3183
	if (sample_type & PERF_SAMPLE_IP) {
3184 3185 3186
		data->ip = perf_instruction_pointer(regs);

		header->size += sizeof(data->ip);
3187
	}
3188

3189
	if (sample_type & PERF_SAMPLE_TID) {
3190
		/* namespace issues */
3191 3192
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
3193

3194
		header->size += sizeof(data->tid_entry);
3195 3196
	}

3197
	if (sample_type & PERF_SAMPLE_TIME) {
P
Peter Zijlstra 已提交
3198
		data->time = perf_clock();
3199

3200
		header->size += sizeof(data->time);
3201 3202
	}

3203
	if (sample_type & PERF_SAMPLE_ADDR)
3204
		header->size += sizeof(data->addr);
3205

3206
	if (sample_type & PERF_SAMPLE_ID) {
3207
		data->id = primary_event_id(event);
3208

3209 3210 3211 3212
		header->size += sizeof(data->id);
	}

	if (sample_type & PERF_SAMPLE_STREAM_ID) {
3213
		data->stream_id = event->id;
3214 3215 3216

		header->size += sizeof(data->stream_id);
	}
3217

3218
	if (sample_type & PERF_SAMPLE_CPU) {
3219 3220
		data->cpu_entry.cpu		= raw_smp_processor_id();
		data->cpu_entry.reserved	= 0;
3221

3222
		header->size += sizeof(data->cpu_entry);
3223 3224
	}

3225
	if (sample_type & PERF_SAMPLE_PERIOD)
3226
		header->size += sizeof(data->period);
3227

3228
	if (sample_type & PERF_SAMPLE_READ)
3229
		header->size += perf_event_read_size(event);
3230

3231
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3232
		int size = 1;
3233

3234 3235 3236 3237 3238 3239
		data->callchain = perf_callchain(regs);

		if (data->callchain)
			size += data->callchain->nr;

		header->size += size * sizeof(u64);
3240 3241
	}

3242
	if (sample_type & PERF_SAMPLE_RAW) {
3243 3244 3245 3246 3247 3248 3249 3250
		int size = sizeof(u32);

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
3251
		header->size += size;
3252
	}
3253
}
3254

3255
static void perf_event_output(struct perf_event *event, int nmi,
3256 3257 3258 3259 3260
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
	struct perf_output_handle handle;
	struct perf_event_header header;
3261

3262
	perf_prepare_sample(&header, data, event, regs);
P
Peter Zijlstra 已提交
3263

3264
	if (perf_output_begin(&handle, event, header.size, nmi, 1))
3265
		return;
3266

3267
	perf_output_sample(&handle, &header, data, event);
3268

3269
	perf_output_end(&handle);
3270 3271
}

3272
/*
3273
 * read event_id
3274 3275 3276 3277 3278 3279 3280 3281 3282 3283
 */

struct perf_read_event {
	struct perf_event_header	header;

	u32				pid;
	u32				tid;
};

static void
3284
perf_event_read_event(struct perf_event *event,
3285 3286 3287
			struct task_struct *task)
{
	struct perf_output_handle handle;
3288
	struct perf_read_event read_event = {
3289
		.header = {
3290
			.type = PERF_RECORD_READ,
3291
			.misc = 0,
3292
			.size = sizeof(read_event) + perf_event_read_size(event),
3293
		},
3294 3295
		.pid = perf_event_pid(event, task),
		.tid = perf_event_tid(event, task),
3296
	};
3297
	int ret;
3298

3299
	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3300 3301 3302
	if (ret)
		return;

3303
	perf_output_put(&handle, read_event);
3304
	perf_output_read(&handle, event);
3305

3306 3307 3308
	perf_output_end(&handle);
}

P
Peter Zijlstra 已提交
3309
/*
P
Peter Zijlstra 已提交
3310 3311 3312
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.task
P
Peter Zijlstra 已提交
3313 3314
 */

P
Peter Zijlstra 已提交
3315
struct perf_task_event {
3316
	struct task_struct		*task;
3317
	struct perf_event_context	*task_ctx;
P
Peter Zijlstra 已提交
3318 3319 3320 3321 3322 3323

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				ppid;
P
Peter Zijlstra 已提交
3324 3325
		u32				tid;
		u32				ptid;
3326
		u64				time;
3327
	} event_id;
P
Peter Zijlstra 已提交
3328 3329
};

3330
static void perf_event_task_output(struct perf_event *event,
P
Peter Zijlstra 已提交
3331
				     struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3332 3333
{
	struct perf_output_handle handle;
3334
	int size;
P
Peter Zijlstra 已提交
3335
	struct task_struct *task = task_event->task;
3336 3337
	int ret;

3338 3339
	size  = task_event->event_id.header.size;
	ret = perf_output_begin(&handle, event, size, 0, 0);
P
Peter Zijlstra 已提交
3340 3341 3342 3343

	if (ret)
		return;

3344 3345
	task_event->event_id.pid = perf_event_pid(event, task);
	task_event->event_id.ppid = perf_event_pid(event, current);
P
Peter Zijlstra 已提交
3346

3347 3348
	task_event->event_id.tid = perf_event_tid(event, task);
	task_event->event_id.ptid = perf_event_tid(event, current);
P
Peter Zijlstra 已提交
3349

3350
	task_event->event_id.time = perf_clock();
3351

3352
	perf_output_put(&handle, task_event->event_id);
3353

P
Peter Zijlstra 已提交
3354 3355 3356
	perf_output_end(&handle);
}

3357
static int perf_event_task_match(struct perf_event *event)
P
Peter Zijlstra 已提交
3358
{
3359 3360 3361
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3362
	if (event->attr.comm || event->attr.mmap || event->attr.task)
P
Peter Zijlstra 已提交
3363 3364 3365 3366 3367
		return 1;

	return 0;
}

3368
static void perf_event_task_ctx(struct perf_event_context *ctx,
P
Peter Zijlstra 已提交
3369
				  struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3370
{
3371
	struct perf_event *event;
P
Peter Zijlstra 已提交
3372

3373 3374 3375
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_task_match(event))
			perf_event_task_output(event, task_event);
P
Peter Zijlstra 已提交
3376 3377 3378
	}
}

3379
static void perf_event_task_event(struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3380 3381
{
	struct perf_cpu_context *cpuctx;
3382
	struct perf_event_context *ctx = task_event->task_ctx;
P
Peter Zijlstra 已提交
3383

3384
	rcu_read_lock();
P
Peter Zijlstra 已提交
3385
	cpuctx = &get_cpu_var(perf_cpu_context);
3386
	perf_event_task_ctx(&cpuctx->ctx, task_event);
3387
	if (!ctx)
3388
		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
P
Peter Zijlstra 已提交
3389
	if (ctx)
3390
		perf_event_task_ctx(ctx, task_event);
3391
	put_cpu_var(perf_cpu_context);
P
Peter Zijlstra 已提交
3392 3393 3394
	rcu_read_unlock();
}

3395 3396
static void perf_event_task(struct task_struct *task,
			      struct perf_event_context *task_ctx,
3397
			      int new)
P
Peter Zijlstra 已提交
3398
{
P
Peter Zijlstra 已提交
3399
	struct perf_task_event task_event;
P
Peter Zijlstra 已提交
3400

3401 3402 3403
	if (!atomic_read(&nr_comm_events) &&
	    !atomic_read(&nr_mmap_events) &&
	    !atomic_read(&nr_task_events))
P
Peter Zijlstra 已提交
3404 3405
		return;

P
Peter Zijlstra 已提交
3406
	task_event = (struct perf_task_event){
3407 3408
		.task	  = task,
		.task_ctx = task_ctx,
3409
		.event_id    = {
P
Peter Zijlstra 已提交
3410
			.header = {
3411
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3412
				.misc = 0,
3413
				.size = sizeof(task_event.event_id),
P
Peter Zijlstra 已提交
3414
			},
3415 3416
			/* .pid  */
			/* .ppid */
P
Peter Zijlstra 已提交
3417 3418
			/* .tid  */
			/* .ptid */
P
Peter Zijlstra 已提交
3419 3420 3421
		},
	};

3422
	perf_event_task_event(&task_event);
P
Peter Zijlstra 已提交
3423 3424
}

3425
void perf_event_fork(struct task_struct *task)
P
Peter Zijlstra 已提交
3426
{
3427
	perf_event_task(task, NULL, 1);
P
Peter Zijlstra 已提交
3428 3429
}

3430 3431 3432 3433 3434
/*
 * comm tracking
 */

struct perf_comm_event {
3435 3436
	struct task_struct	*task;
	char			*comm;
3437 3438 3439 3440 3441 3442 3443
	int			comm_size;

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
3444
	} event_id;
3445 3446
};

3447
static void perf_event_comm_output(struct perf_event *event,
3448 3449 3450
				     struct perf_comm_event *comm_event)
{
	struct perf_output_handle handle;
3451 3452
	int size = comm_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3453 3454 3455 3456

	if (ret)
		return;

3457 3458
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3459

3460
	perf_output_put(&handle, comm_event->event_id);
3461 3462 3463 3464 3465
	perf_output_copy(&handle, comm_event->comm,
				   comm_event->comm_size);
	perf_output_end(&handle);
}

3466
static int perf_event_comm_match(struct perf_event *event)
3467
{
3468 3469 3470
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3471
	if (event->attr.comm)
3472 3473 3474 3475 3476
		return 1;

	return 0;
}

3477
static void perf_event_comm_ctx(struct perf_event_context *ctx,
3478 3479
				  struct perf_comm_event *comm_event)
{
3480
	struct perf_event *event;
3481

3482 3483 3484
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_comm_match(event))
			perf_event_comm_output(event, comm_event);
3485 3486 3487
	}
}

3488
static void perf_event_comm_event(struct perf_comm_event *comm_event)
3489 3490
{
	struct perf_cpu_context *cpuctx;
3491
	struct perf_event_context *ctx;
3492
	unsigned int size;
3493
	char comm[TASK_COMM_LEN];
3494

3495
	memset(comm, 0, sizeof(comm));
3496
	strlcpy(comm, comm_event->task->comm, sizeof(comm));
3497
	size = ALIGN(strlen(comm)+1, sizeof(u64));
3498 3499 3500 3501

	comm_event->comm = comm;
	comm_event->comm_size = size;

3502
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3503

3504
	rcu_read_lock();
3505
	cpuctx = &get_cpu_var(perf_cpu_context);
3506 3507
	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
	ctx = rcu_dereference(current->perf_event_ctxp);
3508
	if (ctx)
3509
		perf_event_comm_ctx(ctx, comm_event);
3510
	put_cpu_var(perf_cpu_context);
3511
	rcu_read_unlock();
3512 3513
}

3514
void perf_event_comm(struct task_struct *task)
3515
{
3516 3517
	struct perf_comm_event comm_event;

3518 3519
	if (task->perf_event_ctxp)
		perf_event_enable_on_exec(task);
3520

3521
	if (!atomic_read(&nr_comm_events))
3522
		return;
3523

3524
	comm_event = (struct perf_comm_event){
3525
		.task	= task,
3526 3527
		/* .comm      */
		/* .comm_size */
3528
		.event_id  = {
3529
			.header = {
3530
				.type = PERF_RECORD_COMM,
3531 3532 3533 3534 3535
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3536 3537 3538
		},
	};

3539
	perf_event_comm_event(&comm_event);
3540 3541
}

3542 3543 3544 3545 3546
/*
 * mmap tracking
 */

struct perf_mmap_event {
3547 3548 3549 3550
	struct vm_area_struct	*vma;

	const char		*file_name;
	int			file_size;
3551 3552 3553 3554 3555 3556 3557 3558 3559

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
		u64				start;
		u64				len;
		u64				pgoff;
3560
	} event_id;
3561 3562
};

3563
static void perf_event_mmap_output(struct perf_event *event,
3564 3565 3566
				     struct perf_mmap_event *mmap_event)
{
	struct perf_output_handle handle;
3567 3568
	int size = mmap_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3569 3570 3571 3572

	if (ret)
		return;

3573 3574
	mmap_event->event_id.pid = perf_event_pid(event, current);
	mmap_event->event_id.tid = perf_event_tid(event, current);
3575

3576
	perf_output_put(&handle, mmap_event->event_id);
3577 3578
	perf_output_copy(&handle, mmap_event->file_name,
				   mmap_event->file_size);
3579
	perf_output_end(&handle);
3580 3581
}

3582
static int perf_event_mmap_match(struct perf_event *event,
3583 3584
				   struct perf_mmap_event *mmap_event)
{
3585 3586 3587
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3588
	if (event->attr.mmap)
3589 3590 3591 3592 3593
		return 1;

	return 0;
}

3594
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3595 3596
				  struct perf_mmap_event *mmap_event)
{
3597
	struct perf_event *event;
3598

3599 3600 3601
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_mmap_match(event, mmap_event))
			perf_event_mmap_output(event, mmap_event);
3602 3603 3604
	}
}

3605
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3606 3607
{
	struct perf_cpu_context *cpuctx;
3608
	struct perf_event_context *ctx;
3609 3610
	struct vm_area_struct *vma = mmap_event->vma;
	struct file *file = vma->vm_file;
3611 3612 3613
	unsigned int size;
	char tmp[16];
	char *buf = NULL;
3614
	const char *name;
3615

3616 3617
	memset(tmp, 0, sizeof(tmp));

3618
	if (file) {
3619 3620 3621 3622 3623 3624
		/*
		 * d_path works from the end of the buffer backwards, so we
		 * need to add enough zero bytes after the string to handle
		 * the 64bit alignment we do later.
		 */
		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3625 3626 3627 3628
		if (!buf) {
			name = strncpy(tmp, "//enomem", sizeof(tmp));
			goto got_name;
		}
3629
		name = d_path(&file->f_path, buf, PATH_MAX);
3630 3631 3632 3633 3634
		if (IS_ERR(name)) {
			name = strncpy(tmp, "//toolong", sizeof(tmp));
			goto got_name;
		}
	} else {
3635 3636 3637
		if (arch_vma_name(mmap_event->vma)) {
			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				       sizeof(tmp));
3638
			goto got_name;
3639
		}
3640 3641 3642 3643 3644 3645

		if (!vma->vm_mm) {
			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			goto got_name;
		}

3646 3647 3648 3649 3650
		name = strncpy(tmp, "//anon", sizeof(tmp));
		goto got_name;
	}

got_name:
3651
	size = ALIGN(strlen(name)+1, sizeof(u64));
3652 3653 3654 3655

	mmap_event->file_name = name;
	mmap_event->file_size = size;

3656
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3657

3658
	rcu_read_lock();
3659
	cpuctx = &get_cpu_var(perf_cpu_context);
3660 3661
	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
	ctx = rcu_dereference(current->perf_event_ctxp);
3662
	if (ctx)
3663
		perf_event_mmap_ctx(ctx, mmap_event);
3664
	put_cpu_var(perf_cpu_context);
3665 3666
	rcu_read_unlock();

3667 3668 3669
	kfree(buf);
}

3670
void __perf_event_mmap(struct vm_area_struct *vma)
3671
{
3672 3673
	struct perf_mmap_event mmap_event;

3674
	if (!atomic_read(&nr_mmap_events))
3675 3676 3677
		return;

	mmap_event = (struct perf_mmap_event){
3678
		.vma	= vma,
3679 3680
		/* .file_name */
		/* .file_size */
3681
		.event_id  = {
3682
			.header = {
3683
				.type = PERF_RECORD_MMAP,
3684 3685 3686 3687 3688
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3689 3690 3691
			.start  = vma->vm_start,
			.len    = vma->vm_end - vma->vm_start,
			.pgoff  = vma->vm_pgoff,
3692 3693 3694
		},
	};

3695
	perf_event_mmap_event(&mmap_event);
3696 3697
}

3698 3699 3700 3701
/*
 * IRQ throttle logging
 */

3702
static void perf_log_throttle(struct perf_event *event, int enable)
3703 3704 3705 3706 3707 3708 3709
{
	struct perf_output_handle handle;
	int ret;

	struct {
		struct perf_event_header	header;
		u64				time;
3710
		u64				id;
3711
		u64				stream_id;
3712 3713
	} throttle_event = {
		.header = {
3714
			.type = PERF_RECORD_THROTTLE,
3715 3716 3717
			.misc = 0,
			.size = sizeof(throttle_event),
		},
P
Peter Zijlstra 已提交
3718
		.time		= perf_clock(),
3719 3720
		.id		= primary_event_id(event),
		.stream_id	= event->id,
3721 3722
	};

3723
	if (enable)
3724
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3725

3726
	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3727 3728 3729 3730 3731 3732 3733
	if (ret)
		return;

	perf_output_put(&handle, throttle_event);
	perf_output_end(&handle);
}

3734
/*
3735
 * Generic event overflow handling, sampling.
3736 3737
 */

3738
static int __perf_event_overflow(struct perf_event *event, int nmi,
3739 3740
				   int throttle, struct perf_sample_data *data,
				   struct pt_regs *regs)
3741
{
3742 3743
	int events = atomic_read(&event->event_limit);
	struct hw_perf_event *hwc = &event->hw;
3744 3745
	int ret = 0;

3746
	throttle = (throttle && event->pmu->unthrottle != NULL);
3747

3748
	if (!throttle) {
3749
		hwc->interrupts++;
3750
	} else {
3751 3752
		if (hwc->interrupts != MAX_INTERRUPTS) {
			hwc->interrupts++;
3753
			if (HZ * hwc->interrupts >
3754
					(u64)sysctl_perf_event_sample_rate) {
3755
				hwc->interrupts = MAX_INTERRUPTS;
3756
				perf_log_throttle(event, 0);
3757 3758 3759 3760
				ret = 1;
			}
		} else {
			/*
3761
			 * Keep re-disabling events even though on the previous
3762
			 * pass we disabled it - just in case we raced with a
3763
			 * sched-in and the event got enabled again:
3764
			 */
3765 3766 3767
			ret = 1;
		}
	}
3768

3769
	if (event->attr.freq) {
P
Peter Zijlstra 已提交
3770
		u64 now = perf_clock();
3771 3772 3773 3774 3775
		s64 delta = now - hwc->freq_stamp;

		hwc->freq_stamp = now;

		if (delta > 0 && delta < TICK_NSEC)
3776
			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3777 3778
	}

3779 3780
	/*
	 * XXX event_limit might not quite work as expected on inherited
3781
	 * events
3782 3783
	 */

3784 3785
	event->pending_kill = POLL_IN;
	if (events && atomic_dec_and_test(&event->event_limit)) {
3786
		ret = 1;
3787
		event->pending_kill = POLL_HUP;
3788
		if (nmi) {
3789 3790 3791
			event->pending_disable = 1;
			perf_pending_queue(&event->pending,
					   perf_pending_event);
3792
		} else
3793
			perf_event_disable(event);
3794 3795
	}

3796 3797 3798 3799 3800
	if (event->overflow_handler)
		event->overflow_handler(event, nmi, data, regs);
	else
		perf_event_output(event, nmi, data, regs);

3801
	return ret;
3802 3803
}

3804
int perf_event_overflow(struct perf_event *event, int nmi,
3805 3806
			  struct perf_sample_data *data,
			  struct pt_regs *regs)
3807
{
3808
	return __perf_event_overflow(event, nmi, 1, data, regs);
3809 3810
}

3811
/*
3812
 * Generic software event infrastructure
3813 3814
 */

3815
/*
3816 3817
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
3818 3819 3820 3821
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

3822
static u64 perf_swevent_set_period(struct perf_event *event)
3823
{
3824
	struct hw_perf_event *hwc = &event->hw;
3825 3826 3827 3828 3829
	u64 period = hwc->last_period;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;
3830 3831

again:
3832 3833 3834
	old = val = atomic64_read(&hwc->period_left);
	if (val < 0)
		return 0;
3835

3836 3837 3838 3839 3840
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
		goto again;
3841

3842
	return nr;
3843 3844
}

3845
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3846 3847
				    int nmi, struct perf_sample_data *data,
				    struct pt_regs *regs)
3848
{
3849
	struct hw_perf_event *hwc = &event->hw;
3850
	int throttle = 0;
3851

3852
	data->period = event->hw.last_period;
3853 3854
	if (!overflow)
		overflow = perf_swevent_set_period(event);
3855

3856 3857
	if (hwc->interrupts == MAX_INTERRUPTS)
		return;
3858

3859
	for (; overflow; overflow--) {
3860
		if (__perf_event_overflow(event, nmi, throttle,
3861
					    data, regs)) {
3862 3863 3864 3865 3866 3867
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
3868
		throttle = 1;
3869
	}
3870 3871
}

3872
static void perf_swevent_unthrottle(struct perf_event *event)
3873 3874
{
	/*
3875
	 * Nothing to do, we already reset hwc->interrupts.
3876
	 */
3877
}
3878

3879
static void perf_swevent_add(struct perf_event *event, u64 nr,
3880 3881
			       int nmi, struct perf_sample_data *data,
			       struct pt_regs *regs)
3882
{
3883
	struct hw_perf_event *hwc = &event->hw;
3884

3885
	atomic64_add(nr, &event->count);
3886

3887 3888 3889
	if (!regs)
		return;

3890 3891
	if (!hwc->sample_period)
		return;
3892

3893 3894 3895 3896
	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
		return perf_swevent_overflow(event, 1, nmi, data, regs);

	if (atomic64_add_negative(nr, &hwc->period_left))
3897
		return;
3898

3899
	perf_swevent_overflow(event, 0, nmi, data, regs);
3900 3901
}

3902
static int perf_swevent_is_counting(struct perf_event *event)
3903
{
3904
	/*
3905
	 * The event is active, we're good!
3906
	 */
3907
	if (event->state == PERF_EVENT_STATE_ACTIVE)
3908 3909
		return 1;

3910
	/*
3911
	 * The event is off/error, not counting.
3912
	 */
3913
	if (event->state != PERF_EVENT_STATE_INACTIVE)
3914 3915 3916
		return 0;

	/*
3917
	 * The event is inactive, if the context is active
3918 3919
	 * we're part of a group that didn't make it on the 'pmu',
	 * not counting.
3920
	 */
3921
	if (event->ctx->is_active)
3922 3923 3924 3925 3926 3927 3928 3929
		return 0;

	/*
	 * We're inactive and the context is too, this means the
	 * task is scheduled out, we're counting events that happen
	 * to us, like migration events.
	 */
	return 1;
3930 3931
}

L
Li Zefan 已提交
3932 3933 3934
static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data);

3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948
static int perf_exclude_event(struct perf_event *event,
			      struct pt_regs *regs)
{
	if (regs) {
		if (event->attr.exclude_user && user_mode(regs))
			return 1;

		if (event->attr.exclude_kernel && !user_mode(regs))
			return 1;
	}

	return 0;
}

3949
static int perf_swevent_match(struct perf_event *event,
P
Peter Zijlstra 已提交
3950
				enum perf_type_id type,
L
Li Zefan 已提交
3951 3952 3953
				u32 event_id,
				struct perf_sample_data *data,
				struct pt_regs *regs)
3954
{
3955 3956 3957
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3958
	if (!perf_swevent_is_counting(event))
3959 3960
		return 0;

3961
	if (event->attr.type != type)
3962
		return 0;
3963

3964
	if (event->attr.config != event_id)
3965 3966
		return 0;

3967 3968
	if (perf_exclude_event(event, regs))
		return 0;
3969

L
Li Zefan 已提交
3970 3971 3972 3973
	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
	    !perf_tp_event_match(event, data))
		return 0;

3974 3975 3976
	return 1;
}

3977
static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3978
				     enum perf_type_id type,
3979
				     u32 event_id, u64 nr, int nmi,
3980 3981
				     struct perf_sample_data *data,
				     struct pt_regs *regs)
3982
{
3983
	struct perf_event *event;
3984

3985
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
L
Li Zefan 已提交
3986
		if (perf_swevent_match(event, type, event_id, data, regs))
3987
			perf_swevent_add(event, nr, nmi, data, regs);
3988 3989 3990
	}
}

3991
int perf_swevent_get_recursion_context(void)
P
Peter Zijlstra 已提交
3992
{
3993 3994
	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
	int rctx;
3995

P
Peter Zijlstra 已提交
3996
	if (in_nmi())
3997
		rctx = 3;
3998
	else if (in_irq())
3999
		rctx = 2;
4000
	else if (in_softirq())
4001
		rctx = 1;
4002
	else
4003
		rctx = 0;
P
Peter Zijlstra 已提交
4004

4005 4006
	if (cpuctx->recursion[rctx]) {
		put_cpu_var(perf_cpu_context);
4007
		return -1;
4008
	}
P
Peter Zijlstra 已提交
4009

4010 4011
	cpuctx->recursion[rctx]++;
	barrier();
P
Peter Zijlstra 已提交
4012

4013
	return rctx;
P
Peter Zijlstra 已提交
4014
}
I
Ingo Molnar 已提交
4015
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
P
Peter Zijlstra 已提交
4016

4017
void perf_swevent_put_recursion_context(int rctx)
4018
{
4019 4020
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
	barrier();
4021
	cpuctx->recursion[rctx]--;
4022
	put_cpu_var(perf_cpu_context);
4023
}
I
Ingo Molnar 已提交
4024
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
P
Peter Zijlstra 已提交
4025

4026 4027 4028 4029
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
				    u64 nr, int nmi,
				    struct perf_sample_data *data,
				    struct pt_regs *regs)
4030
{
4031
	struct perf_cpu_context *cpuctx;
4032
	struct perf_event_context *ctx;
4033

4034
	cpuctx = &__get_cpu_var(perf_cpu_context);
4035
	rcu_read_lock();
4036
	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4037
				 nr, nmi, data, regs);
4038 4039 4040 4041
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
4042
	ctx = rcu_dereference(current->perf_event_ctxp);
4043
	if (ctx)
4044
		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4045
	rcu_read_unlock();
4046
}
4047

4048
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4049
			    struct pt_regs *regs, u64 addr)
4050
{
4051
	struct perf_sample_data data;
4052 4053 4054 4055 4056
	int rctx;

	rctx = perf_swevent_get_recursion_context();
	if (rctx < 0)
		return;
4057 4058 4059

	data.addr = addr;
	data.raw  = NULL;
4060

4061
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4062 4063

	perf_swevent_put_recursion_context(rctx);
4064 4065
}

4066
static void perf_swevent_read(struct perf_event *event)
4067 4068 4069
{
}

4070
static int perf_swevent_enable(struct perf_event *event)
4071
{
4072
	struct hw_perf_event *hwc = &event->hw;
4073 4074 4075

	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
4076
		perf_swevent_set_period(event);
4077
	}
4078 4079 4080
	return 0;
}

4081
static void perf_swevent_disable(struct perf_event *event)
4082 4083 4084
{
}

4085
static const struct pmu perf_ops_generic = {
4086 4087 4088 4089
	.enable		= perf_swevent_enable,
	.disable	= perf_swevent_disable,
	.read		= perf_swevent_read,
	.unthrottle	= perf_swevent_unthrottle,
4090 4091
};

4092
/*
4093
 * hrtimer based swevent callback
4094 4095
 */

4096
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4097 4098 4099
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
4100
	struct pt_regs *regs;
4101
	struct perf_event *event;
4102 4103
	u64 period;

4104 4105
	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
	event->pmu->read(event);
4106 4107

	data.addr = 0;
4108
	data.raw = NULL;
4109
	data.period = event->hw.last_period;
4110
	regs = get_irq_regs();
4111 4112 4113 4114
	/*
	 * In case we exclude kernel IPs or are somehow not in interrupt
	 * context, provide the next best thing, the user IP.
	 */
4115 4116
	if ((event->attr.exclude_kernel || !regs) &&
			!event->attr.exclude_user)
4117
		regs = task_pt_regs(current);
4118

4119
	if (regs) {
4120 4121 4122
		if (!(event->attr.exclude_idle && current->pid == 0))
			if (perf_event_overflow(event, 0, &data, regs))
				ret = HRTIMER_NORESTART;
4123 4124
	}

4125
	period = max_t(u64, 10000, event->hw.sample_period);
4126 4127 4128 4129 4130
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}

4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;

	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hwc->hrtimer.function = perf_swevent_hrtimer;
	if (hwc->sample_period) {
		u64 period;

		if (hwc->remaining) {
			if (hwc->remaining < 0)
				period = 10000;
			else
				period = hwc->remaining;
			hwc->remaining = 0;
		} else {
			period = max_t(u64, 10000, hwc->sample_period);
		}
		__hrtimer_start_range_ns(&hwc->hrtimer,
				ns_to_ktime(period), 0,
				HRTIMER_MODE_REL, 0);
	}
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;

	if (hwc->sample_period) {
		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
		hwc->remaining = ktime_to_ns(remaining);

		hrtimer_cancel(&hwc->hrtimer);
	}
}

4167
/*
4168
 * Software event: cpu wall time clock
4169 4170
 */

4171
static void cpu_clock_perf_event_update(struct perf_event *event)
4172 4173 4174 4175 4176 4177
{
	int cpu = raw_smp_processor_id();
	s64 prev;
	u64 now;

	now = cpu_clock(cpu);
4178
	prev = atomic64_xchg(&event->hw.prev_count, now);
4179
	atomic64_add(now - prev, &event->count);
4180 4181
}

4182
static int cpu_clock_perf_event_enable(struct perf_event *event)
4183
{
4184
	struct hw_perf_event *hwc = &event->hw;
4185 4186 4187
	int cpu = raw_smp_processor_id();

	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4188
	perf_swevent_start_hrtimer(event);
4189 4190 4191 4192

	return 0;
}

4193
static void cpu_clock_perf_event_disable(struct perf_event *event)
4194
{
4195
	perf_swevent_cancel_hrtimer(event);
4196
	cpu_clock_perf_event_update(event);
4197 4198
}

4199
static void cpu_clock_perf_event_read(struct perf_event *event)
4200
{
4201
	cpu_clock_perf_event_update(event);
4202 4203
}

4204
static const struct pmu perf_ops_cpu_clock = {
4205 4206 4207
	.enable		= cpu_clock_perf_event_enable,
	.disable	= cpu_clock_perf_event_disable,
	.read		= cpu_clock_perf_event_read,
4208 4209
};

4210
/*
4211
 * Software event: task time clock
4212 4213
 */

4214
static void task_clock_perf_event_update(struct perf_event *event, u64 now)
I
Ingo Molnar 已提交
4215
{
4216
	u64 prev;
I
Ingo Molnar 已提交
4217 4218
	s64 delta;

4219
	prev = atomic64_xchg(&event->hw.prev_count, now);
I
Ingo Molnar 已提交
4220
	delta = now - prev;
4221
	atomic64_add(delta, &event->count);
4222 4223
}

4224
static int task_clock_perf_event_enable(struct perf_event *event)
I
Ingo Molnar 已提交
4225
{
4226
	struct hw_perf_event *hwc = &event->hw;
4227 4228
	u64 now;

4229
	now = event->ctx->time;
4230

4231
	atomic64_set(&hwc->prev_count, now);
4232 4233

	perf_swevent_start_hrtimer(event);
4234 4235

	return 0;
I
Ingo Molnar 已提交
4236 4237
}

4238
static void task_clock_perf_event_disable(struct perf_event *event)
4239
{
4240
	perf_swevent_cancel_hrtimer(event);
4241
	task_clock_perf_event_update(event, event->ctx->time);
4242

4243
}
I
Ingo Molnar 已提交
4244

4245
static void task_clock_perf_event_read(struct perf_event *event)
4246
{
4247 4248 4249
	u64 time;

	if (!in_nmi()) {
4250 4251
		update_context_time(event->ctx);
		time = event->ctx->time;
4252 4253
	} else {
		u64 now = perf_clock();
4254 4255
		u64 delta = now - event->ctx->timestamp;
		time = event->ctx->time + delta;
4256 4257
	}

4258
	task_clock_perf_event_update(event, time);
4259 4260
}

4261
static const struct pmu perf_ops_task_clock = {
4262 4263 4264
	.enable		= task_clock_perf_event_enable,
	.disable	= task_clock_perf_event_disable,
	.read		= task_clock_perf_event_read,
4265 4266
};

4267
#ifdef CONFIG_EVENT_TRACING
L
Li Zefan 已提交
4268

4269
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4270
			  int entry_size)
4271
{
4272
	struct perf_raw_record raw = {
4273
		.size = entry_size,
4274
		.data = record,
4275 4276
	};

4277
	struct perf_sample_data data = {
4278
		.addr = addr,
4279
		.raw = &raw,
4280
	};
4281

4282 4283 4284 4285
	struct pt_regs *regs = get_irq_regs();

	if (!regs)
		regs = task_pt_regs(current);
4286

4287
	/* Trace events already protected against recursion */
4288
	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4289
				&data, regs);
4290
}
4291
EXPORT_SYMBOL_GPL(perf_tp_event);
4292

L
Li Zefan 已提交
4293 4294 4295 4296 4297 4298 4299 4300 4301
static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	void *record = data->raw->data;

	if (likely(!event->filter) || filter_match_preds(event->filter, record))
		return 1;
	return 0;
}
4302

4303
static void tp_perf_event_destroy(struct perf_event *event)
4304
{
4305
	ftrace_profile_disable(event->attr.config);
4306 4307
}

4308
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4309
{
4310 4311 4312 4313
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
4314
	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4315
			perf_paranoid_tracepoint_raw() &&
4316 4317 4318
			!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

4319
	if (ftrace_profile_enable(event->attr.config))
4320 4321
		return NULL;

4322
	event->destroy = tp_perf_event_destroy;
4323 4324 4325

	return &perf_ops_generic;
}
L
Li Zefan 已提交
4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	char *filter_str;
	int ret;

	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -EINVAL;

	filter_str = strndup_user(arg, PAGE_SIZE);
	if (IS_ERR(filter_str))
		return PTR_ERR(filter_str);

	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);

	kfree(filter_str);
	return ret;
}

static void perf_event_free_filter(struct perf_event *event)
{
	ftrace_profile_free_filter(event);
}

4350
#else
L
Li Zefan 已提交
4351 4352 4353 4354 4355 4356 4357

static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	return 1;
}

4358
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4359 4360 4361
{
	return NULL;
}
L
Li Zefan 已提交
4362 4363 4364 4365 4366 4367 4368 4369 4370 4371

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	return -ENOENT;
}

static void perf_event_free_filter(struct perf_event *event)
{
}

4372
#endif /* CONFIG_EVENT_TRACING */
4373

4374 4375 4376 4377 4378 4379 4380 4381 4382
#ifdef CONFIG_HAVE_HW_BREAKPOINT
static void bp_perf_event_destroy(struct perf_event *event)
{
	release_bp_slot(event);
}

static const struct pmu *bp_perf_event_init(struct perf_event *bp)
{
	int err;
4383 4384

	err = register_perf_hw_breakpoint(bp);
4385 4386 4387 4388 4389 4390 4391 4392
	if (err)
		return ERR_PTR(err);

	bp->destroy = bp_perf_event_destroy;

	return &perf_ops_bp;
}

4393
void perf_bp_event(struct perf_event *bp, void *data)
4394
{
4395 4396 4397
	struct perf_sample_data sample;
	struct pt_regs *regs = data;

4398
	sample.raw = NULL;
4399 4400 4401 4402
	sample.addr = bp->attr.bp_addr;

	if (!perf_exclude_event(bp, regs))
		perf_swevent_add(bp, 1, 1, &sample, regs);
4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414
}
#else
static const struct pmu *bp_perf_event_init(struct perf_event *bp)
{
	return NULL;
}

void perf_bp_event(struct perf_event *bp, void *regs)
{
}
#endif

4415
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4416

4417
static void sw_perf_event_destroy(struct perf_event *event)
4418
{
4419
	u64 event_id = event->attr.config;
4420

4421
	WARN_ON(event->parent);
4422

4423
	atomic_dec(&perf_swevent_enabled[event_id]);
4424 4425
}

4426
static const struct pmu *sw_perf_event_init(struct perf_event *event)
4427
{
4428
	const struct pmu *pmu = NULL;
4429
	u64 event_id = event->attr.config;
4430

4431
	/*
4432
	 * Software events (currently) can't in general distinguish
4433 4434 4435 4436 4437
	 * between user, kernel and hypervisor events.
	 * However, context switches and cpu migrations are considered
	 * to be kernel events, and page faults are never hypervisor
	 * events.
	 */
4438
	switch (event_id) {
4439
	case PERF_COUNT_SW_CPU_CLOCK:
4440
		pmu = &perf_ops_cpu_clock;
4441

4442
		break;
4443
	case PERF_COUNT_SW_TASK_CLOCK:
4444
		/*
4445 4446
		 * If the user instantiates this as a per-cpu event,
		 * use the cpu_clock event instead.
4447
		 */
4448
		if (event->ctx->task)
4449
			pmu = &perf_ops_task_clock;
4450
		else
4451
			pmu = &perf_ops_cpu_clock;
4452

4453
		break;
4454 4455 4456 4457 4458
	case PERF_COUNT_SW_PAGE_FAULTS:
	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
	case PERF_COUNT_SW_CONTEXT_SWITCHES:
	case PERF_COUNT_SW_CPU_MIGRATIONS:
4459 4460
	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
	case PERF_COUNT_SW_EMULATION_FAULTS:
4461 4462 4463
		if (!event->parent) {
			atomic_inc(&perf_swevent_enabled[event_id]);
			event->destroy = sw_perf_event_destroy;
4464
		}
4465
		pmu = &perf_ops_generic;
4466
		break;
4467
	}
4468

4469
	return pmu;
4470 4471
}

T
Thomas Gleixner 已提交
4472
/*
4473
 * Allocate and initialize a event structure
T
Thomas Gleixner 已提交
4474
 */
4475 4476
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr,
4477
		   int cpu,
4478 4479 4480
		   struct perf_event_context *ctx,
		   struct perf_event *group_leader,
		   struct perf_event *parent_event,
4481
		   perf_overflow_handler_t overflow_handler,
4482
		   gfp_t gfpflags)
T
Thomas Gleixner 已提交
4483
{
4484
	const struct pmu *pmu;
4485 4486
	struct perf_event *event;
	struct hw_perf_event *hwc;
4487
	long err;
T
Thomas Gleixner 已提交
4488

4489 4490
	event = kzalloc(sizeof(*event), gfpflags);
	if (!event)
4491
		return ERR_PTR(-ENOMEM);
T
Thomas Gleixner 已提交
4492

4493
	/*
4494
	 * Single events are their own group leaders, with an
4495 4496 4497
	 * empty sibling list:
	 */
	if (!group_leader)
4498
		group_leader = event;
4499

4500 4501
	mutex_init(&event->child_mutex);
	INIT_LIST_HEAD(&event->child_list);
4502

4503 4504 4505 4506
	INIT_LIST_HEAD(&event->group_entry);
	INIT_LIST_HEAD(&event->event_entry);
	INIT_LIST_HEAD(&event->sibling_list);
	init_waitqueue_head(&event->waitq);
T
Thomas Gleixner 已提交
4507

4508
	mutex_init(&event->mmap_mutex);
4509

4510 4511 4512 4513 4514 4515
	event->cpu		= cpu;
	event->attr		= *attr;
	event->group_leader	= group_leader;
	event->pmu		= NULL;
	event->ctx		= ctx;
	event->oncpu		= -1;
4516

4517
	event->parent		= parent_event;
4518

4519 4520
	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
	event->id		= atomic64_inc_return(&perf_event_id);
4521

4522
	event->state		= PERF_EVENT_STATE_INACTIVE;
4523

4524 4525
	if (!overflow_handler && parent_event)
		overflow_handler = parent_event->overflow_handler;
4526
	
4527
	event->overflow_handler	= overflow_handler;
4528

4529
	if (attr->disabled)
4530
		event->state = PERF_EVENT_STATE_OFF;
4531

4532
	pmu = NULL;
4533

4534
	hwc = &event->hw;
4535
	hwc->sample_period = attr->sample_period;
4536
	if (attr->freq && attr->sample_freq)
4537
		hwc->sample_period = 1;
4538
	hwc->last_period = hwc->sample_period;
4539 4540

	atomic64_set(&hwc->period_left, hwc->sample_period);
4541

4542
	/*
4543
	 * we currently do not support PERF_FORMAT_GROUP on inherited events
4544
	 */
4545
	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4546 4547
		goto done;

4548
	switch (attr->type) {
4549
	case PERF_TYPE_RAW:
4550
	case PERF_TYPE_HARDWARE:
4551
	case PERF_TYPE_HW_CACHE:
4552
		pmu = hw_perf_event_init(event);
4553 4554 4555
		break;

	case PERF_TYPE_SOFTWARE:
4556
		pmu = sw_perf_event_init(event);
4557 4558 4559
		break;

	case PERF_TYPE_TRACEPOINT:
4560
		pmu = tp_perf_event_init(event);
4561
		break;
4562

4563 4564 4565 4566 4567
	case PERF_TYPE_BREAKPOINT:
		pmu = bp_perf_event_init(event);
		break;


4568 4569
	default:
		break;
4570
	}
4571 4572
done:
	err = 0;
4573
	if (!pmu)
4574
		err = -EINVAL;
4575 4576
	else if (IS_ERR(pmu))
		err = PTR_ERR(pmu);
4577

4578
	if (err) {
4579 4580 4581
		if (event->ns)
			put_pid_ns(event->ns);
		kfree(event);
4582
		return ERR_PTR(err);
I
Ingo Molnar 已提交
4583
	}
4584

4585
	event->pmu = pmu;
T
Thomas Gleixner 已提交
4586

4587 4588 4589 4590 4591 4592 4593 4594
	if (!event->parent) {
		atomic_inc(&nr_events);
		if (event->attr.mmap)
			atomic_inc(&nr_mmap_events);
		if (event->attr.comm)
			atomic_inc(&nr_comm_events);
		if (event->attr.task)
			atomic_inc(&nr_task_events);
4595
	}
4596

4597
	return event;
T
Thomas Gleixner 已提交
4598 4599
}

4600 4601
static int perf_copy_attr(struct perf_event_attr __user *uattr,
			  struct perf_event_attr *attr)
4602 4603
{
	u32 size;
4604
	int ret;
4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628

	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
		return -EFAULT;

	/*
	 * zero the full structure, so that a short copy will be nice.
	 */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	if (size > PAGE_SIZE)	/* silly large */
		goto err_size;

	if (!size)		/* abi compat */
		size = PERF_ATTR_SIZE_VER0;

	if (size < PERF_ATTR_SIZE_VER0)
		goto err_size;

	/*
	 * If we're handed a bigger struct than we know of,
4629 4630 4631
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
4632 4633
	 */
	if (size > sizeof(*attr)) {
4634 4635 4636
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;
4637

4638 4639
		addr = (void __user *)uattr + sizeof(*attr);
		end  = (void __user *)uattr + size;
4640

4641
		for (; addr < end; addr++) {
4642 4643 4644 4645 4646 4647
			ret = get_user(val, addr);
			if (ret)
				return ret;
			if (val)
				goto err_size;
		}
4648
		size = sizeof(*attr);
4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661
	}

	ret = copy_from_user(attr, uattr, size);
	if (ret)
		return -EFAULT;

	/*
	 * If the type exists, the corresponding creation will verify
	 * the attr->config.
	 */
	if (attr->type >= PERF_TYPE_MAX)
		return -EINVAL;

4662
	if (attr->__reserved_1 || attr->__reserved_2)
4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679
		return -EINVAL;

	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
		return -EINVAL;

	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
		return -EINVAL;

out:
	return ret;

err_size:
	put_user(sizeof(*attr), &uattr->size);
	ret = -E2BIG;
	goto out;
}

L
Li Zefan 已提交
4680
static int perf_event_set_output(struct perf_event *event, int output_fd)
4681
{
4682
	struct perf_event *output_event = NULL;
4683
	struct file *output_file = NULL;
4684
	struct perf_event *old_output;
4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697
	int fput_needed = 0;
	int ret = -EINVAL;

	if (!output_fd)
		goto set;

	output_file = fget_light(output_fd, &fput_needed);
	if (!output_file)
		return -EBADF;

	if (output_file->f_op != &perf_fops)
		goto out;

4698
	output_event = output_file->private_data;
4699 4700

	/* Don't chain output fds */
4701
	if (output_event->output)
4702 4703 4704
		goto out;

	/* Don't set an output fd when we already have an output channel */
4705
	if (event->data)
4706 4707 4708 4709 4710
		goto out;

	atomic_long_inc(&output_file->f_count);

set:
4711 4712 4713 4714
	mutex_lock(&event->mmap_mutex);
	old_output = event->output;
	rcu_assign_pointer(event->output, output_event);
	mutex_unlock(&event->mmap_mutex);
4715 4716 4717 4718

	if (old_output) {
		/*
		 * we need to make sure no existing perf_output_*()
4719
		 * is still referencing this event.
4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730
		 */
		synchronize_rcu();
		fput(old_output->filp);
	}

	ret = 0;
out:
	fput_light(output_file, fput_needed);
	return ret;
}

T
Thomas Gleixner 已提交
4731
/**
4732
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
I
Ingo Molnar 已提交
4733
 *
4734
 * @attr_uptr:	event_id type attributes for monitoring/sampling
T
Thomas Gleixner 已提交
4735
 * @pid:		target pid
I
Ingo Molnar 已提交
4736
 * @cpu:		target cpu
4737
 * @group_fd:		group leader event fd
T
Thomas Gleixner 已提交
4738
 */
4739 4740
SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
4741
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
T
Thomas Gleixner 已提交
4742
{
4743 4744 4745 4746
	struct perf_event *event, *group_leader;
	struct perf_event_attr attr;
	struct perf_event_context *ctx;
	struct file *event_file = NULL;
4747 4748
	struct file *group_file = NULL;
	int fput_needed = 0;
4749
	int fput_needed2 = 0;
4750
	int err;
T
Thomas Gleixner 已提交
4751

4752
	/* for future expandability... */
4753
	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4754 4755
		return -EINVAL;

4756 4757 4758
	err = perf_copy_attr(attr_uptr, &attr);
	if (err)
		return err;
4759

4760 4761 4762 4763 4764
	if (!attr.exclude_kernel) {
		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

4765
	if (attr.freq) {
4766
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
4767 4768 4769
			return -EINVAL;
	}

4770
	/*
I
Ingo Molnar 已提交
4771 4772 4773 4774 4775 4776 4777
	 * Get the target context (task or percpu):
	 */
	ctx = find_get_context(pid, cpu);
	if (IS_ERR(ctx))
		return PTR_ERR(ctx);

	/*
4778
	 * Look up the group leader (we will attach this event to it):
4779 4780
	 */
	group_leader = NULL;
4781
	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4782
		err = -EINVAL;
4783 4784
		group_file = fget_light(group_fd, &fput_needed);
		if (!group_file)
I
Ingo Molnar 已提交
4785
			goto err_put_context;
4786
		if (group_file->f_op != &perf_fops)
I
Ingo Molnar 已提交
4787
			goto err_put_context;
4788 4789 4790

		group_leader = group_file->private_data;
		/*
I
Ingo Molnar 已提交
4791 4792 4793 4794 4795 4796 4797 4798
		 * Do not allow a recursive hierarchy (this new sibling
		 * becoming part of another group-sibling):
		 */
		if (group_leader->group_leader != group_leader)
			goto err_put_context;
		/*
		 * Do not allow to attach to a group in a different
		 * task or CPU context:
4799
		 */
I
Ingo Molnar 已提交
4800 4801
		if (group_leader->ctx != ctx)
			goto err_put_context;
4802 4803 4804
		/*
		 * Only a group leader can be exclusive or pinned
		 */
4805
		if (attr.exclusive || attr.pinned)
4806
			goto err_put_context;
4807 4808
	}

4809
	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4810
				     NULL, NULL, GFP_KERNEL);
4811 4812
	err = PTR_ERR(event);
	if (IS_ERR(event))
T
Thomas Gleixner 已提交
4813 4814
		goto err_put_context;

4815
	err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4816
	if (err < 0)
4817 4818
		goto err_free_put_context;

4819 4820
	event_file = fget_light(err, &fput_needed2);
	if (!event_file)
4821 4822
		goto err_free_put_context;

4823
	if (flags & PERF_FLAG_FD_OUTPUT) {
4824
		err = perf_event_set_output(event, group_fd);
4825 4826
		if (err)
			goto err_fput_free_put_context;
4827 4828
	}

4829
	event->filp = event_file;
4830
	WARN_ON_ONCE(ctx->parent_ctx);
4831
	mutex_lock(&ctx->mutex);
4832
	perf_install_in_context(ctx, event, cpu);
4833
	++ctx->generation;
4834
	mutex_unlock(&ctx->mutex);
4835

4836
	event->owner = current;
4837
	get_task_struct(current);
4838 4839 4840
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
4841

4842
err_fput_free_put_context:
4843
	fput_light(event_file, fput_needed2);
T
Thomas Gleixner 已提交
4844

4845
err_free_put_context:
4846
	if (err < 0)
4847
		kfree(event);
T
Thomas Gleixner 已提交
4848 4849

err_put_context:
4850 4851 4852 4853
	if (err < 0)
		put_ctx(ctx);

	fput_light(group_file, fput_needed);
T
Thomas Gleixner 已提交
4854

4855
	return err;
T
Thomas Gleixner 已提交
4856 4857
}

4858 4859 4860 4861 4862 4863 4864 4865 4866
/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @pid: task to profile
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4867 4868
				 pid_t pid,
				 perf_overflow_handler_t overflow_handler)
4869 4870 4871 4872 4873 4874 4875 4876 4877 4878
{
	struct perf_event *event;
	struct perf_event_context *ctx;
	int err;

	/*
	 * Get the target context (task or percpu):
	 */

	ctx = find_get_context(pid, cpu);
4879 4880 4881 4882
	if (IS_ERR(ctx)) {
		err = PTR_ERR(ctx);
		goto err_exit;
	}
4883 4884

	event = perf_event_alloc(attr, cpu, ctx, NULL,
4885
				 NULL, overflow_handler, GFP_KERNEL);
4886 4887
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
4888
		goto err_put_context;
4889
	}
4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905

	event->filp = NULL;
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
	perf_install_in_context(ctx, event, cpu);
	++ctx->generation;
	mutex_unlock(&ctx->mutex);

	event->owner = current;
	get_task_struct(current);
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);

	return event;

4906 4907 4908 4909
 err_put_context:
	put_ctx(ctx);
 err_exit:
	return ERR_PTR(err);
4910 4911 4912
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

4913
/*
4914
 * inherit a event from parent task to child task:
4915
 */
4916 4917
static struct perf_event *
inherit_event(struct perf_event *parent_event,
4918
	      struct task_struct *parent,
4919
	      struct perf_event_context *parent_ctx,
4920
	      struct task_struct *child,
4921 4922
	      struct perf_event *group_leader,
	      struct perf_event_context *child_ctx)
4923
{
4924
	struct perf_event *child_event;
4925

4926
	/*
4927 4928
	 * Instead of creating recursive hierarchies of events,
	 * we link inherited events back to the original parent,
4929 4930 4931
	 * which has a filp for sure, which we use as the reference
	 * count:
	 */
4932 4933
	if (parent_event->parent)
		parent_event = parent_event->parent;
4934

4935 4936 4937
	child_event = perf_event_alloc(&parent_event->attr,
					   parent_event->cpu, child_ctx,
					   group_leader, parent_event,
4938
					   NULL, GFP_KERNEL);
4939 4940
	if (IS_ERR(child_event))
		return child_event;
4941
	get_ctx(child_ctx);
4942

4943
	/*
4944
	 * Make the child state follow the state of the parent event,
4945
	 * not its attr.disabled bit.  We hold the parent's mutex,
4946
	 * so we won't race with perf_event_{en, dis}able_family.
4947
	 */
4948 4949
	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
		child_event->state = PERF_EVENT_STATE_INACTIVE;
4950
	else
4951
		child_event->state = PERF_EVENT_STATE_OFF;
4952

4953 4954
	if (parent_event->attr.freq)
		child_event->hw.sample_period = parent_event->hw.sample_period;
4955

4956 4957
	child_event->overflow_handler = parent_event->overflow_handler;

4958 4959 4960
	/*
	 * Link it up in the child's context:
	 */
4961
	add_event_to_ctx(child_event, child_ctx);
4962 4963 4964

	/*
	 * Get a reference to the parent filp - we will fput it
4965
	 * when the child event exits. This is safe to do because
4966 4967 4968
	 * we are in the parent and we know that the filp still
	 * exists and has a nonzero count:
	 */
4969
	atomic_long_inc(&parent_event->filp->f_count);
4970

4971
	/*
4972
	 * Link this into the parent event's child list
4973
	 */
4974 4975 4976 4977
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_add_tail(&child_event->child_list, &parent_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
4978

4979
	return child_event;
4980 4981
}

4982
static int inherit_group(struct perf_event *parent_event,
4983
	      struct task_struct *parent,
4984
	      struct perf_event_context *parent_ctx,
4985
	      struct task_struct *child,
4986
	      struct perf_event_context *child_ctx)
4987
{
4988 4989 4990
	struct perf_event *leader;
	struct perf_event *sub;
	struct perf_event *child_ctr;
4991

4992
	leader = inherit_event(parent_event, parent, parent_ctx,
4993
				 child, NULL, child_ctx);
4994 4995
	if (IS_ERR(leader))
		return PTR_ERR(leader);
4996 4997
	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
		child_ctr = inherit_event(sub, parent, parent_ctx,
4998 4999 5000
					    child, leader, child_ctx);
		if (IS_ERR(child_ctr))
			return PTR_ERR(child_ctr);
5001
	}
5002 5003 5004
	return 0;
}

5005
static void sync_child_event(struct perf_event *child_event,
5006
			       struct task_struct *child)
5007
{
5008
	struct perf_event *parent_event = child_event->parent;
5009
	u64 child_val;
5010

5011 5012
	if (child_event->attr.inherit_stat)
		perf_event_read_event(child_event, child);
5013

5014
	child_val = atomic64_read(&child_event->count);
5015 5016 5017 5018

	/*
	 * Add back the child's count to the parent's count:
	 */
5019 5020 5021 5022 5023
	atomic64_add(child_val, &parent_event->count);
	atomic64_add(child_event->total_time_enabled,
		     &parent_event->child_total_time_enabled);
	atomic64_add(child_event->total_time_running,
		     &parent_event->child_total_time_running);
5024 5025

	/*
5026
	 * Remove this event from the parent's list
5027
	 */
5028 5029 5030 5031
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_del_init(&child_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
5032 5033

	/*
5034
	 * Release the parent event, if this was the last
5035 5036
	 * reference to it.
	 */
5037
	fput(parent_event->filp);
5038 5039
}

5040
static void
5041 5042
__perf_event_exit_task(struct perf_event *child_event,
			 struct perf_event_context *child_ctx,
5043
			 struct task_struct *child)
5044
{
5045
	struct perf_event *parent_event;
5046

5047
	perf_event_remove_from_context(child_event);
5048

5049
	parent_event = child_event->parent;
5050
	/*
5051
	 * It can happen that parent exits first, and has events
5052
	 * that are still around due to the child reference. These
5053
	 * events need to be zapped - but otherwise linger.
5054
	 */
5055 5056 5057
	if (parent_event) {
		sync_child_event(child_event, child);
		free_event(child_event);
5058
	}
5059 5060 5061
}

/*
5062
 * When a child task exits, feed back event values to parent events.
5063
 */
5064
void perf_event_exit_task(struct task_struct *child)
5065
{
5066 5067
	struct perf_event *child_event, *tmp;
	struct perf_event_context *child_ctx;
5068
	unsigned long flags;
5069

5070 5071
	if (likely(!child->perf_event_ctxp)) {
		perf_event_task(child, NULL, 0);
5072
		return;
P
Peter Zijlstra 已提交
5073
	}
5074

5075
	local_irq_save(flags);
5076 5077 5078 5079 5080 5081
	/*
	 * We can't reschedule here because interrupts are disabled,
	 * and either child is current or it is a task that can't be
	 * scheduled, so we are now safe from rescheduling changing
	 * our context.
	 */
5082 5083
	child_ctx = child->perf_event_ctxp;
	__perf_event_task_sched_out(child_ctx);
5084 5085 5086

	/*
	 * Take the context lock here so that if find_get_context is
5087
	 * reading child->perf_event_ctxp, we wait until it has
5088 5089
	 * incremented the context's refcount before we do put_ctx below.
	 */
5090
	raw_spin_lock(&child_ctx->lock);
5091
	child->perf_event_ctxp = NULL;
5092 5093 5094
	/*
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
5095
	 * the events from it.
5096 5097
	 */
	unclone_ctx(child_ctx);
5098
	update_context_time(child_ctx);
5099
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
5100 5101

	/*
5102 5103 5104
	 * Report the task dead after unscheduling the events so that we
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
	 * get a few PERF_RECORD_READ events.
P
Peter Zijlstra 已提交
5105
	 */
5106
	perf_event_task(child, child_ctx, 0);
5107

5108 5109 5110
	/*
	 * We can recurse on the same lock type through:
	 *
5111 5112 5113
	 *   __perf_event_exit_task()
	 *     sync_child_event()
	 *       fput(parent_event->filp)
5114 5115 5116 5117 5118 5119
	 *         perf_release()
	 *           mutex_lock(&ctx->mutex)
	 *
	 * But since its the parent context it won't be the same instance.
	 */
	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5120

5121
again:
5122 5123 5124 5125 5126
	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
				 group_entry)
		__perf_event_exit_task(child_event, child_ctx, child);

	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5127
				 group_entry)
5128
		__perf_event_exit_task(child_event, child_ctx, child);
5129 5130

	/*
5131
	 * If the last event was a group event, it will have appended all
5132 5133 5134
	 * its siblings to the list, but we obtained 'tmp' before that which
	 * will still point to the list head terminating the iteration.
	 */
5135 5136
	if (!list_empty(&child_ctx->pinned_groups) ||
	    !list_empty(&child_ctx->flexible_groups))
5137
		goto again;
5138 5139 5140 5141

	mutex_unlock(&child_ctx->mutex);

	put_ctx(child_ctx);
5142 5143
}

5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161
static void perf_free_event(struct perf_event *event,
			    struct perf_event_context *ctx)
{
	struct perf_event *parent = event->parent;

	if (WARN_ON_ONCE(!parent))
		return;

	mutex_lock(&parent->child_mutex);
	list_del_init(&event->child_list);
	mutex_unlock(&parent->child_mutex);

	fput(parent->filp);

	list_del_event(event, ctx);
	free_event(event);
}

5162 5163 5164 5165
/*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
 */
5166
void perf_event_free_task(struct task_struct *task)
5167
{
5168 5169
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event *event, *tmp;
5170 5171 5172 5173 5174 5175

	if (!ctx)
		return;

	mutex_lock(&ctx->mutex);
again:
5176 5177
	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
		perf_free_event(event, ctx);
5178

5179 5180 5181
	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
				 group_entry)
		perf_free_event(event, ctx);
5182

5183 5184 5185
	if (!list_empty(&ctx->pinned_groups) ||
	    !list_empty(&ctx->flexible_groups))
		goto again;
5186

5187
	mutex_unlock(&ctx->mutex);
5188

5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203
	put_ctx(ctx);
}

static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
		   struct perf_event_context *parent_ctx,
		   struct task_struct *child,
		   int *inherited_all)
{
	int ret;
	struct perf_event_context *child_ctx = child->perf_event_ctxp;

	if (!event->attr.inherit) {
		*inherited_all = 0;
		return 0;
5204 5205
	}

5206 5207 5208 5209 5210 5211 5212
	if (!child_ctx) {
		/*
		 * This is executed from the parent task context, so
		 * inherit events that have been marked for cloning.
		 * First allocate and initialize a context for the
		 * child.
		 */
5213

5214 5215 5216 5217
		child_ctx = kzalloc(sizeof(struct perf_event_context),
				    GFP_KERNEL);
		if (!child_ctx)
			return -ENOMEM;
5218

5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230
		__perf_event_init_context(child_ctx, child);
		child->perf_event_ctxp = child_ctx;
		get_task_struct(child);
	}

	ret = inherit_group(event, parent, parent_ctx,
			    child, child_ctx);

	if (ret)
		*inherited_all = 0;

	return ret;
5231 5232
}

5233

5234
/*
5235
 * Initialize the perf_event context in task_struct
5236
 */
5237
int perf_event_init_task(struct task_struct *child)
5238
{
5239
	struct perf_event_context *child_ctx, *parent_ctx;
5240 5241
	struct perf_event_context *cloned_ctx;
	struct perf_event *event;
5242
	struct task_struct *parent = current;
5243
	int inherited_all = 1;
5244
	int ret = 0;
5245

5246
	child->perf_event_ctxp = NULL;
5247

5248 5249
	mutex_init(&child->perf_event_mutex);
	INIT_LIST_HEAD(&child->perf_event_list);
5250

5251
	if (likely(!parent->perf_event_ctxp))
5252 5253
		return 0;

5254
	/*
5255 5256
	 * If the parent's context is a clone, pin it so it won't get
	 * swapped under us.
5257
	 */
5258 5259
	parent_ctx = perf_pin_task_context(parent);

5260 5261 5262 5263 5264 5265 5266
	/*
	 * No need to check if parent_ctx != NULL here; since we saw
	 * it non-NULL earlier, the only reason for it to become NULL
	 * is if we exit, and since we're currently in the middle of
	 * a fork we can't be exiting at the same time.
	 */

5267 5268 5269 5270
	/*
	 * Lock the parent list. No need to lock the child - not PID
	 * hashed yet and not running, so nobody can access it.
	 */
5271
	mutex_lock(&parent_ctx->mutex);
5272 5273 5274 5275 5276

	/*
	 * We dont have to disable NMIs - we are only looking at
	 * the list, not manipulating it:
	 */
5277 5278 5279 5280 5281 5282
	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
		ret = inherit_task_group(event, parent, parent_ctx, child,
					 &inherited_all);
		if (ret)
			break;
	}
5283

5284 5285 5286 5287
	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
		ret = inherit_task_group(event, parent, parent_ctx, child,
					 &inherited_all);
		if (ret)
5288
			break;
5289 5290
	}

5291 5292
	child_ctx = child->perf_event_ctxp;

5293
	if (child_ctx && inherited_all) {
5294 5295 5296
		/*
		 * Mark the child context as a clone of the parent
		 * context, or of whatever the parent is a clone of.
5297 5298
		 * Note that if the parent is a clone, it could get
		 * uncloned at any point, but that doesn't matter
5299
		 * because the list of events and the generation
5300
		 * count can't have changed since we took the mutex.
5301
		 */
5302 5303 5304
		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
		if (cloned_ctx) {
			child_ctx->parent_ctx = cloned_ctx;
5305
			child_ctx->parent_gen = parent_ctx->parent_gen;
5306 5307 5308 5309 5310
		} else {
			child_ctx->parent_ctx = parent_ctx;
			child_ctx->parent_gen = parent_ctx->generation;
		}
		get_ctx(child_ctx->parent_ctx);
5311 5312
	}

5313
	mutex_unlock(&parent_ctx->mutex);
5314

5315
	perf_unpin_context(parent_ctx);
5316

5317
	return ret;
5318 5319
}

5320
static void __cpuinit perf_event_init_cpu(int cpu)
T
Thomas Gleixner 已提交
5321
{
5322
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
5323

5324
	cpuctx = &per_cpu(perf_cpu_context, cpu);
5325
	__perf_event_init_context(&cpuctx->ctx, NULL);
T
Thomas Gleixner 已提交
5326

5327
	spin_lock(&perf_resource_lock);
5328
	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5329
	spin_unlock(&perf_resource_lock);
5330

5331
	hw_perf_event_setup(cpu);
T
Thomas Gleixner 已提交
5332 5333 5334
}

#ifdef CONFIG_HOTPLUG_CPU
5335
static void __perf_event_exit_cpu(void *info)
T
Thomas Gleixner 已提交
5336 5337
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5338 5339
	struct perf_event_context *ctx = &cpuctx->ctx;
	struct perf_event *event, *tmp;
T
Thomas Gleixner 已提交
5340

5341 5342 5343
	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
		__perf_event_remove_from_context(event);
	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5344
		__perf_event_remove_from_context(event);
T
Thomas Gleixner 已提交
5345
}
5346
static void perf_event_exit_cpu(int cpu)
T
Thomas Gleixner 已提交
5347
{
5348
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5349
	struct perf_event_context *ctx = &cpuctx->ctx;
5350 5351

	mutex_lock(&ctx->mutex);
5352
	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5353
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
5354 5355
}
#else
5356
static inline void perf_event_exit_cpu(int cpu) { }
T
Thomas Gleixner 已提交
5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367
#endif

static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action) {

	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
5368
		perf_event_init_cpu(cpu);
T
Thomas Gleixner 已提交
5369 5370
		break;

5371 5372
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
5373
		hw_perf_event_setup_online(cpu);
5374 5375
		break;

T
Thomas Gleixner 已提交
5376 5377
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
5378
		perf_event_exit_cpu(cpu);
T
Thomas Gleixner 已提交
5379 5380 5381 5382 5383 5384 5385 5386 5387
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

5388 5389 5390
/*
 * This has to have a higher priority than migration_notifier in sched.c.
 */
T
Thomas Gleixner 已提交
5391 5392
static struct notifier_block __cpuinitdata perf_cpu_nb = {
	.notifier_call		= perf_cpu_notify,
5393
	.priority		= 20,
T
Thomas Gleixner 已提交
5394 5395
};

5396
void __init perf_event_init(void)
T
Thomas Gleixner 已提交
5397 5398 5399
{
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
			(void *)(long)smp_processor_id());
5400 5401
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
			(void *)(long)smp_processor_id());
T
Thomas Gleixner 已提交
5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421
	register_cpu_notifier(&perf_cpu_nb);
}

static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_reserved_percpu);
}

static ssize_t
perf_set_reserve_percpu(struct sysdev_class *class,
			const char *buf,
			size_t count)
{
	struct perf_cpu_context *cpuctx;
	unsigned long val;
	int err, cpu, mpt;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
5422
	if (val > perf_max_events)
T
Thomas Gleixner 已提交
5423 5424
		return -EINVAL;

5425
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5426 5427 5428
	perf_reserved_percpu = val;
	for_each_online_cpu(cpu) {
		cpuctx = &per_cpu(perf_cpu_context, cpu);
5429
		raw_spin_lock_irq(&cpuctx->ctx.lock);
5430 5431
		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
			  perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
5432
		cpuctx->max_pertask = mpt;
5433
		raw_spin_unlock_irq(&cpuctx->ctx.lock);
T
Thomas Gleixner 已提交
5434
	}
5435
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456

	return count;
}

static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_overcommit);
}

static ssize_t
perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
{
	unsigned long val;
	int err;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
	if (val > 1)
		return -EINVAL;

5457
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5458
	perf_overcommit = val;
5459
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485

	return count;
}

static SYSDEV_CLASS_ATTR(
				reserve_percpu,
				0644,
				perf_show_reserve_percpu,
				perf_set_reserve_percpu
			);

static SYSDEV_CLASS_ATTR(
				overcommit,
				0644,
				perf_show_overcommit,
				perf_set_overcommit
			);

static struct attribute *perfclass_attrs[] = {
	&attr_reserve_percpu.attr,
	&attr_overcommit.attr,
	NULL
};

static struct attribute_group perfclass_attr_group = {
	.attrs			= perfclass_attrs,
5486
	.name			= "perf_events",
T
Thomas Gleixner 已提交
5487 5488
};

5489
static int __init perf_event_sysfs_init(void)
T
Thomas Gleixner 已提交
5490 5491 5492 5493
{
	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				  &perfclass_attr_group);
}
5494
device_initcall(perf_event_sysfs_init);