perf_event.c 113.7 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events core code:
T
Thomas Gleixner 已提交
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
I
Ingo Molnar 已提交
9
 * For licensing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
T
Thomas Gleixner 已提交
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
T
Thomas Gleixner 已提交
17 18
#include <linux/poll.h>
#include <linux/sysfs.h>
19
#include <linux/dcache.h>
T
Thomas Gleixner 已提交
20
#include <linux/percpu.h>
21
#include <linux/ptrace.h>
22 23 24
#include <linux/vmstat.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
T
Thomas Gleixner 已提交
25 26 27
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
I
Ingo Molnar 已提交
28
#include <linux/kernel_stat.h>
29
#include <linux/perf_event.h>
T
Thomas Gleixner 已提交
30

31 32
#include <asm/irq_regs.h>

T
Thomas Gleixner 已提交
33
/*
34
 * Each CPU has a list of per CPU events:
T
Thomas Gleixner 已提交
35 36 37
 */
DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

38
int perf_max_events __read_mostly = 1;
T
Thomas Gleixner 已提交
39 40 41
static int perf_reserved_percpu __read_mostly;
static int perf_overcommit __read_mostly = 1;

42 43 44 45
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
46

47
/*
48
 * perf event paranoia level:
49 50
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
51
 *   1 - disallow cpu events for unpriv
52
 *   2 - disallow kernel profiling for unpriv
53
 */
54
int sysctl_perf_event_paranoid __read_mostly = 1;
55

56 57
static inline bool perf_paranoid_tracepoint_raw(void)
{
58
	return sysctl_perf_event_paranoid > -1;
59 60
}

61 62
static inline bool perf_paranoid_cpu(void)
{
63
	return sysctl_perf_event_paranoid > 0;
64 65 66 67
}

static inline bool perf_paranoid_kernel(void)
{
68
	return sysctl_perf_event_paranoid > 1;
69 70
}

71
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
72 73

/*
74
 * max perf event sample rate
75
 */
76
int sysctl_perf_event_sample_rate __read_mostly = 100000;
77

78
static atomic64_t perf_event_id;
79

T
Thomas Gleixner 已提交
80
/*
81
 * Lock for (sysadmin-configurable) event reservations:
T
Thomas Gleixner 已提交
82
 */
83
static DEFINE_SPINLOCK(perf_resource_lock);
T
Thomas Gleixner 已提交
84 85 86 87

/*
 * Architecture provided APIs - weak aliases:
 */
88
extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
T
Thomas Gleixner 已提交
89
{
90
	return NULL;
T
Thomas Gleixner 已提交
91 92
}

93 94 95
void __weak hw_perf_disable(void)		{ barrier(); }
void __weak hw_perf_enable(void)		{ barrier(); }

96 97
void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
98 99

int __weak
100
hw_perf_group_sched_in(struct perf_event *group_leader,
101
	       struct perf_cpu_context *cpuctx,
102
	       struct perf_event_context *ctx, int cpu)
103 104 105
{
	return 0;
}
T
Thomas Gleixner 已提交
106

107
void __weak perf_event_print_debug(void)	{ }
108

109
static DEFINE_PER_CPU(int, perf_disable_count);
110 111 112

void __perf_disable(void)
{
113
	__get_cpu_var(perf_disable_count)++;
114 115 116 117
}

bool __perf_enable(void)
{
118
	return !--__get_cpu_var(perf_disable_count);
119 120 121 122 123 124 125 126 127 128 129 130 131 132
}

void perf_disable(void)
{
	__perf_disable();
	hw_perf_disable();
}

void perf_enable(void)
{
	if (__perf_enable())
		hw_perf_enable();
}

133
static void get_ctx(struct perf_event_context *ctx)
134
{
135
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136 137
}

138 139
static void free_ctx(struct rcu_head *head)
{
140
	struct perf_event_context *ctx;
141

142
	ctx = container_of(head, struct perf_event_context, rcu_head);
143 144 145
	kfree(ctx);
}

146
static void put_ctx(struct perf_event_context *ctx)
147
{
148 149 150
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
151 152 153
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
154
	}
155 156
}

157
static void unclone_ctx(struct perf_event_context *ctx)
158 159 160 161 162 163 164
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

165
/*
166
 * If we inherit events we want to return the parent event id
167 168
 * to userspace.
 */
169
static u64 primary_event_id(struct perf_event *event)
170
{
171
	u64 id = event->id;
172

173 174
	if (event->parent)
		id = event->parent->id;
175 176 177 178

	return id;
}

179
/*
180
 * Get the perf_event_context for a task and lock it.
181 182 183
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
184
static struct perf_event_context *
185
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186
{
187
	struct perf_event_context *ctx;
188 189 190

	rcu_read_lock();
 retry:
191
	ctx = rcu_dereference(task->perf_event_ctxp);
192 193 194 195
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
196
		 * perf_event_task_sched_out, though the
197 198 199 200 201 202 203
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
		spin_lock_irqsave(&ctx->lock, *flags);
204
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 206 207
			spin_unlock_irqrestore(&ctx->lock, *flags);
			goto retry;
		}
208 209 210 211 212

		if (!atomic_inc_not_zero(&ctx->refcount)) {
			spin_unlock_irqrestore(&ctx->lock, *flags);
			ctx = NULL;
		}
213 214 215 216 217 218 219 220 221 222
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
223
static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
224
{
225
	struct perf_event_context *ctx;
226 227 228 229 230 231 232 233 234 235
	unsigned long flags;

	ctx = perf_lock_task_context(task, &flags);
	if (ctx) {
		++ctx->pin_count;
		spin_unlock_irqrestore(&ctx->lock, flags);
	}
	return ctx;
}

236
static void perf_unpin_context(struct perf_event_context *ctx)
237 238 239 240 241 242 243 244 245
{
	unsigned long flags;

	spin_lock_irqsave(&ctx->lock, flags);
	--ctx->pin_count;
	spin_unlock_irqrestore(&ctx->lock, flags);
	put_ctx(ctx);
}

246
/*
247
 * Add a event from the lists for its context.
248 249
 * Must be called with ctx->mutex and ctx->lock held.
 */
250
static void
251
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
252
{
253
	struct perf_event *group_leader = event->group_leader;
254 255

	/*
256 257
	 * Depending on whether it is a standalone or sibling event,
	 * add it straight to the context's event list, or to the group
258 259
	 * leader's sibling list:
	 */
260 261
	if (group_leader == event)
		list_add_tail(&event->group_entry, &ctx->group_list);
P
Peter Zijlstra 已提交
262
	else {
263
		list_add_tail(&event->group_entry, &group_leader->sibling_list);
P
Peter Zijlstra 已提交
264 265
		group_leader->nr_siblings++;
	}
P
Peter Zijlstra 已提交
266

267 268 269
	list_add_rcu(&event->event_entry, &ctx->event_list);
	ctx->nr_events++;
	if (event->attr.inherit_stat)
270
		ctx->nr_stat++;
271 272
}

273
/*
274
 * Remove a event from the lists for its context.
275
 * Must be called with ctx->mutex and ctx->lock held.
276
 */
277
static void
278
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
279
{
280
	struct perf_event *sibling, *tmp;
281

282
	if (list_empty(&event->group_entry))
283
		return;
284 285
	ctx->nr_events--;
	if (event->attr.inherit_stat)
286
		ctx->nr_stat--;
287

288 289
	list_del_init(&event->group_entry);
	list_del_rcu(&event->event_entry);
290

291 292
	if (event->group_leader != event)
		event->group_leader->nr_siblings--;
P
Peter Zijlstra 已提交
293

294
	/*
295 296
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
297 298
	 * to the context list directly:
	 */
299
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
300

301
		list_move_tail(&sibling->group_entry, &ctx->group_list);
302 303 304 305
		sibling->group_leader = sibling;
	}
}

306
static void
307
event_sched_out(struct perf_event *event,
308
		  struct perf_cpu_context *cpuctx,
309
		  struct perf_event_context *ctx)
310
{
311
	if (event->state != PERF_EVENT_STATE_ACTIVE)
312 313
		return;

314 315 316 317
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
318
	}
319 320 321
	event->tstamp_stopped = ctx->time;
	event->pmu->disable(event);
	event->oncpu = -1;
322

323
	if (!is_software_event(event))
324 325
		cpuctx->active_oncpu--;
	ctx->nr_active--;
326
	if (event->attr.exclusive || !cpuctx->active_oncpu)
327 328 329
		cpuctx->exclusive = 0;
}

330
static void
331
group_sched_out(struct perf_event *group_event,
332
		struct perf_cpu_context *cpuctx,
333
		struct perf_event_context *ctx)
334
{
335
	struct perf_event *event;
336

337
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
338 339
		return;

340
	event_sched_out(group_event, cpuctx, ctx);
341 342 343 344

	/*
	 * Schedule out siblings (if any):
	 */
345 346
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
347

348
	if (group_event->attr.exclusive)
349 350 351
		cpuctx->exclusive = 0;
}

T
Thomas Gleixner 已提交
352
/*
353
 * Cross CPU call to remove a performance event
T
Thomas Gleixner 已提交
354
 *
355
 * We disable the event on the hardware level first. After that we
T
Thomas Gleixner 已提交
356 357
 * remove it from the context list.
 */
358
static void __perf_event_remove_from_context(void *info)
T
Thomas Gleixner 已提交
359 360
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
361 362
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
363 364 365 366 367 368

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
369
	if (ctx->task && cpuctx->task_ctx != ctx)
T
Thomas Gleixner 已提交
370 371
		return;

372
	spin_lock(&ctx->lock);
373 374
	/*
	 * Protect the list operation against NMI by disabling the
375
	 * events on a global level.
376 377
	 */
	perf_disable();
T
Thomas Gleixner 已提交
378

379
	event_sched_out(event, cpuctx, ctx);
380

381
	list_del_event(event, ctx);
T
Thomas Gleixner 已提交
382 383 384

	if (!ctx->task) {
		/*
385
		 * Allow more per task events with respect to the
T
Thomas Gleixner 已提交
386 387 388
		 * reservation:
		 */
		cpuctx->max_pertask =
389 390
			min(perf_max_events - ctx->nr_events,
			    perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
391 392
	}

393
	perf_enable();
394
	spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
395 396 397 398
}


/*
399
 * Remove the event from a task's (or a CPU's) list of events.
T
Thomas Gleixner 已提交
400
 *
401
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
402
 *
403
 * CPU events are removed with a smp call. For task events we only
T
Thomas Gleixner 已提交
404
 * call when the task is on a CPU.
405
 *
406 407
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
408 409
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
410
 * When called from perf_event_exit_task, it's OK because the
411
 * context has been detached from its task.
T
Thomas Gleixner 已提交
412
 */
413
static void perf_event_remove_from_context(struct perf_event *event)
T
Thomas Gleixner 已提交
414
{
415
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
416 417 418 419
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
420
		 * Per cpu events are removed via an smp call and
T
Thomas Gleixner 已提交
421 422
		 * the removal is always sucessful.
		 */
423 424 425
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
T
Thomas Gleixner 已提交
426 427 428 429
		return;
	}

retry:
430 431
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
T
Thomas Gleixner 已提交
432 433 434 435 436

	spin_lock_irq(&ctx->lock);
	/*
	 * If the context is active we need to retry the smp call.
	 */
437
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
T
Thomas Gleixner 已提交
438 439 440 441 442 443
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
444
	 * can remove the event safely, if the call above did not
T
Thomas Gleixner 已提交
445 446
	 * succeed.
	 */
447 448
	if (!list_empty(&event->group_entry)) {
		list_del_event(event, ctx);
T
Thomas Gleixner 已提交
449 450 451 452
	}
	spin_unlock_irq(&ctx->lock);
}

453
static inline u64 perf_clock(void)
454
{
455
	return cpu_clock(smp_processor_id());
456 457 458 459 460
}

/*
 * Update the record of the current time in a context.
 */
461
static void update_context_time(struct perf_event_context *ctx)
462
{
463 464 465 466
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
467 468 469
}

/*
470
 * Update the total_time_enabled and total_time_running fields for a event.
471
 */
472
static void update_event_times(struct perf_event *event)
473
{
474
	struct perf_event_context *ctx = event->ctx;
475 476
	u64 run_end;

477 478
	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 480
		return;

481
	event->total_time_enabled = ctx->time - event->tstamp_enabled;
482

483 484
	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
485 486 487
	else
		run_end = ctx->time;

488
	event->total_time_running = run_end - event->tstamp_running;
489 490 491
}

/*
492
 * Update total_time_enabled and total_time_running for all events in a group.
493
 */
494
static void update_group_times(struct perf_event *leader)
495
{
496
	struct perf_event *event;
497

498 499 500
	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
501 502
}

503
/*
504
 * Cross CPU call to disable a performance event
505
 */
506
static void __perf_event_disable(void *info)
507
{
508
	struct perf_event *event = info;
509
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
510
	struct perf_event_context *ctx = event->ctx;
511 512

	/*
513 514
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
515
	 */
516
	if (ctx->task && cpuctx->task_ctx != ctx)
517 518
		return;

519
	spin_lock(&ctx->lock);
520 521

	/*
522
	 * If the event is on, turn it off.
523 524
	 * If it is in error state, leave it in error state.
	 */
525
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
526
		update_context_time(ctx);
527 528 529
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
530
		else
531 532
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
533 534
	}

535
	spin_unlock(&ctx->lock);
536 537 538
}

/*
539
 * Disable a event.
540
 *
541 542
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
543
 * remains valid.  This condition is satisifed when called through
544 545 546 547
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
548
 * is the current context on this CPU and preemption is disabled,
549
 * hence we can't get into perf_event_task_sched_out for this context.
550
 */
551
static void perf_event_disable(struct perf_event *event)
552
{
553
	struct perf_event_context *ctx = event->ctx;
554 555 556 557
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
558
		 * Disable the event on the cpu that it's on
559
		 */
560 561
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
562 563 564 565
		return;
	}

 retry:
566
	task_oncpu_function_call(task, __perf_event_disable, event);
567 568 569

	spin_lock_irq(&ctx->lock);
	/*
570
	 * If the event is still active, we need to retry the cross-call.
571
	 */
572
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 574 575 576 577 578 579 580
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
581 582 583
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
584
	}
585 586 587 588

	spin_unlock_irq(&ctx->lock);
}

589
static int
590
event_sched_in(struct perf_event *event,
591
		 struct perf_cpu_context *cpuctx,
592
		 struct perf_event_context *ctx,
593 594
		 int cpu)
{
595
	if (event->state <= PERF_EVENT_STATE_OFF)
596 597
		return 0;

598 599
	event->state = PERF_EVENT_STATE_ACTIVE;
	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
600 601 602 603 604
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

605 606 607
	if (event->pmu->enable(event)) {
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
608 609 610
		return -EAGAIN;
	}

611
	event->tstamp_running += ctx->time - event->tstamp_stopped;
612

613
	if (!is_software_event(event))
614
		cpuctx->active_oncpu++;
615 616
	ctx->nr_active++;

617
	if (event->attr.exclusive)
618 619
		cpuctx->exclusive = 1;

620 621 622
	return 0;
}

623
static int
624
group_sched_in(struct perf_event *group_event,
625
	       struct perf_cpu_context *cpuctx,
626
	       struct perf_event_context *ctx,
627 628
	       int cpu)
{
629
	struct perf_event *event, *partial_group;
630 631
	int ret;

632
	if (group_event->state == PERF_EVENT_STATE_OFF)
633 634
		return 0;

635
	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
636 637 638
	if (ret)
		return ret < 0 ? ret : 0;

639
	if (event_sched_in(group_event, cpuctx, ctx, cpu))
640 641 642 643 644
		return -EAGAIN;

	/*
	 * Schedule in siblings as one group (if any):
	 */
645 646 647
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event_sched_in(event, cpuctx, ctx, cpu)) {
			partial_group = event;
648 649 650 651 652 653 654 655 656 657 658
			goto group_error;
		}
	}

	return 0;

group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
	 */
659 660
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
661
			break;
662
		event_sched_out(event, cpuctx, ctx);
663
	}
664
	event_sched_out(group_event, cpuctx, ctx);
665 666 667 668

	return -EAGAIN;
}

669
/*
670 671
 * Return 1 for a group consisting entirely of software events,
 * 0 if the group contains any hardware events.
672
 */
673
static int is_software_only_group(struct perf_event *leader)
674
{
675
	struct perf_event *event;
676

677
	if (!is_software_event(leader))
678
		return 0;
P
Peter Zijlstra 已提交
679

680 681
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		if (!is_software_event(event))
682
			return 0;
P
Peter Zijlstra 已提交
683

684 685 686 687
	return 1;
}

/*
688
 * Work out whether we can put this event group on the CPU now.
689
 */
690
static int group_can_go_on(struct perf_event *event,
691 692 693 694
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
695
	 * Groups consisting entirely of software events can always go on.
696
	 */
697
	if (is_software_only_group(event))
698 699 700
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
701
	 * events can go on.
702 703 704 705 706
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
707
	 * events on the CPU, it can't go on.
708
	 */
709
	if (event->attr.exclusive && cpuctx->active_oncpu)
710 711 712 713 714 715 716 717
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

718 719
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
720
{
721 722 723 724
	list_add_event(event, ctx);
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
725 726
}

T
Thomas Gleixner 已提交
727
/*
728
 * Cross CPU call to install and enable a performance event
729 730
 *
 * Must be called with ctx->mutex held
T
Thomas Gleixner 已提交
731 732 733 734
 */
static void __perf_install_in_context(void *info)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
735 736 737
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
T
Thomas Gleixner 已提交
738
	int cpu = smp_processor_id();
739
	int err;
T
Thomas Gleixner 已提交
740 741 742 743 744

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
745
	 * Or possibly this is the right context but it isn't
746
	 * on this cpu because it had no events.
T
Thomas Gleixner 已提交
747
	 */
748
	if (ctx->task && cpuctx->task_ctx != ctx) {
749
		if (cpuctx->task_ctx || ctx->task != current)
750 751 752
			return;
		cpuctx->task_ctx = ctx;
	}
T
Thomas Gleixner 已提交
753

754
	spin_lock(&ctx->lock);
755
	ctx->is_active = 1;
756
	update_context_time(ctx);
T
Thomas Gleixner 已提交
757 758 759

	/*
	 * Protect the list operation against NMI by disabling the
760
	 * events on a global level. NOP for non NMI based events.
T
Thomas Gleixner 已提交
761
	 */
762
	perf_disable();
T
Thomas Gleixner 已提交
763

764
	add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
765

766
	/*
767
	 * Don't put the event on if it is disabled or if
768 769
	 * it is in a group and the group isn't on.
	 */
770 771
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
772 773
		goto unlock;

774
	/*
775 776 777
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
778
	 */
779
	if (!group_can_go_on(event, cpuctx, 1))
780 781
		err = -EEXIST;
	else
782
		err = event_sched_in(event, cpuctx, ctx, cpu);
783

784 785
	if (err) {
		/*
786
		 * This event couldn't go on.  If it is in a group
787
		 * then we have to pull the whole group off.
788
		 * If the event group is pinned then put it in error state.
789
		 */
790
		if (leader != event)
791
			group_sched_out(leader, cpuctx, ctx);
792
		if (leader->attr.pinned) {
793
			update_group_times(leader);
794
			leader->state = PERF_EVENT_STATE_ERROR;
795
		}
796
	}
T
Thomas Gleixner 已提交
797

798
	if (!err && !ctx->task && cpuctx->max_pertask)
T
Thomas Gleixner 已提交
799 800
		cpuctx->max_pertask--;

801
 unlock:
802
	perf_enable();
803

804
	spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
805 806 807
}

/*
808
 * Attach a performance event to a context
T
Thomas Gleixner 已提交
809
 *
810 811
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
T
Thomas Gleixner 已提交
812
 *
813
 * If the event is attached to a task which is on a CPU we use a smp
T
Thomas Gleixner 已提交
814 815
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
816 817
 *
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
818 819
 */
static void
820 821
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
T
Thomas Gleixner 已提交
822 823 824 825 826 827
			int cpu)
{
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
828
		 * Per cpu events are installed via an smp call and
T
Thomas Gleixner 已提交
829 830 831
		 * the install is always sucessful.
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
832
					 event, 1);
T
Thomas Gleixner 已提交
833 834 835 836 837
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
838
				 event);
T
Thomas Gleixner 已提交
839 840 841 842 843

	spin_lock_irq(&ctx->lock);
	/*
	 * we need to retry the smp call.
	 */
844
	if (ctx->is_active && list_empty(&event->group_entry)) {
T
Thomas Gleixner 已提交
845 846 847 848 849 850
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
851
	 * can add the event safely, if it the call above did not
T
Thomas Gleixner 已提交
852 853
	 * succeed.
	 */
854 855
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
856 857 858
	spin_unlock_irq(&ctx->lock);
}

859
/*
860
 * Put a event into inactive state and update time fields.
861 862 863 864 865 866
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
867 868
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
869
{
870
	struct perf_event *sub;
871

872 873 874 875
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
	list_for_each_entry(sub, &event->sibling_list, group_entry)
		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
876 877 878 879
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
}

880
/*
881
 * Cross CPU call to enable a performance event
882
 */
883
static void __perf_event_enable(void *info)
884
{
885
	struct perf_event *event = info;
886
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
887 888
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
889
	int err;
890

891
	/*
892 893
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
894
	 */
895
	if (ctx->task && cpuctx->task_ctx != ctx) {
896
		if (cpuctx->task_ctx || ctx->task != current)
897 898 899
			return;
		cpuctx->task_ctx = ctx;
	}
900

901
	spin_lock(&ctx->lock);
902
	ctx->is_active = 1;
903
	update_context_time(ctx);
904

905
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
906
		goto unlock;
907
	__perf_event_mark_enabled(event, ctx);
908 909

	/*
910
	 * If the event is in a group and isn't the group leader,
911
	 * then don't put it on unless the group is on.
912
	 */
913
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
914
		goto unlock;
915

916
	if (!group_can_go_on(event, cpuctx, 1)) {
917
		err = -EEXIST;
918
	} else {
919
		perf_disable();
920 921
		if (event == leader)
			err = group_sched_in(event, cpuctx, ctx,
922 923
					     smp_processor_id());
		else
924
			err = event_sched_in(event, cpuctx, ctx,
925
					       smp_processor_id());
926
		perf_enable();
927
	}
928 929 930

	if (err) {
		/*
931
		 * If this event can't go on and it's part of a
932 933
		 * group, then the whole group has to come off.
		 */
934
		if (leader != event)
935
			group_sched_out(leader, cpuctx, ctx);
936
		if (leader->attr.pinned) {
937
			update_group_times(leader);
938
			leader->state = PERF_EVENT_STATE_ERROR;
939
		}
940 941 942
	}

 unlock:
943
	spin_unlock(&ctx->lock);
944 945 946
}

/*
947
 * Enable a event.
948
 *
949 950
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
951
 * remains valid.  This condition is satisfied when called through
952 953
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
954
 */
955
static void perf_event_enable(struct perf_event *event)
956
{
957
	struct perf_event_context *ctx = event->ctx;
958 959 960 961
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
962
		 * Enable the event on the cpu that it's on
963
		 */
964 965
		smp_call_function_single(event->cpu, __perf_event_enable,
					 event, 1);
966 967 968 969
		return;
	}

	spin_lock_irq(&ctx->lock);
970
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
971 972 973
		goto out;

	/*
974 975
	 * If the event is in error state, clear that first.
	 * That way, if we see the event in error state below, we
976 977 978 979
	 * know that it has gone back into error state, as distinct
	 * from the task having been scheduled away before the
	 * cross-call arrived.
	 */
980 981
	if (event->state == PERF_EVENT_STATE_ERROR)
		event->state = PERF_EVENT_STATE_OFF;
982 983 984

 retry:
	spin_unlock_irq(&ctx->lock);
985
	task_oncpu_function_call(task, __perf_event_enable, event);
986 987 988 989

	spin_lock_irq(&ctx->lock);

	/*
990
	 * If the context is active and the event is still off,
991 992
	 * we need to retry the cross-call.
	 */
993
	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
994 995 996 997 998 999
		goto retry;

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
1000 1001
	if (event->state == PERF_EVENT_STATE_OFF)
		__perf_event_mark_enabled(event, ctx);
1002

1003 1004 1005 1006
 out:
	spin_unlock_irq(&ctx->lock);
}

1007
static int perf_event_refresh(struct perf_event *event, int refresh)
1008
{
1009
	/*
1010
	 * not supported on inherited events
1011
	 */
1012
	if (event->attr.inherit)
1013 1014
		return -EINVAL;

1015 1016
	atomic_add(refresh, &event->event_limit);
	perf_event_enable(event);
1017 1018

	return 0;
1019 1020
}

1021
void __perf_event_sched_out(struct perf_event_context *ctx,
1022 1023
			      struct perf_cpu_context *cpuctx)
{
1024
	struct perf_event *event;
1025

1026 1027
	spin_lock(&ctx->lock);
	ctx->is_active = 0;
1028
	if (likely(!ctx->nr_events))
1029
		goto out;
1030
	update_context_time(ctx);
1031

1032
	perf_disable();
1033
	if (ctx->nr_active) {
1034 1035 1036
		list_for_each_entry(event, &ctx->group_list, group_entry) {
			if (event != event->group_leader)
				event_sched_out(event, cpuctx, ctx);
1037
			else
1038
				group_sched_out(event, cpuctx, ctx);
1039
		}
1040
	}
1041
	perf_enable();
1042
 out:
1043 1044 1045
	spin_unlock(&ctx->lock);
}

1046 1047 1048
/*
 * Test whether two contexts are equivalent, i.e. whether they
 * have both been cloned from the same version of the same context
1049 1050 1051 1052
 * and they both have the same number of enabled events.
 * If the number of enabled events is the same, then the set
 * of enabled events should be the same, because these are both
 * inherited contexts, therefore we can't access individual events
1053
 * in them directly with an fd; we can only enable/disable all
1054
 * events via prctl, or enable/disable all events in a family
1055 1056
 * via ioctl, which will have the same effect on both contexts.
 */
1057 1058
static int context_equiv(struct perf_event_context *ctx1,
			 struct perf_event_context *ctx2)
1059 1060
{
	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1061
		&& ctx1->parent_gen == ctx2->parent_gen
1062
		&& !ctx1->pin_count && !ctx2->pin_count;
1063 1064
}

1065
static void __perf_event_read(void *event);
1066

1067 1068
static void __perf_event_sync_stat(struct perf_event *event,
				     struct perf_event *next_event)
1069 1070 1071
{
	u64 value;

1072
	if (!event->attr.inherit_stat)
1073 1074 1075
		return;

	/*
1076
	 * Update the event value, we cannot use perf_event_read()
1077 1078
	 * because we're in the middle of a context switch and have IRQs
	 * disabled, which upsets smp_call_function_single(), however
1079
	 * we know the event must be on the current CPU, therefore we
1080 1081
	 * don't need to use it.
	 */
1082 1083 1084
	switch (event->state) {
	case PERF_EVENT_STATE_ACTIVE:
		__perf_event_read(event);
1085 1086
		break;

1087 1088
	case PERF_EVENT_STATE_INACTIVE:
		update_event_times(event);
1089 1090 1091 1092 1093 1094 1095
		break;

	default:
		break;
	}

	/*
1096
	 * In order to keep per-task stats reliable we need to flip the event
1097 1098
	 * values when we flip the contexts.
	 */
1099 1100 1101
	value = atomic64_read(&next_event->count);
	value = atomic64_xchg(&event->count, value);
	atomic64_set(&next_event->count, value);
1102

1103 1104
	swap(event->total_time_enabled, next_event->total_time_enabled);
	swap(event->total_time_running, next_event->total_time_running);
1105

1106
	/*
1107
	 * Since we swizzled the values, update the user visible data too.
1108
	 */
1109 1110
	perf_event_update_userpage(event);
	perf_event_update_userpage(next_event);
1111 1112 1113 1114 1115
}

#define list_next_entry(pos, member) \
	list_entry(pos->member.next, typeof(*pos), member)

1116 1117
static void perf_event_sync_stat(struct perf_event_context *ctx,
				   struct perf_event_context *next_ctx)
1118
{
1119
	struct perf_event *event, *next_event;
1120 1121 1122 1123

	if (!ctx->nr_stat)
		return;

1124 1125
	event = list_first_entry(&ctx->event_list,
				   struct perf_event, event_entry);
1126

1127 1128
	next_event = list_first_entry(&next_ctx->event_list,
					struct perf_event, event_entry);
1129

1130 1131
	while (&event->event_entry != &ctx->event_list &&
	       &next_event->event_entry != &next_ctx->event_list) {
1132

1133
		__perf_event_sync_stat(event, next_event);
1134

1135 1136
		event = list_next_entry(event, event_entry);
		next_event = list_next_entry(next_event, event_entry);
1137 1138 1139
	}
}

T
Thomas Gleixner 已提交
1140
/*
1141
 * Called from scheduler to remove the events of the current task,
T
Thomas Gleixner 已提交
1142 1143
 * with interrupts disabled.
 *
1144
 * We stop each event and update the event value in event->count.
T
Thomas Gleixner 已提交
1145
 *
I
Ingo Molnar 已提交
1146
 * This does not protect us against NMI, but disable()
1147 1148 1149
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
T
Thomas Gleixner 已提交
1150
 */
1151
void perf_event_task_sched_out(struct task_struct *task,
1152
				 struct task_struct *next, int cpu)
T
Thomas Gleixner 已提交
1153 1154
{
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1155 1156 1157
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event_context *next_ctx;
	struct perf_event_context *parent;
1158
	struct pt_regs *regs;
1159
	int do_switch = 1;
T
Thomas Gleixner 已提交
1160

1161
	regs = task_pt_regs(task);
1162
	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1163

1164
	if (likely(!ctx || !cpuctx->task_ctx))
T
Thomas Gleixner 已提交
1165 1166
		return;

1167
	update_context_time(ctx);
1168 1169 1170

	rcu_read_lock();
	parent = rcu_dereference(ctx->parent_ctx);
1171
	next_ctx = next->perf_event_ctxp;
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
	if (parent && next_ctx &&
	    rcu_dereference(next_ctx->parent_ctx) == parent) {
		/*
		 * Looks like the two contexts are clones, so we might be
		 * able to optimize the context switch.  We lock both
		 * contexts and check that they are clones under the
		 * lock (including re-checking that neither has been
		 * uncloned in the meantime).  It doesn't matter which
		 * order we take the locks because no other cpu could
		 * be trying to lock both of these tasks.
		 */
		spin_lock(&ctx->lock);
		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
		if (context_equiv(ctx, next_ctx)) {
1186 1187
			/*
			 * XXX do we need a memory barrier of sorts
1188
			 * wrt to rcu_dereference() of perf_event_ctxp
1189
			 */
1190 1191
			task->perf_event_ctxp = next_ctx;
			next->perf_event_ctxp = ctx;
1192 1193 1194
			ctx->task = next;
			next_ctx->task = task;
			do_switch = 0;
1195

1196
			perf_event_sync_stat(ctx, next_ctx);
1197 1198 1199
		}
		spin_unlock(&next_ctx->lock);
		spin_unlock(&ctx->lock);
1200
	}
1201
	rcu_read_unlock();
1202

1203
	if (do_switch) {
1204
		__perf_event_sched_out(ctx, cpuctx);
1205 1206
		cpuctx->task_ctx = NULL;
	}
T
Thomas Gleixner 已提交
1207 1208
}

1209 1210 1211
/*
 * Called with IRQs disabled
 */
1212
static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1213 1214 1215
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);

1216 1217
	if (!cpuctx->task_ctx)
		return;
1218 1219 1220 1221

	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
		return;

1222
	__perf_event_sched_out(ctx, cpuctx);
1223 1224 1225
	cpuctx->task_ctx = NULL;
}

1226 1227 1228
/*
 * Called with IRQs disabled
 */
1229
static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1230
{
1231
	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
1232 1233
}

1234
static void
1235
__perf_event_sched_in(struct perf_event_context *ctx,
1236
			struct perf_cpu_context *cpuctx, int cpu)
T
Thomas Gleixner 已提交
1237
{
1238
	struct perf_event *event;
1239
	int can_add_hw = 1;
T
Thomas Gleixner 已提交
1240

1241 1242
	spin_lock(&ctx->lock);
	ctx->is_active = 1;
1243
	if (likely(!ctx->nr_events))
1244
		goto out;
T
Thomas Gleixner 已提交
1245

1246
	ctx->timestamp = perf_clock();
1247

1248
	perf_disable();
1249 1250 1251 1252 1253

	/*
	 * First go through the list and put on any pinned groups
	 * in order to give them the best chance of going on.
	 */
1254 1255 1256
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (event->state <= PERF_EVENT_STATE_OFF ||
		    !event->attr.pinned)
1257
			continue;
1258
		if (event->cpu != -1 && event->cpu != cpu)
1259 1260
			continue;

1261 1262
		if (event != event->group_leader)
			event_sched_in(event, cpuctx, ctx, cpu);
1263
		else {
1264 1265
			if (group_can_go_on(event, cpuctx, 1))
				group_sched_in(event, cpuctx, ctx, cpu);
1266
		}
1267 1268 1269 1270 1271

		/*
		 * If this pinned group hasn't been scheduled,
		 * put it in error state.
		 */
1272 1273 1274
		if (event->state == PERF_EVENT_STATE_INACTIVE) {
			update_group_times(event);
			event->state = PERF_EVENT_STATE_ERROR;
1275
		}
1276 1277
	}

1278
	list_for_each_entry(event, &ctx->group_list, group_entry) {
1279
		/*
1280 1281
		 * Ignore events in OFF or ERROR state, and
		 * ignore pinned events since we did them already.
1282
		 */
1283 1284
		if (event->state <= PERF_EVENT_STATE_OFF ||
		    event->attr.pinned)
1285 1286
			continue;

1287 1288
		/*
		 * Listen to the 'cpu' scheduling filter constraint
1289
		 * of events:
1290
		 */
1291
		if (event->cpu != -1 && event->cpu != cpu)
T
Thomas Gleixner 已提交
1292 1293
			continue;

1294 1295
		if (event != event->group_leader) {
			if (event_sched_in(event, cpuctx, ctx, cpu))
1296
				can_add_hw = 0;
1297
		} else {
1298 1299
			if (group_can_go_on(event, cpuctx, can_add_hw)) {
				if (group_sched_in(event, cpuctx, ctx, cpu))
1300 1301
					can_add_hw = 0;
			}
1302
		}
T
Thomas Gleixner 已提交
1303
	}
1304
	perf_enable();
1305
 out:
T
Thomas Gleixner 已提交
1306
	spin_unlock(&ctx->lock);
1307 1308 1309
}

/*
1310
 * Called from scheduler to add the events of the current task
1311 1312
 * with interrupts disabled.
 *
1313
 * We restore the event value and then enable it.
1314 1315
 *
 * This does not protect us against NMI, but enable()
1316 1317 1318
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
1319
 */
1320
void perf_event_task_sched_in(struct task_struct *task, int cpu)
1321 1322
{
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1323
	struct perf_event_context *ctx = task->perf_event_ctxp;
1324

1325 1326
	if (likely(!ctx))
		return;
1327 1328
	if (cpuctx->task_ctx == ctx)
		return;
1329
	__perf_event_sched_in(ctx, cpuctx, cpu);
T
Thomas Gleixner 已提交
1330 1331 1332
	cpuctx->task_ctx = ctx;
}

1333
static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1334
{
1335
	struct perf_event_context *ctx = &cpuctx->ctx;
1336

1337
	__perf_event_sched_in(ctx, cpuctx, cpu);
1338 1339
}

1340 1341
#define MAX_INTERRUPTS (~0ULL)

1342
static void perf_log_throttle(struct perf_event *event, int enable);
1343

1344
static void perf_adjust_period(struct perf_event *event, u64 events)
1345
{
1346
	struct hw_perf_event *hwc = &event->hw;
1347 1348 1349 1350
	u64 period, sample_period;
	s64 delta;

	events *= hwc->sample_period;
1351
	period = div64_u64(events, event->attr.sample_freq);
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */

	sample_period = hwc->sample_period + delta;

	if (!sample_period)
		sample_period = 1;

	hwc->sample_period = sample_period;
}

1364
static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1365
{
1366 1367
	struct perf_event *event;
	struct hw_perf_event *hwc;
1368
	u64 interrupts, freq;
1369 1370

	spin_lock(&ctx->lock);
1371 1372
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 1374
			continue;

1375
		hwc = &event->hw;
1376 1377 1378

		interrupts = hwc->interrupts;
		hwc->interrupts = 0;
1379

1380
		/*
1381
		 * unthrottle events on the tick
1382
		 */
1383
		if (interrupts == MAX_INTERRUPTS) {
1384 1385 1386
			perf_log_throttle(event, 1);
			event->pmu->unthrottle(event);
			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1387 1388
		}

1389
		if (!event->attr.freq || !event->attr.sample_freq)
1390 1391
			continue;

1392 1393 1394
		/*
		 * if the specified freq < HZ then we need to skip ticks
		 */
1395 1396
		if (event->attr.sample_freq < HZ) {
			freq = event->attr.sample_freq;
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409

			hwc->freq_count += freq;
			hwc->freq_interrupts += interrupts;

			if (hwc->freq_count < HZ)
				continue;

			interrupts = hwc->freq_interrupts;
			hwc->freq_interrupts = 0;
			hwc->freq_count -= HZ;
		} else
			freq = HZ;

1410
		perf_adjust_period(event, freq * interrupts);
1411

1412 1413 1414 1415 1416 1417 1418
		/*
		 * In order to avoid being stalled by an (accidental) huge
		 * sample period, force reset the sample period if we didn't
		 * get any events in this freq period.
		 */
		if (!interrupts) {
			perf_disable();
1419
			event->pmu->disable(event);
1420
			atomic64_set(&hwc->period_left, 0);
1421
			event->pmu->enable(event);
1422 1423
			perf_enable();
		}
1424 1425 1426 1427
	}
	spin_unlock(&ctx->lock);
}

1428
/*
1429
 * Round-robin a context's events:
1430
 */
1431
static void rotate_ctx(struct perf_event_context *ctx)
T
Thomas Gleixner 已提交
1432
{
1433
	struct perf_event *event;
T
Thomas Gleixner 已提交
1434

1435
	if (!ctx->nr_events)
T
Thomas Gleixner 已提交
1436 1437 1438 1439
		return;

	spin_lock(&ctx->lock);
	/*
1440
	 * Rotate the first entry last (works just fine for group events too):
T
Thomas Gleixner 已提交
1441
	 */
1442
	perf_disable();
1443 1444
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		list_move_tail(&event->group_entry, &ctx->group_list);
T
Thomas Gleixner 已提交
1445 1446
		break;
	}
1447
	perf_enable();
T
Thomas Gleixner 已提交
1448 1449

	spin_unlock(&ctx->lock);
1450 1451
}

1452
void perf_event_task_tick(struct task_struct *curr, int cpu)
1453
{
1454
	struct perf_cpu_context *cpuctx;
1455
	struct perf_event_context *ctx;
1456

1457
	if (!atomic_read(&nr_events))
1458 1459 1460
		return;

	cpuctx = &per_cpu(perf_cpu_context, cpu);
1461
	ctx = curr->perf_event_ctxp;
1462

1463
	perf_ctx_adjust_freq(&cpuctx->ctx);
1464
	if (ctx)
1465
		perf_ctx_adjust_freq(ctx);
1466

1467
	perf_event_cpu_sched_out(cpuctx);
1468
	if (ctx)
1469
		__perf_event_task_sched_out(ctx);
T
Thomas Gleixner 已提交
1470

1471
	rotate_ctx(&cpuctx->ctx);
1472 1473
	if (ctx)
		rotate_ctx(ctx);
1474

1475
	perf_event_cpu_sched_in(cpuctx, cpu);
1476
	if (ctx)
1477
		perf_event_task_sched_in(curr, cpu);
T
Thomas Gleixner 已提交
1478 1479
}

1480
/*
1481
 * Enable all of a task's events that have been marked enable-on-exec.
1482 1483
 * This expects task == current.
 */
1484
static void perf_event_enable_on_exec(struct task_struct *task)
1485
{
1486 1487
	struct perf_event_context *ctx;
	struct perf_event *event;
1488 1489 1490 1491
	unsigned long flags;
	int enabled = 0;

	local_irq_save(flags);
1492 1493
	ctx = task->perf_event_ctxp;
	if (!ctx || !ctx->nr_events)
1494 1495
		goto out;

1496
	__perf_event_task_sched_out(ctx);
1497 1498 1499

	spin_lock(&ctx->lock);

1500 1501
	list_for_each_entry(event, &ctx->group_list, group_entry) {
		if (!event->attr.enable_on_exec)
1502
			continue;
1503 1504
		event->attr.enable_on_exec = 0;
		if (event->state >= PERF_EVENT_STATE_INACTIVE)
1505
			continue;
1506
		__perf_event_mark_enabled(event, ctx);
1507 1508 1509 1510
		enabled = 1;
	}

	/*
1511
	 * Unclone this context if we enabled any event.
1512
	 */
1513 1514
	if (enabled)
		unclone_ctx(ctx);
1515 1516 1517

	spin_unlock(&ctx->lock);

1518
	perf_event_task_sched_in(task, smp_processor_id());
1519 1520 1521 1522
 out:
	local_irq_restore(flags);
}

T
Thomas Gleixner 已提交
1523
/*
1524
 * Cross CPU call to read the hardware event
T
Thomas Gleixner 已提交
1525
 */
1526
static void __perf_event_read(void *info)
T
Thomas Gleixner 已提交
1527
{
1528
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 1530
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
I
Ingo Molnar 已提交
1531
	unsigned long flags;
I
Ingo Molnar 已提交
1532

1533 1534 1535 1536
	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu.  If not it has been
	 * scheduled out before the smp call arrived.  In that case
1537 1538
	 * event->count would have been updated to a recent sample
	 * when the event was scheduled out.
1539 1540 1541 1542
	 */
	if (ctx->task && cpuctx->task_ctx != ctx)
		return;

1543
	local_irq_save(flags);
1544
	if (ctx->is_active)
1545
		update_context_time(ctx);
1546 1547
	event->pmu->read(event);
	update_event_times(event);
1548
	local_irq_restore(flags);
T
Thomas Gleixner 已提交
1549 1550
}

1551
static u64 perf_event_read(struct perf_event *event)
T
Thomas Gleixner 已提交
1552 1553
{
	/*
1554 1555
	 * If event is enabled and currently active on a CPU, update the
	 * value in the event structure:
T
Thomas Gleixner 已提交
1556
	 */
1557 1558 1559 1560 1561
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
		smp_call_function_single(event->oncpu,
					 __perf_event_read, event, 1);
	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_event_times(event);
T
Thomas Gleixner 已提交
1562 1563
	}

1564
	return atomic64_read(&event->count);
T
Thomas Gleixner 已提交
1565 1566
}

1567
/*
1568
 * Initialize the perf_event context in a task_struct:
1569 1570
 */
static void
1571
__perf_event_init_context(struct perf_event_context *ctx,
1572 1573 1574 1575 1576
			    struct task_struct *task)
{
	memset(ctx, 0, sizeof(*ctx));
	spin_lock_init(&ctx->lock);
	mutex_init(&ctx->mutex);
1577
	INIT_LIST_HEAD(&ctx->group_list);
1578 1579 1580 1581 1582
	INIT_LIST_HEAD(&ctx->event_list);
	atomic_set(&ctx->refcount, 1);
	ctx->task = task;
}

1583
static struct perf_event_context *find_get_context(pid_t pid, int cpu)
T
Thomas Gleixner 已提交
1584
{
1585
	struct perf_event_context *ctx;
1586
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
1587
	struct task_struct *task;
1588
	unsigned long flags;
1589
	int err;
T
Thomas Gleixner 已提交
1590 1591

	/*
1592
	 * If cpu is not a wildcard then this is a percpu event:
T
Thomas Gleixner 已提交
1593 1594
	 */
	if (cpu != -1) {
1595
		/* Must be root to operate on a CPU event: */
1596
		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
T
Thomas Gleixner 已提交
1597 1598 1599 1600 1601 1602
			return ERR_PTR(-EACCES);

		if (cpu < 0 || cpu > num_possible_cpus())
			return ERR_PTR(-EINVAL);

		/*
1603
		 * We could be clever and allow to attach a event to an
T
Thomas Gleixner 已提交
1604 1605 1606 1607 1608 1609 1610 1611
		 * offline CPU and activate it when the CPU comes up, but
		 * that's for later.
		 */
		if (!cpu_isset(cpu, cpu_online_map))
			return ERR_PTR(-ENODEV);

		cpuctx = &per_cpu(perf_cpu_context, cpu);
		ctx = &cpuctx->ctx;
1612
		get_ctx(ctx);
T
Thomas Gleixner 已提交
1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628

		return ctx;
	}

	rcu_read_lock();
	if (!pid)
		task = current;
	else
		task = find_task_by_vpid(pid);
	if (task)
		get_task_struct(task);
	rcu_read_unlock();

	if (!task)
		return ERR_PTR(-ESRCH);

1629
	/*
1630
	 * Can't attach events to a dying task.
1631 1632 1633 1634 1635
	 */
	err = -ESRCH;
	if (task->flags & PF_EXITING)
		goto errout;

T
Thomas Gleixner 已提交
1636
	/* Reuse ptrace permission checks for now. */
1637 1638 1639 1640 1641
	err = -EACCES;
	if (!ptrace_may_access(task, PTRACE_MODE_READ))
		goto errout;

 retry:
1642
	ctx = perf_lock_task_context(task, &flags);
1643
	if (ctx) {
1644
		unclone_ctx(ctx);
1645
		spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1646 1647
	}

1648
	if (!ctx) {
1649
		ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1650 1651 1652
		err = -ENOMEM;
		if (!ctx)
			goto errout;
1653
		__perf_event_init_context(ctx, task);
1654
		get_ctx(ctx);
1655
		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1656 1657 1658 1659 1660
			/*
			 * We raced with some other task; use
			 * the context they set.
			 */
			kfree(ctx);
1661
			goto retry;
1662
		}
1663
		get_task_struct(task);
1664 1665
	}

1666
	put_task_struct(task);
T
Thomas Gleixner 已提交
1667
	return ctx;
1668 1669 1670 1671

 errout:
	put_task_struct(task);
	return ERR_PTR(err);
T
Thomas Gleixner 已提交
1672 1673
}

1674
static void free_event_rcu(struct rcu_head *head)
P
Peter Zijlstra 已提交
1675
{
1676
	struct perf_event *event;
P
Peter Zijlstra 已提交
1677

1678 1679 1680 1681
	event = container_of(head, struct perf_event, rcu_head);
	if (event->ns)
		put_pid_ns(event->ns);
	kfree(event);
P
Peter Zijlstra 已提交
1682 1683
}

1684
static void perf_pending_sync(struct perf_event *event);
1685

1686
static void free_event(struct perf_event *event)
1687
{
1688
	perf_pending_sync(event);
1689

1690 1691 1692 1693 1694 1695 1696 1697
	if (!event->parent) {
		atomic_dec(&nr_events);
		if (event->attr.mmap)
			atomic_dec(&nr_mmap_events);
		if (event->attr.comm)
			atomic_dec(&nr_comm_events);
		if (event->attr.task)
			atomic_dec(&nr_task_events);
1698
	}
1699

1700 1701 1702
	if (event->output) {
		fput(event->output->filp);
		event->output = NULL;
1703 1704
	}

1705 1706
	if (event->destroy)
		event->destroy(event);
1707

1708 1709
	put_ctx(event->ctx);
	call_rcu(&event->rcu_head, free_event_rcu);
1710 1711
}

T
Thomas Gleixner 已提交
1712 1713 1714 1715 1716
/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
1717 1718
	struct perf_event *event = file->private_data;
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
1719 1720 1721

	file->private_data = NULL;

1722
	WARN_ON_ONCE(ctx->parent_ctx);
1723
	mutex_lock(&ctx->mutex);
1724
	perf_event_remove_from_context(event);
1725
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
1726

1727 1728 1729 1730
	mutex_lock(&event->owner->perf_event_mutex);
	list_del_init(&event->owner_entry);
	mutex_unlock(&event->owner->perf_event_mutex);
	put_task_struct(event->owner);
1731

1732
	free_event(event);
T
Thomas Gleixner 已提交
1733 1734 1735 1736

	return 0;
}

1737
static int perf_event_read_size(struct perf_event *event)
1738 1739 1740 1741 1742
{
	int entry = sizeof(u64); /* value */
	int size = 0;
	int nr = 1;

1743
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1744 1745
		size += sizeof(u64);

1746
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1747 1748
		size += sizeof(u64);

1749
	if (event->attr.read_format & PERF_FORMAT_ID)
1750 1751
		entry += sizeof(u64);

1752 1753
	if (event->attr.read_format & PERF_FORMAT_GROUP) {
		nr += event->group_leader->nr_siblings;
1754 1755 1756 1757 1758 1759 1760 1761
		size += sizeof(u64);
	}

	size += entry * nr;

	return size;
}

1762
static u64 perf_event_read_value(struct perf_event *event)
1763
{
1764
	struct perf_event *child;
1765 1766
	u64 total = 0;

1767 1768 1769
	total += perf_event_read(event);
	list_for_each_entry(child, &event->child_list, child_list)
		total += perf_event_read(child);
1770 1771 1772 1773

	return total;
}

1774
static int perf_event_read_entry(struct perf_event *event,
1775 1776 1777 1778 1779
				   u64 read_format, char __user *buf)
{
	int n = 0, count = 0;
	u64 values[2];

1780
	values[n++] = perf_event_read_value(event);
1781
	if (read_format & PERF_FORMAT_ID)
1782
		values[n++] = primary_event_id(event);
1783 1784 1785 1786 1787 1788 1789 1790 1791

	count = n * sizeof(u64);

	if (copy_to_user(buf, values, count))
		return -EFAULT;

	return count;
}

1792
static int perf_event_read_group(struct perf_event *event,
1793 1794
				   u64 read_format, char __user *buf)
{
1795
	struct perf_event *leader = event->group_leader, *sub;
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813
	int n = 0, size = 0, err = -EFAULT;
	u64 values[3];

	values[n++] = 1 + leader->nr_siblings;
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
		values[n++] = leader->total_time_enabled +
			atomic64_read(&leader->child_total_time_enabled);
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
		values[n++] = leader->total_time_running +
			atomic64_read(&leader->child_total_time_running);
	}

	size = n * sizeof(u64);

	if (copy_to_user(buf, values, size))
		return -EFAULT;

1814
	err = perf_event_read_entry(leader, read_format, buf + size);
1815 1816 1817 1818 1819
	if (err < 0)
		return err;

	size += err;

1820
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821
		err = perf_event_read_entry(sub, read_format,
1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
				buf + size);
		if (err < 0)
			return err;

		size += err;
	}

	return size;
}

1832
static int perf_event_read_one(struct perf_event *event,
1833 1834 1835 1836 1837
				 u64 read_format, char __user *buf)
{
	u64 values[4];
	int n = 0;

1838
	values[n++] = perf_event_read_value(event);
1839
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1840 1841
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
1842 1843
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 1845
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
1846 1847
	}
	if (read_format & PERF_FORMAT_ID)
1848
		values[n++] = primary_event_id(event);
1849 1850 1851 1852 1853 1854 1855

	if (copy_to_user(buf, values, n * sizeof(u64)))
		return -EFAULT;

	return n * sizeof(u64);
}

T
Thomas Gleixner 已提交
1856
/*
1857
 * Read the performance event - simple non blocking version for now
T
Thomas Gleixner 已提交
1858 1859
 */
static ssize_t
1860
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
T
Thomas Gleixner 已提交
1861
{
1862
	u64 read_format = event->attr.read_format;
1863
	int ret;
T
Thomas Gleixner 已提交
1864

1865
	/*
1866
	 * Return end-of-file for a read on a event that is in
1867 1868 1869
	 * error state (i.e. because it was pinned but it couldn't be
	 * scheduled on to the CPU at some point).
	 */
1870
	if (event->state == PERF_EVENT_STATE_ERROR)
1871 1872
		return 0;

1873
	if (count < perf_event_read_size(event))
1874 1875
		return -ENOSPC;

1876 1877
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
1878
	if (read_format & PERF_FORMAT_GROUP)
1879
		ret = perf_event_read_group(event, read_format, buf);
1880
	else
1881 1882
		ret = perf_event_read_one(event, read_format, buf);
	mutex_unlock(&event->child_mutex);
T
Thomas Gleixner 已提交
1883

1884
	return ret;
T
Thomas Gleixner 已提交
1885 1886 1887 1888 1889
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
1890
	struct perf_event *event = file->private_data;
T
Thomas Gleixner 已提交
1891

1892
	return perf_read_hw(event, buf, count);
T
Thomas Gleixner 已提交
1893 1894 1895 1896
}

static unsigned int perf_poll(struct file *file, poll_table *wait)
{
1897
	struct perf_event *event = file->private_data;
P
Peter Zijlstra 已提交
1898
	struct perf_mmap_data *data;
1899
	unsigned int events = POLL_HUP;
P
Peter Zijlstra 已提交
1900 1901

	rcu_read_lock();
1902
	data = rcu_dereference(event->data);
P
Peter Zijlstra 已提交
1903
	if (data)
1904
		events = atomic_xchg(&data->poll, 0);
P
Peter Zijlstra 已提交
1905
	rcu_read_unlock();
T
Thomas Gleixner 已提交
1906

1907
	poll_wait(file, &event->waitq, wait);
T
Thomas Gleixner 已提交
1908 1909 1910 1911

	return events;
}

1912
static void perf_event_reset(struct perf_event *event)
1913
{
1914 1915 1916
	(void)perf_event_read(event);
	atomic64_set(&event->count, 0);
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
1917 1918
}

1919
/*
1920 1921 1922 1923
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in sync_child_event if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
1924
 */
1925 1926
static void perf_event_for_each_child(struct perf_event *event,
					void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
1927
{
1928
	struct perf_event *child;
P
Peter Zijlstra 已提交
1929

1930 1931 1932 1933
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
	func(event);
	list_for_each_entry(child, &event->child_list, child_list)
P
Peter Zijlstra 已提交
1934
		func(child);
1935
	mutex_unlock(&event->child_mutex);
P
Peter Zijlstra 已提交
1936 1937
}

1938 1939
static void perf_event_for_each(struct perf_event *event,
				  void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
1940
{
1941 1942
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *sibling;
P
Peter Zijlstra 已提交
1943

1944 1945
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
1946
	event = event->group_leader;
1947

1948 1949 1950 1951
	perf_event_for_each_child(event, func);
	func(event);
	list_for_each_entry(sibling, &event->sibling_list, group_entry)
		perf_event_for_each_child(event, func);
1952
	mutex_unlock(&ctx->mutex);
1953 1954
}

1955
static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956
{
1957
	struct perf_event_context *ctx = event->ctx;
1958 1959 1960 1961
	unsigned long size;
	int ret = 0;
	u64 value;

1962
	if (!event->attr.sample_period)
1963 1964 1965 1966 1967 1968 1969 1970 1971 1972
		return -EINVAL;

	size = copy_from_user(&value, arg, sizeof(value));
	if (size != sizeof(value))
		return -EFAULT;

	if (!value)
		return -EINVAL;

	spin_lock_irq(&ctx->lock);
1973 1974
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate) {
1975 1976 1977 1978
			ret = -EINVAL;
			goto unlock;
		}

1979
		event->attr.sample_freq = value;
1980
	} else {
1981 1982
		event->attr.sample_period = value;
		event->hw.sample_period = value;
1983 1984 1985 1986 1987 1988 1989
	}
unlock:
	spin_unlock_irq(&ctx->lock);

	return ret;
}

1990
int perf_event_set_output(struct perf_event *event, int output_fd);
1991

1992 1993
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
1994 1995
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
P
Peter Zijlstra 已提交
1996
	u32 flags = arg;
1997 1998

	switch (cmd) {
1999 2000
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
2001
		break;
2002 2003
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
2004
		break;
2005 2006
	case PERF_EVENT_IOC_RESET:
		func = perf_event_reset;
2007
		break;
P
Peter Zijlstra 已提交
2008

2009 2010
	case PERF_EVENT_IOC_REFRESH:
		return perf_event_refresh(event, arg);
2011

2012 2013
	case PERF_EVENT_IOC_PERIOD:
		return perf_event_period(event, (u64 __user *)arg);
2014

2015 2016
	case PERF_EVENT_IOC_SET_OUTPUT:
		return perf_event_set_output(event, arg);
2017

2018
	default:
P
Peter Zijlstra 已提交
2019
		return -ENOTTY;
2020
	}
P
Peter Zijlstra 已提交
2021 2022

	if (flags & PERF_IOC_FLAG_GROUP)
2023
		perf_event_for_each(event, func);
P
Peter Zijlstra 已提交
2024
	else
2025
		perf_event_for_each_child(event, func);
P
Peter Zijlstra 已提交
2026 2027

	return 0;
2028 2029
}

2030
int perf_event_task_enable(void)
2031
{
2032
	struct perf_event *event;
2033

2034 2035 2036 2037
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_enable);
	mutex_unlock(&current->perf_event_mutex);
2038 2039 2040 2041

	return 0;
}

2042
int perf_event_task_disable(void)
2043
{
2044
	struct perf_event *event;
2045

2046 2047 2048 2049
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_disable);
	mutex_unlock(&current->perf_event_mutex);
2050 2051 2052 2053

	return 0;
}

2054 2055
#ifndef PERF_EVENT_INDEX_OFFSET
# define PERF_EVENT_INDEX_OFFSET 0
I
Ingo Molnar 已提交
2056 2057
#endif

2058
static int perf_event_index(struct perf_event *event)
2059
{
2060
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2061 2062
		return 0;

2063
	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2064 2065
}

2066 2067 2068 2069 2070
/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
2071
void perf_event_update_userpage(struct perf_event *event)
2072
{
2073
	struct perf_event_mmap_page *userpg;
2074
	struct perf_mmap_data *data;
2075 2076

	rcu_read_lock();
2077
	data = rcu_dereference(event->data);
2078 2079 2080 2081
	if (!data)
		goto unlock;

	userpg = data->user_page;
2082

2083 2084 2085 2086 2087
	/*
	 * Disable preemption so as to not let the corresponding user-space
	 * spin too long if we get preempted.
	 */
	preempt_disable();
2088
	++userpg->lock;
2089
	barrier();
2090 2091 2092 2093
	userpg->index = perf_event_index(event);
	userpg->offset = atomic64_read(&event->count);
	if (event->state == PERF_EVENT_STATE_ACTIVE)
		userpg->offset -= atomic64_read(&event->hw.prev_count);
2094

2095 2096
	userpg->time_enabled = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2097

2098 2099
	userpg->time_running = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2100

2101
	barrier();
2102
	++userpg->lock;
2103
	preempt_enable();
2104
unlock:
2105
	rcu_read_unlock();
2106 2107 2108 2109
}

static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
2110
	struct perf_event *event = vma->vm_file->private_data;
2111 2112 2113
	struct perf_mmap_data *data;
	int ret = VM_FAULT_SIGBUS;

2114 2115 2116 2117 2118 2119
	if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
			ret = 0;
		return ret;
	}

2120
	rcu_read_lock();
2121
	data = rcu_dereference(event->data);
2122 2123 2124 2125 2126 2127 2128
	if (!data)
		goto unlock;

	if (vmf->pgoff == 0) {
		vmf->page = virt_to_page(data->user_page);
	} else {
		int nr = vmf->pgoff - 1;
2129

2130 2131
		if ((unsigned)nr > data->nr_pages)
			goto unlock;
2132

2133 2134 2135
		if (vmf->flags & FAULT_FLAG_WRITE)
			goto unlock;

2136 2137
		vmf->page = virt_to_page(data->data_pages[nr]);
	}
2138

2139
	get_page(vmf->page);
2140 2141 2142
	vmf->page->mapping = vma->vm_file->f_mapping;
	vmf->page->index   = vmf->pgoff;

2143 2144 2145 2146 2147 2148 2149
	ret = 0;
unlock:
	rcu_read_unlock();

	return ret;
}

2150
static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151 2152 2153 2154 2155
{
	struct perf_mmap_data *data;
	unsigned long size;
	int i;

2156
	WARN_ON(atomic_read(&event->mmap_count));
2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175

	size = sizeof(struct perf_mmap_data);
	size += nr_pages * sizeof(void *);

	data = kzalloc(size, GFP_KERNEL);
	if (!data)
		goto fail;

	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
	if (!data->user_page)
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
		if (!data->data_pages[i])
			goto fail_data_pages;
	}

	data->nr_pages = nr_pages;
2176
	atomic_set(&data->lock, -1);
2177

2178
	if (event->attr.watermark) {
2179
		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180
				      event->attr.wakeup_watermark);
2181 2182 2183 2184
	}
	if (!data->watermark)
		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);

2185
	rcu_assign_pointer(event->data, data);
2186

2187
	return 0;
2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201

fail_data_pages:
	for (i--; i >= 0; i--)
		free_page((unsigned long)data->data_pages[i]);

	free_page((unsigned long)data->user_page);

fail_user_page:
	kfree(data);

fail:
	return -ENOMEM;
}

2202 2203
static void perf_mmap_free_page(unsigned long addr)
{
K
Kevin Cernekee 已提交
2204
	struct page *page = virt_to_page((void *)addr);
2205 2206 2207 2208 2209

	page->mapping = NULL;
	__free_page(page);
}

2210 2211
static void __perf_mmap_data_free(struct rcu_head *rcu_head)
{
2212
	struct perf_mmap_data *data;
2213 2214
	int i;

2215 2216
	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);

2217
	perf_mmap_free_page((unsigned long)data->user_page);
2218
	for (i = 0; i < data->nr_pages; i++)
2219 2220
		perf_mmap_free_page((unsigned long)data->data_pages[i]);

2221 2222 2223
	kfree(data);
}

2224
static void perf_mmap_data_free(struct perf_event *event)
2225
{
2226
	struct perf_mmap_data *data = event->data;
2227

2228
	WARN_ON(atomic_read(&event->mmap_count));
2229

2230
	rcu_assign_pointer(event->data, NULL);
2231 2232 2233 2234 2235
	call_rcu(&data->rcu_head, __perf_mmap_data_free);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
2236
	struct perf_event *event = vma->vm_file->private_data;
2237

2238
	atomic_inc(&event->mmap_count);
2239 2240 2241 2242
}

static void perf_mmap_close(struct vm_area_struct *vma)
{
2243
	struct perf_event *event = vma->vm_file->private_data;
2244

2245 2246
	WARN_ON_ONCE(event->ctx->parent_ctx);
	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2247 2248
		struct user_struct *user = current_user();

2249 2250 2251 2252
		atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
		vma->vm_mm->locked_vm -= event->data->nr_locked;
		perf_mmap_data_free(event);
		mutex_unlock(&event->mmap_mutex);
2253
	}
2254 2255 2256
}

static struct vm_operations_struct perf_mmap_vmops = {
2257 2258 2259 2260
	.open		= perf_mmap_open,
	.close		= perf_mmap_close,
	.fault		= perf_mmap_fault,
	.page_mkwrite	= perf_mmap_fault,
2261 2262 2263 2264
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
2265
	struct perf_event *event = file->private_data;
2266
	unsigned long user_locked, user_lock_limit;
2267
	struct user_struct *user = current_user();
2268
	unsigned long locked, lock_limit;
2269 2270
	unsigned long vma_size;
	unsigned long nr_pages;
2271
	long user_extra, extra;
2272
	int ret = 0;
2273

2274
	if (!(vma->vm_flags & VM_SHARED))
2275
		return -EINVAL;
2276 2277 2278 2279

	vma_size = vma->vm_end - vma->vm_start;
	nr_pages = (vma_size / PAGE_SIZE) - 1;

2280 2281 2282 2283 2284
	/*
	 * If we have data pages ensure they're a power-of-two number, so we
	 * can do bitmasks instead of modulo.
	 */
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
2285 2286
		return -EINVAL;

2287
	if (vma_size != PAGE_SIZE * (1 + nr_pages))
2288 2289
		return -EINVAL;

2290 2291
	if (vma->vm_pgoff != 0)
		return -EINVAL;
2292

2293 2294 2295
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->mmap_mutex);
	if (event->output) {
2296 2297 2298 2299
		ret = -EINVAL;
		goto unlock;
	}

2300 2301
	if (atomic_inc_not_zero(&event->mmap_count)) {
		if (nr_pages != event->data->nr_pages)
2302 2303 2304 2305
			ret = -EINVAL;
		goto unlock;
	}

2306
	user_extra = nr_pages + 1;
2307
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
I
Ingo Molnar 已提交
2308 2309 2310 2311 2312 2313

	/*
	 * Increase the limit linearly with more CPUs:
	 */
	user_lock_limit *= num_online_cpus();

2314
	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2315

2316 2317 2318
	extra = 0;
	if (user_locked > user_lock_limit)
		extra = user_locked - user_lock_limit;
2319 2320 2321

	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
	lock_limit >>= PAGE_SHIFT;
2322
	locked = vma->vm_mm->locked_vm + extra;
2323

2324 2325
	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
		!capable(CAP_IPC_LOCK)) {
2326 2327 2328
		ret = -EPERM;
		goto unlock;
	}
2329

2330 2331
	WARN_ON(event->data);
	ret = perf_mmap_data_alloc(event, nr_pages);
2332 2333 2334
	if (ret)
		goto unlock;

2335
	atomic_set(&event->mmap_count, 1);
2336
	atomic_long_add(user_extra, &user->locked_vm);
2337
	vma->vm_mm->locked_vm += extra;
2338
	event->data->nr_locked = extra;
2339
	if (vma->vm_flags & VM_WRITE)
2340
		event->data->writable = 1;
2341

2342
unlock:
2343
	mutex_unlock(&event->mmap_mutex);
2344 2345 2346

	vma->vm_flags |= VM_RESERVED;
	vma->vm_ops = &perf_mmap_vmops;
2347 2348

	return ret;
2349 2350
}

P
Peter Zijlstra 已提交
2351 2352 2353
static int perf_fasync(int fd, struct file *filp, int on)
{
	struct inode *inode = filp->f_path.dentry->d_inode;
2354
	struct perf_event *event = filp->private_data;
P
Peter Zijlstra 已提交
2355 2356 2357
	int retval;

	mutex_lock(&inode->i_mutex);
2358
	retval = fasync_helper(fd, filp, on, &event->fasync);
P
Peter Zijlstra 已提交
2359 2360 2361 2362 2363 2364 2365 2366
	mutex_unlock(&inode->i_mutex);

	if (retval < 0)
		return retval;

	return 0;
}

T
Thomas Gleixner 已提交
2367 2368 2369 2370
static const struct file_operations perf_fops = {
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
2371 2372
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
2373
	.mmap			= perf_mmap,
P
Peter Zijlstra 已提交
2374
	.fasync			= perf_fasync,
T
Thomas Gleixner 已提交
2375 2376
};

2377
/*
2378
 * Perf event wakeup
2379 2380 2381 2382 2383
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

2384
void perf_event_wakeup(struct perf_event *event)
2385
{
2386
	wake_up_all(&event->waitq);
2387

2388 2389 2390
	if (event->pending_kill) {
		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
		event->pending_kill = 0;
2391
	}
2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402
}

/*
 * Pending wakeups
 *
 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
 *
 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
 * single linked list and use cmpxchg() to add entries lockless.
 */

2403
static void perf_pending_event(struct perf_pending_entry *entry)
2404
{
2405 2406
	struct perf_event *event = container_of(entry,
			struct perf_event, pending);
2407

2408 2409 2410
	if (event->pending_disable) {
		event->pending_disable = 0;
		__perf_event_disable(event);
2411 2412
	}

2413 2414 2415
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
2416 2417 2418
	}
}

2419
#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2420

2421
static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2422 2423 2424
	PENDING_TAIL,
};

2425 2426
static void perf_pending_queue(struct perf_pending_entry *entry,
			       void (*func)(struct perf_pending_entry *))
2427
{
2428
	struct perf_pending_entry **head;
2429

2430
	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2431 2432
		return;

2433 2434 2435
	entry->func = func;

	head = &get_cpu_var(perf_pending_head);
2436 2437

	do {
2438 2439
		entry->next = *head;
	} while (cmpxchg(head, entry->next, entry) != entry->next);
2440

2441
	set_perf_event_pending();
2442

2443
	put_cpu_var(perf_pending_head);
2444 2445 2446 2447
}

static int __perf_pending_run(void)
{
2448
	struct perf_pending_entry *list;
2449 2450
	int nr = 0;

2451
	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2452
	while (list != PENDING_TAIL) {
2453 2454
		void (*func)(struct perf_pending_entry *);
		struct perf_pending_entry *entry = list;
2455 2456 2457

		list = list->next;

2458 2459
		func = entry->func;
		entry->next = NULL;
2460 2461 2462 2463 2464 2465 2466
		/*
		 * Ensure we observe the unqueue before we issue the wakeup,
		 * so that we won't be waiting forever.
		 * -- see perf_not_pending().
		 */
		smp_wmb();

2467
		func(entry);
2468 2469 2470 2471 2472 2473
		nr++;
	}

	return nr;
}

2474
static inline int perf_not_pending(struct perf_event *event)
2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488
{
	/*
	 * If we flush on whatever cpu we run, there is a chance we don't
	 * need to wait.
	 */
	get_cpu();
	__perf_pending_run();
	put_cpu();

	/*
	 * Ensure we see the proper queue state before going to sleep
	 * so that we do not miss the wakeup. -- see perf_pending_handle()
	 */
	smp_rmb();
2489
	return event->pending.next == NULL;
2490 2491
}

2492
static void perf_pending_sync(struct perf_event *event)
2493
{
2494
	wait_event(event->waitq, perf_not_pending(event));
2495 2496
}

2497
void perf_event_do_pending(void)
2498 2499 2500 2501
{
	__perf_pending_run();
}

2502 2503 2504 2505
/*
 * Callchain support -- arch specific
 */

2506
__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2507 2508 2509 2510
{
	return NULL;
}

2511 2512 2513
/*
 * Output
 */
2514 2515
static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
			      unsigned long offset, unsigned long head)
2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532
{
	unsigned long mask;

	if (!data->writable)
		return true;

	mask = (data->nr_pages << PAGE_SHIFT) - 1;

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

2533
static void perf_output_wakeup(struct perf_output_handle *handle)
2534
{
2535 2536
	atomic_set(&handle->data->poll, POLL_IN);

2537
	if (handle->nmi) {
2538 2539 2540
		handle->event->pending_wakeup = 1;
		perf_pending_queue(&handle->event->pending,
				   perf_pending_event);
2541
	} else
2542
		perf_event_wakeup(handle->event);
2543 2544
}

2545 2546 2547
/*
 * Curious locking construct.
 *
2548 2549
 * We need to ensure a later event_id doesn't publish a head when a former
 * event_id isn't done writing. However since we need to deal with NMIs we
2550 2551 2552 2553 2554 2555
 * cannot fully serialize things.
 *
 * What we do is serialize between CPUs so we only have to deal with NMI
 * nesting on a single CPU.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
2556
 * event_id completes.
2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570
 */
static void perf_output_lock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
	int cpu;

	handle->locked = 0;

	local_irq_save(handle->flags);
	cpu = smp_processor_id();

	if (in_nmi() && atomic_read(&data->lock) == cpu)
		return;

2571
	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 2573 2574 2575 2576 2577 2578 2579
		cpu_relax();

	handle->locked = 1;
}

static void perf_output_unlock(struct perf_output_handle *handle)
{
	struct perf_mmap_data *data = handle->data;
2580 2581
	unsigned long head;
	int cpu;
2582

2583
	data->done_head = data->head;
2584 2585 2586 2587 2588 2589 2590 2591 2592 2593

	if (!handle->locked)
		goto out;

again:
	/*
	 * The xchg implies a full barrier that ensures all writes are done
	 * before we publish the new head, matched by a rmb() in userspace when
	 * reading this position.
	 */
2594
	while ((head = atomic_long_xchg(&data->done_head, 0)))
2595 2596 2597
		data->user_page->data_head = head;

	/*
2598
	 * NMI can happen here, which means we can miss a done_head update.
2599 2600
	 */

2601
	cpu = atomic_xchg(&data->lock, -1);
2602 2603 2604 2605 2606
	WARN_ON_ONCE(cpu != smp_processor_id());

	/*
	 * Therefore we have to validate we did not indeed do so.
	 */
2607
	if (unlikely(atomic_long_read(&data->done_head))) {
2608 2609 2610
		/*
		 * Since we had it locked, we can lock it again.
		 */
2611
		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2612 2613 2614 2615 2616
			cpu_relax();

		goto again;
	}

2617
	if (atomic_xchg(&data->wakeup, 0))
2618 2619 2620 2621 2622
		perf_output_wakeup(handle);
out:
	local_irq_restore(handle->flags);
}

2623 2624
void perf_output_copy(struct perf_output_handle *handle,
		      const void *buf, unsigned int len)
2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658
{
	unsigned int pages_mask;
	unsigned int offset;
	unsigned int size;
	void **pages;

	offset		= handle->offset;
	pages_mask	= handle->data->nr_pages - 1;
	pages		= handle->data->data_pages;

	do {
		unsigned int page_offset;
		int nr;

		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
		page_offset = offset & (PAGE_SIZE - 1);
		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);

		memcpy(pages[nr] + page_offset, buf, size);

		len	    -= size;
		buf	    += size;
		offset	    += size;
	} while (len);

	handle->offset = offset;

	/*
	 * Check we didn't copy past our reservation window, taking the
	 * possible unsigned int wrap into account.
	 */
	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}

2659
int perf_output_begin(struct perf_output_handle *handle,
2660
		      struct perf_event *event, unsigned int size,
2661
		      int nmi, int sample)
2662
{
2663
	struct perf_event *output_event;
2664
	struct perf_mmap_data *data;
2665
	unsigned long tail, offset, head;
2666 2667 2668 2669 2670 2671
	int have_lost;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;
2672

2673
	rcu_read_lock();
2674
	/*
2675
	 * For inherited events we send all the output towards the parent.
2676
	 */
2677 2678
	if (event->parent)
		event = event->parent;
2679

2680 2681 2682
	output_event = rcu_dereference(event->output);
	if (output_event)
		event = output_event;
2683

2684
	data = rcu_dereference(event->data);
2685 2686 2687
	if (!data)
		goto out;

2688
	handle->data	= data;
2689
	handle->event	= event;
2690 2691
	handle->nmi	= nmi;
	handle->sample	= sample;
2692

2693
	if (!data->nr_pages)
2694
		goto fail;
2695

2696 2697 2698 2699
	have_lost = atomic_read(&data->lost);
	if (have_lost)
		size += sizeof(lost_event);

2700 2701
	perf_output_lock(handle);

2702
	do {
2703 2704 2705 2706 2707 2708 2709
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
		tail = ACCESS_ONCE(data->user_page->data_tail);
		smp_rmb();
2710
		offset = head = atomic_long_read(&data->head);
P
Peter Zijlstra 已提交
2711
		head += size;
2712
		if (unlikely(!perf_output_space(data, tail, offset, head)))
2713
			goto fail;
2714
	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2715

2716
	handle->offset	= offset;
2717
	handle->head	= head;
2718

2719
	if (head - tail > data->watermark)
2720
		atomic_set(&data->wakeup, 1);
2721

2722
	if (have_lost) {
2723
		lost_event.header.type = PERF_RECORD_LOST;
2724 2725
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
2726
		lost_event.id          = event->id;
2727 2728 2729 2730 2731
		lost_event.lost        = atomic_xchg(&data->lost, 0);

		perf_output_put(handle, lost_event);
	}

2732
	return 0;
2733

2734
fail:
2735 2736
	atomic_inc(&data->lost);
	perf_output_unlock(handle);
2737 2738
out:
	rcu_read_unlock();
2739

2740 2741
	return -ENOSPC;
}
2742

2743
void perf_output_end(struct perf_output_handle *handle)
2744
{
2745
	struct perf_event *event = handle->event;
2746 2747
	struct perf_mmap_data *data = handle->data;

2748
	int wakeup_events = event->attr.wakeup_events;
P
Peter Zijlstra 已提交
2749

2750
	if (handle->sample && wakeup_events) {
2751
		int events = atomic_inc_return(&data->events);
P
Peter Zijlstra 已提交
2752
		if (events >= wakeup_events) {
2753
			atomic_sub(wakeup_events, &data->events);
2754
			atomic_set(&data->wakeup, 1);
P
Peter Zijlstra 已提交
2755
		}
2756 2757 2758
	}

	perf_output_unlock(handle);
2759
	rcu_read_unlock();
2760 2761
}

2762
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2763 2764
{
	/*
2765
	 * only top level events have the pid namespace they were created in
2766
	 */
2767 2768
	if (event->parent)
		event = event->parent;
2769

2770
	return task_tgid_nr_ns(p, event->ns);
2771 2772
}

2773
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2774 2775
{
	/*
2776
	 * only top level events have the pid namespace they were created in
2777
	 */
2778 2779
	if (event->parent)
		event = event->parent;
2780

2781
	return task_pid_nr_ns(p, event->ns);
2782 2783
}

2784
static void perf_output_read_one(struct perf_output_handle *handle,
2785
				 struct perf_event *event)
2786
{
2787
	u64 read_format = event->attr.read_format;
2788 2789 2790
	u64 values[4];
	int n = 0;

2791
	values[n++] = atomic64_read(&event->count);
2792
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2793 2794
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2795 2796
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2797 2798
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2799 2800
	}
	if (read_format & PERF_FORMAT_ID)
2801
		values[n++] = primary_event_id(event);
2802 2803 2804 2805 2806

	perf_output_copy(handle, values, n * sizeof(u64));
}

/*
2807
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2808 2809
 */
static void perf_output_read_group(struct perf_output_handle *handle,
2810
			    struct perf_event *event)
2811
{
2812 2813
	struct perf_event *leader = event->group_leader, *sub;
	u64 read_format = event->attr.read_format;
2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
	u64 values[5];
	int n = 0;

	values[n++] = 1 + leader->nr_siblings;

	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = leader->total_time_enabled;

	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = leader->total_time_running;

2825
	if (leader != event)
2826 2827 2828 2829
		leader->pmu->read(leader);

	values[n++] = atomic64_read(&leader->count);
	if (read_format & PERF_FORMAT_ID)
2830
		values[n++] = primary_event_id(leader);
2831 2832 2833

	perf_output_copy(handle, values, n * sizeof(u64));

2834
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2835 2836
		n = 0;

2837
		if (sub != event)
2838 2839 2840 2841
			sub->pmu->read(sub);

		values[n++] = atomic64_read(&sub->count);
		if (read_format & PERF_FORMAT_ID)
2842
			values[n++] = primary_event_id(sub);
2843 2844 2845 2846 2847 2848

		perf_output_copy(handle, values, n * sizeof(u64));
	}
}

static void perf_output_read(struct perf_output_handle *handle,
2849
			     struct perf_event *event)
2850
{
2851 2852
	if (event->attr.read_format & PERF_FORMAT_GROUP)
		perf_output_read_group(handle, event);
2853
	else
2854
		perf_output_read_one(handle, event);
2855 2856
}

2857 2858 2859
void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
2860
			struct perf_event *event)
2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	if (sample_type & PERF_SAMPLE_ADDR)
		perf_output_put(handle, data->addr);

	if (sample_type & PERF_SAMPLE_ID)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		perf_output_put(handle, data->stream_id);

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	if (sample_type & PERF_SAMPLE_READ)
2891
		perf_output_read(handle, event);
2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928

	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
		if (data->callchain) {
			int size = 1;

			if (data->callchain)
				size += data->callchain->nr;

			size *= sizeof(u64);

			perf_output_copy(handle, data->callchain, size);
		} else {
			u64 nr = 0;
			perf_output_put(handle, nr);
		}
	}

	if (sample_type & PERF_SAMPLE_RAW) {
		if (data->raw) {
			perf_output_put(handle, data->raw->size);
			perf_output_copy(handle, data->raw->data,
					 data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(handle, raw);
		}
	}
}

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
2929
			 struct perf_event *event,
2930
			 struct pt_regs *regs)
2931
{
2932
	u64 sample_type = event->attr.sample_type;
2933

2934
	data->type = sample_type;
2935

2936
	header->type = PERF_RECORD_SAMPLE;
2937 2938 2939 2940
	header->size = sizeof(*header);

	header->misc = 0;
	header->misc |= perf_misc_flags(regs);
2941

2942
	if (sample_type & PERF_SAMPLE_IP) {
2943 2944 2945
		data->ip = perf_instruction_pointer(regs);

		header->size += sizeof(data->ip);
2946
	}
2947

2948
	if (sample_type & PERF_SAMPLE_TID) {
2949
		/* namespace issues */
2950 2951
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
2952

2953
		header->size += sizeof(data->tid_entry);
2954 2955
	}

2956
	if (sample_type & PERF_SAMPLE_TIME) {
P
Peter Zijlstra 已提交
2957
		data->time = perf_clock();
2958

2959
		header->size += sizeof(data->time);
2960 2961
	}

2962
	if (sample_type & PERF_SAMPLE_ADDR)
2963
		header->size += sizeof(data->addr);
2964

2965
	if (sample_type & PERF_SAMPLE_ID) {
2966
		data->id = primary_event_id(event);
2967

2968 2969 2970 2971
		header->size += sizeof(data->id);
	}

	if (sample_type & PERF_SAMPLE_STREAM_ID) {
2972
		data->stream_id = event->id;
2973 2974 2975

		header->size += sizeof(data->stream_id);
	}
2976

2977
	if (sample_type & PERF_SAMPLE_CPU) {
2978 2979
		data->cpu_entry.cpu		= raw_smp_processor_id();
		data->cpu_entry.reserved	= 0;
2980

2981
		header->size += sizeof(data->cpu_entry);
2982 2983
	}

2984
	if (sample_type & PERF_SAMPLE_PERIOD)
2985
		header->size += sizeof(data->period);
2986

2987
	if (sample_type & PERF_SAMPLE_READ)
2988
		header->size += perf_event_read_size(event);
2989

2990
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2991
		int size = 1;
2992

2993 2994 2995 2996 2997 2998
		data->callchain = perf_callchain(regs);

		if (data->callchain)
			size += data->callchain->nr;

		header->size += size * sizeof(u64);
2999 3000
	}

3001
	if (sample_type & PERF_SAMPLE_RAW) {
3002 3003 3004 3005 3006 3007 3008 3009
		int size = sizeof(u32);

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
3010
		header->size += size;
3011
	}
3012
}
3013

3014
static void perf_event_output(struct perf_event *event, int nmi,
3015 3016 3017 3018 3019
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
	struct perf_output_handle handle;
	struct perf_event_header header;
3020

3021
	perf_prepare_sample(&header, data, event, regs);
P
Peter Zijlstra 已提交
3022

3023
	if (perf_output_begin(&handle, event, header.size, nmi, 1))
3024
		return;
3025

3026
	perf_output_sample(&handle, &header, data, event);
3027

3028
	perf_output_end(&handle);
3029 3030
}

3031
/*
3032
 * read event_id
3033 3034 3035 3036 3037 3038 3039 3040 3041 3042
 */

struct perf_read_event {
	struct perf_event_header	header;

	u32				pid;
	u32				tid;
};

static void
3043
perf_event_read_event(struct perf_event *event,
3044 3045 3046
			struct task_struct *task)
{
	struct perf_output_handle handle;
3047
	struct perf_read_event read_event = {
3048
		.header = {
3049
			.type = PERF_RECORD_READ,
3050
			.misc = 0,
3051
			.size = sizeof(read_event) + perf_event_read_size(event),
3052
		},
3053 3054
		.pid = perf_event_pid(event, task),
		.tid = perf_event_tid(event, task),
3055
	};
3056
	int ret;
3057

3058
	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3059 3060 3061
	if (ret)
		return;

3062
	perf_output_put(&handle, read_event);
3063
	perf_output_read(&handle, event);
3064

3065 3066 3067
	perf_output_end(&handle);
}

P
Peter Zijlstra 已提交
3068
/*
P
Peter Zijlstra 已提交
3069 3070 3071
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.task
P
Peter Zijlstra 已提交
3072 3073
 */

P
Peter Zijlstra 已提交
3074
struct perf_task_event {
3075
	struct task_struct		*task;
3076
	struct perf_event_context	*task_ctx;
P
Peter Zijlstra 已提交
3077 3078 3079 3080 3081 3082

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				ppid;
P
Peter Zijlstra 已提交
3083 3084
		u32				tid;
		u32				ptid;
3085
		u64				time;
3086
	} event_id;
P
Peter Zijlstra 已提交
3087 3088
};

3089
static void perf_event_task_output(struct perf_event *event,
P
Peter Zijlstra 已提交
3090
				     struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3091 3092
{
	struct perf_output_handle handle;
3093
	int size;
P
Peter Zijlstra 已提交
3094
	struct task_struct *task = task_event->task;
3095 3096
	int ret;

3097 3098
	size  = task_event->event_id.header.size;
	ret = perf_output_begin(&handle, event, size, 0, 0);
P
Peter Zijlstra 已提交
3099 3100 3101 3102

	if (ret)
		return;

3103 3104
	task_event->event_id.pid = perf_event_pid(event, task);
	task_event->event_id.ppid = perf_event_pid(event, current);
P
Peter Zijlstra 已提交
3105

3106 3107
	task_event->event_id.tid = perf_event_tid(event, task);
	task_event->event_id.ptid = perf_event_tid(event, current);
P
Peter Zijlstra 已提交
3108

3109
	task_event->event_id.time = perf_clock();
3110

3111
	perf_output_put(&handle, task_event->event_id);
3112

P
Peter Zijlstra 已提交
3113 3114 3115
	perf_output_end(&handle);
}

3116
static int perf_event_task_match(struct perf_event *event)
P
Peter Zijlstra 已提交
3117
{
3118
	if (event->attr.comm || event->attr.mmap || event->attr.task)
P
Peter Zijlstra 已提交
3119 3120 3121 3122 3123
		return 1;

	return 0;
}

3124
static void perf_event_task_ctx(struct perf_event_context *ctx,
P
Peter Zijlstra 已提交
3125
				  struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3126
{
3127
	struct perf_event *event;
P
Peter Zijlstra 已提交
3128 3129 3130 3131 3132

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3133 3134 3135
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_task_match(event))
			perf_event_task_output(event, task_event);
P
Peter Zijlstra 已提交
3136 3137 3138 3139
	}
	rcu_read_unlock();
}

3140
static void perf_event_task_event(struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3141 3142
{
	struct perf_cpu_context *cpuctx;
3143
	struct perf_event_context *ctx = task_event->task_ctx;
P
Peter Zijlstra 已提交
3144 3145

	cpuctx = &get_cpu_var(perf_cpu_context);
3146
	perf_event_task_ctx(&cpuctx->ctx, task_event);
P
Peter Zijlstra 已提交
3147 3148 3149
	put_cpu_var(perf_cpu_context);

	rcu_read_lock();
3150
	if (!ctx)
3151
		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
P
Peter Zijlstra 已提交
3152
	if (ctx)
3153
		perf_event_task_ctx(ctx, task_event);
P
Peter Zijlstra 已提交
3154 3155 3156
	rcu_read_unlock();
}

3157 3158
static void perf_event_task(struct task_struct *task,
			      struct perf_event_context *task_ctx,
3159
			      int new)
P
Peter Zijlstra 已提交
3160
{
P
Peter Zijlstra 已提交
3161
	struct perf_task_event task_event;
P
Peter Zijlstra 已提交
3162

3163 3164 3165
	if (!atomic_read(&nr_comm_events) &&
	    !atomic_read(&nr_mmap_events) &&
	    !atomic_read(&nr_task_events))
P
Peter Zijlstra 已提交
3166 3167
		return;

P
Peter Zijlstra 已提交
3168
	task_event = (struct perf_task_event){
3169 3170
		.task	  = task,
		.task_ctx = task_ctx,
3171
		.event_id    = {
P
Peter Zijlstra 已提交
3172
			.header = {
3173
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3174
				.misc = 0,
3175
				.size = sizeof(task_event.event_id),
P
Peter Zijlstra 已提交
3176
			},
3177 3178
			/* .pid  */
			/* .ppid */
P
Peter Zijlstra 已提交
3179 3180
			/* .tid  */
			/* .ptid */
P
Peter Zijlstra 已提交
3181 3182 3183
		},
	};

3184
	perf_event_task_event(&task_event);
P
Peter Zijlstra 已提交
3185 3186
}

3187
void perf_event_fork(struct task_struct *task)
P
Peter Zijlstra 已提交
3188
{
3189
	perf_event_task(task, NULL, 1);
P
Peter Zijlstra 已提交
3190 3191
}

3192 3193 3194 3195 3196
/*
 * comm tracking
 */

struct perf_comm_event {
3197 3198
	struct task_struct	*task;
	char			*comm;
3199 3200 3201 3202 3203 3204 3205
	int			comm_size;

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
3206
	} event_id;
3207 3208
};

3209
static void perf_event_comm_output(struct perf_event *event,
3210 3211 3212
				     struct perf_comm_event *comm_event)
{
	struct perf_output_handle handle;
3213 3214
	int size = comm_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3215 3216 3217 3218

	if (ret)
		return;

3219 3220
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3221

3222
	perf_output_put(&handle, comm_event->event_id);
3223 3224 3225 3226 3227
	perf_output_copy(&handle, comm_event->comm,
				   comm_event->comm_size);
	perf_output_end(&handle);
}

3228
static int perf_event_comm_match(struct perf_event *event)
3229
{
3230
	if (event->attr.comm)
3231 3232 3233 3234 3235
		return 1;

	return 0;
}

3236
static void perf_event_comm_ctx(struct perf_event_context *ctx,
3237 3238
				  struct perf_comm_event *comm_event)
{
3239
	struct perf_event *event;
3240 3241 3242 3243 3244

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3245 3246 3247
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_comm_match(event))
			perf_event_comm_output(event, comm_event);
3248 3249 3250 3251
	}
	rcu_read_unlock();
}

3252
static void perf_event_comm_event(struct perf_comm_event *comm_event)
3253 3254
{
	struct perf_cpu_context *cpuctx;
3255
	struct perf_event_context *ctx;
3256
	unsigned int size;
3257
	char comm[TASK_COMM_LEN];
3258

3259 3260
	memset(comm, 0, sizeof(comm));
	strncpy(comm, comm_event->task->comm, sizeof(comm));
3261
	size = ALIGN(strlen(comm)+1, sizeof(u64));
3262 3263 3264 3265

	comm_event->comm = comm;
	comm_event->comm_size = size;

3266
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267 3268

	cpuctx = &get_cpu_var(perf_cpu_context);
3269
	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270
	put_cpu_var(perf_cpu_context);
3271 3272 3273 3274 3275 3276

	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3277
	ctx = rcu_dereference(current->perf_event_ctxp);
3278
	if (ctx)
3279
		perf_event_comm_ctx(ctx, comm_event);
3280
	rcu_read_unlock();
3281 3282
}

3283
void perf_event_comm(struct task_struct *task)
3284
{
3285 3286
	struct perf_comm_event comm_event;

3287 3288
	if (task->perf_event_ctxp)
		perf_event_enable_on_exec(task);
3289

3290
	if (!atomic_read(&nr_comm_events))
3291
		return;
3292

3293
	comm_event = (struct perf_comm_event){
3294
		.task	= task,
3295 3296
		/* .comm      */
		/* .comm_size */
3297
		.event_id  = {
3298
			.header = {
3299
				.type = PERF_RECORD_COMM,
3300 3301 3302 3303 3304
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3305 3306 3307
		},
	};

3308
	perf_event_comm_event(&comm_event);
3309 3310
}

3311 3312 3313 3314 3315
/*
 * mmap tracking
 */

struct perf_mmap_event {
3316 3317 3318 3319
	struct vm_area_struct	*vma;

	const char		*file_name;
	int			file_size;
3320 3321 3322 3323 3324 3325 3326 3327 3328

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
		u64				start;
		u64				len;
		u64				pgoff;
3329
	} event_id;
3330 3331
};

3332
static void perf_event_mmap_output(struct perf_event *event,
3333 3334 3335
				     struct perf_mmap_event *mmap_event)
{
	struct perf_output_handle handle;
3336 3337
	int size = mmap_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3338 3339 3340 3341

	if (ret)
		return;

3342 3343
	mmap_event->event_id.pid = perf_event_pid(event, current);
	mmap_event->event_id.tid = perf_event_tid(event, current);
3344

3345
	perf_output_put(&handle, mmap_event->event_id);
3346 3347
	perf_output_copy(&handle, mmap_event->file_name,
				   mmap_event->file_size);
3348
	perf_output_end(&handle);
3349 3350
}

3351
static int perf_event_mmap_match(struct perf_event *event,
3352 3353
				   struct perf_mmap_event *mmap_event)
{
3354
	if (event->attr.mmap)
3355 3356 3357 3358 3359
		return 1;

	return 0;
}

3360
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3361 3362
				  struct perf_mmap_event *mmap_event)
{
3363
	struct perf_event *event;
3364 3365 3366 3367 3368

	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
		return;

	rcu_read_lock();
3369 3370 3371
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_mmap_match(event, mmap_event))
			perf_event_mmap_output(event, mmap_event);
3372 3373 3374 3375
	}
	rcu_read_unlock();
}

3376
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3377 3378
{
	struct perf_cpu_context *cpuctx;
3379
	struct perf_event_context *ctx;
3380 3381
	struct vm_area_struct *vma = mmap_event->vma;
	struct file *file = vma->vm_file;
3382 3383 3384
	unsigned int size;
	char tmp[16];
	char *buf = NULL;
3385
	const char *name;
3386

3387 3388
	memset(tmp, 0, sizeof(tmp));

3389
	if (file) {
3390 3391 3392 3393 3394 3395
		/*
		 * d_path works from the end of the buffer backwards, so we
		 * need to add enough zero bytes after the string to handle
		 * the 64bit alignment we do later.
		 */
		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3396 3397 3398 3399
		if (!buf) {
			name = strncpy(tmp, "//enomem", sizeof(tmp));
			goto got_name;
		}
3400
		name = d_path(&file->f_path, buf, PATH_MAX);
3401 3402 3403 3404 3405
		if (IS_ERR(name)) {
			name = strncpy(tmp, "//toolong", sizeof(tmp));
			goto got_name;
		}
	} else {
3406 3407 3408
		if (arch_vma_name(mmap_event->vma)) {
			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				       sizeof(tmp));
3409
			goto got_name;
3410
		}
3411 3412 3413 3414 3415 3416

		if (!vma->vm_mm) {
			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			goto got_name;
		}

3417 3418 3419 3420 3421
		name = strncpy(tmp, "//anon", sizeof(tmp));
		goto got_name;
	}

got_name:
3422
	size = ALIGN(strlen(name)+1, sizeof(u64));
3423 3424 3425 3426

	mmap_event->file_name = name;
	mmap_event->file_size = size;

3427
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428 3429

	cpuctx = &get_cpu_var(perf_cpu_context);
3430
	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 3432
	put_cpu_var(perf_cpu_context);

3433 3434 3435 3436 3437
	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3438
	ctx = rcu_dereference(current->perf_event_ctxp);
3439
	if (ctx)
3440
		perf_event_mmap_ctx(ctx, mmap_event);
3441 3442
	rcu_read_unlock();

3443 3444 3445
	kfree(buf);
}

3446
void __perf_event_mmap(struct vm_area_struct *vma)
3447
{
3448 3449
	struct perf_mmap_event mmap_event;

3450
	if (!atomic_read(&nr_mmap_events))
3451 3452 3453
		return;

	mmap_event = (struct perf_mmap_event){
3454
		.vma	= vma,
3455 3456
		/* .file_name */
		/* .file_size */
3457
		.event_id  = {
3458
			.header = {
3459
				.type = PERF_RECORD_MMAP,
3460 3461 3462 3463 3464
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3465 3466 3467
			.start  = vma->vm_start,
			.len    = vma->vm_end - vma->vm_start,
			.pgoff  = vma->vm_pgoff,
3468 3469 3470
		},
	};

3471
	perf_event_mmap_event(&mmap_event);
3472 3473
}

3474 3475 3476 3477
/*
 * IRQ throttle logging
 */

3478
static void perf_log_throttle(struct perf_event *event, int enable)
3479 3480 3481 3482 3483 3484 3485
{
	struct perf_output_handle handle;
	int ret;

	struct {
		struct perf_event_header	header;
		u64				time;
3486
		u64				id;
3487
		u64				stream_id;
3488 3489
	} throttle_event = {
		.header = {
3490
			.type = PERF_RECORD_THROTTLE,
3491 3492 3493
			.misc = 0,
			.size = sizeof(throttle_event),
		},
P
Peter Zijlstra 已提交
3494
		.time		= perf_clock(),
3495 3496
		.id		= primary_event_id(event),
		.stream_id	= event->id,
3497 3498
	};

3499
	if (enable)
3500
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3501

3502
	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3503 3504 3505 3506 3507 3508 3509
	if (ret)
		return;

	perf_output_put(&handle, throttle_event);
	perf_output_end(&handle);
}

3510
/*
3511
 * Generic event overflow handling, sampling.
3512 3513
 */

3514
static int __perf_event_overflow(struct perf_event *event, int nmi,
3515 3516
				   int throttle, struct perf_sample_data *data,
				   struct pt_regs *regs)
3517
{
3518 3519
	int events = atomic_read(&event->event_limit);
	struct hw_perf_event *hwc = &event->hw;
3520 3521
	int ret = 0;

3522
	throttle = (throttle && event->pmu->unthrottle != NULL);
3523

3524
	if (!throttle) {
3525
		hwc->interrupts++;
3526
	} else {
3527 3528
		if (hwc->interrupts != MAX_INTERRUPTS) {
			hwc->interrupts++;
3529
			if (HZ * hwc->interrupts >
3530
					(u64)sysctl_perf_event_sample_rate) {
3531
				hwc->interrupts = MAX_INTERRUPTS;
3532
				perf_log_throttle(event, 0);
3533 3534 3535 3536
				ret = 1;
			}
		} else {
			/*
3537
			 * Keep re-disabling events even though on the previous
3538
			 * pass we disabled it - just in case we raced with a
3539
			 * sched-in and the event got enabled again:
3540
			 */
3541 3542 3543
			ret = 1;
		}
	}
3544

3545
	if (event->attr.freq) {
P
Peter Zijlstra 已提交
3546
		u64 now = perf_clock();
3547 3548 3549 3550 3551
		s64 delta = now - hwc->freq_stamp;

		hwc->freq_stamp = now;

		if (delta > 0 && delta < TICK_NSEC)
3552
			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3553 3554
	}

3555 3556
	/*
	 * XXX event_limit might not quite work as expected on inherited
3557
	 * events
3558 3559
	 */

3560 3561
	event->pending_kill = POLL_IN;
	if (events && atomic_dec_and_test(&event->event_limit)) {
3562
		ret = 1;
3563
		event->pending_kill = POLL_HUP;
3564
		if (nmi) {
3565 3566 3567
			event->pending_disable = 1;
			perf_pending_queue(&event->pending,
					   perf_pending_event);
3568
		} else
3569
			perf_event_disable(event);
3570 3571
	}

3572
	perf_event_output(event, nmi, data, regs);
3573
	return ret;
3574 3575
}

3576
int perf_event_overflow(struct perf_event *event, int nmi,
3577 3578
			  struct perf_sample_data *data,
			  struct pt_regs *regs)
3579
{
3580
	return __perf_event_overflow(event, nmi, 1, data, regs);
3581 3582
}

3583
/*
3584
 * Generic software event infrastructure
3585 3586
 */

3587
/*
3588 3589
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
3590 3591 3592 3593
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

3594
static u64 perf_swevent_set_period(struct perf_event *event)
3595
{
3596
	struct hw_perf_event *hwc = &event->hw;
3597 3598 3599 3600 3601
	u64 period = hwc->last_period;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;
3602 3603

again:
3604 3605 3606
	old = val = atomic64_read(&hwc->period_left);
	if (val < 0)
		return 0;
3607

3608 3609 3610 3611 3612
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
		goto again;
3613

3614
	return nr;
3615 3616
}

3617
static void perf_swevent_overflow(struct perf_event *event,
3618 3619
				    int nmi, struct perf_sample_data *data,
				    struct pt_regs *regs)
3620
{
3621
	struct hw_perf_event *hwc = &event->hw;
3622
	int throttle = 0;
3623
	u64 overflow;
3624

3625 3626
	data->period = event->hw.last_period;
	overflow = perf_swevent_set_period(event);
3627

3628 3629
	if (hwc->interrupts == MAX_INTERRUPTS)
		return;
3630

3631
	for (; overflow; overflow--) {
3632
		if (__perf_event_overflow(event, nmi, throttle,
3633
					    data, regs)) {
3634 3635 3636 3637 3638 3639
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
3640
		throttle = 1;
3641
	}
3642 3643
}

3644
static void perf_swevent_unthrottle(struct perf_event *event)
3645 3646
{
	/*
3647
	 * Nothing to do, we already reset hwc->interrupts.
3648
	 */
3649
}
3650

3651
static void perf_swevent_add(struct perf_event *event, u64 nr,
3652 3653
			       int nmi, struct perf_sample_data *data,
			       struct pt_regs *regs)
3654
{
3655
	struct hw_perf_event *hwc = &event->hw;
3656

3657
	atomic64_add(nr, &event->count);
3658

3659 3660
	if (!hwc->sample_period)
		return;
3661

3662
	if (!regs)
3663
		return;
3664

3665
	if (!atomic64_add_negative(nr, &hwc->period_left))
3666
		perf_swevent_overflow(event, nmi, data, regs);
3667 3668
}

3669
static int perf_swevent_is_counting(struct perf_event *event)
3670
{
3671
	/*
3672
	 * The event is active, we're good!
3673
	 */
3674
	if (event->state == PERF_EVENT_STATE_ACTIVE)
3675 3676
		return 1;

3677
	/*
3678
	 * The event is off/error, not counting.
3679
	 */
3680
	if (event->state != PERF_EVENT_STATE_INACTIVE)
3681 3682 3683
		return 0;

	/*
3684
	 * The event is inactive, if the context is active
3685 3686
	 * we're part of a group that didn't make it on the 'pmu',
	 * not counting.
3687
	 */
3688
	if (event->ctx->is_active)
3689 3690 3691 3692 3693 3694 3695 3696
		return 0;

	/*
	 * We're inactive and the context is too, this means the
	 * task is scheduled out, we're counting events that happen
	 * to us, like migration events.
	 */
	return 1;
3697 3698
}

3699
static int perf_swevent_match(struct perf_event *event,
P
Peter Zijlstra 已提交
3700
				enum perf_type_id type,
3701
				u32 event_id, struct pt_regs *regs)
3702
{
3703
	if (!perf_swevent_is_counting(event))
3704 3705
		return 0;

3706
	if (event->attr.type != type)
3707
		return 0;
3708
	if (event->attr.config != event_id)
3709 3710
		return 0;

3711
	if (regs) {
3712
		if (event->attr.exclude_user && user_mode(regs))
3713
			return 0;
3714

3715
		if (event->attr.exclude_kernel && !user_mode(regs))
3716 3717
			return 0;
	}
3718 3719 3720 3721

	return 1;
}

3722
static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3723
				     enum perf_type_id type,
3724
				     u32 event_id, u64 nr, int nmi,
3725 3726
				     struct perf_sample_data *data,
				     struct pt_regs *regs)
3727
{
3728
	struct perf_event *event;
3729

3730
	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 3732
		return;

P
Peter Zijlstra 已提交
3733
	rcu_read_lock();
3734 3735 3736
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_swevent_match(event, type, event_id, regs))
			perf_swevent_add(event, nr, nmi, data, regs);
3737
	}
P
Peter Zijlstra 已提交
3738
	rcu_read_unlock();
3739 3740
}

3741
static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
P
Peter Zijlstra 已提交
3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754
{
	if (in_nmi())
		return &cpuctx->recursion[3];

	if (in_irq())
		return &cpuctx->recursion[2];

	if (in_softirq())
		return &cpuctx->recursion[1];

	return &cpuctx->recursion[0];
}

3755
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756
				    u64 nr, int nmi,
3757 3758
				    struct perf_sample_data *data,
				    struct pt_regs *regs)
3759 3760
{
	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3761 3762
	int *recursion = perf_swevent_recursion_context(cpuctx);
	struct perf_event_context *ctx;
P
Peter Zijlstra 已提交
3763 3764 3765 3766 3767 3768

	if (*recursion)
		goto out;

	(*recursion)++;
	barrier();
3769

3770
	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771
				 nr, nmi, data, regs);
3772 3773 3774 3775 3776
	rcu_read_lock();
	/*
	 * doesn't really matter which of the child contexts the
	 * events ends up in.
	 */
3777
	ctx = rcu_dereference(current->perf_event_ctxp);
3778
	if (ctx)
3779
		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780
	rcu_read_unlock();
3781

P
Peter Zijlstra 已提交
3782 3783 3784 3785
	barrier();
	(*recursion)--;

out:
3786 3787 3788
	put_cpu_var(perf_cpu_context);
}

3789
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790
			    struct pt_regs *regs, u64 addr)
3791
{
3792 3793 3794 3795
	struct perf_sample_data data = {
		.addr = addr,
	};

3796
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3797
				&data, regs);
3798 3799
}

3800
static void perf_swevent_read(struct perf_event *event)
3801 3802 3803
{
}

3804
static int perf_swevent_enable(struct perf_event *event)
3805
{
3806
	struct hw_perf_event *hwc = &event->hw;
3807 3808 3809

	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
3810
		perf_swevent_set_period(event);
3811
	}
3812 3813 3814
	return 0;
}

3815
static void perf_swevent_disable(struct perf_event *event)
3816 3817 3818
{
}

3819
static const struct pmu perf_ops_generic = {
3820 3821 3822 3823
	.enable		= perf_swevent_enable,
	.disable	= perf_swevent_disable,
	.read		= perf_swevent_read,
	.unthrottle	= perf_swevent_unthrottle,
3824 3825
};

3826
/*
3827
 * hrtimer based swevent callback
3828 3829
 */

3830
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3831 3832 3833
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
3834
	struct pt_regs *regs;
3835
	struct perf_event *event;
3836 3837
	u64 period;

3838 3839
	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
	event->pmu->read(event);
3840 3841

	data.addr = 0;
3842
	regs = get_irq_regs();
3843 3844 3845 3846
	/*
	 * In case we exclude kernel IPs or are somehow not in interrupt
	 * context, provide the next best thing, the user IP.
	 */
3847 3848
	if ((event->attr.exclude_kernel || !regs) &&
			!event->attr.exclude_user)
3849
		regs = task_pt_regs(current);
3850

3851
	if (regs) {
3852
		if (perf_event_overflow(event, 0, &data, regs))
3853 3854 3855
			ret = HRTIMER_NORESTART;
	}

3856
	period = max_t(u64, 10000, event->hw.sample_period);
3857 3858 3859 3860 3861
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}

3862
/*
3863
 * Software event: cpu wall time clock
3864 3865
 */

3866
static void cpu_clock_perf_event_update(struct perf_event *event)
3867 3868 3869 3870 3871 3872
{
	int cpu = raw_smp_processor_id();
	s64 prev;
	u64 now;

	now = cpu_clock(cpu);
3873 3874 3875
	prev = atomic64_read(&event->hw.prev_count);
	atomic64_set(&event->hw.prev_count, now);
	atomic64_add(now - prev, &event->count);
3876 3877
}

3878
static int cpu_clock_perf_event_enable(struct perf_event *event)
3879
{
3880
	struct hw_perf_event *hwc = &event->hw;
3881 3882 3883
	int cpu = raw_smp_processor_id();

	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3885
	hwc->hrtimer.function = perf_swevent_hrtimer;
3886 3887
	if (hwc->sample_period) {
		u64 period = max_t(u64, 10000, hwc->sample_period);
3888
		__hrtimer_start_range_ns(&hwc->hrtimer,
3889
				ns_to_ktime(period), 0,
3890 3891 3892 3893 3894 3895
				HRTIMER_MODE_REL, 0);
	}

	return 0;
}

3896
static void cpu_clock_perf_event_disable(struct perf_event *event)
3897
{
3898 3899 3900
	if (event->hw.sample_period)
		hrtimer_cancel(&event->hw.hrtimer);
	cpu_clock_perf_event_update(event);
3901 3902
}

3903
static void cpu_clock_perf_event_read(struct perf_event *event)
3904
{
3905
	cpu_clock_perf_event_update(event);
3906 3907
}

3908
static const struct pmu perf_ops_cpu_clock = {
3909 3910 3911
	.enable		= cpu_clock_perf_event_enable,
	.disable	= cpu_clock_perf_event_disable,
	.read		= cpu_clock_perf_event_read,
3912 3913
};

3914
/*
3915
 * Software event: task time clock
3916 3917
 */

3918
static void task_clock_perf_event_update(struct perf_event *event, u64 now)
I
Ingo Molnar 已提交
3919
{
3920
	u64 prev;
I
Ingo Molnar 已提交
3921 3922
	s64 delta;

3923
	prev = atomic64_xchg(&event->hw.prev_count, now);
I
Ingo Molnar 已提交
3924
	delta = now - prev;
3925
	atomic64_add(delta, &event->count);
3926 3927
}

3928
static int task_clock_perf_event_enable(struct perf_event *event)
I
Ingo Molnar 已提交
3929
{
3930
	struct hw_perf_event *hwc = &event->hw;
3931 3932
	u64 now;

3933
	now = event->ctx->time;
3934

3935
	atomic64_set(&hwc->prev_count, now);
3936
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3937
	hwc->hrtimer.function = perf_swevent_hrtimer;
3938 3939
	if (hwc->sample_period) {
		u64 period = max_t(u64, 10000, hwc->sample_period);
3940
		__hrtimer_start_range_ns(&hwc->hrtimer,
3941
				ns_to_ktime(period), 0,
3942 3943
				HRTIMER_MODE_REL, 0);
	}
3944 3945

	return 0;
I
Ingo Molnar 已提交
3946 3947
}

3948
static void task_clock_perf_event_disable(struct perf_event *event)
3949
{
3950 3951 3952
	if (event->hw.sample_period)
		hrtimer_cancel(&event->hw.hrtimer);
	task_clock_perf_event_update(event, event->ctx->time);
3953

3954
}
I
Ingo Molnar 已提交
3955

3956
static void task_clock_perf_event_read(struct perf_event *event)
3957
{
3958 3959 3960
	u64 time;

	if (!in_nmi()) {
3961 3962
		update_context_time(event->ctx);
		time = event->ctx->time;
3963 3964
	} else {
		u64 now = perf_clock();
3965 3966
		u64 delta = now - event->ctx->timestamp;
		time = event->ctx->time + delta;
3967 3968
	}

3969
	task_clock_perf_event_update(event, time);
3970 3971
}

3972
static const struct pmu perf_ops_task_clock = {
3973 3974 3975
	.enable		= task_clock_perf_event_enable,
	.disable	= task_clock_perf_event_disable,
	.read		= task_clock_perf_event_read,
3976 3977
};

3978
#ifdef CONFIG_EVENT_PROFILE
3979
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980
			  int entry_size)
3981
{
3982
	struct perf_raw_record raw = {
3983
		.size = entry_size,
3984
		.data = record,
3985 3986
	};

3987
	struct perf_sample_data data = {
3988
		.addr = addr,
3989
		.raw = &raw,
3990
	};
3991

3992 3993 3994 3995
	struct pt_regs *regs = get_irq_regs();

	if (!regs)
		regs = task_pt_regs(current);
3996

3997
	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998
				&data, regs);
3999
}
4000
EXPORT_SYMBOL_GPL(perf_tp_event);
4001 4002 4003 4004

extern int ftrace_profile_enable(int);
extern void ftrace_profile_disable(int);

4005
static void tp_perf_event_destroy(struct perf_event *event)
4006
{
4007
	ftrace_profile_disable(event->attr.config);
4008 4009
}

4010
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4011
{
4012 4013 4014 4015
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
4016
	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4017
			perf_paranoid_tracepoint_raw() &&
4018 4019 4020
			!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

4021
	if (ftrace_profile_enable(event->attr.config))
4022 4023
		return NULL;

4024
	event->destroy = tp_perf_event_destroy;
4025 4026 4027 4028

	return &perf_ops_generic;
}
#else
4029
static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030 4031 4032 4033 4034
{
	return NULL;
}
#endif

4035
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4036

4037
static void sw_perf_event_destroy(struct perf_event *event)
4038
{
4039
	u64 event_id = event->attr.config;
4040

4041
	WARN_ON(event->parent);
4042

4043
	atomic_dec(&perf_swevent_enabled[event_id]);
4044 4045
}

4046
static const struct pmu *sw_perf_event_init(struct perf_event *event)
4047
{
4048
	const struct pmu *pmu = NULL;
4049
	u64 event_id = event->attr.config;
4050

4051
	/*
4052
	 * Software events (currently) can't in general distinguish
4053 4054 4055 4056 4057
	 * between user, kernel and hypervisor events.
	 * However, context switches and cpu migrations are considered
	 * to be kernel events, and page faults are never hypervisor
	 * events.
	 */
4058
	switch (event_id) {
4059
	case PERF_COUNT_SW_CPU_CLOCK:
4060
		pmu = &perf_ops_cpu_clock;
4061

4062
		break;
4063
	case PERF_COUNT_SW_TASK_CLOCK:
4064
		/*
4065 4066
		 * If the user instantiates this as a per-cpu event,
		 * use the cpu_clock event instead.
4067
		 */
4068
		if (event->ctx->task)
4069
			pmu = &perf_ops_task_clock;
4070
		else
4071
			pmu = &perf_ops_cpu_clock;
4072

4073
		break;
4074 4075 4076 4077 4078
	case PERF_COUNT_SW_PAGE_FAULTS:
	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
	case PERF_COUNT_SW_CONTEXT_SWITCHES:
	case PERF_COUNT_SW_CPU_MIGRATIONS:
4079 4080 4081
		if (!event->parent) {
			atomic_inc(&perf_swevent_enabled[event_id]);
			event->destroy = sw_perf_event_destroy;
4082
		}
4083
		pmu = &perf_ops_generic;
4084
		break;
4085
	}
4086

4087
	return pmu;
4088 4089
}

T
Thomas Gleixner 已提交
4090
/*
4091
 * Allocate and initialize a event structure
T
Thomas Gleixner 已提交
4092
 */
4093 4094
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr,
4095
		   int cpu,
4096 4097 4098
		   struct perf_event_context *ctx,
		   struct perf_event *group_leader,
		   struct perf_event *parent_event,
4099
		   gfp_t gfpflags)
T
Thomas Gleixner 已提交
4100
{
4101
	const struct pmu *pmu;
4102 4103
	struct perf_event *event;
	struct hw_perf_event *hwc;
4104
	long err;
T
Thomas Gleixner 已提交
4105

4106 4107
	event = kzalloc(sizeof(*event), gfpflags);
	if (!event)
4108
		return ERR_PTR(-ENOMEM);
T
Thomas Gleixner 已提交
4109

4110
	/*
4111
	 * Single events are their own group leaders, with an
4112 4113 4114
	 * empty sibling list:
	 */
	if (!group_leader)
4115
		group_leader = event;
4116

4117 4118
	mutex_init(&event->child_mutex);
	INIT_LIST_HEAD(&event->child_list);
4119

4120 4121 4122 4123
	INIT_LIST_HEAD(&event->group_entry);
	INIT_LIST_HEAD(&event->event_entry);
	INIT_LIST_HEAD(&event->sibling_list);
	init_waitqueue_head(&event->waitq);
T
Thomas Gleixner 已提交
4124

4125
	mutex_init(&event->mmap_mutex);
4126

4127 4128 4129 4130 4131 4132
	event->cpu		= cpu;
	event->attr		= *attr;
	event->group_leader	= group_leader;
	event->pmu		= NULL;
	event->ctx		= ctx;
	event->oncpu		= -1;
4133

4134
	event->parent		= parent_event;
4135

4136 4137
	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
	event->id		= atomic64_inc_return(&perf_event_id);
4138

4139
	event->state		= PERF_EVENT_STATE_INACTIVE;
4140

4141
	if (attr->disabled)
4142
		event->state = PERF_EVENT_STATE_OFF;
4143

4144
	pmu = NULL;
4145

4146
	hwc = &event->hw;
4147
	hwc->sample_period = attr->sample_period;
4148
	if (attr->freq && attr->sample_freq)
4149
		hwc->sample_period = 1;
4150
	hwc->last_period = hwc->sample_period;
4151 4152

	atomic64_set(&hwc->period_left, hwc->sample_period);
4153

4154
	/*
4155
	 * we currently do not support PERF_FORMAT_GROUP on inherited events
4156
	 */
4157
	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4158 4159
		goto done;

4160
	switch (attr->type) {
4161
	case PERF_TYPE_RAW:
4162
	case PERF_TYPE_HARDWARE:
4163
	case PERF_TYPE_HW_CACHE:
4164
		pmu = hw_perf_event_init(event);
4165 4166 4167
		break;

	case PERF_TYPE_SOFTWARE:
4168
		pmu = sw_perf_event_init(event);
4169 4170 4171
		break;

	case PERF_TYPE_TRACEPOINT:
4172
		pmu = tp_perf_event_init(event);
4173
		break;
4174 4175 4176

	default:
		break;
4177
	}
4178 4179
done:
	err = 0;
4180
	if (!pmu)
4181
		err = -EINVAL;
4182 4183
	else if (IS_ERR(pmu))
		err = PTR_ERR(pmu);
4184

4185
	if (err) {
4186 4187 4188
		if (event->ns)
			put_pid_ns(event->ns);
		kfree(event);
4189
		return ERR_PTR(err);
I
Ingo Molnar 已提交
4190
	}
4191

4192
	event->pmu = pmu;
T
Thomas Gleixner 已提交
4193

4194 4195 4196 4197 4198 4199 4200 4201
	if (!event->parent) {
		atomic_inc(&nr_events);
		if (event->attr.mmap)
			atomic_inc(&nr_mmap_events);
		if (event->attr.comm)
			atomic_inc(&nr_comm_events);
		if (event->attr.task)
			atomic_inc(&nr_task_events);
4202
	}
4203

4204
	return event;
T
Thomas Gleixner 已提交
4205 4206
}

4207 4208
static int perf_copy_attr(struct perf_event_attr __user *uattr,
			  struct perf_event_attr *attr)
4209 4210
{
	u32 size;
4211
	int ret;
4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235

	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
		return -EFAULT;

	/*
	 * zero the full structure, so that a short copy will be nice.
	 */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	if (size > PAGE_SIZE)	/* silly large */
		goto err_size;

	if (!size)		/* abi compat */
		size = PERF_ATTR_SIZE_VER0;

	if (size < PERF_ATTR_SIZE_VER0)
		goto err_size;

	/*
	 * If we're handed a bigger struct than we know of,
4236 4237 4238
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
4239 4240
	 */
	if (size > sizeof(*attr)) {
4241 4242 4243
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;
4244

4245 4246
		addr = (void __user *)uattr + sizeof(*attr);
		end  = (void __user *)uattr + size;
4247

4248
		for (; addr < end; addr++) {
4249 4250 4251 4252 4253 4254
			ret = get_user(val, addr);
			if (ret)
				return ret;
			if (val)
				goto err_size;
		}
4255
		size = sizeof(*attr);
4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286
	}

	ret = copy_from_user(attr, uattr, size);
	if (ret)
		return -EFAULT;

	/*
	 * If the type exists, the corresponding creation will verify
	 * the attr->config.
	 */
	if (attr->type >= PERF_TYPE_MAX)
		return -EINVAL;

	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
		return -EINVAL;

	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
		return -EINVAL;

	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
		return -EINVAL;

out:
	return ret;

err_size:
	put_user(sizeof(*attr), &uattr->size);
	ret = -E2BIG;
	goto out;
}

4287
int perf_event_set_output(struct perf_event *event, int output_fd)
4288
{
4289
	struct perf_event *output_event = NULL;
4290
	struct file *output_file = NULL;
4291
	struct perf_event *old_output;
4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304
	int fput_needed = 0;
	int ret = -EINVAL;

	if (!output_fd)
		goto set;

	output_file = fget_light(output_fd, &fput_needed);
	if (!output_file)
		return -EBADF;

	if (output_file->f_op != &perf_fops)
		goto out;

4305
	output_event = output_file->private_data;
4306 4307

	/* Don't chain output fds */
4308
	if (output_event->output)
4309 4310 4311
		goto out;

	/* Don't set an output fd when we already have an output channel */
4312
	if (event->data)
4313 4314 4315 4316 4317
		goto out;

	atomic_long_inc(&output_file->f_count);

set:
4318 4319 4320 4321
	mutex_lock(&event->mmap_mutex);
	old_output = event->output;
	rcu_assign_pointer(event->output, output_event);
	mutex_unlock(&event->mmap_mutex);
4322 4323 4324 4325

	if (old_output) {
		/*
		 * we need to make sure no existing perf_output_*()
4326
		 * is still referencing this event.
4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337
		 */
		synchronize_rcu();
		fput(old_output->filp);
	}

	ret = 0;
out:
	fput_light(output_file, fput_needed);
	return ret;
}

T
Thomas Gleixner 已提交
4338
/**
4339
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
I
Ingo Molnar 已提交
4340
 *
4341
 * @attr_uptr:	event_id type attributes for monitoring/sampling
T
Thomas Gleixner 已提交
4342
 * @pid:		target pid
I
Ingo Molnar 已提交
4343
 * @cpu:		target cpu
4344
 * @group_fd:		group leader event fd
T
Thomas Gleixner 已提交
4345
 */
4346 4347
SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
4348
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
T
Thomas Gleixner 已提交
4349
{
4350 4351 4352 4353
	struct perf_event *event, *group_leader;
	struct perf_event_attr attr;
	struct perf_event_context *ctx;
	struct file *event_file = NULL;
4354 4355
	struct file *group_file = NULL;
	int fput_needed = 0;
4356
	int fput_needed2 = 0;
4357
	int err;
T
Thomas Gleixner 已提交
4358

4359
	/* for future expandability... */
4360
	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4361 4362
		return -EINVAL;

4363 4364 4365
	err = perf_copy_attr(attr_uptr, &attr);
	if (err)
		return err;
4366

4367 4368 4369 4370 4371
	if (!attr.exclude_kernel) {
		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

4372
	if (attr.freq) {
4373
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
4374 4375 4376
			return -EINVAL;
	}

4377
	/*
I
Ingo Molnar 已提交
4378 4379 4380 4381 4382 4383 4384
	 * Get the target context (task or percpu):
	 */
	ctx = find_get_context(pid, cpu);
	if (IS_ERR(ctx))
		return PTR_ERR(ctx);

	/*
4385
	 * Look up the group leader (we will attach this event to it):
4386 4387
	 */
	group_leader = NULL;
4388
	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4389
		err = -EINVAL;
4390 4391
		group_file = fget_light(group_fd, &fput_needed);
		if (!group_file)
I
Ingo Molnar 已提交
4392
			goto err_put_context;
4393
		if (group_file->f_op != &perf_fops)
I
Ingo Molnar 已提交
4394
			goto err_put_context;
4395 4396 4397

		group_leader = group_file->private_data;
		/*
I
Ingo Molnar 已提交
4398 4399 4400 4401 4402 4403 4404 4405
		 * Do not allow a recursive hierarchy (this new sibling
		 * becoming part of another group-sibling):
		 */
		if (group_leader->group_leader != group_leader)
			goto err_put_context;
		/*
		 * Do not allow to attach to a group in a different
		 * task or CPU context:
4406
		 */
I
Ingo Molnar 已提交
4407 4408
		if (group_leader->ctx != ctx)
			goto err_put_context;
4409 4410 4411
		/*
		 * Only a group leader can be exclusive or pinned
		 */
4412
		if (attr.exclusive || attr.pinned)
4413
			goto err_put_context;
4414 4415
	}

4416
	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417
				     NULL, GFP_KERNEL);
4418 4419
	err = PTR_ERR(event);
	if (IS_ERR(event))
T
Thomas Gleixner 已提交
4420 4421
		goto err_put_context;

4422
	err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4423
	if (err < 0)
4424 4425
		goto err_free_put_context;

4426 4427
	event_file = fget_light(err, &fput_needed2);
	if (!event_file)
4428 4429
		goto err_free_put_context;

4430
	if (flags & PERF_FLAG_FD_OUTPUT) {
4431
		err = perf_event_set_output(event, group_fd);
4432 4433
		if (err)
			goto err_fput_free_put_context;
4434 4435
	}

4436
	event->filp = event_file;
4437
	WARN_ON_ONCE(ctx->parent_ctx);
4438
	mutex_lock(&ctx->mutex);
4439
	perf_install_in_context(ctx, event, cpu);
4440
	++ctx->generation;
4441
	mutex_unlock(&ctx->mutex);
4442

4443
	event->owner = current;
4444
	get_task_struct(current);
4445 4446 4447
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
4448

4449
err_fput_free_put_context:
4450
	fput_light(event_file, fput_needed2);
T
Thomas Gleixner 已提交
4451

4452
err_free_put_context:
4453
	if (err < 0)
4454
		kfree(event);
T
Thomas Gleixner 已提交
4455 4456

err_put_context:
4457 4458 4459 4460
	if (err < 0)
		put_ctx(ctx);

	fput_light(group_file, fput_needed);
T
Thomas Gleixner 已提交
4461

4462
	return err;
T
Thomas Gleixner 已提交
4463 4464
}

4465
/*
4466
 * inherit a event from parent task to child task:
4467
 */
4468 4469
static struct perf_event *
inherit_event(struct perf_event *parent_event,
4470
	      struct task_struct *parent,
4471
	      struct perf_event_context *parent_ctx,
4472
	      struct task_struct *child,
4473 4474
	      struct perf_event *group_leader,
	      struct perf_event_context *child_ctx)
4475
{
4476
	struct perf_event *child_event;
4477

4478
	/*
4479 4480
	 * Instead of creating recursive hierarchies of events,
	 * we link inherited events back to the original parent,
4481 4482 4483
	 * which has a filp for sure, which we use as the reference
	 * count:
	 */
4484 4485
	if (parent_event->parent)
		parent_event = parent_event->parent;
4486

4487 4488 4489
	child_event = perf_event_alloc(&parent_event->attr,
					   parent_event->cpu, child_ctx,
					   group_leader, parent_event,
4490
					   GFP_KERNEL);
4491 4492
	if (IS_ERR(child_event))
		return child_event;
4493
	get_ctx(child_ctx);
4494

4495
	/*
4496
	 * Make the child state follow the state of the parent event,
4497
	 * not its attr.disabled bit.  We hold the parent's mutex,
4498
	 * so we won't race with perf_event_{en, dis}able_family.
4499
	 */
4500 4501
	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
		child_event->state = PERF_EVENT_STATE_INACTIVE;
4502
	else
4503
		child_event->state = PERF_EVENT_STATE_OFF;
4504

4505 4506
	if (parent_event->attr.freq)
		child_event->hw.sample_period = parent_event->hw.sample_period;
4507

4508 4509 4510
	/*
	 * Link it up in the child's context:
	 */
4511
	add_event_to_ctx(child_event, child_ctx);
4512 4513 4514

	/*
	 * Get a reference to the parent filp - we will fput it
4515
	 * when the child event exits. This is safe to do because
4516 4517 4518
	 * we are in the parent and we know that the filp still
	 * exists and has a nonzero count:
	 */
4519
	atomic_long_inc(&parent_event->filp->f_count);
4520

4521
	/*
4522
	 * Link this into the parent event's child list
4523
	 */
4524 4525 4526 4527
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_add_tail(&child_event->child_list, &parent_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
4528

4529
	return child_event;
4530 4531
}

4532
static int inherit_group(struct perf_event *parent_event,
4533
	      struct task_struct *parent,
4534
	      struct perf_event_context *parent_ctx,
4535
	      struct task_struct *child,
4536
	      struct perf_event_context *child_ctx)
4537
{
4538 4539 4540
	struct perf_event *leader;
	struct perf_event *sub;
	struct perf_event *child_ctr;
4541

4542
	leader = inherit_event(parent_event, parent, parent_ctx,
4543
				 child, NULL, child_ctx);
4544 4545
	if (IS_ERR(leader))
		return PTR_ERR(leader);
4546 4547
	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
		child_ctr = inherit_event(sub, parent, parent_ctx,
4548 4549 4550
					    child, leader, child_ctx);
		if (IS_ERR(child_ctr))
			return PTR_ERR(child_ctr);
4551
	}
4552 4553 4554
	return 0;
}

4555
static void sync_child_event(struct perf_event *child_event,
4556
			       struct task_struct *child)
4557
{
4558
	struct perf_event *parent_event = child_event->parent;
4559
	u64 child_val;
4560

4561 4562
	if (child_event->attr.inherit_stat)
		perf_event_read_event(child_event, child);
4563

4564
	child_val = atomic64_read(&child_event->count);
4565 4566 4567 4568

	/*
	 * Add back the child's count to the parent's count:
	 */
4569 4570 4571 4572 4573
	atomic64_add(child_val, &parent_event->count);
	atomic64_add(child_event->total_time_enabled,
		     &parent_event->child_total_time_enabled);
	atomic64_add(child_event->total_time_running,
		     &parent_event->child_total_time_running);
4574 4575

	/*
4576
	 * Remove this event from the parent's list
4577
	 */
4578 4579 4580 4581
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_del_init(&child_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
4582 4583

	/*
4584
	 * Release the parent event, if this was the last
4585 4586
	 * reference to it.
	 */
4587
	fput(parent_event->filp);
4588 4589
}

4590
static void
4591 4592
__perf_event_exit_task(struct perf_event *child_event,
			 struct perf_event_context *child_ctx,
4593
			 struct task_struct *child)
4594
{
4595
	struct perf_event *parent_event;
4596

4597 4598
	update_event_times(child_event);
	perf_event_remove_from_context(child_event);
4599

4600
	parent_event = child_event->parent;
4601
	/*
4602
	 * It can happen that parent exits first, and has events
4603
	 * that are still around due to the child reference. These
4604
	 * events need to be zapped - but otherwise linger.
4605
	 */
4606 4607 4608
	if (parent_event) {
		sync_child_event(child_event, child);
		free_event(child_event);
4609
	}
4610 4611 4612
}

/*
4613
 * When a child task exits, feed back event values to parent events.
4614
 */
4615
void perf_event_exit_task(struct task_struct *child)
4616
{
4617 4618
	struct perf_event *child_event, *tmp;
	struct perf_event_context *child_ctx;
4619
	unsigned long flags;
4620

4621 4622
	if (likely(!child->perf_event_ctxp)) {
		perf_event_task(child, NULL, 0);
4623
		return;
P
Peter Zijlstra 已提交
4624
	}
4625

4626
	local_irq_save(flags);
4627 4628 4629 4630 4631 4632
	/*
	 * We can't reschedule here because interrupts are disabled,
	 * and either child is current or it is a task that can't be
	 * scheduled, so we are now safe from rescheduling changing
	 * our context.
	 */
4633 4634
	child_ctx = child->perf_event_ctxp;
	__perf_event_task_sched_out(child_ctx);
4635 4636 4637

	/*
	 * Take the context lock here so that if find_get_context is
4638
	 * reading child->perf_event_ctxp, we wait until it has
4639 4640 4641
	 * incremented the context's refcount before we do put_ctx below.
	 */
	spin_lock(&child_ctx->lock);
4642
	child->perf_event_ctxp = NULL;
4643 4644 4645
	/*
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
4646
	 * the events from it.
4647 4648
	 */
	unclone_ctx(child_ctx);
P
Peter Zijlstra 已提交
4649 4650 4651
	spin_unlock_irqrestore(&child_ctx->lock, flags);

	/*
4652 4653 4654
	 * Report the task dead after unscheduling the events so that we
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
	 * get a few PERF_RECORD_READ events.
P
Peter Zijlstra 已提交
4655
	 */
4656
	perf_event_task(child, child_ctx, 0);
4657

4658 4659 4660
	/*
	 * We can recurse on the same lock type through:
	 *
4661 4662 4663
	 *   __perf_event_exit_task()
	 *     sync_child_event()
	 *       fput(parent_event->filp)
4664 4665 4666 4667 4668 4669
	 *         perf_release()
	 *           mutex_lock(&ctx->mutex)
	 *
	 * But since its the parent context it won't be the same instance.
	 */
	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4670

4671
again:
4672
	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4673
				 group_entry)
4674
		__perf_event_exit_task(child_event, child_ctx, child);
4675 4676

	/*
4677
	 * If the last event was a group event, it will have appended all
4678 4679 4680
	 * its siblings to the list, but we obtained 'tmp' before that which
	 * will still point to the list head terminating the iteration.
	 */
4681
	if (!list_empty(&child_ctx->group_list))
4682
		goto again;
4683 4684 4685 4686

	mutex_unlock(&child_ctx->mutex);

	put_ctx(child_ctx);
4687 4688
}

4689 4690 4691 4692
/*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
 */
4693
void perf_event_free_task(struct task_struct *task)
4694
{
4695 4696
	struct perf_event_context *ctx = task->perf_event_ctxp;
	struct perf_event *event, *tmp;
4697 4698 4699 4700 4701 4702

	if (!ctx)
		return;

	mutex_lock(&ctx->mutex);
again:
4703 4704
	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
		struct perf_event *parent = event->parent;
4705 4706 4707 4708 4709

		if (WARN_ON_ONCE(!parent))
			continue;

		mutex_lock(&parent->child_mutex);
4710
		list_del_init(&event->child_list);
4711 4712 4713 4714
		mutex_unlock(&parent->child_mutex);

		fput(parent->filp);

4715 4716
		list_del_event(event, ctx);
		free_event(event);
4717 4718
	}

4719
	if (!list_empty(&ctx->group_list))
4720 4721 4722 4723 4724 4725 4726
		goto again;

	mutex_unlock(&ctx->mutex);

	put_ctx(ctx);
}

4727
/*
4728
 * Initialize the perf_event context in task_struct
4729
 */
4730
int perf_event_init_task(struct task_struct *child)
4731
{
4732 4733 4734
	struct perf_event_context *child_ctx, *parent_ctx;
	struct perf_event_context *cloned_ctx;
	struct perf_event *event;
4735
	struct task_struct *parent = current;
4736
	int inherited_all = 1;
4737
	int ret = 0;
4738

4739
	child->perf_event_ctxp = NULL;
4740

4741 4742
	mutex_init(&child->perf_event_mutex);
	INIT_LIST_HEAD(&child->perf_event_list);
4743

4744
	if (likely(!parent->perf_event_ctxp))
4745 4746
		return 0;

4747 4748
	/*
	 * This is executed from the parent task context, so inherit
4749
	 * events that have been marked for cloning.
4750
	 * First allocate and initialize a context for the child.
4751 4752
	 */

4753
	child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4754
	if (!child_ctx)
4755
		return -ENOMEM;
4756

4757 4758
	__perf_event_init_context(child_ctx, child);
	child->perf_event_ctxp = child_ctx;
4759
	get_task_struct(child);
4760

4761
	/*
4762 4763
	 * If the parent's context is a clone, pin it so it won't get
	 * swapped under us.
4764
	 */
4765 4766
	parent_ctx = perf_pin_task_context(parent);

4767 4768 4769 4770 4771 4772 4773
	/*
	 * No need to check if parent_ctx != NULL here; since we saw
	 * it non-NULL earlier, the only reason for it to become NULL
	 * is if we exit, and since we're currently in the middle of
	 * a fork we can't be exiting at the same time.
	 */

4774 4775 4776 4777
	/*
	 * Lock the parent list. No need to lock the child - not PID
	 * hashed yet and not running, so nobody can access it.
	 */
4778
	mutex_lock(&parent_ctx->mutex);
4779 4780 4781 4782 4783

	/*
	 * We dont have to disable NMIs - we are only looking at
	 * the list, not manipulating it:
	 */
4784 4785
	list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
		if (event != event->group_leader)
4786 4787
			continue;

4788
		if (!event->attr.inherit) {
4789
			inherited_all = 0;
4790
			continue;
4791
		}
4792

4793
		ret = inherit_group(event, parent, parent_ctx,
4794 4795
					     child, child_ctx);
		if (ret) {
4796
			inherited_all = 0;
4797
			break;
4798 4799 4800 4801 4802 4803 4804
		}
	}

	if (inherited_all) {
		/*
		 * Mark the child context as a clone of the parent
		 * context, or of whatever the parent is a clone of.
4805 4806
		 * Note that if the parent is a clone, it could get
		 * uncloned at any point, but that doesn't matter
4807
		 * because the list of events and the generation
4808
		 * count can't have changed since we took the mutex.
4809
		 */
4810 4811 4812
		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
		if (cloned_ctx) {
			child_ctx->parent_ctx = cloned_ctx;
4813
			child_ctx->parent_gen = parent_ctx->parent_gen;
4814 4815 4816 4817 4818
		} else {
			child_ctx->parent_ctx = parent_ctx;
			child_ctx->parent_gen = parent_ctx->generation;
		}
		get_ctx(child_ctx->parent_ctx);
4819 4820
	}

4821
	mutex_unlock(&parent_ctx->mutex);
4822

4823
	perf_unpin_context(parent_ctx);
4824

4825
	return ret;
4826 4827
}

4828
static void __cpuinit perf_event_init_cpu(int cpu)
T
Thomas Gleixner 已提交
4829
{
4830
	struct perf_cpu_context *cpuctx;
T
Thomas Gleixner 已提交
4831

4832
	cpuctx = &per_cpu(perf_cpu_context, cpu);
4833
	__perf_event_init_context(&cpuctx->ctx, NULL);
T
Thomas Gleixner 已提交
4834

4835
	spin_lock(&perf_resource_lock);
4836
	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4837
	spin_unlock(&perf_resource_lock);
4838

4839
	hw_perf_event_setup(cpu);
T
Thomas Gleixner 已提交
4840 4841 4842
}

#ifdef CONFIG_HOTPLUG_CPU
4843
static void __perf_event_exit_cpu(void *info)
T
Thomas Gleixner 已提交
4844 4845
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4846 4847
	struct perf_event_context *ctx = &cpuctx->ctx;
	struct perf_event *event, *tmp;
T
Thomas Gleixner 已提交
4848

4849 4850
	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
		__perf_event_remove_from_context(event);
T
Thomas Gleixner 已提交
4851
}
4852
static void perf_event_exit_cpu(int cpu)
T
Thomas Gleixner 已提交
4853
{
4854
	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4855
	struct perf_event_context *ctx = &cpuctx->ctx;
4856 4857

	mutex_lock(&ctx->mutex);
4858
	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4859
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
4860 4861
}
#else
4862
static inline void perf_event_exit_cpu(int cpu) { }
T
Thomas Gleixner 已提交
4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873
#endif

static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action) {

	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
4874
		perf_event_init_cpu(cpu);
T
Thomas Gleixner 已提交
4875 4876
		break;

4877 4878
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
4879
		hw_perf_event_setup_online(cpu);
4880 4881
		break;

T
Thomas Gleixner 已提交
4882 4883
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
4884
		perf_event_exit_cpu(cpu);
T
Thomas Gleixner 已提交
4885 4886 4887 4888 4889 4890 4891 4892 4893
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

4894 4895 4896
/*
 * This has to have a higher priority than migration_notifier in sched.c.
 */
T
Thomas Gleixner 已提交
4897 4898
static struct notifier_block __cpuinitdata perf_cpu_nb = {
	.notifier_call		= perf_cpu_notify,
4899
	.priority		= 20,
T
Thomas Gleixner 已提交
4900 4901
};

4902
void __init perf_event_init(void)
T
Thomas Gleixner 已提交
4903 4904 4905
{
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
			(void *)(long)smp_processor_id());
4906 4907
	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
			(void *)(long)smp_processor_id());
T
Thomas Gleixner 已提交
4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927
	register_cpu_notifier(&perf_cpu_nb);
}

static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_reserved_percpu);
}

static ssize_t
perf_set_reserve_percpu(struct sysdev_class *class,
			const char *buf,
			size_t count)
{
	struct perf_cpu_context *cpuctx;
	unsigned long val;
	int err, cpu, mpt;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
4928
	if (val > perf_max_events)
T
Thomas Gleixner 已提交
4929 4930
		return -EINVAL;

4931
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
4932 4933 4934 4935
	perf_reserved_percpu = val;
	for_each_online_cpu(cpu) {
		cpuctx = &per_cpu(perf_cpu_context, cpu);
		spin_lock_irq(&cpuctx->ctx.lock);
4936 4937
		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
			  perf_max_events - perf_reserved_percpu);
T
Thomas Gleixner 已提交
4938 4939 4940
		cpuctx->max_pertask = mpt;
		spin_unlock_irq(&cpuctx->ctx.lock);
	}
4941
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962

	return count;
}

static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
{
	return sprintf(buf, "%d\n", perf_overcommit);
}

static ssize_t
perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
{
	unsigned long val;
	int err;

	err = strict_strtoul(buf, 10, &val);
	if (err)
		return err;
	if (val > 1)
		return -EINVAL;

4963
	spin_lock(&perf_resource_lock);
T
Thomas Gleixner 已提交
4964
	perf_overcommit = val;
4965
	spin_unlock(&perf_resource_lock);
T
Thomas Gleixner 已提交
4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991

	return count;
}

static SYSDEV_CLASS_ATTR(
				reserve_percpu,
				0644,
				perf_show_reserve_percpu,
				perf_set_reserve_percpu
			);

static SYSDEV_CLASS_ATTR(
				overcommit,
				0644,
				perf_show_overcommit,
				perf_set_overcommit
			);

static struct attribute *perfclass_attrs[] = {
	&attr_reserve_percpu.attr,
	&attr_overcommit.attr,
	NULL
};

static struct attribute_group perfclass_attr_group = {
	.attrs			= perfclass_attrs,
4992
	.name			= "perf_events",
T
Thomas Gleixner 已提交
4993 4994
};

4995
static int __init perf_event_sysfs_init(void)
T
Thomas Gleixner 已提交
4996 4997 4998 4999
{
	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				  &perfclass_attr_group);
}
5000
device_initcall(perf_event_sysfs_init);