perf_event.c 142.6 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events core code:
T
Thomas Gleixner 已提交
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
I
Ingo Molnar 已提交
9
 * For licensing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
T
Thomas Gleixner 已提交
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
T
Thomas Gleixner 已提交
17
#include <linux/poll.h>
18
#include <linux/slab.h>
19
#include <linux/hash.h>
T
Thomas Gleixner 已提交
20
#include <linux/sysfs.h>
21
#include <linux/dcache.h>
T
Thomas Gleixner 已提交
22
#include <linux/percpu.h>
23
#include <linux/ptrace.h>
24
#include <linux/vmstat.h>
25
#include <linux/vmalloc.h>
26 27
#include <linux/hardirq.h>
#include <linux/rculist.h>
T
Thomas Gleixner 已提交
28 29 30
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
I
Ingo Molnar 已提交
31
#include <linux/kernel_stat.h>
32
#include <linux/perf_event.h>
L
Li Zefan 已提交
33
#include <linux/ftrace_event.h>
T
Thomas Gleixner 已提交
34

35 36
#include <asm/irq_regs.h>

37 38 39 40
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
41

P
Peter Zijlstra 已提交
42 43 44 45
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;

46
/*
47
 * perf event paranoia level:
48 49
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
50
 *   1 - disallow cpu events for unpriv
51
 *   2 - disallow kernel profiling for unpriv
52
 */
53
int sysctl_perf_event_paranoid __read_mostly = 1;
54

55
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
56 57

/*
58
 * max perf event sample rate
59
 */
60
int sysctl_perf_event_sample_rate __read_mostly = 100000;
61

62
static atomic64_t perf_event_id;
63

64
void __weak perf_event_print_debug(void)	{ }
65

66 67 68 69 70
extern __weak const char *perf_pmu_name(void)
{
	return "pmu";
}

P
Peter Zijlstra 已提交
71
void perf_pmu_disable(struct pmu *pmu)
72
{
P
Peter Zijlstra 已提交
73 74 75
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!(*count)++)
		pmu->pmu_disable(pmu);
76 77
}

P
Peter Zijlstra 已提交
78
void perf_pmu_enable(struct pmu *pmu)
79
{
P
Peter Zijlstra 已提交
80 81 82
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!--(*count))
		pmu->pmu_enable(pmu);
83 84
}

85 86 87 88 89 90 91
static DEFINE_PER_CPU(struct list_head, rotation_list);

/*
 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
 */
P
Peter Zijlstra 已提交
92
static void perf_pmu_rotate_start(struct pmu *pmu)
93
{
P
Peter Zijlstra 已提交
94
	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
95
	struct list_head *head = &__get_cpu_var(rotation_list);
96

97
	WARN_ON(!irqs_disabled());
98

99 100
	if (list_empty(&cpuctx->rotation_list))
		list_add(&cpuctx->rotation_list, head);
101 102
}

103
static void get_ctx(struct perf_event_context *ctx)
104
{
105
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
106 107
}

108 109
static void free_ctx(struct rcu_head *head)
{
110
	struct perf_event_context *ctx;
111

112
	ctx = container_of(head, struct perf_event_context, rcu_head);
113 114 115
	kfree(ctx);
}

116
static void put_ctx(struct perf_event_context *ctx)
117
{
118 119 120
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
121 122 123
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
124
	}
125 126
}

127
static void unclone_ctx(struct perf_event_context *ctx)
128 129 130 131 132 133 134
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

135
/*
136
 * If we inherit events we want to return the parent event id
137 138
 * to userspace.
 */
139
static u64 primary_event_id(struct perf_event *event)
140
{
141
	u64 id = event->id;
142

143 144
	if (event->parent)
		id = event->parent->id;
145 146 147 148

	return id;
}

149
/*
150
 * Get the perf_event_context for a task and lock it.
151 152 153
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
154
static struct perf_event_context *
P
Peter Zijlstra 已提交
155
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
156
{
157
	struct perf_event_context *ctx;
158 159

	rcu_read_lock();
P
Peter Zijlstra 已提交
160
retry:
P
Peter Zijlstra 已提交
161
	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
162 163 164 165
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
166
		 * perf_event_task_sched_out, though the
167 168 169 170 171 172
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
173
		raw_spin_lock_irqsave(&ctx->lock, *flags);
P
Peter Zijlstra 已提交
174
		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
175
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
176 177
			goto retry;
		}
178 179

		if (!atomic_inc_not_zero(&ctx->refcount)) {
180
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
181 182
			ctx = NULL;
		}
183 184 185 186 187 188 189 190 191 192
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
P
Peter Zijlstra 已提交
193 194
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
195
{
196
	struct perf_event_context *ctx;
197 198
	unsigned long flags;

P
Peter Zijlstra 已提交
199
	ctx = perf_lock_task_context(task, ctxn, &flags);
200 201
	if (ctx) {
		++ctx->pin_count;
202
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
203 204 205 206
	}
	return ctx;
}

207
static void perf_unpin_context(struct perf_event_context *ctx)
208 209 210
{
	unsigned long flags;

211
	raw_spin_lock_irqsave(&ctx->lock, flags);
212
	--ctx->pin_count;
213
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
214 215 216
	put_ctx(ctx);
}

217 218
static inline u64 perf_clock(void)
{
219
	return local_clock();
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
}

/*
 * Update the record of the current time in a context.
 */
static void update_context_time(struct perf_event_context *ctx)
{
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
}

/*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
static void update_event_times(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	u64 run_end;

	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
		return;

245 246 247 248 249 250
	if (ctx->is_active)
		run_end = ctx->time;
	else
		run_end = event->tstamp_stopped;

	event->total_time_enabled = run_end - event->tstamp_enabled;
251 252 253 254 255 256 257 258 259

	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
	else
		run_end = ctx->time;

	event->total_time_running = run_end - event->tstamp_running;
}

260 261 262 263 264 265 266 267 268 269 270 271
/*
 * Update total_time_enabled and total_time_running for all events in a group.
 */
static void update_group_times(struct perf_event *leader)
{
	struct perf_event *event;

	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
}

272 273 274 275 276 277 278 279 280
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
	if (event->attr.pinned)
		return &ctx->pinned_groups;
	else
		return &ctx->flexible_groups;
}

281
/*
282
 * Add a event from the lists for its context.
283 284
 * Must be called with ctx->mutex and ctx->lock held.
 */
285
static void
286
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
287
{
288 289
	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
	event->attach_state |= PERF_ATTACH_CONTEXT;
290 291

	/*
292 293 294
	 * If we're a stand alone event or group leader, we go to the context
	 * list, group events are kept attached to the group so that
	 * perf_group_detach can, at all times, locate all siblings.
295
	 */
296
	if (event->group_leader == event) {
297 298
		struct list_head *list;

299 300 301
		if (is_software_event(event))
			event->group_flags |= PERF_GROUP_SOFTWARE;

302 303
		list = ctx_group_list(event, ctx);
		list_add_tail(&event->group_entry, list);
P
Peter Zijlstra 已提交
304
	}
P
Peter Zijlstra 已提交
305

306
	list_add_rcu(&event->event_entry, &ctx->event_list);
307
	if (!ctx->nr_events)
P
Peter Zijlstra 已提交
308
		perf_pmu_rotate_start(ctx->pmu);
309 310
	ctx->nr_events++;
	if (event->attr.inherit_stat)
311
		ctx->nr_stat++;
312 313
}

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
static void perf_group_attach(struct perf_event *event)
{
	struct perf_event *group_leader = event->group_leader;

	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
	event->attach_state |= PERF_ATTACH_GROUP;

	if (group_leader == event)
		return;

	if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
			!is_software_event(event))
		group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;

	list_add_tail(&event->group_entry, &group_leader->sibling_list);
	group_leader->nr_siblings++;
}

332
/*
333
 * Remove a event from the lists for its context.
334
 * Must be called with ctx->mutex and ctx->lock held.
335
 */
336
static void
337
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
338
{
339 340 341 342
	/*
	 * We can have double detach due to exit/hot-unplug + close.
	 */
	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
343
		return;
344 345 346

	event->attach_state &= ~PERF_ATTACH_CONTEXT;

347 348
	ctx->nr_events--;
	if (event->attr.inherit_stat)
349
		ctx->nr_stat--;
350

351
	list_del_rcu(&event->event_entry);
352

353 354
	if (event->group_leader == event)
		list_del_init(&event->group_entry);
P
Peter Zijlstra 已提交
355

356
	update_group_times(event);
357 358 359 360 361 362 363 364 365 366

	/*
	 * If event was in error state, then keep it
	 * that way, otherwise bogus counts will be
	 * returned on read(). The only way to get out
	 * of error state is by explicit re-enabling
	 * of the event
	 */
	if (event->state > PERF_EVENT_STATE_OFF)
		event->state = PERF_EVENT_STATE_OFF;
367 368
}

369
static void perf_group_detach(struct perf_event *event)
370 371
{
	struct perf_event *sibling, *tmp;
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
	struct list_head *list = NULL;

	/*
	 * We can have double detach due to exit/hot-unplug + close.
	 */
	if (!(event->attach_state & PERF_ATTACH_GROUP))
		return;

	event->attach_state &= ~PERF_ATTACH_GROUP;

	/*
	 * If this is a sibling, remove it from its group.
	 */
	if (event->group_leader != event) {
		list_del_init(&event->group_entry);
		event->group_leader->nr_siblings--;
		return;
	}

	if (!list_empty(&event->group_entry))
		list = &event->group_entry;
393

394
	/*
395 396
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
397
	 * to whatever list we are on.
398
	 */
399
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
400 401
		if (list)
			list_move_tail(&sibling->group_entry, list);
402
		sibling->group_leader = sibling;
403 404 405

		/* Inherit group flags from the previous leader */
		sibling->group_flags = event->group_flags;
406 407 408
	}
}

409 410 411 412 413 414
static inline int
event_filter_match(struct perf_event *event)
{
	return event->cpu == -1 || event->cpu == smp_processor_id();
}

415 416
static int
__event_sched_out(struct perf_event *event,
417
		  struct perf_cpu_context *cpuctx,
418
		  struct perf_event_context *ctx)
419
{
420 421 422 423 424 425 426 427 428 429 430 431 432 433
	u64 delta;
	/*
	 * An event which could not be activated because of
	 * filter mismatch still needs to have its timings
	 * maintained, otherwise bogus information is return
	 * via read() for time_enabled, time_running:
	 */
	if (event->state == PERF_EVENT_STATE_INACTIVE
	    && !event_filter_match(event)) {
		delta = ctx->time - event->tstamp_stopped;
		event->tstamp_running += delta;
		event->tstamp_stopped = ctx->time;
	}

434
	if (event->state != PERF_EVENT_STATE_ACTIVE)
435
		return 0;
436

437 438 439 440
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
441
	}
P
Peter Zijlstra 已提交
442
	event->pmu->del(event, 0);
443
	event->oncpu = -1;
444

445
	if (!is_software_event(event))
446 447
		cpuctx->active_oncpu--;
	ctx->nr_active--;
448
	if (event->attr.exclusive || !cpuctx->active_oncpu)
449
		cpuctx->exclusive = 0;
450 451 452 453 454 455 456 457 458 459 460 461 462
	return 1;
}

static void
event_sched_out(struct perf_event *event,
		  struct perf_cpu_context *cpuctx,
		  struct perf_event_context *ctx)
{
	int ret;

	ret = __event_sched_out(event, cpuctx, ctx);
	if (ret)
		event->tstamp_stopped = ctx->time;
463 464
}

465
static void
466
group_sched_out(struct perf_event *group_event,
467
		struct perf_cpu_context *cpuctx,
468
		struct perf_event_context *ctx)
469
{
470
	struct perf_event *event;
471
	int state = group_event->state;
472

473
	event_sched_out(group_event, cpuctx, ctx);
474 475 476 477

	/*
	 * Schedule out siblings (if any):
	 */
478 479
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
480

481
	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
482 483 484
		cpuctx->exclusive = 0;
}

P
Peter Zijlstra 已提交
485 486 487 488 489 490
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

T
Thomas Gleixner 已提交
491
/*
492
 * Cross CPU call to remove a performance event
T
Thomas Gleixner 已提交
493
 *
494
 * We disable the event on the hardware level first. After that we
T
Thomas Gleixner 已提交
495 496
 * remove it from the context list.
 */
497
static void __perf_event_remove_from_context(void *info)
T
Thomas Gleixner 已提交
498
{
499 500
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
501
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
T
Thomas Gleixner 已提交
502 503 504 505 506 507

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
508
	if (ctx->task && cpuctx->task_ctx != ctx)
T
Thomas Gleixner 已提交
509 510
		return;

511
	raw_spin_lock(&ctx->lock);
T
Thomas Gleixner 已提交
512

513
	event_sched_out(event, cpuctx, ctx);
514

515
	list_del_event(event, ctx);
T
Thomas Gleixner 已提交
516

517
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
518 519 520 521
}


/*
522
 * Remove the event from a task's (or a CPU's) list of events.
T
Thomas Gleixner 已提交
523
 *
524
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
525
 *
526
 * CPU events are removed with a smp call. For task events we only
T
Thomas Gleixner 已提交
527
 * call when the task is on a CPU.
528
 *
529 530
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
531 532
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
533
 * When called from perf_event_exit_task, it's OK because the
534
 * context has been detached from its task.
T
Thomas Gleixner 已提交
535
 */
536
static void perf_event_remove_from_context(struct perf_event *event)
T
Thomas Gleixner 已提交
537
{
538
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
539 540 541 542
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
543
		 * Per cpu events are removed via an smp call and
544
		 * the removal is always successful.
T
Thomas Gleixner 已提交
545
		 */
546 547 548
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
T
Thomas Gleixner 已提交
549 550 551 552
		return;
	}

retry:
553 554
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
T
Thomas Gleixner 已提交
555

556
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
557 558 559
	/*
	 * If the context is active we need to retry the smp call.
	 */
560
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
561
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
562 563 564 565 566
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
567
	 * can remove the event safely, if the call above did not
T
Thomas Gleixner 已提交
568 569
	 * succeed.
	 */
P
Peter Zijlstra 已提交
570
	if (!list_empty(&event->group_entry))
571
		list_del_event(event, ctx);
572
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
573 574
}

575
/*
576
 * Cross CPU call to disable a performance event
577
 */
578
static void __perf_event_disable(void *info)
579
{
580 581
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
582
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
583 584

	/*
585 586
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
587
	 */
588
	if (ctx->task && cpuctx->task_ctx != ctx)
589 590
		return;

591
	raw_spin_lock(&ctx->lock);
592 593

	/*
594
	 * If the event is on, turn it off.
595 596
	 * If it is in error state, leave it in error state.
	 */
597
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
598
		update_context_time(ctx);
599 600 601
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
602
		else
603 604
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
605 606
	}

607
	raw_spin_unlock(&ctx->lock);
608 609 610
}

/*
611
 * Disable a event.
612
 *
613 614
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
615
 * remains valid.  This condition is satisifed when called through
616 617 618 619
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
620
 * is the current context on this CPU and preemption is disabled,
621
 * hence we can't get into perf_event_task_sched_out for this context.
622
 */
623
void perf_event_disable(struct perf_event *event)
624
{
625
	struct perf_event_context *ctx = event->ctx;
626 627 628 629
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
630
		 * Disable the event on the cpu that it's on
631
		 */
632 633
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
634 635 636
		return;
	}

P
Peter Zijlstra 已提交
637
retry:
638
	task_oncpu_function_call(task, __perf_event_disable, event);
639

640
	raw_spin_lock_irq(&ctx->lock);
641
	/*
642
	 * If the event is still active, we need to retry the cross-call.
643
	 */
644
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
645
		raw_spin_unlock_irq(&ctx->lock);
646 647 648 649 650 651 652
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
653 654 655
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
656
	}
657

658
	raw_spin_unlock_irq(&ctx->lock);
659 660
}

661
static int
662
__event_sched_in(struct perf_event *event,
663
		 struct perf_cpu_context *cpuctx,
664
		 struct perf_event_context *ctx)
665
{
666
	if (event->state <= PERF_EVENT_STATE_OFF)
667 668
		return 0;

669
	event->state = PERF_EVENT_STATE_ACTIVE;
670
	event->oncpu = smp_processor_id();
671 672 673 674 675
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

P
Peter Zijlstra 已提交
676
	if (event->pmu->add(event, PERF_EF_START)) {
677 678
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
679 680 681
		return -EAGAIN;
	}

682
	if (!is_software_event(event))
683
		cpuctx->active_oncpu++;
684 685
	ctx->nr_active++;

686
	if (event->attr.exclusive)
687 688
		cpuctx->exclusive = 1;

689 690 691
	return 0;
}

692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
static inline int
event_sched_in(struct perf_event *event,
		 struct perf_cpu_context *cpuctx,
		 struct perf_event_context *ctx)
{
	int ret = __event_sched_in(event, cpuctx, ctx);
	if (ret)
		return ret;
	event->tstamp_running += ctx->time - event->tstamp_stopped;
	return 0;
}

static void
group_commit_event_sched_in(struct perf_event *group_event,
	       struct perf_cpu_context *cpuctx,
	       struct perf_event_context *ctx)
{
	struct perf_event *event;
	u64 now = ctx->time;

	group_event->tstamp_running += now - group_event->tstamp_stopped;
	/*
	 * Schedule in siblings as one group (if any):
	 */
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		event->tstamp_running += now - event->tstamp_stopped;
	}
}

721
static int
722
group_sched_in(struct perf_event *group_event,
723
	       struct perf_cpu_context *cpuctx,
724
	       struct perf_event_context *ctx)
725
{
726
	struct perf_event *event, *partial_group = NULL;
P
Peter Zijlstra 已提交
727
	struct pmu *pmu = group_event->pmu;
728

729
	if (group_event->state == PERF_EVENT_STATE_OFF)
730 731
		return 0;

P
Peter Zijlstra 已提交
732
	pmu->start_txn(pmu);
733

734 735 736 737 738 739 740
	/*
	 * use __event_sched_in() to delay updating tstamp_running
	 * until the transaction is committed. In case of failure
	 * we will keep an unmodified tstamp_running which is a
	 * requirement to get correct timing information
	 */
	if (__event_sched_in(group_event, cpuctx, ctx)) {
P
Peter Zijlstra 已提交
741
		pmu->cancel_txn(pmu);
742
		return -EAGAIN;
743
	}
744 745 746 747

	/*
	 * Schedule in siblings as one group (if any):
	 */
748
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
749
		if (__event_sched_in(event, cpuctx, ctx)) {
750
			partial_group = event;
751 752 753 754
			goto group_error;
		}
	}

755 756 757
	if (!pmu->commit_txn(pmu)) {
		/* commit tstamp_running */
		group_commit_event_sched_in(group_event, cpuctx, ctx);
758
		return 0;
759
	}
760 761 762 763
group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
764 765 766
	 *
	 * use __event_sched_out() to avoid updating tstamp_stopped
	 * because the event never actually ran
767
	 */
768 769
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
770
			break;
771
		__event_sched_out(event, cpuctx, ctx);
772
	}
773
	__event_sched_out(group_event, cpuctx, ctx);
774

P
Peter Zijlstra 已提交
775
	pmu->cancel_txn(pmu);
776

777 778 779
	return -EAGAIN;
}

780
/*
781
 * Work out whether we can put this event group on the CPU now.
782
 */
783
static int group_can_go_on(struct perf_event *event,
784 785 786 787
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
788
	 * Groups consisting entirely of software events can always go on.
789
	 */
790
	if (event->group_flags & PERF_GROUP_SOFTWARE)
791 792 793
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
794
	 * events can go on.
795 796 797 798 799
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
800
	 * events on the CPU, it can't go on.
801
	 */
802
	if (event->attr.exclusive && cpuctx->active_oncpu)
803 804 805 806 807 808 809 810
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

811 812
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
813
{
814
	list_add_event(event, ctx);
815
	perf_group_attach(event);
816 817 818
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
819 820
}

T
Thomas Gleixner 已提交
821
/*
822
 * Cross CPU call to install and enable a performance event
823 824
 *
 * Must be called with ctx->mutex held
T
Thomas Gleixner 已提交
825 826 827
 */
static void __perf_install_in_context(void *info)
{
828 829 830
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
P
Peter Zijlstra 已提交
831
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
832
	int err;
T
Thomas Gleixner 已提交
833 834 835 836 837

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
838
	 * Or possibly this is the right context but it isn't
839
	 * on this cpu because it had no events.
T
Thomas Gleixner 已提交
840
	 */
841
	if (ctx->task && cpuctx->task_ctx != ctx) {
842
		if (cpuctx->task_ctx || ctx->task != current)
843 844 845
			return;
		cpuctx->task_ctx = ctx;
	}
T
Thomas Gleixner 已提交
846

847
	raw_spin_lock(&ctx->lock);
848
	ctx->is_active = 1;
849
	update_context_time(ctx);
T
Thomas Gleixner 已提交
850

851
	add_event_to_ctx(event, ctx);
T
Thomas Gleixner 已提交
852

853 854 855
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

856
	/*
857
	 * Don't put the event on if it is disabled or if
858 859
	 * it is in a group and the group isn't on.
	 */
860 861
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
862 863
		goto unlock;

864
	/*
865 866 867
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
868
	 */
869
	if (!group_can_go_on(event, cpuctx, 1))
870 871
		err = -EEXIST;
	else
872
		err = event_sched_in(event, cpuctx, ctx);
873

874 875
	if (err) {
		/*
876
		 * This event couldn't go on.  If it is in a group
877
		 * then we have to pull the whole group off.
878
		 * If the event group is pinned then put it in error state.
879
		 */
880
		if (leader != event)
881
			group_sched_out(leader, cpuctx, ctx);
882
		if (leader->attr.pinned) {
883
			update_group_times(leader);
884
			leader->state = PERF_EVENT_STATE_ERROR;
885
		}
886
	}
T
Thomas Gleixner 已提交
887

P
Peter Zijlstra 已提交
888
unlock:
889
	raw_spin_unlock(&ctx->lock);
T
Thomas Gleixner 已提交
890 891 892
}

/*
893
 * Attach a performance event to a context
T
Thomas Gleixner 已提交
894
 *
895 896
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
T
Thomas Gleixner 已提交
897
 *
898
 * If the event is attached to a task which is on a CPU we use a smp
T
Thomas Gleixner 已提交
899 900
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
901 902
 *
 * Must be called with ctx->mutex held.
T
Thomas Gleixner 已提交
903 904
 */
static void
905 906
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
T
Thomas Gleixner 已提交
907 908 909 910
			int cpu)
{
	struct task_struct *task = ctx->task;

911 912
	event->ctx = ctx;

T
Thomas Gleixner 已提交
913 914
	if (!task) {
		/*
915
		 * Per cpu events are installed via an smp call and
916
		 * the install is always successful.
T
Thomas Gleixner 已提交
917 918
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
919
					 event, 1);
T
Thomas Gleixner 已提交
920 921 922 923 924
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
925
				 event);
T
Thomas Gleixner 已提交
926

927
	raw_spin_lock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
928 929 930
	/*
	 * we need to retry the smp call.
	 */
931
	if (ctx->is_active && list_empty(&event->group_entry)) {
932
		raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
933 934 935 936 937
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
938
	 * can add the event safely, if it the call above did not
T
Thomas Gleixner 已提交
939 940
	 * succeed.
	 */
941 942
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
943
	raw_spin_unlock_irq(&ctx->lock);
T
Thomas Gleixner 已提交
944 945
}

946
/*
947
 * Put a event into inactive state and update time fields.
948 949 950 951 952 953
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
954 955
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
956
{
957
	struct perf_event *sub;
958

959 960
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
P
Peter Zijlstra 已提交
961 962
	list_for_each_entry(sub, &event->sibling_list, group_entry) {
		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
963 964
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
P
Peter Zijlstra 已提交
965 966
		}
	}
967 968
}

969
/*
970
 * Cross CPU call to enable a performance event
971
 */
972
static void __perf_event_enable(void *info)
973
{
974 975 976
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
P
Peter Zijlstra 已提交
977
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
978
	int err;
979

980
	/*
981 982
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
983
	 */
984
	if (ctx->task && cpuctx->task_ctx != ctx) {
985
		if (cpuctx->task_ctx || ctx->task != current)
986 987 988
			return;
		cpuctx->task_ctx = ctx;
	}
989

990
	raw_spin_lock(&ctx->lock);
991
	ctx->is_active = 1;
992
	update_context_time(ctx);
993

994
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
995
		goto unlock;
996
	__perf_event_mark_enabled(event, ctx);
997

998 999 1000
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

1001
	/*
1002
	 * If the event is in a group and isn't the group leader,
1003
	 * then don't put it on unless the group is on.
1004
	 */
1005
	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1006
		goto unlock;
1007

1008
	if (!group_can_go_on(event, cpuctx, 1)) {
1009
		err = -EEXIST;
1010
	} else {
1011
		if (event == leader)
1012
			err = group_sched_in(event, cpuctx, ctx);
1013
		else
1014
			err = event_sched_in(event, cpuctx, ctx);
1015
	}
1016 1017 1018

	if (err) {
		/*
1019
		 * If this event can't go on and it's part of a
1020 1021
		 * group, then the whole group has to come off.
		 */
1022
		if (leader != event)
1023
			group_sched_out(leader, cpuctx, ctx);
1024
		if (leader->attr.pinned) {
1025
			update_group_times(leader);
1026
			leader->state = PERF_EVENT_STATE_ERROR;
1027
		}
1028 1029
	}

P
Peter Zijlstra 已提交
1030
unlock:
1031
	raw_spin_unlock(&ctx->lock);
1032 1033 1034
}

/*
1035
 * Enable a event.
1036
 *
1037 1038
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
1039
 * remains valid.  This condition is satisfied when called through
1040 1041
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
1042
 */
1043
void perf_event_enable(struct perf_event *event)
1044
{
1045
	struct perf_event_context *ctx = event->ctx;
1046 1047 1048 1049
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
1050
		 * Enable the event on the cpu that it's on
1051
		 */
1052 1053
		smp_call_function_single(event->cpu, __perf_event_enable,
					 event, 1);
1054 1055 1056
		return;
	}

1057
	raw_spin_lock_irq(&ctx->lock);
1058
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
1059 1060 1061
		goto out;

	/*
1062 1063
	 * If the event is in error state, clear that first.
	 * That way, if we see the event in error state below, we
1064 1065 1066 1067
	 * know that it has gone back into error state, as distinct
	 * from the task having been scheduled away before the
	 * cross-call arrived.
	 */
1068 1069
	if (event->state == PERF_EVENT_STATE_ERROR)
		event->state = PERF_EVENT_STATE_OFF;
1070

P
Peter Zijlstra 已提交
1071
retry:
1072
	raw_spin_unlock_irq(&ctx->lock);
1073
	task_oncpu_function_call(task, __perf_event_enable, event);
1074

1075
	raw_spin_lock_irq(&ctx->lock);
1076 1077

	/*
1078
	 * If the context is active and the event is still off,
1079 1080
	 * we need to retry the cross-call.
	 */
1081
	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1082 1083 1084 1085 1086 1087
		goto retry;

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
1088 1089
	if (event->state == PERF_EVENT_STATE_OFF)
		__perf_event_mark_enabled(event, ctx);
1090

P
Peter Zijlstra 已提交
1091
out:
1092
	raw_spin_unlock_irq(&ctx->lock);
1093 1094
}

1095
static int perf_event_refresh(struct perf_event *event, int refresh)
1096
{
1097
	/*
1098
	 * not supported on inherited events
1099
	 */
1100
	if (event->attr.inherit)
1101 1102
		return -EINVAL;

1103 1104
	atomic_add(refresh, &event->event_limit);
	perf_event_enable(event);
1105 1106

	return 0;
1107 1108
}

1109 1110 1111 1112 1113 1114 1115 1116 1117
enum event_type_t {
	EVENT_FLEXIBLE = 0x1,
	EVENT_PINNED = 0x2,
	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

static void ctx_sched_out(struct perf_event_context *ctx,
			  struct perf_cpu_context *cpuctx,
			  enum event_type_t event_type)
1118
{
1119
	struct perf_event *event;
1120

1121
	raw_spin_lock(&ctx->lock);
P
Peter Zijlstra 已提交
1122
	perf_pmu_disable(ctx->pmu);
1123
	ctx->is_active = 0;
1124
	if (likely(!ctx->nr_events))
1125
		goto out;
1126
	update_context_time(ctx);
1127

1128
	if (!ctx->nr_active)
1129
		goto out;
1130

P
Peter Zijlstra 已提交
1131
	if (event_type & EVENT_PINNED) {
1132 1133
		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
			group_sched_out(event, cpuctx, ctx);
P
Peter Zijlstra 已提交
1134
	}
1135

P
Peter Zijlstra 已提交
1136
	if (event_type & EVENT_FLEXIBLE) {
1137
		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1138
			group_sched_out(event, cpuctx, ctx);
P
Peter Zijlstra 已提交
1139 1140
	}
out:
P
Peter Zijlstra 已提交
1141
	perf_pmu_enable(ctx->pmu);
1142
	raw_spin_unlock(&ctx->lock);
1143 1144
}

1145 1146 1147
/*
 * Test whether two contexts are equivalent, i.e. whether they
 * have both been cloned from the same version of the same context
1148 1149 1150 1151
 * and they both have the same number of enabled events.
 * If the number of enabled events is the same, then the set
 * of enabled events should be the same, because these are both
 * inherited contexts, therefore we can't access individual events
1152
 * in them directly with an fd; we can only enable/disable all
1153
 * events via prctl, or enable/disable all events in a family
1154 1155
 * via ioctl, which will have the same effect on both contexts.
 */
1156 1157
static int context_equiv(struct perf_event_context *ctx1,
			 struct perf_event_context *ctx2)
1158 1159
{
	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1160
		&& ctx1->parent_gen == ctx2->parent_gen
1161
		&& !ctx1->pin_count && !ctx2->pin_count;
1162 1163
}

1164 1165
static void __perf_event_sync_stat(struct perf_event *event,
				     struct perf_event *next_event)
1166 1167 1168
{
	u64 value;

1169
	if (!event->attr.inherit_stat)
1170 1171 1172
		return;

	/*
1173
	 * Update the event value, we cannot use perf_event_read()
1174 1175
	 * because we're in the middle of a context switch and have IRQs
	 * disabled, which upsets smp_call_function_single(), however
1176
	 * we know the event must be on the current CPU, therefore we
1177 1178
	 * don't need to use it.
	 */
1179 1180
	switch (event->state) {
	case PERF_EVENT_STATE_ACTIVE:
1181 1182
		event->pmu->read(event);
		/* fall-through */
1183

1184 1185
	case PERF_EVENT_STATE_INACTIVE:
		update_event_times(event);
1186 1187 1188 1189 1190 1191 1192
		break;

	default:
		break;
	}

	/*
1193
	 * In order to keep per-task stats reliable we need to flip the event
1194 1195
	 * values when we flip the contexts.
	 */
1196 1197 1198
	value = local64_read(&next_event->count);
	value = local64_xchg(&event->count, value);
	local64_set(&next_event->count, value);
1199

1200 1201
	swap(event->total_time_enabled, next_event->total_time_enabled);
	swap(event->total_time_running, next_event->total_time_running);
1202

1203
	/*
1204
	 * Since we swizzled the values, update the user visible data too.
1205
	 */
1206 1207
	perf_event_update_userpage(event);
	perf_event_update_userpage(next_event);
1208 1209 1210 1211 1212
}

#define list_next_entry(pos, member) \
	list_entry(pos->member.next, typeof(*pos), member)

1213 1214
static void perf_event_sync_stat(struct perf_event_context *ctx,
				   struct perf_event_context *next_ctx)
1215
{
1216
	struct perf_event *event, *next_event;
1217 1218 1219 1220

	if (!ctx->nr_stat)
		return;

1221 1222
	update_context_time(ctx);

1223 1224
	event = list_first_entry(&ctx->event_list,
				   struct perf_event, event_entry);
1225

1226 1227
	next_event = list_first_entry(&next_ctx->event_list,
					struct perf_event, event_entry);
1228

1229 1230
	while (&event->event_entry != &ctx->event_list &&
	       &next_event->event_entry != &next_ctx->event_list) {
1231

1232
		__perf_event_sync_stat(event, next_event);
1233

1234 1235
		event = list_next_entry(event, event_entry);
		next_event = list_next_entry(next_event, event_entry);
1236 1237 1238
	}
}

P
Peter Zijlstra 已提交
1239 1240
void perf_event_context_sched_out(struct task_struct *task, int ctxn,
				  struct task_struct *next)
T
Thomas Gleixner 已提交
1241
{
P
Peter Zijlstra 已提交
1242
	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1243 1244
	struct perf_event_context *next_ctx;
	struct perf_event_context *parent;
P
Peter Zijlstra 已提交
1245
	struct perf_cpu_context *cpuctx;
1246
	int do_switch = 1;
T
Thomas Gleixner 已提交
1247

P
Peter Zijlstra 已提交
1248 1249
	if (likely(!ctx))
		return;
1250

P
Peter Zijlstra 已提交
1251 1252
	cpuctx = __get_cpu_context(ctx);
	if (!cpuctx->task_ctx)
T
Thomas Gleixner 已提交
1253 1254
		return;

1255 1256
	rcu_read_lock();
	parent = rcu_dereference(ctx->parent_ctx);
P
Peter Zijlstra 已提交
1257
	next_ctx = next->perf_event_ctxp[ctxn];
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
	if (parent && next_ctx &&
	    rcu_dereference(next_ctx->parent_ctx) == parent) {
		/*
		 * Looks like the two contexts are clones, so we might be
		 * able to optimize the context switch.  We lock both
		 * contexts and check that they are clones under the
		 * lock (including re-checking that neither has been
		 * uncloned in the meantime).  It doesn't matter which
		 * order we take the locks because no other cpu could
		 * be trying to lock both of these tasks.
		 */
1269 1270
		raw_spin_lock(&ctx->lock);
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1271
		if (context_equiv(ctx, next_ctx)) {
1272 1273
			/*
			 * XXX do we need a memory barrier of sorts
1274
			 * wrt to rcu_dereference() of perf_event_ctxp
1275
			 */
P
Peter Zijlstra 已提交
1276 1277
			task->perf_event_ctxp[ctxn] = next_ctx;
			next->perf_event_ctxp[ctxn] = ctx;
1278 1279 1280
			ctx->task = next;
			next_ctx->task = task;
			do_switch = 0;
1281

1282
			perf_event_sync_stat(ctx, next_ctx);
1283
		}
1284 1285
		raw_spin_unlock(&next_ctx->lock);
		raw_spin_unlock(&ctx->lock);
1286
	}
1287
	rcu_read_unlock();
1288

1289
	if (do_switch) {
1290
		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1291 1292
		cpuctx->task_ctx = NULL;
	}
T
Thomas Gleixner 已提交
1293 1294
}

P
Peter Zijlstra 已提交
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
#define for_each_task_context_nr(ctxn)					\
	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void perf_event_task_sched_out(struct task_struct *task,
			       struct task_struct *next)
{
	int ctxn;

	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);

	for_each_task_context_nr(ctxn)
		perf_event_context_sched_out(task, ctxn, next);
}

1320 1321
static void task_ctx_sched_out(struct perf_event_context *ctx,
			       enum event_type_t event_type)
1322
{
P
Peter Zijlstra 已提交
1323
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1324

1325 1326
	if (!cpuctx->task_ctx)
		return;
1327 1328 1329 1330

	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
		return;

1331
	ctx_sched_out(ctx, cpuctx, event_type);
1332 1333 1334
	cpuctx->task_ctx = NULL;
}

1335 1336 1337
/*
 * Called with IRQs disabled
 */
1338
static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1339
{
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	task_ctx_sched_out(ctx, EVENT_ALL);
}

/*
 * Called with IRQs disabled
 */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
			      enum event_type_t event_type)
{
	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1350 1351
}

1352
static void
1353
ctx_pinned_sched_in(struct perf_event_context *ctx,
1354
		    struct perf_cpu_context *cpuctx)
T
Thomas Gleixner 已提交
1355
{
1356
	struct perf_event *event;
T
Thomas Gleixner 已提交
1357

1358 1359
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		if (event->state <= PERF_EVENT_STATE_OFF)
1360
			continue;
1361
		if (event->cpu != -1 && event->cpu != smp_processor_id())
1362 1363
			continue;

1364
		if (group_can_go_on(event, cpuctx, 1))
1365
			group_sched_in(event, cpuctx, ctx);
1366 1367 1368 1369 1370

		/*
		 * If this pinned group hasn't been scheduled,
		 * put it in error state.
		 */
1371 1372 1373
		if (event->state == PERF_EVENT_STATE_INACTIVE) {
			update_group_times(event);
			event->state = PERF_EVENT_STATE_ERROR;
1374
		}
1375
	}
1376 1377 1378 1379
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
1380
		      struct perf_cpu_context *cpuctx)
1381 1382 1383
{
	struct perf_event *event;
	int can_add_hw = 1;
1384

1385 1386 1387
	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		/* Ignore events in OFF or ERROR state */
		if (event->state <= PERF_EVENT_STATE_OFF)
1388
			continue;
1389 1390
		/*
		 * Listen to the 'cpu' scheduling filter constraint
1391
		 * of events:
1392
		 */
1393
		if (event->cpu != -1 && event->cpu != smp_processor_id())
T
Thomas Gleixner 已提交
1394 1395
			continue;

P
Peter Zijlstra 已提交
1396
		if (group_can_go_on(event, cpuctx, can_add_hw)) {
1397
			if (group_sched_in(event, cpuctx, ctx))
1398
				can_add_hw = 0;
P
Peter Zijlstra 已提交
1399
		}
T
Thomas Gleixner 已提交
1400
	}
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
}

static void
ctx_sched_in(struct perf_event_context *ctx,
	     struct perf_cpu_context *cpuctx,
	     enum event_type_t event_type)
{
	raw_spin_lock(&ctx->lock);
	ctx->is_active = 1;
	if (likely(!ctx->nr_events))
		goto out;

	ctx->timestamp = perf_clock();

	/*
	 * First go through the list and put on any pinned groups
	 * in order to give them the best chance of going on.
	 */
	if (event_type & EVENT_PINNED)
1420
		ctx_pinned_sched_in(ctx, cpuctx);
1421 1422 1423

	/* Then walk through the lower prio flexible groups */
	if (event_type & EVENT_FLEXIBLE)
1424
		ctx_flexible_sched_in(ctx, cpuctx);
1425

P
Peter Zijlstra 已提交
1426
out:
1427
	raw_spin_unlock(&ctx->lock);
1428 1429
}

1430 1431 1432 1433 1434 1435 1436 1437
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
			     enum event_type_t event_type)
{
	struct perf_event_context *ctx = &cpuctx->ctx;

	ctx_sched_in(ctx, cpuctx, event_type);
}

P
Peter Zijlstra 已提交
1438
static void task_ctx_sched_in(struct perf_event_context *ctx,
1439 1440
			      enum event_type_t event_type)
{
P
Peter Zijlstra 已提交
1441
	struct perf_cpu_context *cpuctx;
1442

P
Peter Zijlstra 已提交
1443
       	cpuctx = __get_cpu_context(ctx);
1444 1445
	if (cpuctx->task_ctx == ctx)
		return;
P
Peter Zijlstra 已提交
1446

1447 1448 1449
	ctx_sched_in(ctx, cpuctx, event_type);
	cpuctx->task_ctx = ctx;
}
T
Thomas Gleixner 已提交
1450

P
Peter Zijlstra 已提交
1451
void perf_event_context_sched_in(struct perf_event_context *ctx)
1452
{
P
Peter Zijlstra 已提交
1453
	struct perf_cpu_context *cpuctx;
1454

P
Peter Zijlstra 已提交
1455
	cpuctx = __get_cpu_context(ctx);
1456 1457 1458
	if (cpuctx->task_ctx == ctx)
		return;

P
Peter Zijlstra 已提交
1459
	perf_pmu_disable(ctx->pmu);
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
	/*
	 * We want to keep the following priority order:
	 * cpu pinned (that don't need to move), task pinned,
	 * cpu flexible, task flexible.
	 */
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);

	cpuctx->task_ctx = ctx;
1472

1473 1474 1475 1476
	/*
	 * Since these rotations are per-cpu, we need to ensure the
	 * cpu-context we got scheduled on is actually rotating.
	 */
P
Peter Zijlstra 已提交
1477
	perf_pmu_rotate_start(ctx->pmu);
P
Peter Zijlstra 已提交
1478
	perf_pmu_enable(ctx->pmu);
1479 1480
}

P
Peter Zijlstra 已提交
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void perf_event_task_sched_in(struct task_struct *task)
{
	struct perf_event_context *ctx;
	int ctxn;

	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (likely(!ctx))
			continue;

		perf_event_context_sched_in(ctx);
	}
1504 1505
}

1506 1507
#define MAX_INTERRUPTS (~0ULL)

1508
static void perf_log_throttle(struct perf_event *event, int enable);
1509

1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
	u64 frequency = event->attr.sample_freq;
	u64 sec = NSEC_PER_SEC;
	u64 divisor, dividend;

	int count_fls, nsec_fls, frequency_fls, sec_fls;

	count_fls = fls64(count);
	nsec_fls = fls64(nsec);
	frequency_fls = fls64(frequency);
	sec_fls = 30;

	/*
	 * We got @count in @nsec, with a target of sample_freq HZ
	 * the target period becomes:
	 *
	 *             @count * 10^9
	 * period = -------------------
	 *          @nsec * sample_freq
	 *
	 */

	/*
	 * Reduce accuracy by one bit such that @a and @b converge
	 * to a similar magnitude.
	 */
#define REDUCE_FLS(a, b) 		\
do {					\
	if (a##_fls > b##_fls) {	\
		a >>= 1;		\
		a##_fls--;		\
	} else {			\
		b >>= 1;		\
		b##_fls--;		\
	}				\
} while (0)

	/*
	 * Reduce accuracy until either term fits in a u64, then proceed with
	 * the other, so that finally we can do a u64/u64 division.
	 */
	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
		REDUCE_FLS(nsec, frequency);
		REDUCE_FLS(sec, count);
	}

	if (count_fls + sec_fls > 64) {
		divisor = nsec * frequency;

		while (count_fls + sec_fls > 64) {
			REDUCE_FLS(count, sec);
			divisor >>= 1;
		}

		dividend = count * sec;
	} else {
		dividend = count * sec;

		while (nsec_fls + frequency_fls > 64) {
			REDUCE_FLS(nsec, frequency);
			dividend >>= 1;
		}

		divisor = nsec * frequency;
	}

1577 1578 1579
	if (!divisor)
		return dividend;

1580 1581 1582 1583
	return div64_u64(dividend, divisor);
}

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1584
{
1585
	struct hw_perf_event *hwc = &event->hw;
1586
	s64 period, sample_period;
1587 1588
	s64 delta;

1589
	period = perf_calculate_period(event, nsec, count);
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599

	delta = (s64)(period - hwc->sample_period);
	delta = (delta + 7) / 8; /* low pass filter */

	sample_period = hwc->sample_period + delta;

	if (!sample_period)
		sample_period = 1;

	hwc->sample_period = sample_period;
1600

1601
	if (local64_read(&hwc->period_left) > 8*sample_period) {
P
Peter Zijlstra 已提交
1602
		event->pmu->stop(event, PERF_EF_UPDATE);
1603
		local64_set(&hwc->period_left, 0);
P
Peter Zijlstra 已提交
1604
		event->pmu->start(event, PERF_EF_RELOAD);
1605
	}
1606 1607
}

1608
static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1609
{
1610 1611
	struct perf_event *event;
	struct hw_perf_event *hwc;
1612 1613
	u64 interrupts, now;
	s64 delta;
1614

1615
	raw_spin_lock(&ctx->lock);
1616
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1617
		if (event->state != PERF_EVENT_STATE_ACTIVE)
1618 1619
			continue;

1620 1621 1622
		if (event->cpu != -1 && event->cpu != smp_processor_id())
			continue;

1623
		hwc = &event->hw;
1624 1625 1626

		interrupts = hwc->interrupts;
		hwc->interrupts = 0;
1627

1628
		/*
1629
		 * unthrottle events on the tick
1630
		 */
1631
		if (interrupts == MAX_INTERRUPTS) {
1632
			perf_log_throttle(event, 1);
P
Peter Zijlstra 已提交
1633
			event->pmu->start(event, 0);
1634 1635
		}

1636
		if (!event->attr.freq || !event->attr.sample_freq)
1637 1638
			continue;

1639
		event->pmu->read(event);
1640
		now = local64_read(&event->count);
1641 1642
		delta = now - hwc->freq_count_stamp;
		hwc->freq_count_stamp = now;
1643

1644
		if (delta > 0)
1645
			perf_adjust_period(event, period, delta);
1646
	}
1647
	raw_spin_unlock(&ctx->lock);
1648 1649
}

1650
/*
1651
 * Round-robin a context's events:
1652
 */
1653
static void rotate_ctx(struct perf_event_context *ctx)
T
Thomas Gleixner 已提交
1654
{
1655
	raw_spin_lock(&ctx->lock);
1656 1657 1658 1659

	/* Rotate the first entry last of non-pinned groups */
	list_rotate_left(&ctx->flexible_groups);

1660
	raw_spin_unlock(&ctx->lock);
1661 1662
}

1663
/*
1664 1665 1666
 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
1667
 */
1668
static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1669
{
1670
	u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
P
Peter Zijlstra 已提交
1671
	struct perf_event_context *ctx = NULL;
1672
	int rotate = 0, remove = 1;
1673

1674
	if (cpuctx->ctx.nr_events) {
1675
		remove = 0;
1676 1677 1678
		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
			rotate = 1;
	}
1679

P
Peter Zijlstra 已提交
1680
	ctx = cpuctx->task_ctx;
1681
	if (ctx && ctx->nr_events) {
1682
		remove = 0;
1683 1684 1685
		if (ctx->nr_events != ctx->nr_active)
			rotate = 1;
	}
1686

P
Peter Zijlstra 已提交
1687
	perf_pmu_disable(cpuctx->ctx.pmu);
1688
	perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1689
	if (ctx)
1690
		perf_ctx_adjust_freq(ctx, interval);
1691

1692
	if (!rotate)
1693
		goto done;
1694

1695
	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1696
	if (ctx)
1697
		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
T
Thomas Gleixner 已提交
1698

1699
	rotate_ctx(&cpuctx->ctx);
1700 1701
	if (ctx)
		rotate_ctx(ctx);
1702

1703
	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1704
	if (ctx)
P
Peter Zijlstra 已提交
1705
		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1706 1707

done:
1708 1709 1710
	if (remove)
		list_del_init(&cpuctx->rotation_list);

P
Peter Zijlstra 已提交
1711
	perf_pmu_enable(cpuctx->ctx.pmu);
1712 1713 1714 1715 1716 1717
}

void perf_event_task_tick(void)
{
	struct list_head *head = &__get_cpu_var(rotation_list);
	struct perf_cpu_context *cpuctx, *tmp;
1718

1719 1720 1721 1722 1723 1724 1725
	WARN_ON(!irqs_disabled());

	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
		if (cpuctx->jiffies_interval == 1 ||
				!(jiffies % cpuctx->jiffies_interval))
			perf_rotate_context(cpuctx);
	}
T
Thomas Gleixner 已提交
1726 1727
}

1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742
static int event_enable_on_exec(struct perf_event *event,
				struct perf_event_context *ctx)
{
	if (!event->attr.enable_on_exec)
		return 0;

	event->attr.enable_on_exec = 0;
	if (event->state >= PERF_EVENT_STATE_INACTIVE)
		return 0;

	__perf_event_mark_enabled(event, ctx);

	return 1;
}

1743
/*
1744
 * Enable all of a task's events that have been marked enable-on-exec.
1745 1746
 * This expects task == current.
 */
P
Peter Zijlstra 已提交
1747
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1748
{
1749
	struct perf_event *event;
1750 1751
	unsigned long flags;
	int enabled = 0;
1752
	int ret;
1753 1754

	local_irq_save(flags);
1755
	if (!ctx || !ctx->nr_events)
1756 1757
		goto out;

P
Peter Zijlstra 已提交
1758
	task_ctx_sched_out(ctx, EVENT_ALL);
1759

1760
	raw_spin_lock(&ctx->lock);
1761

1762 1763 1764 1765 1766 1767 1768 1769 1770 1771
	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
	}

	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
		ret = event_enable_on_exec(event, ctx);
		if (ret)
			enabled = 1;
1772 1773 1774
	}

	/*
1775
	 * Unclone this context if we enabled any event.
1776
	 */
1777 1778
	if (enabled)
		unclone_ctx(ctx);
1779

1780
	raw_spin_unlock(&ctx->lock);
1781

P
Peter Zijlstra 已提交
1782
	perf_event_context_sched_in(ctx);
P
Peter Zijlstra 已提交
1783
out:
1784 1785 1786
	local_irq_restore(flags);
}

T
Thomas Gleixner 已提交
1787
/*
1788
 * Cross CPU call to read the hardware event
T
Thomas Gleixner 已提交
1789
 */
1790
static void __perf_event_read(void *info)
T
Thomas Gleixner 已提交
1791
{
1792 1793
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
P
Peter Zijlstra 已提交
1794
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
I
Ingo Molnar 已提交
1795

1796 1797 1798 1799
	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu.  If not it has been
	 * scheduled out before the smp call arrived.  In that case
1800 1801
	 * event->count would have been updated to a recent sample
	 * when the event was scheduled out.
1802 1803 1804 1805
	 */
	if (ctx->task && cpuctx->task_ctx != ctx)
		return;

1806
	raw_spin_lock(&ctx->lock);
P
Peter Zijlstra 已提交
1807
	update_context_time(ctx);
1808
	update_event_times(event);
1809
	raw_spin_unlock(&ctx->lock);
P
Peter Zijlstra 已提交
1810

P
Peter Zijlstra 已提交
1811
	event->pmu->read(event);
T
Thomas Gleixner 已提交
1812 1813
}

P
Peter Zijlstra 已提交
1814 1815
static inline u64 perf_event_count(struct perf_event *event)
{
1816
	return local64_read(&event->count) + atomic64_read(&event->child_count);
P
Peter Zijlstra 已提交
1817 1818
}

1819
static u64 perf_event_read(struct perf_event *event)
T
Thomas Gleixner 已提交
1820 1821
{
	/*
1822 1823
	 * If event is enabled and currently active on a CPU, update the
	 * value in the event structure:
T
Thomas Gleixner 已提交
1824
	 */
1825 1826 1827 1828
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
		smp_call_function_single(event->oncpu,
					 __perf_event_read, event, 1);
	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
P
Peter Zijlstra 已提交
1829 1830 1831
		struct perf_event_context *ctx = event->ctx;
		unsigned long flags;

1832
		raw_spin_lock_irqsave(&ctx->lock, flags);
1833 1834 1835 1836 1837 1838 1839
		/*
		 * may read while context is not active
		 * (e.g., thread is blocked), in that case
		 * we cannot update context time
		 */
		if (ctx->is_active)
			update_context_time(ctx);
1840
		update_event_times(event);
1841
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
1842 1843
	}

P
Peter Zijlstra 已提交
1844
	return perf_event_count(event);
T
Thomas Gleixner 已提交
1845 1846
}

1847
/*
1848
 * Callchain support
1849
 */
1850 1851 1852 1853 1854 1855

struct callchain_cpus_entries {
	struct rcu_head			rcu_head;
	struct perf_callchain_entry	*cpu_entries[0];
};

1856
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1857 1858 1859 1860 1861 1862 1863
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
struct callchain_cpus_entries *callchain_cpus_entries;


__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
				  struct pt_regs *regs)
1864 1865 1866
{
}

1867 1868
__weak void perf_callchain_user(struct perf_callchain_entry *entry,
				struct pt_regs *regs)
T
Thomas Gleixner 已提交
1869
{
1870
}
T
Thomas Gleixner 已提交
1871

1872 1873 1874 1875
static void release_callchain_buffers_rcu(struct rcu_head *head)
{
	struct callchain_cpus_entries *entries;
	int cpu;
T
Thomas Gleixner 已提交
1876

1877
	entries = container_of(head, struct callchain_cpus_entries, rcu_head);
T
Thomas Gleixner 已提交
1878

1879 1880
	for_each_possible_cpu(cpu)
		kfree(entries->cpu_entries[cpu]);
T
Thomas Gleixner 已提交
1881

1882 1883
	kfree(entries);
}
T
Thomas Gleixner 已提交
1884

1885 1886 1887
static void release_callchain_buffers(void)
{
	struct callchain_cpus_entries *entries;
T
Thomas Gleixner 已提交
1888

1889 1890 1891 1892
	entries = callchain_cpus_entries;
	rcu_assign_pointer(callchain_cpus_entries, NULL);
	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
T
Thomas Gleixner 已提交
1893

1894 1895 1896 1897 1898
static int alloc_callchain_buffers(void)
{
	int cpu;
	int size;
	struct callchain_cpus_entries *entries;
T
Thomas Gleixner 已提交
1899

1900
	/*
1901 1902 1903
	 * We can't use the percpu allocation API for data that can be
	 * accessed from NMI. Use a temporary manual per cpu allocation
	 * until that gets sorted out.
1904
	 */
1905 1906
	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
		num_possible_cpus();
1907

1908 1909 1910
	entries = kzalloc(size, GFP_KERNEL);
	if (!entries)
		return -ENOMEM;
1911

1912
	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
T
Thomas Gleixner 已提交
1913

1914 1915 1916 1917 1918
	for_each_possible_cpu(cpu) {
		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
							 cpu_to_node(cpu));
		if (!entries->cpu_entries[cpu])
			goto fail;
1919 1920
	}

1921
	rcu_assign_pointer(callchain_cpus_entries, entries);
T
Thomas Gleixner 已提交
1922

1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
	return 0;

fail:
	for_each_possible_cpu(cpu)
		kfree(entries->cpu_entries[cpu]);
	kfree(entries);

	return -ENOMEM;
}

static int get_callchain_buffers(void)
{
	int err = 0;
	int count;

	mutex_lock(&callchain_mutex);

	count = atomic_inc_return(&nr_callchain_events);
	if (WARN_ON_ONCE(count < 1)) {
		err = -EINVAL;
		goto exit;
	}

	if (count > 1) {
		/* If the allocation failed, give up */
		if (!callchain_cpus_entries)
			err = -ENOMEM;
		goto exit;
	}

	err = alloc_callchain_buffers();
	if (err)
		release_callchain_buffers();
exit:
	mutex_unlock(&callchain_mutex);

	return err;
}

static void put_callchain_buffers(void)
{
	if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
		release_callchain_buffers();
		mutex_unlock(&callchain_mutex);
	}
}

static int get_recursion_context(int *recursion)
{
	int rctx;

	if (in_nmi())
		rctx = 3;
	else if (in_irq())
		rctx = 2;
	else if (in_softirq())
		rctx = 1;
	else
		rctx = 0;

	if (recursion[rctx])
		return -1;

	recursion[rctx]++;
	barrier();

	return rctx;
}

static inline void put_recursion_context(int *recursion, int rctx)
{
	barrier();
	recursion[rctx]--;
}

static struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
	int cpu;
	struct callchain_cpus_entries *entries;

	*rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
	if (*rctx == -1)
		return NULL;

	entries = rcu_dereference(callchain_cpus_entries);
	if (!entries)
		return NULL;

	cpu = smp_processor_id();

	return &entries->cpu_entries[cpu][*rctx];
}

static void
put_callchain_entry(int rctx)
{
	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
}

static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	int rctx;
	struct perf_callchain_entry *entry;


	entry = get_callchain_entry(&rctx);
	if (rctx == -1)
		return NULL;

	if (!entry)
		goto exit_put;

	entry->nr = 0;

	if (!user_mode(regs)) {
		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
		perf_callchain_kernel(entry, regs);
		if (current->mm)
			regs = task_pt_regs(current);
		else
			regs = NULL;
	}

	if (regs) {
		perf_callchain_store(entry, PERF_CONTEXT_USER);
		perf_callchain_user(entry, regs);
	}

exit_put:
	put_callchain_entry(rctx);

	return entry;
}

2057
/*
2058
 * Initialize the perf_event context in a task_struct:
2059
 */
2060
static void __perf_event_init_context(struct perf_event_context *ctx)
2061
{
2062
	raw_spin_lock_init(&ctx->lock);
2063
	mutex_init(&ctx->mutex);
2064 2065
	INIT_LIST_HEAD(&ctx->pinned_groups);
	INIT_LIST_HEAD(&ctx->flexible_groups);
2066 2067
	INIT_LIST_HEAD(&ctx->event_list);
	atomic_set(&ctx->refcount, 1);
2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
}

static struct perf_event_context *
alloc_perf_context(struct pmu *pmu, struct task_struct *task)
{
	struct perf_event_context *ctx;

	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
	if (!ctx)
		return NULL;

	__perf_event_init_context(ctx);
	if (task) {
		ctx->task = task;
		get_task_struct(task);
	}
	ctx->pmu = pmu;

	return ctx;
2087 2088
}

2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
	struct task_struct *task;
	int err;

	rcu_read_lock();
	if (!vpid)
		task = current;
	else
		task = find_task_by_vpid(vpid);
	if (task)
		get_task_struct(task);
	rcu_read_unlock();

	if (!task)
		return ERR_PTR(-ESRCH);

	/*
	 * Can't attach events to a dying task.
	 */
	err = -ESRCH;
	if (task->flags & PF_EXITING)
		goto errout;

	/* Reuse ptrace permission checks for now. */
	err = -EACCES;
	if (!ptrace_may_access(task, PTRACE_MODE_READ))
		goto errout;

	return task;
errout:
	put_task_struct(task);
	return ERR_PTR(err);

}

P
Peter Zijlstra 已提交
2126
static struct perf_event_context *
M
Matt Helsley 已提交
2127
find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
T
Thomas Gleixner 已提交
2128
{
2129
	struct perf_event_context *ctx;
2130
	struct perf_cpu_context *cpuctx;
2131
	unsigned long flags;
P
Peter Zijlstra 已提交
2132
	int ctxn, err;
T
Thomas Gleixner 已提交
2133

M
Matt Helsley 已提交
2134
	if (!task && cpu != -1) {
2135
		/* Must be root to operate on a CPU event: */
2136
		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
T
Thomas Gleixner 已提交
2137 2138
			return ERR_PTR(-EACCES);

2139
		if (cpu < 0 || cpu >= nr_cpumask_bits)
T
Thomas Gleixner 已提交
2140 2141 2142
			return ERR_PTR(-EINVAL);

		/*
2143
		 * We could be clever and allow to attach a event to an
T
Thomas Gleixner 已提交
2144 2145 2146
		 * offline CPU and activate it when the CPU comes up, but
		 * that's for later.
		 */
2147
		if (!cpu_online(cpu))
T
Thomas Gleixner 已提交
2148 2149
			return ERR_PTR(-ENODEV);

P
Peter Zijlstra 已提交
2150
		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
T
Thomas Gleixner 已提交
2151
		ctx = &cpuctx->ctx;
2152
		get_ctx(ctx);
T
Thomas Gleixner 已提交
2153 2154 2155 2156

		return ctx;
	}

P
Peter Zijlstra 已提交
2157 2158 2159 2160 2161
	err = -EINVAL;
	ctxn = pmu->task_ctx_nr;
	if (ctxn < 0)
		goto errout;

P
Peter Zijlstra 已提交
2162
retry:
P
Peter Zijlstra 已提交
2163
	ctx = perf_lock_task_context(task, ctxn, &flags);
2164
	if (ctx) {
2165
		unclone_ctx(ctx);
2166
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
T
Thomas Gleixner 已提交
2167 2168
	}

2169
	if (!ctx) {
2170
		ctx = alloc_perf_context(pmu, task);
2171 2172 2173
		err = -ENOMEM;
		if (!ctx)
			goto errout;
2174

2175
		get_ctx(ctx);
2176

P
Peter Zijlstra 已提交
2177
		if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
2178 2179 2180 2181
			/*
			 * We raced with some other task; use
			 * the context they set.
			 */
2182
			put_task_struct(task);
2183
			kfree(ctx);
2184
			goto retry;
2185 2186 2187
		}
	}

2188
	put_task_struct(task);
T
Thomas Gleixner 已提交
2189
	return ctx;
2190

P
Peter Zijlstra 已提交
2191
errout:
2192 2193
	put_task_struct(task);
	return ERR_PTR(err);
T
Thomas Gleixner 已提交
2194 2195
}

L
Li Zefan 已提交
2196 2197
static void perf_event_free_filter(struct perf_event *event);

2198
static void free_event_rcu(struct rcu_head *head)
P
Peter Zijlstra 已提交
2199
{
2200
	struct perf_event *event;
P
Peter Zijlstra 已提交
2201

2202 2203 2204
	event = container_of(head, struct perf_event, rcu_head);
	if (event->ns)
		put_pid_ns(event->ns);
L
Li Zefan 已提交
2205
	perf_event_free_filter(event);
2206
	kfree(event);
P
Peter Zijlstra 已提交
2207 2208
}

2209
static void perf_buffer_put(struct perf_buffer *buffer);
2210

2211
static void free_event(struct perf_event *event)
2212
{
2213
	irq_work_sync(&event->pending);
2214

2215 2216
	if (!event->parent) {
		atomic_dec(&nr_events);
2217
		if (event->attr.mmap || event->attr.mmap_data)
2218 2219 2220 2221 2222
			atomic_dec(&nr_mmap_events);
		if (event->attr.comm)
			atomic_dec(&nr_comm_events);
		if (event->attr.task)
			atomic_dec(&nr_task_events);
2223 2224
		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
			put_callchain_buffers();
2225
	}
2226

2227 2228 2229
	if (event->buffer) {
		perf_buffer_put(event->buffer);
		event->buffer = NULL;
2230 2231
	}

2232 2233
	if (event->destroy)
		event->destroy(event);
2234

P
Peter Zijlstra 已提交
2235 2236 2237
	if (event->ctx)
		put_ctx(event->ctx);

2238
	call_rcu(&event->rcu_head, free_event_rcu);
2239 2240
}

2241
int perf_event_release_kernel(struct perf_event *event)
T
Thomas Gleixner 已提交
2242
{
2243
	struct perf_event_context *ctx = event->ctx;
T
Thomas Gleixner 已提交
2244

2245 2246 2247 2248 2249 2250
	/*
	 * Remove from the PMU, can't get re-enabled since we got
	 * here because the last ref went.
	 */
	perf_event_disable(event);

2251
	WARN_ON_ONCE(ctx->parent_ctx);
2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	/*
	 * There are two ways this annotation is useful:
	 *
	 *  1) there is a lock recursion from perf_event_exit_task
	 *     see the comment there.
	 *
	 *  2) there is a lock-inversion with mmap_sem through
	 *     perf_event_read_group(), which takes faults while
	 *     holding ctx->mutex, however this is called after
	 *     the last filedesc died, so there is no possibility
	 *     to trigger the AB-BA case.
	 */
	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2265
	raw_spin_lock_irq(&ctx->lock);
2266
	perf_group_detach(event);
2267 2268
	list_del_event(event, ctx);
	raw_spin_unlock_irq(&ctx->lock);
2269
	mutex_unlock(&ctx->mutex);
T
Thomas Gleixner 已提交
2270

2271 2272 2273 2274
	mutex_lock(&event->owner->perf_event_mutex);
	list_del_init(&event->owner_entry);
	mutex_unlock(&event->owner->perf_event_mutex);
	put_task_struct(event->owner);
2275

2276
	free_event(event);
T
Thomas Gleixner 已提交
2277 2278 2279

	return 0;
}
2280
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
T
Thomas Gleixner 已提交
2281

2282 2283 2284 2285
/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
2286
{
2287
	struct perf_event *event = file->private_data;
2288

2289
	file->private_data = NULL;
2290

2291
	return perf_event_release_kernel(event);
2292 2293
}

2294
static int perf_event_read_size(struct perf_event *event)
2295 2296 2297 2298 2299
{
	int entry = sizeof(u64); /* value */
	int size = 0;
	int nr = 1;

2300
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2301 2302
		size += sizeof(u64);

2303
	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2304 2305
		size += sizeof(u64);

2306
	if (event->attr.read_format & PERF_FORMAT_ID)
2307 2308
		entry += sizeof(u64);

2309 2310
	if (event->attr.read_format & PERF_FORMAT_GROUP) {
		nr += event->group_leader->nr_siblings;
2311 2312 2313 2314 2315 2316 2317 2318
		size += sizeof(u64);
	}

	size += entry * nr;

	return size;
}

2319
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2320
{
2321
	struct perf_event *child;
2322 2323
	u64 total = 0;

2324 2325 2326
	*enabled = 0;
	*running = 0;

2327
	mutex_lock(&event->child_mutex);
2328
	total += perf_event_read(event);
2329 2330 2331 2332 2333 2334
	*enabled += event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
	*running += event->total_time_running +
			atomic64_read(&event->child_total_time_running);

	list_for_each_entry(child, &event->child_list, child_list) {
2335
		total += perf_event_read(child);
2336 2337 2338
		*enabled += child->total_time_enabled;
		*running += child->total_time_running;
	}
2339
	mutex_unlock(&event->child_mutex);
2340 2341 2342

	return total;
}
2343
EXPORT_SYMBOL_GPL(perf_event_read_value);
2344

2345
static int perf_event_read_group(struct perf_event *event,
2346 2347
				   u64 read_format, char __user *buf)
{
2348
	struct perf_event *leader = event->group_leader, *sub;
2349 2350
	int n = 0, size = 0, ret = -EFAULT;
	struct perf_event_context *ctx = leader->ctx;
2351
	u64 values[5];
2352
	u64 count, enabled, running;
2353

2354
	mutex_lock(&ctx->mutex);
2355
	count = perf_event_read_value(leader, &enabled, &running);
2356 2357

	values[n++] = 1 + leader->nr_siblings;
2358 2359 2360 2361
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
2362 2363 2364
	values[n++] = count;
	if (read_format & PERF_FORMAT_ID)
		values[n++] = primary_event_id(leader);
2365 2366 2367 2368

	size = n * sizeof(u64);

	if (copy_to_user(buf, values, size))
2369
		goto unlock;
2370

2371
	ret = size;
2372

2373
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2374
		n = 0;
2375

2376
		values[n++] = perf_event_read_value(sub, &enabled, &running);
2377 2378 2379 2380 2381
		if (read_format & PERF_FORMAT_ID)
			values[n++] = primary_event_id(sub);

		size = n * sizeof(u64);

2382
		if (copy_to_user(buf + ret, values, size)) {
2383 2384 2385
			ret = -EFAULT;
			goto unlock;
		}
2386 2387

		ret += size;
2388
	}
2389 2390
unlock:
	mutex_unlock(&ctx->mutex);
2391

2392
	return ret;
2393 2394
}

2395
static int perf_event_read_one(struct perf_event *event,
2396 2397
				 u64 read_format, char __user *buf)
{
2398
	u64 enabled, running;
2399 2400 2401
	u64 values[4];
	int n = 0;

2402 2403 2404 2405 2406
	values[n++] = perf_event_read_value(event, &enabled, &running);
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = enabled;
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = running;
2407
	if (read_format & PERF_FORMAT_ID)
2408
		values[n++] = primary_event_id(event);
2409 2410 2411 2412 2413 2414 2415

	if (copy_to_user(buf, values, n * sizeof(u64)))
		return -EFAULT;

	return n * sizeof(u64);
}

T
Thomas Gleixner 已提交
2416
/*
2417
 * Read the performance event - simple non blocking version for now
T
Thomas Gleixner 已提交
2418 2419
 */
static ssize_t
2420
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
T
Thomas Gleixner 已提交
2421
{
2422
	u64 read_format = event->attr.read_format;
2423
	int ret;
T
Thomas Gleixner 已提交
2424

2425
	/*
2426
	 * Return end-of-file for a read on a event that is in
2427 2428 2429
	 * error state (i.e. because it was pinned but it couldn't be
	 * scheduled on to the CPU at some point).
	 */
2430
	if (event->state == PERF_EVENT_STATE_ERROR)
2431 2432
		return 0;

2433
	if (count < perf_event_read_size(event))
2434 2435
		return -ENOSPC;

2436
	WARN_ON_ONCE(event->ctx->parent_ctx);
2437
	if (read_format & PERF_FORMAT_GROUP)
2438
		ret = perf_event_read_group(event, read_format, buf);
2439
	else
2440
		ret = perf_event_read_one(event, read_format, buf);
T
Thomas Gleixner 已提交
2441

2442
	return ret;
T
Thomas Gleixner 已提交
2443 2444 2445 2446 2447
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
2448
	struct perf_event *event = file->private_data;
T
Thomas Gleixner 已提交
2449

2450
	return perf_read_hw(event, buf, count);
T
Thomas Gleixner 已提交
2451 2452 2453 2454
}

static unsigned int perf_poll(struct file *file, poll_table *wait)
{
2455
	struct perf_event *event = file->private_data;
2456
	struct perf_buffer *buffer;
2457
	unsigned int events = POLL_HUP;
P
Peter Zijlstra 已提交
2458 2459

	rcu_read_lock();
2460 2461 2462
	buffer = rcu_dereference(event->buffer);
	if (buffer)
		events = atomic_xchg(&buffer->poll, 0);
P
Peter Zijlstra 已提交
2463
	rcu_read_unlock();
T
Thomas Gleixner 已提交
2464

2465
	poll_wait(file, &event->waitq, wait);
T
Thomas Gleixner 已提交
2466 2467 2468 2469

	return events;
}

2470
static void perf_event_reset(struct perf_event *event)
2471
{
2472
	(void)perf_event_read(event);
2473
	local64_set(&event->count, 0);
2474
	perf_event_update_userpage(event);
P
Peter Zijlstra 已提交
2475 2476
}

2477
/*
2478 2479 2480 2481
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in sync_child_event if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
2482
 */
2483 2484
static void perf_event_for_each_child(struct perf_event *event,
					void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2485
{
2486
	struct perf_event *child;
P
Peter Zijlstra 已提交
2487

2488 2489 2490 2491
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->child_mutex);
	func(event);
	list_for_each_entry(child, &event->child_list, child_list)
P
Peter Zijlstra 已提交
2492
		func(child);
2493
	mutex_unlock(&event->child_mutex);
P
Peter Zijlstra 已提交
2494 2495
}

2496 2497
static void perf_event_for_each(struct perf_event *event,
				  void (*func)(struct perf_event *))
P
Peter Zijlstra 已提交
2498
{
2499 2500
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *sibling;
P
Peter Zijlstra 已提交
2501

2502 2503
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
2504
	event = event->group_leader;
2505

2506 2507 2508 2509
	perf_event_for_each_child(event, func);
	func(event);
	list_for_each_entry(sibling, &event->sibling_list, group_entry)
		perf_event_for_each_child(event, func);
2510
	mutex_unlock(&ctx->mutex);
2511 2512
}

2513
static int perf_event_period(struct perf_event *event, u64 __user *arg)
2514
{
2515
	struct perf_event_context *ctx = event->ctx;
2516 2517 2518 2519
	unsigned long size;
	int ret = 0;
	u64 value;

2520
	if (!event->attr.sample_period)
2521 2522 2523 2524 2525 2526 2527 2528 2529
		return -EINVAL;

	size = copy_from_user(&value, arg, sizeof(value));
	if (size != sizeof(value))
		return -EFAULT;

	if (!value)
		return -EINVAL;

2530
	raw_spin_lock_irq(&ctx->lock);
2531 2532
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate) {
2533 2534 2535 2536
			ret = -EINVAL;
			goto unlock;
		}

2537
		event->attr.sample_freq = value;
2538
	} else {
2539 2540
		event->attr.sample_period = value;
		event->hw.sample_period = value;
2541 2542
	}
unlock:
2543
	raw_spin_unlock_irq(&ctx->lock);
2544 2545 2546 2547

	return ret;
}

2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568
static const struct file_operations perf_fops;

static struct perf_event *perf_fget_light(int fd, int *fput_needed)
{
	struct file *file;

	file = fget_light(fd, fput_needed);
	if (!file)
		return ERR_PTR(-EBADF);

	if (file->f_op != &perf_fops) {
		fput_light(file, *fput_needed);
		*fput_needed = 0;
		return ERR_PTR(-EBADF);
	}

	return file->private_data;
}

static int perf_event_set_output(struct perf_event *event,
				 struct perf_event *output_event);
L
Li Zefan 已提交
2569
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2570

2571 2572
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
2573 2574
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
P
Peter Zijlstra 已提交
2575
	u32 flags = arg;
2576 2577

	switch (cmd) {
2578 2579
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
2580
		break;
2581 2582
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
2583
		break;
2584 2585
	case PERF_EVENT_IOC_RESET:
		func = perf_event_reset;
2586
		break;
P
Peter Zijlstra 已提交
2587

2588 2589
	case PERF_EVENT_IOC_REFRESH:
		return perf_event_refresh(event, arg);
2590

2591 2592
	case PERF_EVENT_IOC_PERIOD:
		return perf_event_period(event, (u64 __user *)arg);
2593

2594
	case PERF_EVENT_IOC_SET_OUTPUT:
2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
	{
		struct perf_event *output_event = NULL;
		int fput_needed = 0;
		int ret;

		if (arg != -1) {
			output_event = perf_fget_light(arg, &fput_needed);
			if (IS_ERR(output_event))
				return PTR_ERR(output_event);
		}

		ret = perf_event_set_output(event, output_event);
		if (output_event)
			fput_light(output_event->filp, fput_needed);

		return ret;
	}
2612

L
Li Zefan 已提交
2613 2614 2615
	case PERF_EVENT_IOC_SET_FILTER:
		return perf_event_set_filter(event, (void __user *)arg);

2616
	default:
P
Peter Zijlstra 已提交
2617
		return -ENOTTY;
2618
	}
P
Peter Zijlstra 已提交
2619 2620

	if (flags & PERF_IOC_FLAG_GROUP)
2621
		perf_event_for_each(event, func);
P
Peter Zijlstra 已提交
2622
	else
2623
		perf_event_for_each_child(event, func);
P
Peter Zijlstra 已提交
2624 2625

	return 0;
2626 2627
}

2628
int perf_event_task_enable(void)
2629
{
2630
	struct perf_event *event;
2631

2632 2633 2634 2635
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_enable);
	mutex_unlock(&current->perf_event_mutex);
2636 2637 2638 2639

	return 0;
}

2640
int perf_event_task_disable(void)
2641
{
2642
	struct perf_event *event;
2643

2644 2645 2646 2647
	mutex_lock(&current->perf_event_mutex);
	list_for_each_entry(event, &current->perf_event_list, owner_entry)
		perf_event_for_each_child(event, perf_event_disable);
	mutex_unlock(&current->perf_event_mutex);
2648 2649 2650 2651

	return 0;
}

2652 2653
#ifndef PERF_EVENT_INDEX_OFFSET
# define PERF_EVENT_INDEX_OFFSET 0
I
Ingo Molnar 已提交
2654 2655
#endif

2656
static int perf_event_index(struct perf_event *event)
2657
{
P
Peter Zijlstra 已提交
2658 2659 2660
	if (event->hw.state & PERF_HES_STOPPED)
		return 0;

2661
	if (event->state != PERF_EVENT_STATE_ACTIVE)
2662 2663
		return 0;

2664
	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2665 2666
}

2667 2668 2669 2670 2671
/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
2672
void perf_event_update_userpage(struct perf_event *event)
2673
{
2674
	struct perf_event_mmap_page *userpg;
2675
	struct perf_buffer *buffer;
2676 2677

	rcu_read_lock();
2678 2679
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
2680 2681
		goto unlock;

2682
	userpg = buffer->user_page;
2683

2684 2685 2686 2687 2688
	/*
	 * Disable preemption so as to not let the corresponding user-space
	 * spin too long if we get preempted.
	 */
	preempt_disable();
2689
	++userpg->lock;
2690
	barrier();
2691
	userpg->index = perf_event_index(event);
P
Peter Zijlstra 已提交
2692
	userpg->offset = perf_event_count(event);
2693
	if (event->state == PERF_EVENT_STATE_ACTIVE)
2694
		userpg->offset -= local64_read(&event->hw.prev_count);
2695

2696 2697
	userpg->time_enabled = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
2698

2699 2700
	userpg->time_running = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
2701

2702
	barrier();
2703
	++userpg->lock;
2704
	preempt_enable();
2705
unlock:
2706
	rcu_read_unlock();
2707 2708
}

2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727
static unsigned long perf_data_size(struct perf_buffer *buffer);

static void
perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
{
	long max_size = perf_data_size(buffer);

	if (watermark)
		buffer->watermark = min(max_size, watermark);

	if (!buffer->watermark)
		buffer->watermark = max_size / 2;

	if (flags & PERF_BUFFER_WRITABLE)
		buffer->writable = 1;

	atomic_set(&buffer->refcount, 1);
}

2728
#ifndef CONFIG_PERF_USE_VMALLOC
2729

2730 2731 2732
/*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
2733

2734
static struct page *
2735
perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2736
{
2737
	if (pgoff > buffer->nr_pages)
2738
		return NULL;
2739

2740
	if (pgoff == 0)
2741
		return virt_to_page(buffer->user_page);
2742

2743
	return virt_to_page(buffer->data_pages[pgoff - 1]);
2744 2745
}

2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758
static void *perf_mmap_alloc_page(int cpu)
{
	struct page *page;
	int node;

	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
	if (!page)
		return NULL;

	return page_address(page);
}

2759
static struct perf_buffer *
2760
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2761
{
2762
	struct perf_buffer *buffer;
2763 2764 2765
	unsigned long size;
	int i;

2766
	size = sizeof(struct perf_buffer);
2767 2768
	size += nr_pages * sizeof(void *);

2769 2770
	buffer = kzalloc(size, GFP_KERNEL);
	if (!buffer)
2771 2772
		goto fail;

2773
	buffer->user_page = perf_mmap_alloc_page(cpu);
2774
	if (!buffer->user_page)
2775 2776 2777
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
2778
		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2779
		if (!buffer->data_pages[i])
2780 2781 2782
			goto fail_data_pages;
	}

2783
	buffer->nr_pages = nr_pages;
2784

2785 2786
	perf_buffer_init(buffer, watermark, flags);

2787
	return buffer;
2788 2789 2790

fail_data_pages:
	for (i--; i >= 0; i--)
2791
		free_page((unsigned long)buffer->data_pages[i]);
2792

2793
	free_page((unsigned long)buffer->user_page);
2794 2795

fail_user_page:
2796
	kfree(buffer);
2797 2798

fail:
2799
	return NULL;
2800 2801
}

2802 2803
static void perf_mmap_free_page(unsigned long addr)
{
K
Kevin Cernekee 已提交
2804
	struct page *page = virt_to_page((void *)addr);
2805 2806 2807 2808 2809

	page->mapping = NULL;
	__free_page(page);
}

2810
static void perf_buffer_free(struct perf_buffer *buffer)
2811 2812 2813
{
	int i;

2814 2815 2816 2817
	perf_mmap_free_page((unsigned long)buffer->user_page);
	for (i = 0; i < buffer->nr_pages; i++)
		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
	kfree(buffer);
2818 2819
}

2820
static inline int page_order(struct perf_buffer *buffer)
2821 2822 2823 2824
{
	return 0;
}

2825 2826 2827 2828 2829 2830 2831 2832
#else

/*
 * Back perf_mmap() with vmalloc memory.
 *
 * Required for architectures that have d-cache aliasing issues.
 */

2833
static inline int page_order(struct perf_buffer *buffer)
2834
{
2835
	return buffer->page_order;
2836 2837
}

2838
static struct page *
2839
perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2840
{
2841
	if (pgoff > (1UL << page_order(buffer)))
2842 2843
		return NULL;

2844
	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2845 2846 2847 2848 2849 2850 2851 2852 2853
}

static void perf_mmap_unmark_page(void *addr)
{
	struct page *page = vmalloc_to_page(addr);

	page->mapping = NULL;
}

2854
static void perf_buffer_free_work(struct work_struct *work)
2855
{
2856
	struct perf_buffer *buffer;
2857 2858 2859
	void *base;
	int i, nr;

2860 2861
	buffer = container_of(work, struct perf_buffer, work);
	nr = 1 << page_order(buffer);
2862

2863
	base = buffer->user_page;
2864 2865 2866 2867
	for (i = 0; i < nr + 1; i++)
		perf_mmap_unmark_page(base + (i * PAGE_SIZE));

	vfree(base);
2868
	kfree(buffer);
2869 2870
}

2871
static void perf_buffer_free(struct perf_buffer *buffer)
2872
{
2873
	schedule_work(&buffer->work);
2874 2875
}

2876
static struct perf_buffer *
2877
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2878
{
2879
	struct perf_buffer *buffer;
2880 2881 2882
	unsigned long size;
	void *all_buf;

2883
	size = sizeof(struct perf_buffer);
2884 2885
	size += sizeof(void *);

2886 2887
	buffer = kzalloc(size, GFP_KERNEL);
	if (!buffer)
2888 2889
		goto fail;

2890
	INIT_WORK(&buffer->work, perf_buffer_free_work);
2891 2892 2893 2894 2895

	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

2896 2897 2898 2899
	buffer->user_page = all_buf;
	buffer->data_pages[0] = all_buf + PAGE_SIZE;
	buffer->page_order = ilog2(nr_pages);
	buffer->nr_pages = 1;
2900

2901 2902
	perf_buffer_init(buffer, watermark, flags);

2903
	return buffer;
2904 2905

fail_all_buf:
2906
	kfree(buffer);
2907 2908 2909 2910 2911 2912 2913

fail:
	return NULL;
}

#endif

2914
static unsigned long perf_data_size(struct perf_buffer *buffer)
2915
{
2916
	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2917 2918
}

2919 2920 2921
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct perf_event *event = vma->vm_file->private_data;
2922
	struct perf_buffer *buffer;
2923 2924 2925 2926 2927 2928 2929 2930 2931
	int ret = VM_FAULT_SIGBUS;

	if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
			ret = 0;
		return ret;
	}

	rcu_read_lock();
2932 2933
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
2934 2935 2936 2937 2938
		goto unlock;

	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
		goto unlock;

2939
	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953
	if (!vmf->page)
		goto unlock;

	get_page(vmf->page);
	vmf->page->mapping = vma->vm_file->f_mapping;
	vmf->page->index   = vmf->pgoff;

	ret = 0;
unlock:
	rcu_read_unlock();

	return ret;
}

2954
static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2955
{
2956
	struct perf_buffer *buffer;
2957

2958 2959
	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
	perf_buffer_free(buffer);
2960 2961
}

2962
static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2963
{
2964
	struct perf_buffer *buffer;
2965

2966
	rcu_read_lock();
2967 2968 2969 2970
	buffer = rcu_dereference(event->buffer);
	if (buffer) {
		if (!atomic_inc_not_zero(&buffer->refcount))
			buffer = NULL;
2971 2972 2973
	}
	rcu_read_unlock();

2974
	return buffer;
2975 2976
}

2977
static void perf_buffer_put(struct perf_buffer *buffer)
2978
{
2979
	if (!atomic_dec_and_test(&buffer->refcount))
2980
		return;
2981

2982
	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2983 2984 2985 2986
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
2987
	struct perf_event *event = vma->vm_file->private_data;
2988

2989
	atomic_inc(&event->mmap_count);
2990 2991 2992 2993
}

static void perf_mmap_close(struct vm_area_struct *vma)
{
2994
	struct perf_event *event = vma->vm_file->private_data;
2995

2996
	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2997
		unsigned long size = perf_data_size(event->buffer);
2998
		struct user_struct *user = event->mmap_user;
2999
		struct perf_buffer *buffer = event->buffer;
3000

3001
		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3002
		vma->vm_mm->locked_vm -= event->mmap_locked;
3003
		rcu_assign_pointer(event->buffer, NULL);
3004
		mutex_unlock(&event->mmap_mutex);
3005

3006
		perf_buffer_put(buffer);
3007
		free_uid(user);
3008
	}
3009 3010
}

3011
static const struct vm_operations_struct perf_mmap_vmops = {
3012 3013 3014 3015
	.open		= perf_mmap_open,
	.close		= perf_mmap_close,
	.fault		= perf_mmap_fault,
	.page_mkwrite	= perf_mmap_fault,
3016 3017 3018 3019
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
3020
	struct perf_event *event = file->private_data;
3021
	unsigned long user_locked, user_lock_limit;
3022
	struct user_struct *user = current_user();
3023
	unsigned long locked, lock_limit;
3024
	struct perf_buffer *buffer;
3025 3026
	unsigned long vma_size;
	unsigned long nr_pages;
3027
	long user_extra, extra;
3028
	int ret = 0, flags = 0;
3029

3030 3031 3032 3033 3034 3035 3036 3037
	/*
	 * Don't allow mmap() of inherited per-task counters. This would
	 * create a performance issue due to all children writing to the
	 * same buffer.
	 */
	if (event->cpu == -1 && event->attr.inherit)
		return -EINVAL;

3038
	if (!(vma->vm_flags & VM_SHARED))
3039
		return -EINVAL;
3040 3041 3042 3043

	vma_size = vma->vm_end - vma->vm_start;
	nr_pages = (vma_size / PAGE_SIZE) - 1;

3044
	/*
3045
	 * If we have buffer pages ensure they're a power-of-two number, so we
3046 3047 3048
	 * can do bitmasks instead of modulo.
	 */
	if (nr_pages != 0 && !is_power_of_2(nr_pages))
3049 3050
		return -EINVAL;

3051
	if (vma_size != PAGE_SIZE * (1 + nr_pages))
3052 3053
		return -EINVAL;

3054 3055
	if (vma->vm_pgoff != 0)
		return -EINVAL;
3056

3057 3058
	WARN_ON_ONCE(event->ctx->parent_ctx);
	mutex_lock(&event->mmap_mutex);
3059 3060 3061
	if (event->buffer) {
		if (event->buffer->nr_pages == nr_pages)
			atomic_inc(&event->buffer->refcount);
3062
		else
3063 3064 3065 3066
			ret = -EINVAL;
		goto unlock;
	}

3067
	user_extra = nr_pages + 1;
3068
	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
I
Ingo Molnar 已提交
3069 3070 3071 3072 3073 3074

	/*
	 * Increase the limit linearly with more CPUs:
	 */
	user_lock_limit *= num_online_cpus();

3075
	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3076

3077 3078 3079
	extra = 0;
	if (user_locked > user_lock_limit)
		extra = user_locked - user_lock_limit;
3080

3081
	lock_limit = rlimit(RLIMIT_MEMLOCK);
3082
	lock_limit >>= PAGE_SHIFT;
3083
	locked = vma->vm_mm->locked_vm + extra;
3084

3085 3086
	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
		!capable(CAP_IPC_LOCK)) {
3087 3088 3089
		ret = -EPERM;
		goto unlock;
	}
3090

3091
	WARN_ON(event->buffer);
3092

3093 3094 3095 3096 3097
	if (vma->vm_flags & VM_WRITE)
		flags |= PERF_BUFFER_WRITABLE;

	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
				   event->cpu, flags);
3098
	if (!buffer) {
3099
		ret = -ENOMEM;
3100
		goto unlock;
3101
	}
3102
	rcu_assign_pointer(event->buffer, buffer);
3103

3104 3105 3106 3107 3108
	atomic_long_add(user_extra, &user->locked_vm);
	event->mmap_locked = extra;
	event->mmap_user = get_current_user();
	vma->vm_mm->locked_vm += event->mmap_locked;

3109
unlock:
3110 3111
	if (!ret)
		atomic_inc(&event->mmap_count);
3112
	mutex_unlock(&event->mmap_mutex);
3113 3114 3115

	vma->vm_flags |= VM_RESERVED;
	vma->vm_ops = &perf_mmap_vmops;
3116 3117

	return ret;
3118 3119
}

P
Peter Zijlstra 已提交
3120 3121 3122
static int perf_fasync(int fd, struct file *filp, int on)
{
	struct inode *inode = filp->f_path.dentry->d_inode;
3123
	struct perf_event *event = filp->private_data;
P
Peter Zijlstra 已提交
3124 3125 3126
	int retval;

	mutex_lock(&inode->i_mutex);
3127
	retval = fasync_helper(fd, filp, on, &event->fasync);
P
Peter Zijlstra 已提交
3128 3129 3130 3131 3132 3133 3134 3135
	mutex_unlock(&inode->i_mutex);

	if (retval < 0)
		return retval;

	return 0;
}

T
Thomas Gleixner 已提交
3136
static const struct file_operations perf_fops = {
3137
	.llseek			= no_llseek,
T
Thomas Gleixner 已提交
3138 3139 3140
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
3141 3142
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
3143
	.mmap			= perf_mmap,
P
Peter Zijlstra 已提交
3144
	.fasync			= perf_fasync,
T
Thomas Gleixner 已提交
3145 3146
};

3147
/*
3148
 * Perf event wakeup
3149 3150 3151 3152 3153
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

3154
void perf_event_wakeup(struct perf_event *event)
3155
{
3156
	wake_up_all(&event->waitq);
3157

3158 3159 3160
	if (event->pending_kill) {
		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
		event->pending_kill = 0;
3161
	}
3162 3163
}

3164
static void perf_pending_event(struct irq_work *entry)
3165
{
3166 3167
	struct perf_event *event = container_of(entry,
			struct perf_event, pending);
3168

3169 3170 3171
	if (event->pending_disable) {
		event->pending_disable = 0;
		__perf_event_disable(event);
3172 3173
	}

3174 3175 3176
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
3177 3178 3179
	}
}

3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200
/*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
 * another virtualization implementation supporting the callbacks.
 */
struct perf_guest_info_callbacks *perf_guest_cbs;

int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
	perf_guest_cbs = cbs;
	return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
	perf_guest_cbs = NULL;
	return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);

3201 3202 3203
/*
 * Output
 */
3204
static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3205
			      unsigned long offset, unsigned long head)
3206 3207 3208
{
	unsigned long mask;

3209
	if (!buffer->writable)
3210 3211
		return true;

3212
	mask = perf_data_size(buffer) - 1;
3213 3214 3215 3216 3217 3218 3219 3220 3221 3222

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

3223
static void perf_output_wakeup(struct perf_output_handle *handle)
3224
{
3225
	atomic_set(&handle->buffer->poll, POLL_IN);
3226

3227
	if (handle->nmi) {
3228
		handle->event->pending_wakeup = 1;
3229
		irq_work_queue(&handle->event->pending);
3230
	} else
3231
		perf_event_wakeup(handle->event);
3232 3233
}

3234
/*
3235
 * We need to ensure a later event_id doesn't publish a head when a former
3236
 * event isn't done writing. However since we need to deal with NMIs we
3237 3238 3239
 * cannot fully serialize things.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
3240
 * event completes.
3241
 */
3242
static void perf_output_get_handle(struct perf_output_handle *handle)
3243
{
3244
	struct perf_buffer *buffer = handle->buffer;
3245

3246
	preempt_disable();
3247 3248
	local_inc(&buffer->nest);
	handle->wakeup = local_read(&buffer->wakeup);
3249 3250
}

3251
static void perf_output_put_handle(struct perf_output_handle *handle)
3252
{
3253
	struct perf_buffer *buffer = handle->buffer;
3254
	unsigned long head;
3255 3256

again:
3257
	head = local_read(&buffer->head);
3258 3259

	/*
3260
	 * IRQ/NMI can happen here, which means we can miss a head update.
3261 3262
	 */

3263
	if (!local_dec_and_test(&buffer->nest))
3264
		goto out;
3265 3266

	/*
3267
	 * Publish the known good head. Rely on the full barrier implied
3268
	 * by atomic_dec_and_test() order the buffer->head read and this
3269
	 * write.
3270
	 */
3271
	buffer->user_page->data_head = head;
3272

3273 3274
	/*
	 * Now check if we missed an update, rely on the (compiler)
3275
	 * barrier in atomic_dec_and_test() to re-read buffer->head.
3276
	 */
3277 3278
	if (unlikely(head != local_read(&buffer->head))) {
		local_inc(&buffer->nest);
3279 3280 3281
		goto again;
	}

3282
	if (handle->wakeup != local_read(&buffer->wakeup))
3283
		perf_output_wakeup(handle);
3284

P
Peter Zijlstra 已提交
3285
out:
3286
	preempt_enable();
3287 3288
}

3289
__always_inline void perf_output_copy(struct perf_output_handle *handle,
3290
		      const void *buf, unsigned int len)
3291
{
3292
	do {
3293
		unsigned long size = min_t(unsigned long, handle->size, len);
3294 3295 3296 3297 3298

		memcpy(handle->addr, buf, size);

		len -= size;
		handle->addr += size;
3299
		buf += size;
3300 3301
		handle->size -= size;
		if (!handle->size) {
3302
			struct perf_buffer *buffer = handle->buffer;
3303

3304
			handle->page++;
3305 3306 3307
			handle->page &= buffer->nr_pages - 1;
			handle->addr = buffer->data_pages[handle->page];
			handle->size = PAGE_SIZE << page_order(buffer);
3308 3309
		}
	} while (len);
3310 3311
}

3312
int perf_output_begin(struct perf_output_handle *handle,
3313
		      struct perf_event *event, unsigned int size,
3314
		      int nmi, int sample)
3315
{
3316
	struct perf_buffer *buffer;
3317
	unsigned long tail, offset, head;
3318 3319 3320 3321 3322 3323
	int have_lost;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;
3324

3325
	rcu_read_lock();
3326
	/*
3327
	 * For inherited events we send all the output towards the parent.
3328
	 */
3329 3330
	if (event->parent)
		event = event->parent;
3331

3332 3333
	buffer = rcu_dereference(event->buffer);
	if (!buffer)
3334 3335
		goto out;

3336
	handle->buffer	= buffer;
3337
	handle->event	= event;
3338 3339
	handle->nmi	= nmi;
	handle->sample	= sample;
3340

3341
	if (!buffer->nr_pages)
3342
		goto out;
3343

3344
	have_lost = local_read(&buffer->lost);
3345 3346 3347
	if (have_lost)
		size += sizeof(lost_event);

3348
	perf_output_get_handle(handle);
3349

3350
	do {
3351 3352 3353 3354 3355
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
3356
		tail = ACCESS_ONCE(buffer->user_page->data_tail);
3357
		smp_rmb();
3358
		offset = head = local_read(&buffer->head);
P
Peter Zijlstra 已提交
3359
		head += size;
3360
		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3361
			goto fail;
3362
	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
3363

3364 3365
	if (head - local_read(&buffer->wakeup) > buffer->watermark)
		local_add(buffer->watermark, &buffer->wakeup);
3366

3367 3368 3369 3370
	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
	handle->page &= buffer->nr_pages - 1;
	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
	handle->addr = buffer->data_pages[handle->page];
3371
	handle->addr += handle->size;
3372
	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3373

3374
	if (have_lost) {
3375
		lost_event.header.type = PERF_RECORD_LOST;
3376 3377
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
3378
		lost_event.id          = event->id;
3379
		lost_event.lost        = local_xchg(&buffer->lost, 0);
3380 3381 3382 3383

		perf_output_put(handle, lost_event);
	}

3384
	return 0;
3385

3386
fail:
3387
	local_inc(&buffer->lost);
3388
	perf_output_put_handle(handle);
3389 3390
out:
	rcu_read_unlock();
3391

3392 3393
	return -ENOSPC;
}
3394

3395
void perf_output_end(struct perf_output_handle *handle)
3396
{
3397
	struct perf_event *event = handle->event;
3398
	struct perf_buffer *buffer = handle->buffer;
3399

3400
	int wakeup_events = event->attr.wakeup_events;
P
Peter Zijlstra 已提交
3401

3402
	if (handle->sample && wakeup_events) {
3403
		int events = local_inc_return(&buffer->events);
P
Peter Zijlstra 已提交
3404
		if (events >= wakeup_events) {
3405 3406
			local_sub(wakeup_events, &buffer->events);
			local_inc(&buffer->wakeup);
P
Peter Zijlstra 已提交
3407
		}
3408 3409
	}

3410
	perf_output_put_handle(handle);
3411
	rcu_read_unlock();
3412 3413
}

3414
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3415 3416
{
	/*
3417
	 * only top level events have the pid namespace they were created in
3418
	 */
3419 3420
	if (event->parent)
		event = event->parent;
3421

3422
	return task_tgid_nr_ns(p, event->ns);
3423 3424
}

3425
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3426 3427
{
	/*
3428
	 * only top level events have the pid namespace they were created in
3429
	 */
3430 3431
	if (event->parent)
		event = event->parent;
3432

3433
	return task_pid_nr_ns(p, event->ns);
3434 3435
}

3436
static void perf_output_read_one(struct perf_output_handle *handle,
3437
				 struct perf_event *event)
3438
{
3439
	u64 read_format = event->attr.read_format;
3440 3441 3442
	u64 values[4];
	int n = 0;

P
Peter Zijlstra 已提交
3443
	values[n++] = perf_event_count(event);
3444
	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3445 3446
		values[n++] = event->total_time_enabled +
			atomic64_read(&event->child_total_time_enabled);
3447 3448
	}
	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3449 3450
		values[n++] = event->total_time_running +
			atomic64_read(&event->child_total_time_running);
3451 3452
	}
	if (read_format & PERF_FORMAT_ID)
3453
		values[n++] = primary_event_id(event);
3454 3455 3456 3457 3458

	perf_output_copy(handle, values, n * sizeof(u64));
}

/*
3459
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3460 3461
 */
static void perf_output_read_group(struct perf_output_handle *handle,
3462
			    struct perf_event *event)
3463
{
3464 3465
	struct perf_event *leader = event->group_leader, *sub;
	u64 read_format = event->attr.read_format;
3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476
	u64 values[5];
	int n = 0;

	values[n++] = 1 + leader->nr_siblings;

	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		values[n++] = leader->total_time_enabled;

	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		values[n++] = leader->total_time_running;

3477
	if (leader != event)
3478 3479
		leader->pmu->read(leader);

P
Peter Zijlstra 已提交
3480
	values[n++] = perf_event_count(leader);
3481
	if (read_format & PERF_FORMAT_ID)
3482
		values[n++] = primary_event_id(leader);
3483 3484 3485

	perf_output_copy(handle, values, n * sizeof(u64));

3486
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3487 3488
		n = 0;

3489
		if (sub != event)
3490 3491
			sub->pmu->read(sub);

P
Peter Zijlstra 已提交
3492
		values[n++] = perf_event_count(sub);
3493
		if (read_format & PERF_FORMAT_ID)
3494
			values[n++] = primary_event_id(sub);
3495 3496 3497 3498 3499 3500

		perf_output_copy(handle, values, n * sizeof(u64));
	}
}

static void perf_output_read(struct perf_output_handle *handle,
3501
			     struct perf_event *event)
3502
{
3503 3504
	if (event->attr.read_format & PERF_FORMAT_GROUP)
		perf_output_read_group(handle, event);
3505
	else
3506
		perf_output_read_one(handle, event);
3507 3508
}

3509 3510 3511
void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
3512
			struct perf_event *event)
3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	if (sample_type & PERF_SAMPLE_ADDR)
		perf_output_put(handle, data->addr);

	if (sample_type & PERF_SAMPLE_ID)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		perf_output_put(handle, data->stream_id);

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	if (sample_type & PERF_SAMPLE_READ)
3543
		perf_output_read(handle, event);
3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580

	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
		if (data->callchain) {
			int size = 1;

			if (data->callchain)
				size += data->callchain->nr;

			size *= sizeof(u64);

			perf_output_copy(handle, data->callchain, size);
		} else {
			u64 nr = 0;
			perf_output_put(handle, nr);
		}
	}

	if (sample_type & PERF_SAMPLE_RAW) {
		if (data->raw) {
			perf_output_put(handle, data->raw->size);
			perf_output_copy(handle, data->raw->data,
					 data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(handle, raw);
		}
	}
}

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
3581
			 struct perf_event *event,
3582
			 struct pt_regs *regs)
3583
{
3584
	u64 sample_type = event->attr.sample_type;
3585

3586
	data->type = sample_type;
3587

3588
	header->type = PERF_RECORD_SAMPLE;
3589 3590 3591 3592
	header->size = sizeof(*header);

	header->misc = 0;
	header->misc |= perf_misc_flags(regs);
3593

3594
	if (sample_type & PERF_SAMPLE_IP) {
3595 3596 3597
		data->ip = perf_instruction_pointer(regs);

		header->size += sizeof(data->ip);
3598
	}
3599

3600
	if (sample_type & PERF_SAMPLE_TID) {
3601
		/* namespace issues */
3602 3603
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
3604

3605
		header->size += sizeof(data->tid_entry);
3606 3607
	}

3608
	if (sample_type & PERF_SAMPLE_TIME) {
P
Peter Zijlstra 已提交
3609
		data->time = perf_clock();
3610

3611
		header->size += sizeof(data->time);
3612 3613
	}

3614
	if (sample_type & PERF_SAMPLE_ADDR)
3615
		header->size += sizeof(data->addr);
3616

3617
	if (sample_type & PERF_SAMPLE_ID) {
3618
		data->id = primary_event_id(event);
3619

3620 3621 3622 3623
		header->size += sizeof(data->id);
	}

	if (sample_type & PERF_SAMPLE_STREAM_ID) {
3624
		data->stream_id = event->id;
3625 3626 3627

		header->size += sizeof(data->stream_id);
	}
3628

3629
	if (sample_type & PERF_SAMPLE_CPU) {
3630 3631
		data->cpu_entry.cpu		= raw_smp_processor_id();
		data->cpu_entry.reserved	= 0;
3632

3633
		header->size += sizeof(data->cpu_entry);
3634 3635
	}

3636
	if (sample_type & PERF_SAMPLE_PERIOD)
3637
		header->size += sizeof(data->period);
3638

3639
	if (sample_type & PERF_SAMPLE_READ)
3640
		header->size += perf_event_read_size(event);
3641

3642
	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3643
		int size = 1;
3644

3645 3646 3647 3648 3649 3650
		data->callchain = perf_callchain(regs);

		if (data->callchain)
			size += data->callchain->nr;

		header->size += size * sizeof(u64);
3651 3652
	}

3653
	if (sample_type & PERF_SAMPLE_RAW) {
3654 3655 3656 3657 3658 3659 3660 3661
		int size = sizeof(u32);

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
3662
		header->size += size;
3663
	}
3664
}
3665

3666
static void perf_event_output(struct perf_event *event, int nmi,
3667 3668 3669 3670 3671
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
	struct perf_output_handle handle;
	struct perf_event_header header;
3672

3673 3674 3675
	/* protect the callchain buffers */
	rcu_read_lock();

3676
	perf_prepare_sample(&header, data, event, regs);
P
Peter Zijlstra 已提交
3677

3678
	if (perf_output_begin(&handle, event, header.size, nmi, 1))
3679
		goto exit;
3680

3681
	perf_output_sample(&handle, &header, data, event);
3682

3683
	perf_output_end(&handle);
3684 3685 3686

exit:
	rcu_read_unlock();
3687 3688
}

3689
/*
3690
 * read event_id
3691 3692 3693 3694 3695 3696 3697 3698 3699 3700
 */

struct perf_read_event {
	struct perf_event_header	header;

	u32				pid;
	u32				tid;
};

static void
3701
perf_event_read_event(struct perf_event *event,
3702 3703 3704
			struct task_struct *task)
{
	struct perf_output_handle handle;
3705
	struct perf_read_event read_event = {
3706
		.header = {
3707
			.type = PERF_RECORD_READ,
3708
			.misc = 0,
3709
			.size = sizeof(read_event) + perf_event_read_size(event),
3710
		},
3711 3712
		.pid = perf_event_pid(event, task),
		.tid = perf_event_tid(event, task),
3713
	};
3714
	int ret;
3715

3716
	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3717 3718 3719
	if (ret)
		return;

3720
	perf_output_put(&handle, read_event);
3721
	perf_output_read(&handle, event);
3722

3723 3724 3725
	perf_output_end(&handle);
}

P
Peter Zijlstra 已提交
3726
/*
P
Peter Zijlstra 已提交
3727 3728
 * task tracking -- fork/exit
 *
3729
 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
P
Peter Zijlstra 已提交
3730 3731
 */

P
Peter Zijlstra 已提交
3732
struct perf_task_event {
3733
	struct task_struct		*task;
3734
	struct perf_event_context	*task_ctx;
P
Peter Zijlstra 已提交
3735 3736 3737 3738 3739 3740

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				ppid;
P
Peter Zijlstra 已提交
3741 3742
		u32				tid;
		u32				ptid;
3743
		u64				time;
3744
	} event_id;
P
Peter Zijlstra 已提交
3745 3746
};

3747
static void perf_event_task_output(struct perf_event *event,
P
Peter Zijlstra 已提交
3748
				     struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3749 3750
{
	struct perf_output_handle handle;
P
Peter Zijlstra 已提交
3751
	struct task_struct *task = task_event->task;
3752 3753
	int size, ret;

3754 3755
	size  = task_event->event_id.header.size;
	ret = perf_output_begin(&handle, event, size, 0, 0);
P
Peter Zijlstra 已提交
3756

3757
	if (ret)
P
Peter Zijlstra 已提交
3758 3759
		return;

3760 3761
	task_event->event_id.pid = perf_event_pid(event, task);
	task_event->event_id.ppid = perf_event_pid(event, current);
P
Peter Zijlstra 已提交
3762

3763 3764
	task_event->event_id.tid = perf_event_tid(event, task);
	task_event->event_id.ptid = perf_event_tid(event, current);
P
Peter Zijlstra 已提交
3765

3766
	perf_output_put(&handle, task_event->event_id);
3767

P
Peter Zijlstra 已提交
3768 3769 3770
	perf_output_end(&handle);
}

3771
static int perf_event_task_match(struct perf_event *event)
P
Peter Zijlstra 已提交
3772
{
P
Peter Zijlstra 已提交
3773
	if (event->state < PERF_EVENT_STATE_INACTIVE)
3774 3775
		return 0;

3776 3777 3778
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3779 3780
	if (event->attr.comm || event->attr.mmap ||
	    event->attr.mmap_data || event->attr.task)
P
Peter Zijlstra 已提交
3781 3782 3783 3784 3785
		return 1;

	return 0;
}

3786
static void perf_event_task_ctx(struct perf_event_context *ctx,
P
Peter Zijlstra 已提交
3787
				  struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3788
{
3789
	struct perf_event *event;
P
Peter Zijlstra 已提交
3790

3791 3792 3793
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_task_match(event))
			perf_event_task_output(event, task_event);
P
Peter Zijlstra 已提交
3794 3795 3796
	}
}

3797
static void perf_event_task_event(struct perf_task_event *task_event)
P
Peter Zijlstra 已提交
3798 3799
{
	struct perf_cpu_context *cpuctx;
P
Peter Zijlstra 已提交
3800
	struct perf_event_context *ctx;
P
Peter Zijlstra 已提交
3801
	struct pmu *pmu;
P
Peter Zijlstra 已提交
3802
	int ctxn;
P
Peter Zijlstra 已提交
3803

3804
	rcu_read_lock();
P
Peter Zijlstra 已提交
3805
	list_for_each_entry_rcu(pmu, &pmus, entry) {
3806
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3807
		perf_event_task_ctx(&cpuctx->ctx, task_event);
P
Peter Zijlstra 已提交
3808 3809 3810 3811 3812

		ctx = task_event->task_ctx;
		if (!ctx) {
			ctxn = pmu->task_ctx_nr;
			if (ctxn < 0)
3813
				goto next;
P
Peter Zijlstra 已提交
3814 3815 3816 3817
			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		}
		if (ctx)
			perf_event_task_ctx(ctx, task_event);
3818 3819
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3820
	}
P
Peter Zijlstra 已提交
3821 3822 3823
	rcu_read_unlock();
}

3824 3825
static void perf_event_task(struct task_struct *task,
			      struct perf_event_context *task_ctx,
3826
			      int new)
P
Peter Zijlstra 已提交
3827
{
P
Peter Zijlstra 已提交
3828
	struct perf_task_event task_event;
P
Peter Zijlstra 已提交
3829

3830 3831 3832
	if (!atomic_read(&nr_comm_events) &&
	    !atomic_read(&nr_mmap_events) &&
	    !atomic_read(&nr_task_events))
P
Peter Zijlstra 已提交
3833 3834
		return;

P
Peter Zijlstra 已提交
3835
	task_event = (struct perf_task_event){
3836 3837
		.task	  = task,
		.task_ctx = task_ctx,
3838
		.event_id    = {
P
Peter Zijlstra 已提交
3839
			.header = {
3840
				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3841
				.misc = 0,
3842
				.size = sizeof(task_event.event_id),
P
Peter Zijlstra 已提交
3843
			},
3844 3845
			/* .pid  */
			/* .ppid */
P
Peter Zijlstra 已提交
3846 3847
			/* .tid  */
			/* .ptid */
P
Peter Zijlstra 已提交
3848
			.time = perf_clock(),
P
Peter Zijlstra 已提交
3849 3850 3851
		},
	};

3852
	perf_event_task_event(&task_event);
P
Peter Zijlstra 已提交
3853 3854
}

3855
void perf_event_fork(struct task_struct *task)
P
Peter Zijlstra 已提交
3856
{
3857
	perf_event_task(task, NULL, 1);
P
Peter Zijlstra 已提交
3858 3859
}

3860 3861 3862 3863 3864
/*
 * comm tracking
 */

struct perf_comm_event {
3865 3866
	struct task_struct	*task;
	char			*comm;
3867 3868 3869 3870 3871 3872 3873
	int			comm_size;

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
3874
	} event_id;
3875 3876
};

3877
static void perf_event_comm_output(struct perf_event *event,
3878 3879 3880
				     struct perf_comm_event *comm_event)
{
	struct perf_output_handle handle;
3881 3882
	int size = comm_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
3883 3884 3885 3886

	if (ret)
		return;

3887 3888
	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3889

3890
	perf_output_put(&handle, comm_event->event_id);
3891 3892 3893 3894 3895
	perf_output_copy(&handle, comm_event->comm,
				   comm_event->comm_size);
	perf_output_end(&handle);
}

3896
static int perf_event_comm_match(struct perf_event *event)
3897
{
P
Peter Zijlstra 已提交
3898
	if (event->state < PERF_EVENT_STATE_INACTIVE)
3899 3900
		return 0;

3901 3902 3903
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

3904
	if (event->attr.comm)
3905 3906 3907 3908 3909
		return 1;

	return 0;
}

3910
static void perf_event_comm_ctx(struct perf_event_context *ctx,
3911 3912
				  struct perf_comm_event *comm_event)
{
3913
	struct perf_event *event;
3914

3915 3916 3917
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
		if (perf_event_comm_match(event))
			perf_event_comm_output(event, comm_event);
3918 3919 3920
	}
}

3921
static void perf_event_comm_event(struct perf_comm_event *comm_event)
3922 3923
{
	struct perf_cpu_context *cpuctx;
3924
	struct perf_event_context *ctx;
3925
	char comm[TASK_COMM_LEN];
3926
	unsigned int size;
P
Peter Zijlstra 已提交
3927
	struct pmu *pmu;
P
Peter Zijlstra 已提交
3928
	int ctxn;
3929

3930
	memset(comm, 0, sizeof(comm));
3931
	strlcpy(comm, comm_event->task->comm, sizeof(comm));
3932
	size = ALIGN(strlen(comm)+1, sizeof(u64));
3933 3934 3935 3936

	comm_event->comm = comm;
	comm_event->comm_size = size;

3937
	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3938

3939
	rcu_read_lock();
P
Peter Zijlstra 已提交
3940
	list_for_each_entry_rcu(pmu, &pmus, entry) {
3941
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3942
		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
P
Peter Zijlstra 已提交
3943 3944 3945

		ctxn = pmu->task_ctx_nr;
		if (ctxn < 0)
3946
			goto next;
P
Peter Zijlstra 已提交
3947 3948 3949 3950

		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		if (ctx)
			perf_event_comm_ctx(ctx, comm_event);
3951 3952
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
3953
	}
3954
	rcu_read_unlock();
3955 3956
}

3957
void perf_event_comm(struct task_struct *task)
3958
{
3959
	struct perf_comm_event comm_event;
P
Peter Zijlstra 已提交
3960 3961
	struct perf_event_context *ctx;
	int ctxn;
3962

P
Peter Zijlstra 已提交
3963 3964 3965 3966
	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (!ctx)
			continue;
3967

P
Peter Zijlstra 已提交
3968 3969
		perf_event_enable_on_exec(ctx);
	}
3970

3971
	if (!atomic_read(&nr_comm_events))
3972
		return;
3973

3974
	comm_event = (struct perf_comm_event){
3975
		.task	= task,
3976 3977
		/* .comm      */
		/* .comm_size */
3978
		.event_id  = {
3979
			.header = {
3980
				.type = PERF_RECORD_COMM,
3981 3982 3983 3984 3985
				.misc = 0,
				/* .size */
			},
			/* .pid */
			/* .tid */
3986 3987 3988
		},
	};

3989
	perf_event_comm_event(&comm_event);
3990 3991
}

3992 3993 3994 3995 3996
/*
 * mmap tracking
 */

struct perf_mmap_event {
3997 3998 3999 4000
	struct vm_area_struct	*vma;

	const char		*file_name;
	int			file_size;
4001 4002 4003 4004 4005 4006 4007 4008 4009

	struct {
		struct perf_event_header	header;

		u32				pid;
		u32				tid;
		u64				start;
		u64				len;
		u64				pgoff;
4010
	} event_id;
4011 4012
};

4013
static void perf_event_mmap_output(struct perf_event *event,
4014 4015 4016
				     struct perf_mmap_event *mmap_event)
{
	struct perf_output_handle handle;
4017 4018
	int size = mmap_event->event_id.header.size;
	int ret = perf_output_begin(&handle, event, size, 0, 0);
4019 4020 4021 4022

	if (ret)
		return;

4023 4024
	mmap_event->event_id.pid = perf_event_pid(event, current);
	mmap_event->event_id.tid = perf_event_tid(event, current);
4025

4026
	perf_output_put(&handle, mmap_event->event_id);
4027 4028
	perf_output_copy(&handle, mmap_event->file_name,
				   mmap_event->file_size);
4029
	perf_output_end(&handle);
4030 4031
}

4032
static int perf_event_mmap_match(struct perf_event *event,
4033 4034
				   struct perf_mmap_event *mmap_event,
				   int executable)
4035
{
P
Peter Zijlstra 已提交
4036
	if (event->state < PERF_EVENT_STATE_INACTIVE)
4037 4038
		return 0;

4039 4040 4041
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		return 0;

4042 4043
	if ((!executable && event->attr.mmap_data) ||
	    (executable && event->attr.mmap))
4044 4045 4046 4047 4048
		return 1;

	return 0;
}

4049
static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4050 4051
				  struct perf_mmap_event *mmap_event,
				  int executable)
4052
{
4053
	struct perf_event *event;
4054

4055
	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4056
		if (perf_event_mmap_match(event, mmap_event, executable))
4057
			perf_event_mmap_output(event, mmap_event);
4058 4059 4060
	}
}

4061
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4062 4063
{
	struct perf_cpu_context *cpuctx;
4064
	struct perf_event_context *ctx;
4065 4066
	struct vm_area_struct *vma = mmap_event->vma;
	struct file *file = vma->vm_file;
4067 4068 4069
	unsigned int size;
	char tmp[16];
	char *buf = NULL;
4070
	const char *name;
P
Peter Zijlstra 已提交
4071
	struct pmu *pmu;
P
Peter Zijlstra 已提交
4072
	int ctxn;
4073

4074 4075
	memset(tmp, 0, sizeof(tmp));

4076
	if (file) {
4077 4078 4079 4080 4081 4082
		/*
		 * d_path works from the end of the buffer backwards, so we
		 * need to add enough zero bytes after the string to handle
		 * the 64bit alignment we do later.
		 */
		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4083 4084 4085 4086
		if (!buf) {
			name = strncpy(tmp, "//enomem", sizeof(tmp));
			goto got_name;
		}
4087
		name = d_path(&file->f_path, buf, PATH_MAX);
4088 4089 4090 4091 4092
		if (IS_ERR(name)) {
			name = strncpy(tmp, "//toolong", sizeof(tmp));
			goto got_name;
		}
	} else {
4093 4094 4095
		if (arch_vma_name(mmap_event->vma)) {
			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				       sizeof(tmp));
4096
			goto got_name;
4097
		}
4098 4099 4100 4101

		if (!vma->vm_mm) {
			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			goto got_name;
4102 4103 4104 4105 4106 4107 4108 4109
		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
				vma->vm_end >= vma->vm_mm->brk) {
			name = strncpy(tmp, "[heap]", sizeof(tmp));
			goto got_name;
		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
				vma->vm_end >= vma->vm_mm->start_stack) {
			name = strncpy(tmp, "[stack]", sizeof(tmp));
			goto got_name;
4110 4111
		}

4112 4113 4114 4115 4116
		name = strncpy(tmp, "//anon", sizeof(tmp));
		goto got_name;
	}

got_name:
4117
	size = ALIGN(strlen(name)+1, sizeof(u64));
4118 4119 4120 4121

	mmap_event->file_name = name;
	mmap_event->file_size = size;

4122
	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4123

4124
	rcu_read_lock();
P
Peter Zijlstra 已提交
4125
	list_for_each_entry_rcu(pmu, &pmus, entry) {
4126
		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4127 4128
		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
					vma->vm_flags & VM_EXEC);
P
Peter Zijlstra 已提交
4129 4130 4131

		ctxn = pmu->task_ctx_nr;
		if (ctxn < 0)
4132
			goto next;
P
Peter Zijlstra 已提交
4133 4134 4135 4136 4137 4138

		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
		if (ctx) {
			perf_event_mmap_ctx(ctx, mmap_event,
					vma->vm_flags & VM_EXEC);
		}
4139 4140
next:
		put_cpu_ptr(pmu->pmu_cpu_context);
P
Peter Zijlstra 已提交
4141
	}
4142 4143
	rcu_read_unlock();

4144 4145 4146
	kfree(buf);
}

4147
void perf_event_mmap(struct vm_area_struct *vma)
4148
{
4149 4150
	struct perf_mmap_event mmap_event;

4151
	if (!atomic_read(&nr_mmap_events))
4152 4153 4154
		return;

	mmap_event = (struct perf_mmap_event){
4155
		.vma	= vma,
4156 4157
		/* .file_name */
		/* .file_size */
4158
		.event_id  = {
4159
			.header = {
4160
				.type = PERF_RECORD_MMAP,
4161
				.misc = PERF_RECORD_MISC_USER,
4162 4163 4164 4165
				/* .size */
			},
			/* .pid */
			/* .tid */
4166 4167
			.start  = vma->vm_start,
			.len    = vma->vm_end - vma->vm_start,
4168
			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
4169 4170 4171
		},
	};

4172
	perf_event_mmap_event(&mmap_event);
4173 4174
}

4175 4176 4177 4178
/*
 * IRQ throttle logging
 */

4179
static void perf_log_throttle(struct perf_event *event, int enable)
4180 4181 4182 4183 4184 4185 4186
{
	struct perf_output_handle handle;
	int ret;

	struct {
		struct perf_event_header	header;
		u64				time;
4187
		u64				id;
4188
		u64				stream_id;
4189 4190
	} throttle_event = {
		.header = {
4191
			.type = PERF_RECORD_THROTTLE,
4192 4193 4194
			.misc = 0,
			.size = sizeof(throttle_event),
		},
P
Peter Zijlstra 已提交
4195
		.time		= perf_clock(),
4196 4197
		.id		= primary_event_id(event),
		.stream_id	= event->id,
4198 4199
	};

4200
	if (enable)
4201
		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4202

4203
	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
4204 4205 4206 4207 4208 4209 4210
	if (ret)
		return;

	perf_output_put(&handle, throttle_event);
	perf_output_end(&handle);
}

4211
/*
4212
 * Generic event overflow handling, sampling.
4213 4214
 */

4215
static int __perf_event_overflow(struct perf_event *event, int nmi,
4216 4217
				   int throttle, struct perf_sample_data *data,
				   struct pt_regs *regs)
4218
{
4219 4220
	int events = atomic_read(&event->event_limit);
	struct hw_perf_event *hwc = &event->hw;
4221 4222
	int ret = 0;

4223
	if (!throttle) {
4224
		hwc->interrupts++;
4225
	} else {
4226 4227
		if (hwc->interrupts != MAX_INTERRUPTS) {
			hwc->interrupts++;
4228
			if (HZ * hwc->interrupts >
4229
					(u64)sysctl_perf_event_sample_rate) {
4230
				hwc->interrupts = MAX_INTERRUPTS;
4231
				perf_log_throttle(event, 0);
4232 4233 4234 4235
				ret = 1;
			}
		} else {
			/*
4236
			 * Keep re-disabling events even though on the previous
4237
			 * pass we disabled it - just in case we raced with a
4238
			 * sched-in and the event got enabled again:
4239
			 */
4240 4241 4242
			ret = 1;
		}
	}
4243

4244
	if (event->attr.freq) {
P
Peter Zijlstra 已提交
4245
		u64 now = perf_clock();
4246
		s64 delta = now - hwc->freq_time_stamp;
4247

4248
		hwc->freq_time_stamp = now;
4249

4250 4251
		if (delta > 0 && delta < 2*TICK_NSEC)
			perf_adjust_period(event, delta, hwc->last_period);
4252 4253
	}

4254 4255
	/*
	 * XXX event_limit might not quite work as expected on inherited
4256
	 * events
4257 4258
	 */

4259 4260
	event->pending_kill = POLL_IN;
	if (events && atomic_dec_and_test(&event->event_limit)) {
4261
		ret = 1;
4262
		event->pending_kill = POLL_HUP;
4263
		if (nmi) {
4264
			event->pending_disable = 1;
4265
			irq_work_queue(&event->pending);
4266
		} else
4267
			perf_event_disable(event);
4268 4269
	}

4270 4271 4272 4273 4274
	if (event->overflow_handler)
		event->overflow_handler(event, nmi, data, regs);
	else
		perf_event_output(event, nmi, data, regs);

4275
	return ret;
4276 4277
}

4278
int perf_event_overflow(struct perf_event *event, int nmi,
4279 4280
			  struct perf_sample_data *data,
			  struct pt_regs *regs)
4281
{
4282
	return __perf_event_overflow(event, nmi, 1, data, regs);
4283 4284
}

4285
/*
4286
 * Generic software event infrastructure
4287 4288
 */

4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299
struct swevent_htable {
	struct swevent_hlist		*swevent_hlist;
	struct mutex			hlist_mutex;
	int				hlist_refcount;

	/* Recursion avoidance in each contexts */
	int				recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

4300
/*
4301 4302
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
4303 4304 4305 4306
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

4307
static u64 perf_swevent_set_period(struct perf_event *event)
4308
{
4309
	struct hw_perf_event *hwc = &event->hw;
4310 4311 4312 4313 4314
	u64 period = hwc->last_period;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;
4315 4316

again:
4317
	old = val = local64_read(&hwc->period_left);
4318 4319
	if (val < 0)
		return 0;
4320

4321 4322 4323
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
4324
	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4325
		goto again;
4326

4327
	return nr;
4328 4329
}

4330
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4331 4332
				    int nmi, struct perf_sample_data *data,
				    struct pt_regs *regs)
4333
{
4334
	struct hw_perf_event *hwc = &event->hw;
4335
	int throttle = 0;
4336

4337
	data->period = event->hw.last_period;
4338 4339
	if (!overflow)
		overflow = perf_swevent_set_period(event);
4340

4341 4342
	if (hwc->interrupts == MAX_INTERRUPTS)
		return;
4343

4344
	for (; overflow; overflow--) {
4345
		if (__perf_event_overflow(event, nmi, throttle,
4346
					    data, regs)) {
4347 4348 4349 4350 4351 4352
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
4353
		throttle = 1;
4354
	}
4355 4356
}

P
Peter Zijlstra 已提交
4357
static void perf_swevent_event(struct perf_event *event, u64 nr,
4358 4359
			       int nmi, struct perf_sample_data *data,
			       struct pt_regs *regs)
4360
{
4361
	struct hw_perf_event *hwc = &event->hw;
4362

4363
	local64_add(nr, &event->count);
4364

4365 4366 4367
	if (!regs)
		return;

4368 4369
	if (!hwc->sample_period)
		return;
4370

4371 4372 4373
	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
		return perf_swevent_overflow(event, 1, nmi, data, regs);

4374
	if (local64_add_negative(nr, &hwc->period_left))
4375
		return;
4376

4377
	perf_swevent_overflow(event, 0, nmi, data, regs);
4378 4379
}

4380 4381 4382
static int perf_exclude_event(struct perf_event *event,
			      struct pt_regs *regs)
{
P
Peter Zijlstra 已提交
4383 4384 4385
	if (event->hw.state & PERF_HES_STOPPED)
		return 0;

4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396
	if (regs) {
		if (event->attr.exclude_user && user_mode(regs))
			return 1;

		if (event->attr.exclude_kernel && !user_mode(regs))
			return 1;
	}

	return 0;
}

4397
static int perf_swevent_match(struct perf_event *event,
P
Peter Zijlstra 已提交
4398
				enum perf_type_id type,
L
Li Zefan 已提交
4399 4400 4401
				u32 event_id,
				struct perf_sample_data *data,
				struct pt_regs *regs)
4402
{
4403
	if (event->attr.type != type)
4404
		return 0;
4405

4406
	if (event->attr.config != event_id)
4407 4408
		return 0;

4409 4410
	if (perf_exclude_event(event, regs))
		return 0;
4411 4412 4413 4414

	return 1;
}

4415 4416 4417 4418 4419 4420 4421
static inline u64 swevent_hash(u64 type, u32 event_id)
{
	u64 val = event_id | (type << 32);

	return hash_64(val, SWEVENT_HLIST_BITS);
}

4422 4423
static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4424
{
4425 4426 4427 4428
	u64 hash = swevent_hash(type, event_id);

	return &hlist->heads[hash];
}
4429

4430 4431
/* For the read side: events when they trigger */
static inline struct hlist_head *
4432
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4433 4434
{
	struct swevent_hlist *hlist;
4435

4436
	hlist = rcu_dereference(swhash->swevent_hlist);
4437 4438 4439
	if (!hlist)
		return NULL;

4440 4441 4442 4443 4444
	return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
4445
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4446 4447 4448 4449 4450 4451 4452 4453 4454 4455
{
	struct swevent_hlist *hlist;
	u32 event_id = event->attr.config;
	u64 type = event->attr.type;

	/*
	 * Event scheduling is always serialized against hlist allocation
	 * and release. Which makes the protected version suitable here.
	 * The context lock guarantees that.
	 */
4456
	hlist = rcu_dereference_protected(swhash->swevent_hlist,
4457 4458 4459 4460 4461
					  lockdep_is_held(&event->ctx->lock));
	if (!hlist)
		return NULL;

	return __find_swevent_head(hlist, type, event_id);
4462 4463 4464 4465 4466 4467
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
				    u64 nr, int nmi,
				    struct perf_sample_data *data,
				    struct pt_regs *regs)
4468
{
4469
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4470
	struct perf_event *event;
4471 4472
	struct hlist_node *node;
	struct hlist_head *head;
4473

4474
	rcu_read_lock();
4475
	head = find_swevent_head_rcu(swhash, type, event_id);
4476 4477 4478 4479
	if (!head)
		goto end;

	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
L
Li Zefan 已提交
4480
		if (perf_swevent_match(event, type, event_id, data, regs))
P
Peter Zijlstra 已提交
4481
			perf_swevent_event(event, nr, nmi, data, regs);
4482
	}
4483 4484
end:
	rcu_read_unlock();
4485 4486
}

4487
int perf_swevent_get_recursion_context(void)
P
Peter Zijlstra 已提交
4488
{
4489
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
P
Peter Zijlstra 已提交
4490

4491
	return get_recursion_context(swhash->recursion);
P
Peter Zijlstra 已提交
4492
}
I
Ingo Molnar 已提交
4493
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
P
Peter Zijlstra 已提交
4494

4495
void inline perf_swevent_put_recursion_context(int rctx)
4496
{
4497
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4498

4499
	put_recursion_context(swhash->recursion, rctx);
4500
}
4501

4502
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4503
			    struct pt_regs *regs, u64 addr)
4504
{
4505
	struct perf_sample_data data;
4506 4507
	int rctx;

4508
	preempt_disable_notrace();
4509 4510 4511
	rctx = perf_swevent_get_recursion_context();
	if (rctx < 0)
		return;
4512

4513
	perf_sample_data_init(&data, addr);
4514

4515
	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4516 4517

	perf_swevent_put_recursion_context(rctx);
4518
	preempt_enable_notrace();
4519 4520
}

4521
static void perf_swevent_read(struct perf_event *event)
4522 4523 4524
{
}

P
Peter Zijlstra 已提交
4525
static int perf_swevent_add(struct perf_event *event, int flags)
4526
{
4527
	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4528
	struct hw_perf_event *hwc = &event->hw;
4529 4530
	struct hlist_head *head;

4531 4532
	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
4533
		perf_swevent_set_period(event);
4534
	}
4535

P
Peter Zijlstra 已提交
4536 4537
	hwc->state = !(flags & PERF_EF_START);

4538
	head = find_swevent_head(swhash, event);
4539 4540 4541 4542 4543
	if (WARN_ON_ONCE(!head))
		return -EINVAL;

	hlist_add_head_rcu(&event->hlist_entry, head);

4544 4545 4546
	return 0;
}

P
Peter Zijlstra 已提交
4547
static void perf_swevent_del(struct perf_event *event, int flags)
4548
{
4549
	hlist_del_rcu(&event->hlist_entry);
4550 4551
}

P
Peter Zijlstra 已提交
4552
static void perf_swevent_start(struct perf_event *event, int flags)
4553
{
P
Peter Zijlstra 已提交
4554
	event->hw.state = 0;
4555 4556
}

P
Peter Zijlstra 已提交
4557
static void perf_swevent_stop(struct perf_event *event, int flags)
4558
{
P
Peter Zijlstra 已提交
4559
	event->hw.state = PERF_HES_STOPPED;
4560 4561
}

4562 4563
/* Deref the hlist from the update side */
static inline struct swevent_hlist *
4564
swevent_hlist_deref(struct swevent_htable *swhash)
4565
{
4566 4567
	return rcu_dereference_protected(swhash->swevent_hlist,
					 lockdep_is_held(&swhash->hlist_mutex));
4568 4569
}

4570 4571 4572 4573 4574 4575 4576 4577
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
	struct swevent_hlist *hlist;

	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
	kfree(hlist);
}

4578
static void swevent_hlist_release(struct swevent_htable *swhash)
4579
{
4580
	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4581

4582
	if (!hlist)
4583 4584
		return;

4585
	rcu_assign_pointer(swhash->swevent_hlist, NULL);
4586 4587 4588 4589 4590
	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}

static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
{
4591
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4592

4593
	mutex_lock(&swhash->hlist_mutex);
4594

4595 4596
	if (!--swhash->hlist_refcount)
		swevent_hlist_release(swhash);
4597

4598
	mutex_unlock(&swhash->hlist_mutex);
4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615
}

static void swevent_hlist_put(struct perf_event *event)
{
	int cpu;

	if (event->cpu != -1) {
		swevent_hlist_put_cpu(event, event->cpu);
		return;
	}

	for_each_possible_cpu(cpu)
		swevent_hlist_put_cpu(event, cpu);
}

static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
4616
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4617 4618
	int err = 0;

4619
	mutex_lock(&swhash->hlist_mutex);
4620

4621
	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4622 4623 4624 4625 4626 4627 4628
		struct swevent_hlist *hlist;

		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
		if (!hlist) {
			err = -ENOMEM;
			goto exit;
		}
4629
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
4630
	}
4631
	swhash->hlist_refcount++;
P
Peter Zijlstra 已提交
4632
exit:
4633
	mutex_unlock(&swhash->hlist_mutex);
4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656

	return err;
}

static int swevent_hlist_get(struct perf_event *event)
{
	int err;
	int cpu, failed_cpu;

	if (event->cpu != -1)
		return swevent_hlist_get_cpu(event, event->cpu);

	get_online_cpus();
	for_each_possible_cpu(cpu) {
		err = swevent_hlist_get_cpu(event, cpu);
		if (err) {
			failed_cpu = cpu;
			goto fail;
		}
	}
	put_online_cpus();

	return 0;
P
Peter Zijlstra 已提交
4657
fail:
4658 4659 4660 4661 4662 4663 4664 4665 4666 4667
	for_each_possible_cpu(cpu) {
		if (cpu == failed_cpu)
			break;
		swevent_hlist_put_cpu(event, cpu);
	}

	put_online_cpus();
	return err;
}

4668
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4669

4670 4671 4672
static void sw_perf_event_destroy(struct perf_event *event)
{
	u64 event_id = event->attr.config;
4673

4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713
	WARN_ON(event->parent);

	atomic_dec(&perf_swevent_enabled[event_id]);
	swevent_hlist_put(event);
}

static int perf_swevent_init(struct perf_event *event)
{
	int event_id = event->attr.config;

	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	switch (event_id) {
	case PERF_COUNT_SW_CPU_CLOCK:
	case PERF_COUNT_SW_TASK_CLOCK:
		return -ENOENT;

	default:
		break;
	}

	if (event_id > PERF_COUNT_SW_MAX)
		return -ENOENT;

	if (!event->parent) {
		int err;

		err = swevent_hlist_get(event);
		if (err)
			return err;

		atomic_inc(&perf_swevent_enabled[event_id]);
		event->destroy = sw_perf_event_destroy;
	}

	return 0;
}

static struct pmu perf_swevent = {
4714 4715
	.task_ctx_nr	= perf_sw_context,

4716
	.event_init	= perf_swevent_init,
P
Peter Zijlstra 已提交
4717 4718 4719 4720
	.add		= perf_swevent_add,
	.del		= perf_swevent_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
4721 4722 4723
	.read		= perf_swevent_read,
};

4724 4725
#ifdef CONFIG_EVENT_TRACING

4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739
static int perf_tp_filter_match(struct perf_event *event,
				struct perf_sample_data *data)
{
	void *record = data->raw->data;

	if (likely(!event->filter) || filter_match_preds(event->filter, record))
		return 1;
	return 0;
}

static int perf_tp_event_match(struct perf_event *event,
				struct perf_sample_data *data,
				struct pt_regs *regs)
{
4740 4741 4742 4743
	/*
	 * All tracepoints are from kernel-space.
	 */
	if (event->attr.exclude_kernel)
4744 4745 4746 4747 4748 4749 4750 4751 4752
		return 0;

	if (!perf_tp_filter_match(event, data))
		return 0;

	return 1;
}

void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4753
		   struct pt_regs *regs, struct hlist_head *head, int rctx)
4754 4755
{
	struct perf_sample_data data;
4756 4757 4758
	struct perf_event *event;
	struct hlist_node *node;

4759 4760 4761 4762 4763 4764 4765 4766
	struct perf_raw_record raw = {
		.size = entry_size,
		.data = record,
	};

	perf_sample_data_init(&data, addr);
	data.raw = &raw;

4767 4768
	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
		if (perf_tp_event_match(event, &data, regs))
P
Peter Zijlstra 已提交
4769
			perf_swevent_event(event, count, 1, &data, regs);
4770
	}
4771 4772

	perf_swevent_put_recursion_context(rctx);
4773 4774 4775
}
EXPORT_SYMBOL_GPL(perf_tp_event);

4776
static void tp_perf_event_destroy(struct perf_event *event)
4777
{
4778
	perf_trace_destroy(event);
4779 4780
}

4781
static int perf_tp_event_init(struct perf_event *event)
4782
{
4783 4784
	int err;

4785 4786 4787
	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -ENOENT;

4788 4789 4790 4791
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
4792
	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4793
			perf_paranoid_tracepoint_raw() &&
4794
			!capable(CAP_SYS_ADMIN))
4795
		return -EPERM;
4796

4797 4798
	err = perf_trace_init(event);
	if (err)
4799
		return err;
4800

4801
	event->destroy = tp_perf_event_destroy;
4802

4803 4804 4805 4806
	return 0;
}

static struct pmu perf_tracepoint = {
4807 4808
	.task_ctx_nr	= perf_sw_context,

4809
	.event_init	= perf_tp_event_init,
P
Peter Zijlstra 已提交
4810 4811 4812 4813
	.add		= perf_trace_add,
	.del		= perf_trace_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
4814 4815 4816 4817 4818 4819
	.read		= perf_swevent_read,
};

static inline void perf_tp_register(void)
{
	perf_pmu_register(&perf_tracepoint);
4820
}
L
Li Zefan 已提交
4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	char *filter_str;
	int ret;

	if (event->attr.type != PERF_TYPE_TRACEPOINT)
		return -EINVAL;

	filter_str = strndup_user(arg, PAGE_SIZE);
	if (IS_ERR(filter_str))
		return PTR_ERR(filter_str);

	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);

	kfree(filter_str);
	return ret;
}

static void perf_event_free_filter(struct perf_event *event)
{
	ftrace_profile_free_filter(event);
}

4845
#else
L
Li Zefan 已提交
4846

4847
static inline void perf_tp_register(void)
4848 4849
{
}
L
Li Zefan 已提交
4850 4851 4852 4853 4854 4855 4856 4857 4858 4859

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
	return -ENOENT;
}

static void perf_event_free_filter(struct perf_event *event)
{
}

4860
#endif /* CONFIG_EVENT_TRACING */
4861

4862
#ifdef CONFIG_HAVE_HW_BREAKPOINT
4863
void perf_bp_event(struct perf_event *bp, void *data)
4864
{
4865 4866 4867 4868 4869
	struct perf_sample_data sample;
	struct pt_regs *regs = data;

	perf_sample_data_init(&sample, bp->attr.bp_addr);

P
Peter Zijlstra 已提交
4870 4871
	if (!bp->hw.state && !perf_exclude_event(bp, regs))
		perf_swevent_event(bp, 1, 1, &sample, regs);
4872
}
4873 4874 4875 4876 4877
#endif

/*
 * hrtimer based swevent callback
 */
4878

4879
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4880
{
4881 4882 4883 4884 4885
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
	struct pt_regs *regs;
	struct perf_event *event;
	u64 period;
4886

4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898
	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
	event->pmu->read(event);

	perf_sample_data_init(&data, 0);
	data.period = event->hw.last_period;
	regs = get_irq_regs();

	if (regs && !perf_exclude_event(event, regs)) {
		if (!(event->attr.exclude_idle && current->pid == 0))
			if (perf_event_overflow(event, 0, &data, regs))
				ret = HRTIMER_NORESTART;
	}
4899

4900 4901
	period = max_t(u64, 10000, event->hw.sample_period);
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4902

4903
	return ret;
4904 4905
}

4906
static void perf_swevent_start_hrtimer(struct perf_event *event)
4907
{
4908
	struct hw_perf_event *hwc = &event->hw;
4909

4910 4911 4912
	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hwc->hrtimer.function = perf_swevent_hrtimer;
	if (hwc->sample_period) {
P
Peter Zijlstra 已提交
4913
		s64 period = local64_read(&hwc->period_left);
4914

P
Peter Zijlstra 已提交
4915 4916
		if (period) {
			if (period < 0)
4917
				period = 10000;
P
Peter Zijlstra 已提交
4918 4919

			local64_set(&hwc->period_left, 0);
4920 4921 4922 4923 4924
		} else {
			period = max_t(u64, 10000, hwc->sample_period);
		}
		__hrtimer_start_range_ns(&hwc->hrtimer,
				ns_to_ktime(period), 0,
4925
				HRTIMER_MODE_REL_PINNED, 0);
4926
	}
4927
}
4928 4929

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4930
{
4931 4932 4933 4934
	struct hw_perf_event *hwc = &event->hw;

	if (hwc->sample_period) {
		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
P
Peter Zijlstra 已提交
4935
		local64_set(&hwc->period_left, ktime_to_ns(remaining));
4936 4937 4938

		hrtimer_cancel(&hwc->hrtimer);
	}
4939 4940
}

4941 4942 4943 4944 4945
/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
4946
{
4947 4948 4949
	s64 prev;
	u64 now;

P
Peter Zijlstra 已提交
4950
	now = local_clock();
4951 4952
	prev = local64_xchg(&event->hw.prev_count, now);
	local64_add(now - prev, &event->count);
4953 4954
}

P
Peter Zijlstra 已提交
4955
static void cpu_clock_event_start(struct perf_event *event, int flags)
4956
{
P
Peter Zijlstra 已提交
4957
	local64_set(&event->hw.prev_count, local_clock());
4958 4959 4960
	perf_swevent_start_hrtimer(event);
}

P
Peter Zijlstra 已提交
4961
static void cpu_clock_event_stop(struct perf_event *event, int flags)
4962
{
4963 4964 4965
	perf_swevent_cancel_hrtimer(event);
	cpu_clock_event_update(event);
}
4966

P
Peter Zijlstra 已提交
4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979
static int cpu_clock_event_add(struct perf_event *event, int flags)
{
	if (flags & PERF_EF_START)
		cpu_clock_event_start(event, flags);

	return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
	cpu_clock_event_stop(event, flags);
}

4980 4981 4982 4983
static void cpu_clock_event_read(struct perf_event *event)
{
	cpu_clock_event_update(event);
}
4984

4985 4986 4987 4988 4989 4990 4991 4992 4993
static int cpu_clock_event_init(struct perf_event *event)
{
	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
		return -ENOENT;

	return 0;
4994 4995
}

4996
static struct pmu perf_cpu_clock = {
4997 4998
	.task_ctx_nr	= perf_sw_context,

4999
	.event_init	= cpu_clock_event_init,
P
Peter Zijlstra 已提交
5000 5001 5002 5003
	.add		= cpu_clock_event_add,
	.del		= cpu_clock_event_del,
	.start		= cpu_clock_event_start,
	.stop		= cpu_clock_event_stop,
5004 5005 5006 5007 5008 5009 5010 5011
	.read		= cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
5012
{
5013 5014
	u64 prev;
	s64 delta;
5015

5016 5017 5018 5019
	prev = local64_xchg(&event->hw.prev_count, now);
	delta = now - prev;
	local64_add(delta, &event->count);
}
5020

P
Peter Zijlstra 已提交
5021
static void task_clock_event_start(struct perf_event *event, int flags)
5022
{
P
Peter Zijlstra 已提交
5023
	local64_set(&event->hw.prev_count, event->ctx->time);
5024 5025 5026
	perf_swevent_start_hrtimer(event);
}

P
Peter Zijlstra 已提交
5027
static void task_clock_event_stop(struct perf_event *event, int flags)
5028 5029 5030
{
	perf_swevent_cancel_hrtimer(event);
	task_clock_event_update(event, event->ctx->time);
P
Peter Zijlstra 已提交
5031 5032 5033 5034 5035 5036
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
	if (flags & PERF_EF_START)
		task_clock_event_start(event, flags);
5037

P
Peter Zijlstra 已提交
5038 5039 5040 5041 5042 5043
	return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
	task_clock_event_stop(event, PERF_EF_UPDATE);
5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062
}

static void task_clock_event_read(struct perf_event *event)
{
	u64 time;

	if (!in_nmi()) {
		update_context_time(event->ctx);
		time = event->ctx->time;
	} else {
		u64 now = perf_clock();
		u64 delta = now - event->ctx->timestamp;
		time = event->ctx->time + delta;
	}

	task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
L
Li Zefan 已提交
5063
{
5064 5065 5066 5067 5068 5069 5070
	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
		return -ENOENT;

	return 0;
L
Li Zefan 已提交
5071 5072
}

5073
static struct pmu perf_task_clock = {
5074 5075
	.task_ctx_nr	= perf_sw_context,

5076
	.event_init	= task_clock_event_init,
P
Peter Zijlstra 已提交
5077 5078 5079 5080
	.add		= task_clock_event_add,
	.del		= task_clock_event_del,
	.start		= task_clock_event_start,
	.stop		= task_clock_event_stop,
5081 5082
	.read		= task_clock_event_read,
};
L
Li Zefan 已提交
5083

P
Peter Zijlstra 已提交
5084
static void perf_pmu_nop_void(struct pmu *pmu)
5085 5086
{
}
L
Li Zefan 已提交
5087

P
Peter Zijlstra 已提交
5088
static int perf_pmu_nop_int(struct pmu *pmu)
L
Li Zefan 已提交
5089
{
P
Peter Zijlstra 已提交
5090
	return 0;
L
Li Zefan 已提交
5091 5092
}

P
Peter Zijlstra 已提交
5093
static void perf_pmu_start_txn(struct pmu *pmu)
L
Li Zefan 已提交
5094
{
P
Peter Zijlstra 已提交
5095
	perf_pmu_disable(pmu);
L
Li Zefan 已提交
5096 5097
}

P
Peter Zijlstra 已提交
5098 5099 5100 5101 5102
static int perf_pmu_commit_txn(struct pmu *pmu)
{
	perf_pmu_enable(pmu);
	return 0;
}
5103

P
Peter Zijlstra 已提交
5104
static void perf_pmu_cancel_txn(struct pmu *pmu)
5105
{
P
Peter Zijlstra 已提交
5106
	perf_pmu_enable(pmu);
5107 5108
}

P
Peter Zijlstra 已提交
5109 5110 5111 5112 5113
/*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
 */
static void *find_pmu_context(int ctxn)
5114
{
P
Peter Zijlstra 已提交
5115
	struct pmu *pmu;
5116

P
Peter Zijlstra 已提交
5117 5118
	if (ctxn < 0)
		return NULL;
5119

P
Peter Zijlstra 已提交
5120 5121 5122 5123
	list_for_each_entry(pmu, &pmus, entry) {
		if (pmu->task_ctx_nr == ctxn)
			return pmu->pmu_cpu_context;
	}
5124

P
Peter Zijlstra 已提交
5125
	return NULL;
5126 5127
}

P
Peter Zijlstra 已提交
5128
static void free_pmu_context(void * __percpu cpu_context)
5129
{
P
Peter Zijlstra 已提交
5130
	struct pmu *pmu;
5131

P
Peter Zijlstra 已提交
5132 5133 5134 5135 5136 5137 5138 5139
	mutex_lock(&pmus_lock);
	/*
	 * Like a real lame refcount.
	 */
	list_for_each_entry(pmu, &pmus, entry) {
		if (pmu->pmu_cpu_context == cpu_context)
			goto out;
	}
5140

P
Peter Zijlstra 已提交
5141 5142 5143
	free_percpu(cpu_context);
out:
	mutex_unlock(&pmus_lock);
5144 5145
}

5146
int perf_pmu_register(struct pmu *pmu)
5147
{
P
Peter Zijlstra 已提交
5148
	int cpu, ret;
5149

5150
	mutex_lock(&pmus_lock);
P
Peter Zijlstra 已提交
5151 5152 5153 5154
	ret = -ENOMEM;
	pmu->pmu_disable_count = alloc_percpu(int);
	if (!pmu->pmu_disable_count)
		goto unlock;
5155

P
Peter Zijlstra 已提交
5156 5157 5158
	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
	if (pmu->pmu_cpu_context)
		goto got_cpu_context;
5159

P
Peter Zijlstra 已提交
5160 5161 5162
	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
	if (!pmu->pmu_cpu_context)
		goto free_pdc;
5163

P
Peter Zijlstra 已提交
5164 5165 5166 5167
	for_each_possible_cpu(cpu) {
		struct perf_cpu_context *cpuctx;

		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5168
		__perf_event_init_context(&cpuctx->ctx);
5169
		cpuctx->ctx.type = cpu_context;
P
Peter Zijlstra 已提交
5170
		cpuctx->ctx.pmu = pmu;
5171 5172
		cpuctx->jiffies_interval = 1;
		INIT_LIST_HEAD(&cpuctx->rotation_list);
P
Peter Zijlstra 已提交
5173 5174
	}

P
Peter Zijlstra 已提交
5175
got_cpu_context:
P
Peter Zijlstra 已提交
5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197
	if (!pmu->start_txn) {
		if (pmu->pmu_enable) {
			/*
			 * If we have pmu_enable/pmu_disable calls, install
			 * transaction stubs that use that to try and batch
			 * hardware accesses.
			 */
			pmu->start_txn  = perf_pmu_start_txn;
			pmu->commit_txn = perf_pmu_commit_txn;
			pmu->cancel_txn = perf_pmu_cancel_txn;
		} else {
			pmu->start_txn  = perf_pmu_nop_void;
			pmu->commit_txn = perf_pmu_nop_int;
			pmu->cancel_txn = perf_pmu_nop_void;
		}
	}

	if (!pmu->pmu_enable) {
		pmu->pmu_enable  = perf_pmu_nop_void;
		pmu->pmu_disable = perf_pmu_nop_void;
	}

5198
	list_add_rcu(&pmu->entry, &pmus);
P
Peter Zijlstra 已提交
5199 5200
	ret = 0;
unlock:
5201 5202
	mutex_unlock(&pmus_lock);

P
Peter Zijlstra 已提交
5203
	return ret;
P
Peter Zijlstra 已提交
5204 5205 5206 5207

free_pdc:
	free_percpu(pmu->pmu_disable_count);
	goto unlock;
5208 5209
}

5210
void perf_pmu_unregister(struct pmu *pmu)
5211
{
5212 5213 5214
	mutex_lock(&pmus_lock);
	list_del_rcu(&pmu->entry);
	mutex_unlock(&pmus_lock);
5215

5216
	/*
P
Peter Zijlstra 已提交
5217 5218
	 * We dereference the pmu list under both SRCU and regular RCU, so
	 * synchronize against both of those.
5219
	 */
5220
	synchronize_srcu(&pmus_srcu);
P
Peter Zijlstra 已提交
5221
	synchronize_rcu();
5222

P
Peter Zijlstra 已提交
5223
	free_percpu(pmu->pmu_disable_count);
P
Peter Zijlstra 已提交
5224
	free_pmu_context(pmu->pmu_cpu_context);
5225
}
5226

5227 5228 5229 5230 5231 5232 5233 5234 5235
struct pmu *perf_init_event(struct perf_event *event)
{
	struct pmu *pmu = NULL;
	int idx;

	idx = srcu_read_lock(&pmus_srcu);
	list_for_each_entry_rcu(pmu, &pmus, entry) {
		int ret = pmu->event_init(event);
		if (!ret)
P
Peter Zijlstra 已提交
5236
			goto unlock;
5237

5238 5239
		if (ret != -ENOENT) {
			pmu = ERR_PTR(ret);
P
Peter Zijlstra 已提交
5240
			goto unlock;
5241
		}
5242
	}
P
Peter Zijlstra 已提交
5243 5244
	pmu = ERR_PTR(-ENOENT);
unlock:
5245
	srcu_read_unlock(&pmus_srcu, idx);
5246

5247
	return pmu;
5248 5249
}

T
Thomas Gleixner 已提交
5250
/*
5251
 * Allocate and initialize a event structure
T
Thomas Gleixner 已提交
5252
 */
5253
static struct perf_event *
5254
perf_event_alloc(struct perf_event_attr *attr, int cpu,
5255 5256
		   struct perf_event *group_leader,
		   struct perf_event *parent_event,
5257
		   perf_overflow_handler_t overflow_handler)
T
Thomas Gleixner 已提交
5258
{
P
Peter Zijlstra 已提交
5259
	struct pmu *pmu;
5260 5261
	struct perf_event *event;
	struct hw_perf_event *hwc;
5262
	long err;
T
Thomas Gleixner 已提交
5263

5264
	event = kzalloc(sizeof(*event), GFP_KERNEL);
5265
	if (!event)
5266
		return ERR_PTR(-ENOMEM);
T
Thomas Gleixner 已提交
5267

5268
	/*
5269
	 * Single events are their own group leaders, with an
5270 5271 5272
	 * empty sibling list:
	 */
	if (!group_leader)
5273
		group_leader = event;
5274

5275 5276
	mutex_init(&event->child_mutex);
	INIT_LIST_HEAD(&event->child_list);
5277

5278 5279 5280 5281
	INIT_LIST_HEAD(&event->group_entry);
	INIT_LIST_HEAD(&event->event_entry);
	INIT_LIST_HEAD(&event->sibling_list);
	init_waitqueue_head(&event->waitq);
5282
	init_irq_work(&event->pending, perf_pending_event);
T
Thomas Gleixner 已提交
5283

5284
	mutex_init(&event->mmap_mutex);
5285

5286 5287 5288 5289 5290
	event->cpu		= cpu;
	event->attr		= *attr;
	event->group_leader	= group_leader;
	event->pmu		= NULL;
	event->oncpu		= -1;
5291

5292
	event->parent		= parent_event;
5293

5294 5295
	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
	event->id		= atomic64_inc_return(&perf_event_id);
5296

5297
	event->state		= PERF_EVENT_STATE_INACTIVE;
5298

5299 5300
	if (!overflow_handler && parent_event)
		overflow_handler = parent_event->overflow_handler;
5301
	
5302
	event->overflow_handler	= overflow_handler;
5303

5304
	if (attr->disabled)
5305
		event->state = PERF_EVENT_STATE_OFF;
5306

5307
	pmu = NULL;
5308

5309
	hwc = &event->hw;
5310
	hwc->sample_period = attr->sample_period;
5311
	if (attr->freq && attr->sample_freq)
5312
		hwc->sample_period = 1;
5313
	hwc->last_period = hwc->sample_period;
5314

5315
	local64_set(&hwc->period_left, hwc->sample_period);
5316

5317
	/*
5318
	 * we currently do not support PERF_FORMAT_GROUP on inherited events
5319
	 */
5320
	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
5321 5322
		goto done;

5323
	pmu = perf_init_event(event);
5324

5325 5326
done:
	err = 0;
5327
	if (!pmu)
5328
		err = -EINVAL;
5329 5330
	else if (IS_ERR(pmu))
		err = PTR_ERR(pmu);
5331

5332
	if (err) {
5333 5334 5335
		if (event->ns)
			put_pid_ns(event->ns);
		kfree(event);
5336
		return ERR_PTR(err);
I
Ingo Molnar 已提交
5337
	}
5338

5339
	event->pmu = pmu;
T
Thomas Gleixner 已提交
5340

5341 5342
	if (!event->parent) {
		atomic_inc(&nr_events);
5343
		if (event->attr.mmap || event->attr.mmap_data)
5344 5345 5346 5347 5348
			atomic_inc(&nr_mmap_events);
		if (event->attr.comm)
			atomic_inc(&nr_comm_events);
		if (event->attr.task)
			atomic_inc(&nr_task_events);
5349 5350 5351 5352 5353 5354 5355
		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
			err = get_callchain_buffers();
			if (err) {
				free_event(event);
				return ERR_PTR(err);
			}
		}
5356
	}
5357

5358
	return event;
T
Thomas Gleixner 已提交
5359 5360
}

5361 5362
static int perf_copy_attr(struct perf_event_attr __user *uattr,
			  struct perf_event_attr *attr)
5363 5364
{
	u32 size;
5365
	int ret;
5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389

	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
		return -EFAULT;

	/*
	 * zero the full structure, so that a short copy will be nice.
	 */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	if (size > PAGE_SIZE)	/* silly large */
		goto err_size;

	if (!size)		/* abi compat */
		size = PERF_ATTR_SIZE_VER0;

	if (size < PERF_ATTR_SIZE_VER0)
		goto err_size;

	/*
	 * If we're handed a bigger struct than we know of,
5390 5391 5392
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
5393 5394
	 */
	if (size > sizeof(*attr)) {
5395 5396 5397
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;
5398

5399 5400
		addr = (void __user *)uattr + sizeof(*attr);
		end  = (void __user *)uattr + size;
5401

5402
		for (; addr < end; addr++) {
5403 5404 5405 5406 5407 5408
			ret = get_user(val, addr);
			if (ret)
				return ret;
			if (val)
				goto err_size;
		}
5409
		size = sizeof(*attr);
5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422
	}

	ret = copy_from_user(attr, uattr, size);
	if (ret)
		return -EFAULT;

	/*
	 * If the type exists, the corresponding creation will verify
	 * the attr->config.
	 */
	if (attr->type >= PERF_TYPE_MAX)
		return -EINVAL;

5423
	if (attr->__reserved_1)
5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440
		return -EINVAL;

	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
		return -EINVAL;

	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
		return -EINVAL;

out:
	return ret;

err_size:
	put_user(sizeof(*attr), &uattr->size);
	ret = -E2BIG;
	goto out;
}

5441 5442
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5443
{
5444
	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5445 5446
	int ret = -EINVAL;

5447
	if (!output_event)
5448 5449
		goto set;

5450 5451
	/* don't allow circular references */
	if (event == output_event)
5452 5453
		goto out;

5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465
	/*
	 * Don't allow cross-cpu buffers
	 */
	if (output_event->cpu != event->cpu)
		goto out;

	/*
	 * If its not a per-cpu buffer, it must be the same task.
	 */
	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
		goto out;

5466
set:
5467
	mutex_lock(&event->mmap_mutex);
5468 5469 5470
	/* Can't redirect output if we've got an active mmap() */
	if (atomic_read(&event->mmap_count))
		goto unlock;
5471

5472 5473
	if (output_event) {
		/* get the buffer we want to redirect to */
5474 5475
		buffer = perf_buffer_get(output_event);
		if (!buffer)
5476
			goto unlock;
5477 5478
	}

5479 5480
	old_buffer = event->buffer;
	rcu_assign_pointer(event->buffer, buffer);
5481
	ret = 0;
5482 5483 5484
unlock:
	mutex_unlock(&event->mmap_mutex);

5485 5486
	if (old_buffer)
		perf_buffer_put(old_buffer);
5487 5488 5489 5490
out:
	return ret;
}

T
Thomas Gleixner 已提交
5491
/**
5492
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
I
Ingo Molnar 已提交
5493
 *
5494
 * @attr_uptr:	event_id type attributes for monitoring/sampling
T
Thomas Gleixner 已提交
5495
 * @pid:		target pid
I
Ingo Molnar 已提交
5496
 * @cpu:		target cpu
5497
 * @group_fd:		group leader event fd
T
Thomas Gleixner 已提交
5498
 */
5499 5500
SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
5501
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
T
Thomas Gleixner 已提交
5502
{
5503 5504
	struct perf_event *group_leader = NULL, *output_event = NULL;
	struct perf_event *event, *sibling;
5505 5506 5507
	struct perf_event_attr attr;
	struct perf_event_context *ctx;
	struct file *event_file = NULL;
5508
	struct file *group_file = NULL;
M
Matt Helsley 已提交
5509
	struct task_struct *task = NULL;
5510
	struct pmu *pmu;
5511
	int event_fd;
5512
	int move_group = 0;
5513
	int fput_needed = 0;
5514
	int err;
T
Thomas Gleixner 已提交
5515

5516
	/* for future expandability... */
5517
	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
5518 5519
		return -EINVAL;

5520 5521 5522
	err = perf_copy_attr(attr_uptr, &attr);
	if (err)
		return err;
5523

5524 5525 5526 5527 5528
	if (!attr.exclude_kernel) {
		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

5529
	if (attr.freq) {
5530
		if (attr.sample_freq > sysctl_perf_event_sample_rate)
5531 5532 5533
			return -EINVAL;
	}

5534 5535 5536 5537
	event_fd = get_unused_fd_flags(O_RDWR);
	if (event_fd < 0)
		return event_fd;

5538 5539 5540 5541
	if (group_fd != -1) {
		group_leader = perf_fget_light(group_fd, &fput_needed);
		if (IS_ERR(group_leader)) {
			err = PTR_ERR(group_leader);
5542
			goto err_fd;
5543 5544 5545 5546 5547 5548 5549 5550
		}
		group_file = group_leader->filp;
		if (flags & PERF_FLAG_FD_OUTPUT)
			output_event = group_leader;
		if (flags & PERF_FLAG_FD_NO_GROUP)
			group_leader = NULL;
	}

5551 5552 5553 5554 5555 5556
	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
		goto err_fd;
	}

5557 5558 5559 5560 5561
	/*
	 * Special case software events and allow them to be part of
	 * any hardware group.
	 */
	pmu = event->pmu;
5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584

	if (group_leader &&
	    (is_software_event(event) != is_software_event(group_leader))) {
		if (is_software_event(event)) {
			/*
			 * If event and group_leader are not both a software
			 * event, and event is, then group leader is not.
			 *
			 * Allow the addition of software events to !software
			 * groups, this is safe because software events never
			 * fail to schedule.
			 */
			pmu = group_leader->pmu;
		} else if (is_software_event(group_leader) &&
			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
			/*
			 * In case the group is a pure software group, and we
			 * try to add a hardware event, move the whole group to
			 * the hardware context.
			 */
			move_group = 1;
		}
	}
5585

5586
	if (pid != -1) {
M
Matt Helsley 已提交
5587
		task = find_lively_task_by_vpid(pid);
5588 5589 5590 5591 5592
		if (IS_ERR(task)) {
			err = PTR_ERR(task);
			goto err_group_fd;
		}
	}
M
Matt Helsley 已提交
5593

5594 5595 5596
	/*
	 * Get the target context (task or percpu):
	 */
M
Matt Helsley 已提交
5597
	ctx = find_get_context(pmu, task, cpu);
5598 5599 5600 5601 5602
	if (IS_ERR(ctx)) {
		err = PTR_ERR(ctx);
		goto err_group_fd;
	}

I
Ingo Molnar 已提交
5603
	/*
5604
	 * Look up the group leader (we will attach this event to it):
5605
	 */
5606
	if (group_leader) {
5607
		err = -EINVAL;
5608 5609

		/*
I
Ingo Molnar 已提交
5610 5611 5612 5613
		 * Do not allow a recursive hierarchy (this new sibling
		 * becoming part of another group-sibling):
		 */
		if (group_leader->group_leader != group_leader)
5614
			goto err_context;
I
Ingo Molnar 已提交
5615 5616 5617
		/*
		 * Do not allow to attach to a group in a different
		 * task or CPU context:
5618
		 */
5619 5620 5621 5622 5623 5624 5625 5626
		if (move_group) {
			if (group_leader->ctx->type != ctx->type)
				goto err_context;
		} else {
			if (group_leader->ctx != ctx)
				goto err_context;
		}

5627 5628 5629
		/*
		 * Only a group leader can be exclusive or pinned
		 */
5630
		if (attr.exclusive || attr.pinned)
5631
			goto err_context;
5632 5633 5634 5635 5636
	}

	if (output_event) {
		err = perf_event_set_output(event, output_event);
		if (err)
5637
			goto err_context;
5638
	}
T
Thomas Gleixner 已提交
5639

5640 5641 5642
	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
	if (IS_ERR(event_file)) {
		err = PTR_ERR(event_file);
5643
		goto err_context;
5644
	}
5645

5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657
	if (move_group) {
		struct perf_event_context *gctx = group_leader->ctx;

		mutex_lock(&gctx->mutex);
		perf_event_remove_from_context(group_leader);
		list_for_each_entry(sibling, &group_leader->sibling_list,
				    group_entry) {
			perf_event_remove_from_context(sibling);
			put_ctx(gctx);
		}
		mutex_unlock(&gctx->mutex);
		put_ctx(gctx);
5658
	}
5659

5660
	event->filp = event_file;
5661
	WARN_ON_ONCE(ctx->parent_ctx);
5662
	mutex_lock(&ctx->mutex);
5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673

	if (move_group) {
		perf_install_in_context(ctx, group_leader, cpu);
		get_ctx(ctx);
		list_for_each_entry(sibling, &group_leader->sibling_list,
				    group_entry) {
			perf_install_in_context(ctx, sibling, cpu);
			get_ctx(ctx);
		}
	}

5674
	perf_install_in_context(ctx, event, cpu);
5675
	++ctx->generation;
5676
	mutex_unlock(&ctx->mutex);
5677

5678
	event->owner = current;
5679
	get_task_struct(current);
5680 5681 5682
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
5683

5684 5685 5686 5687 5688 5689
	/*
	 * Drop the reference on the group_event after placing the
	 * new event on the sibling_list. This ensures destruction
	 * of the group leader will find the pointer to itself in
	 * perf_group_detach().
	 */
5690 5691 5692
	fput_light(group_file, fput_needed);
	fd_install(event_fd, event_file);
	return event_fd;
T
Thomas Gleixner 已提交
5693

5694
err_context:
5695
	put_ctx(ctx);
5696 5697
err_group_fd:
	fput_light(group_file, fput_needed);
5698
	free_event(event);
5699 5700
err_fd:
	put_unused_fd(event_fd);
5701
	return err;
T
Thomas Gleixner 已提交
5702 5703
}

5704 5705 5706 5707 5708
/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
M
Matt Helsley 已提交
5709
 * @task: task to profile (NULL for percpu)
5710 5711 5712
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
M
Matt Helsley 已提交
5713
				 struct task_struct *task,
5714
				 perf_overflow_handler_t overflow_handler)
5715 5716
{
	struct perf_event_context *ctx;
5717
	struct perf_event *event;
5718
	int err;
5719

5720 5721 5722
	/*
	 * Get the target context (task or percpu):
	 */
5723

5724 5725 5726 5727 5728
	event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
		goto err;
	}
5729

M
Matt Helsley 已提交
5730
	ctx = find_get_context(event->pmu, task, cpu);
5731 5732
	if (IS_ERR(ctx)) {
		err = PTR_ERR(ctx);
5733
		goto err_free;
5734
	}
5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750

	event->filp = NULL;
	WARN_ON_ONCE(ctx->parent_ctx);
	mutex_lock(&ctx->mutex);
	perf_install_in_context(ctx, event, cpu);
	++ctx->generation;
	mutex_unlock(&ctx->mutex);

	event->owner = current;
	get_task_struct(current);
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);

	return event;

5751 5752 5753
err_free:
	free_event(event);
err:
5754
	return ERR_PTR(err);
5755
}
5756
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5757

5758
static void sync_child_event(struct perf_event *child_event,
5759
			       struct task_struct *child)
5760
{
5761
	struct perf_event *parent_event = child_event->parent;
5762
	u64 child_val;
5763

5764 5765
	if (child_event->attr.inherit_stat)
		perf_event_read_event(child_event, child);
5766

P
Peter Zijlstra 已提交
5767
	child_val = perf_event_count(child_event);
5768 5769 5770 5771

	/*
	 * Add back the child's count to the parent's count:
	 */
5772
	atomic64_add(child_val, &parent_event->child_count);
5773 5774 5775 5776
	atomic64_add(child_event->total_time_enabled,
		     &parent_event->child_total_time_enabled);
	atomic64_add(child_event->total_time_running,
		     &parent_event->child_total_time_running);
5777 5778

	/*
5779
	 * Remove this event from the parent's list
5780
	 */
5781 5782 5783 5784
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_del_init(&child_event->child_list);
	mutex_unlock(&parent_event->child_mutex);
5785 5786

	/*
5787
	 * Release the parent event, if this was the last
5788 5789
	 * reference to it.
	 */
5790
	fput(parent_event->filp);
5791 5792
}

5793
static void
5794 5795
__perf_event_exit_task(struct perf_event *child_event,
			 struct perf_event_context *child_ctx,
5796
			 struct task_struct *child)
5797
{
5798
	struct perf_event *parent_event;
5799

5800
	perf_event_remove_from_context(child_event);
5801

5802
	parent_event = child_event->parent;
5803
	/*
5804
	 * It can happen that parent exits first, and has events
5805
	 * that are still around due to the child reference. These
5806
	 * events need to be zapped - but otherwise linger.
5807
	 */
5808 5809 5810
	if (parent_event) {
		sync_child_event(child_event, child);
		free_event(child_event);
5811
	}
5812 5813
}

P
Peter Zijlstra 已提交
5814
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5815
{
5816 5817
	struct perf_event *child_event, *tmp;
	struct perf_event_context *child_ctx;
5818
	unsigned long flags;
5819

P
Peter Zijlstra 已提交
5820
	if (likely(!child->perf_event_ctxp[ctxn])) {
5821
		perf_event_task(child, NULL, 0);
5822
		return;
P
Peter Zijlstra 已提交
5823
	}
5824

5825
	local_irq_save(flags);
5826 5827 5828 5829 5830 5831
	/*
	 * We can't reschedule here because interrupts are disabled,
	 * and either child is current or it is a task that can't be
	 * scheduled, so we are now safe from rescheduling changing
	 * our context.
	 */
P
Peter Zijlstra 已提交
5832
	child_ctx = child->perf_event_ctxp[ctxn];
5833
	__perf_event_task_sched_out(child_ctx);
5834 5835 5836

	/*
	 * Take the context lock here so that if find_get_context is
5837
	 * reading child->perf_event_ctxp, we wait until it has
5838 5839
	 * incremented the context's refcount before we do put_ctx below.
	 */
5840
	raw_spin_lock(&child_ctx->lock);
P
Peter Zijlstra 已提交
5841
	child->perf_event_ctxp[ctxn] = NULL;
5842 5843 5844
	/*
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
5845
	 * the events from it.
5846 5847
	 */
	unclone_ctx(child_ctx);
5848
	update_context_time(child_ctx);
5849
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
5850 5851

	/*
5852 5853 5854
	 * Report the task dead after unscheduling the events so that we
	 * won't get any samples after PERF_RECORD_EXIT. We can however still
	 * get a few PERF_RECORD_READ events.
P
Peter Zijlstra 已提交
5855
	 */
5856
	perf_event_task(child, child_ctx, 0);
5857

5858 5859 5860
	/*
	 * We can recurse on the same lock type through:
	 *
5861 5862 5863
	 *   __perf_event_exit_task()
	 *     sync_child_event()
	 *       fput(parent_event->filp)
5864 5865 5866 5867 5868
	 *         perf_release()
	 *           mutex_lock(&ctx->mutex)
	 *
	 * But since its the parent context it won't be the same instance.
	 */
5869
	mutex_lock(&child_ctx->mutex);
5870

5871
again:
5872 5873 5874 5875 5876
	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
				 group_entry)
		__perf_event_exit_task(child_event, child_ctx, child);

	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5877
				 group_entry)
5878
		__perf_event_exit_task(child_event, child_ctx, child);
5879 5880

	/*
5881
	 * If the last event was a group event, it will have appended all
5882 5883 5884
	 * its siblings to the list, but we obtained 'tmp' before that which
	 * will still point to the list head terminating the iteration.
	 */
5885 5886
	if (!list_empty(&child_ctx->pinned_groups) ||
	    !list_empty(&child_ctx->flexible_groups))
5887
		goto again;
5888 5889 5890 5891

	mutex_unlock(&child_ctx->mutex);

	put_ctx(child_ctx);
5892 5893
}

P
Peter Zijlstra 已提交
5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904
/*
 * When a child task exits, feed back event values to parent events.
 */
void perf_event_exit_task(struct task_struct *child)
{
	int ctxn;

	for_each_task_context_nr(ctxn)
		perf_event_exit_task_context(child, ctxn);
}

5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918
static void perf_free_event(struct perf_event *event,
			    struct perf_event_context *ctx)
{
	struct perf_event *parent = event->parent;

	if (WARN_ON_ONCE(!parent))
		return;

	mutex_lock(&parent->child_mutex);
	list_del_init(&event->child_list);
	mutex_unlock(&parent->child_mutex);

	fput(parent->filp);

5919
	perf_group_detach(event);
5920 5921 5922 5923
	list_del_event(event, ctx);
	free_event(event);
}

5924 5925
/*
 * free an unexposed, unused context as created by inheritance by
P
Peter Zijlstra 已提交
5926
 * perf_event_init_task below, used by fork() in case of fail.
5927
 */
5928
void perf_event_free_task(struct task_struct *task)
5929
{
P
Peter Zijlstra 已提交
5930
	struct perf_event_context *ctx;
5931
	struct perf_event *event, *tmp;
P
Peter Zijlstra 已提交
5932
	int ctxn;
5933

P
Peter Zijlstra 已提交
5934 5935 5936 5937
	for_each_task_context_nr(ctxn) {
		ctx = task->perf_event_ctxp[ctxn];
		if (!ctx)
			continue;
5938

P
Peter Zijlstra 已提交
5939
		mutex_lock(&ctx->mutex);
5940
again:
P
Peter Zijlstra 已提交
5941 5942 5943
		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
				group_entry)
			perf_free_event(event, ctx);
5944

P
Peter Zijlstra 已提交
5945 5946 5947
		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
				group_entry)
			perf_free_event(event, ctx);
5948

P
Peter Zijlstra 已提交
5949 5950 5951
		if (!list_empty(&ctx->pinned_groups) ||
				!list_empty(&ctx->flexible_groups))
			goto again;
5952

P
Peter Zijlstra 已提交
5953
		mutex_unlock(&ctx->mutex);
5954

P
Peter Zijlstra 已提交
5955 5956
		put_ctx(ctx);
	}
5957 5958
}

5959 5960 5961 5962 5963 5964 5965 5966
void perf_event_delayed_put(struct task_struct *task)
{
	int ctxn;

	for_each_task_context_nr(ctxn)
		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

P
Peter Zijlstra 已提交
5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978
/*
 * inherit a event from parent task to child task:
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
	      struct task_struct *parent,
	      struct perf_event_context *parent_ctx,
	      struct task_struct *child,
	      struct perf_event *group_leader,
	      struct perf_event_context *child_ctx)
{
	struct perf_event *child_event;
5979
	unsigned long flags;
P
Peter Zijlstra 已提交
5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023

	/*
	 * Instead of creating recursive hierarchies of events,
	 * we link inherited events back to the original parent,
	 * which has a filp for sure, which we use as the reference
	 * count:
	 */
	if (parent_event->parent)
		parent_event = parent_event->parent;

	child_event = perf_event_alloc(&parent_event->attr,
					   parent_event->cpu,
					   group_leader, parent_event,
					   NULL);
	if (IS_ERR(child_event))
		return child_event;
	get_ctx(child_ctx);

	/*
	 * Make the child state follow the state of the parent event,
	 * not its attr.disabled bit.  We hold the parent's mutex,
	 * so we won't race with perf_event_{en, dis}able_family.
	 */
	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
		child_event->state = PERF_EVENT_STATE_INACTIVE;
	else
		child_event->state = PERF_EVENT_STATE_OFF;

	if (parent_event->attr.freq) {
		u64 sample_period = parent_event->hw.sample_period;
		struct hw_perf_event *hwc = &child_event->hw;

		hwc->sample_period = sample_period;
		hwc->last_period   = sample_period;

		local64_set(&hwc->period_left, sample_period);
	}

	child_event->ctx = child_ctx;
	child_event->overflow_handler = parent_event->overflow_handler;

	/*
	 * Link it up in the child's context:
	 */
6024
	raw_spin_lock_irqsave(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
6025
	add_event_to_ctx(child_event, child_ctx);
6026
	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
P
Peter Zijlstra 已提交
6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067

	/*
	 * Get a reference to the parent filp - we will fput it
	 * when the child event exits. This is safe to do because
	 * we are in the parent and we know that the filp still
	 * exists and has a nonzero count:
	 */
	atomic_long_inc(&parent_event->filp->f_count);

	/*
	 * Link this into the parent event's child list
	 */
	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
	mutex_lock(&parent_event->child_mutex);
	list_add_tail(&child_event->child_list, &parent_event->child_list);
	mutex_unlock(&parent_event->child_mutex);

	return child_event;
}

static int inherit_group(struct perf_event *parent_event,
	      struct task_struct *parent,
	      struct perf_event_context *parent_ctx,
	      struct task_struct *child,
	      struct perf_event_context *child_ctx)
{
	struct perf_event *leader;
	struct perf_event *sub;
	struct perf_event *child_ctr;

	leader = inherit_event(parent_event, parent, parent_ctx,
				 child, NULL, child_ctx);
	if (IS_ERR(leader))
		return PTR_ERR(leader);
	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
		child_ctr = inherit_event(sub, parent, parent_ctx,
					    child, leader, child_ctx);
		if (IS_ERR(child_ctr))
			return PTR_ERR(child_ctr);
	}
	return 0;
6068 6069 6070 6071 6072
}

static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
		   struct perf_event_context *parent_ctx,
P
Peter Zijlstra 已提交
6073
		   struct task_struct *child, int ctxn,
6074 6075 6076
		   int *inherited_all)
{
	int ret;
P
Peter Zijlstra 已提交
6077
	struct perf_event_context *child_ctx;
6078 6079 6080 6081

	if (!event->attr.inherit) {
		*inherited_all = 0;
		return 0;
6082 6083
	}

P
Peter Zijlstra 已提交
6084
       	child_ctx = child->perf_event_ctxp[ctxn];
6085 6086 6087 6088 6089 6090 6091
	if (!child_ctx) {
		/*
		 * This is executed from the parent task context, so
		 * inherit events that have been marked for cloning.
		 * First allocate and initialize a context for the
		 * child.
		 */
6092

6093
		child_ctx = alloc_perf_context(event->pmu, child);
6094 6095
		if (!child_ctx)
			return -ENOMEM;
6096

P
Peter Zijlstra 已提交
6097
		child->perf_event_ctxp[ctxn] = child_ctx;
6098 6099 6100 6101 6102 6103 6104 6105 6106
	}

	ret = inherit_group(event, parent, parent_ctx,
			    child, child_ctx);

	if (ret)
		*inherited_all = 0;

	return ret;
6107 6108
}

6109
/*
6110
 * Initialize the perf_event context in task_struct
6111
 */
P
Peter Zijlstra 已提交
6112
int perf_event_init_context(struct task_struct *child, int ctxn)
6113
{
6114
	struct perf_event_context *child_ctx, *parent_ctx;
6115 6116
	struct perf_event_context *cloned_ctx;
	struct perf_event *event;
6117
	struct task_struct *parent = current;
6118
	int inherited_all = 1;
6119
	int ret = 0;
6120

P
Peter Zijlstra 已提交
6121
	child->perf_event_ctxp[ctxn] = NULL;
6122

6123 6124
	mutex_init(&child->perf_event_mutex);
	INIT_LIST_HEAD(&child->perf_event_list);
6125

P
Peter Zijlstra 已提交
6126
	if (likely(!parent->perf_event_ctxp[ctxn]))
6127 6128
		return 0;

6129
	/*
6130 6131
	 * If the parent's context is a clone, pin it so it won't get
	 * swapped under us.
6132
	 */
P
Peter Zijlstra 已提交
6133
	parent_ctx = perf_pin_task_context(parent, ctxn);
6134

6135 6136 6137 6138 6139 6140 6141
	/*
	 * No need to check if parent_ctx != NULL here; since we saw
	 * it non-NULL earlier, the only reason for it to become NULL
	 * is if we exit, and since we're currently in the middle of
	 * a fork we can't be exiting at the same time.
	 */

6142 6143 6144 6145
	/*
	 * Lock the parent list. No need to lock the child - not PID
	 * hashed yet and not running, so nobody can access it.
	 */
6146
	mutex_lock(&parent_ctx->mutex);
6147 6148 6149 6150 6151

	/*
	 * We dont have to disable NMIs - we are only looking at
	 * the list, not manipulating it:
	 */
6152
	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
P
Peter Zijlstra 已提交
6153 6154
		ret = inherit_task_group(event, parent, parent_ctx,
					 child, ctxn, &inherited_all);
6155 6156 6157
		if (ret)
			break;
	}
6158

6159
	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
P
Peter Zijlstra 已提交
6160 6161
		ret = inherit_task_group(event, parent, parent_ctx,
					 child, ctxn, &inherited_all);
6162
		if (ret)
6163
			break;
6164 6165
	}

P
Peter Zijlstra 已提交
6166
	child_ctx = child->perf_event_ctxp[ctxn];
6167

6168
	if (child_ctx && inherited_all) {
6169 6170 6171
		/*
		 * Mark the child context as a clone of the parent
		 * context, or of whatever the parent is a clone of.
6172 6173
		 * Note that if the parent is a clone, it could get
		 * uncloned at any point, but that doesn't matter
6174
		 * because the list of events and the generation
6175
		 * count can't have changed since we took the mutex.
6176
		 */
6177 6178 6179
		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
		if (cloned_ctx) {
			child_ctx->parent_ctx = cloned_ctx;
6180
			child_ctx->parent_gen = parent_ctx->parent_gen;
6181 6182 6183 6184 6185
		} else {
			child_ctx->parent_ctx = parent_ctx;
			child_ctx->parent_gen = parent_ctx->generation;
		}
		get_ctx(child_ctx->parent_ctx);
6186 6187
	}

6188
	mutex_unlock(&parent_ctx->mutex);
6189

6190
	perf_unpin_context(parent_ctx);
6191

6192
	return ret;
6193 6194
}

P
Peter Zijlstra 已提交
6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210
/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child)
{
	int ctxn, ret;

	for_each_task_context_nr(ctxn) {
		ret = perf_event_init_context(child, ctxn);
		if (ret)
			return ret;
	}

	return 0;
}

6211 6212
static void __init perf_event_init_all_cpus(void)
{
6213
	struct swevent_htable *swhash;
6214 6215 6216
	int cpu;

	for_each_possible_cpu(cpu) {
6217 6218
		swhash = &per_cpu(swevent_htable, cpu);
		mutex_init(&swhash->hlist_mutex);
6219
		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
6220 6221 6222
	}
}

6223
static void __cpuinit perf_event_init_cpu(int cpu)
T
Thomas Gleixner 已提交
6224
{
P
Peter Zijlstra 已提交
6225
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
T
Thomas Gleixner 已提交
6226

6227 6228
	mutex_lock(&swhash->hlist_mutex);
	if (swhash->hlist_refcount > 0) {
6229 6230
		struct swevent_hlist *hlist;

6231 6232 6233
		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
		WARN_ON(!hlist);
		rcu_assign_pointer(swhash->swevent_hlist, hlist);
6234
	}
6235
	mutex_unlock(&swhash->hlist_mutex);
T
Thomas Gleixner 已提交
6236 6237 6238
}

#ifdef CONFIG_HOTPLUG_CPU
6239
static void perf_pmu_rotate_stop(struct pmu *pmu)
T
Thomas Gleixner 已提交
6240
{
6241 6242 6243 6244 6245 6246 6247
	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

	WARN_ON(!irqs_disabled());

	list_del_init(&cpuctx->rotation_list);
}

P
Peter Zijlstra 已提交
6248
static void __perf_event_exit_context(void *__info)
T
Thomas Gleixner 已提交
6249
{
P
Peter Zijlstra 已提交
6250
	struct perf_event_context *ctx = __info;
6251
	struct perf_event *event, *tmp;
T
Thomas Gleixner 已提交
6252

P
Peter Zijlstra 已提交
6253
	perf_pmu_rotate_stop(ctx->pmu);
6254

6255 6256 6257
	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
		__perf_event_remove_from_context(event);
	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6258
		__perf_event_remove_from_context(event);
T
Thomas Gleixner 已提交
6259
}
P
Peter Zijlstra 已提交
6260 6261 6262 6263 6264 6265 6266 6267 6268

static void perf_event_exit_cpu_context(int cpu)
{
	struct perf_event_context *ctx;
	struct pmu *pmu;
	int idx;

	idx = srcu_read_lock(&pmus_srcu);
	list_for_each_entry_rcu(pmu, &pmus, entry) {
6269
		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
P
Peter Zijlstra 已提交
6270 6271 6272 6273 6274 6275 6276 6277

		mutex_lock(&ctx->mutex);
		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
		mutex_unlock(&ctx->mutex);
	}
	srcu_read_unlock(&pmus_srcu, idx);
}

6278
static void perf_event_exit_cpu(int cpu)
T
Thomas Gleixner 已提交
6279
{
6280
	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6281

6282 6283 6284
	mutex_lock(&swhash->hlist_mutex);
	swevent_hlist_release(swhash);
	mutex_unlock(&swhash->hlist_mutex);
6285

P
Peter Zijlstra 已提交
6286
	perf_event_exit_cpu_context(cpu);
T
Thomas Gleixner 已提交
6287 6288
}
#else
6289
static inline void perf_event_exit_cpu(int cpu) { }
T
Thomas Gleixner 已提交
6290 6291 6292 6293 6294 6295 6296
#endif

static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

P
Peter Zijlstra 已提交
6297
	switch (action & ~CPU_TASKS_FROZEN) {
T
Thomas Gleixner 已提交
6298 6299

	case CPU_UP_PREPARE:
P
Peter Zijlstra 已提交
6300
	case CPU_DOWN_FAILED:
6301
		perf_event_init_cpu(cpu);
T
Thomas Gleixner 已提交
6302 6303
		break;

P
Peter Zijlstra 已提交
6304
	case CPU_UP_CANCELED:
T
Thomas Gleixner 已提交
6305
	case CPU_DOWN_PREPARE:
6306
		perf_event_exit_cpu(cpu);
T
Thomas Gleixner 已提交
6307 6308 6309 6310 6311 6312 6313 6314 6315
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

6316
void __init perf_event_init(void)
T
Thomas Gleixner 已提交
6317
{
6318
	perf_event_init_all_cpus();
6319 6320 6321 6322 6323 6324
	init_srcu_struct(&pmus_srcu);
	perf_pmu_register(&perf_swevent);
	perf_pmu_register(&perf_cpu_clock);
	perf_pmu_register(&perf_task_clock);
	perf_tp_register();
	perf_cpu_notifier(perf_cpu_notify);
T
Thomas Gleixner 已提交
6325
}