perf_event.c 37.1 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/highmem.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
I
Ingo Molnar 已提交
27 28

#include <asm/apic.h>
29
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
30
#include <asm/nmi.h>
I
Ingo Molnar 已提交
31

32 33 34 35 36 37 38 39 40 41 42
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	int type = in_nmi() ? KM_NMI : KM_IRQ0;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

		map = kmap_atomic(page, type);
		memcpy(to, map+offset, size);
		kunmap_atomic(map, type);
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

78
static u64 perf_event_mask __read_mostly;
79

80
struct event_constraint {
81 82
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
83
		u64		idxmsk64;
84
	};
85 86
	u64	code;
	u64	cmask;
87
	int	weight;
88 89
};

90 91 92 93 94 95 96
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

97 98
#define MAX_LBR_ENTRIES		16

99
struct cpu_hw_events {
100 101 102
	/*
	 * Generic x86 PMC bits
	 */
103
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
104
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105
	unsigned long		interrupts;
106
	int			enabled;
I
Ingo Molnar 已提交
107

108 109 110
	int			n_events;
	int			n_added;
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
111
	u64			tags[X86_PMC_IDX_MAX];
112
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
113 114 115 116 117 118 119

	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

120 121 122 123 124 125 126 127
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

128 129 130
	/*
	 * AMD specific bits
	 */
131
	struct amd_nb		*amd_nb;
132 133
};

134
#define __EVENT_CONSTRAINT(c, n, m, w) {\
135
	{ .idxmsk64 = (n) },		\
136 137
	.code = (c),			\
	.cmask = (m),			\
138
	.weight = (w),			\
139
}
140

141 142 143
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

144 145 146
/*
 * Constraint on the Event code.
 */
147 148
#define INTEL_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
149

150 151 152
/*
 * Constraint on the Event code + UMask + fixed-mask
 */
153
#define FIXED_EVENT_CONSTRAINT(c, n)	\
154
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
155

156 157 158 159 160 161
/*
 * Constraint on the Event code + UMask
 */
#define PEBS_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

162 163 164 165 166
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
	for ((e) = (c); (e)->cmask; (e)++)
167

168 169 170 171 172 173 174 175 176 177 178
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

I
Ingo Molnar 已提交
179
/*
180
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
181
 */
182
struct x86_pmu {
183 184 185
	/*
	 * Generic x86 PMC bits
	 */
186 187
	const char	*name;
	int		version;
188
	int		(*handle_irq)(struct pt_regs *);
189 190
	void		(*disable_all)(void);
	void		(*enable_all)(void);
191 192
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
193 194
	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
195 196
	unsigned	eventsel;
	unsigned	perfctr;
197 198
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
199
	int		max_events;
200 201 202 203
	int		num_events;
	int		num_events_fixed;
	int		event_bits;
	u64		event_mask;
204
	int		apic;
205
	u64		max_period;
206 207 208 209
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

210 211
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
212
	struct event_constraint *event_constraints;
213
	void		(*quirks)(void);
214 215 216 217 218

	void		(*cpu_prepare)(int cpu);
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
219 220 221 222

	/*
	 * Intel Arch Perfmon v2+
	 */
223 224
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
225 226 227 228 229 230 231 232

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
233 234 235 236 237 238

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
239 240
};

241
static struct x86_pmu x86_pmu __read_mostly;
242

243
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
244 245
	.enabled = 1,
};
I
Ingo Molnar 已提交
246

247
static int x86_perf_event_set_period(struct perf_event *event);
248

249
/*
250
 * Generalized hw caching related hw_event table, filled
251
 * in on a per model basis. A value of 0 means
252 253
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
254 255 256 257 258 259 260 261 262 263
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

264
/*
265 266
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
267 268
 * Returns the delta events processed.
 */
269
static u64
270
x86_perf_event_update(struct perf_event *event)
271
{
272
	struct hw_perf_event *hwc = &event->hw;
273
	int shift = 64 - x86_pmu.event_bits;
274
	u64 prev_raw_count, new_raw_count;
275
	int idx = hwc->idx;
276
	s64 delta;
277

278 279 280
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

281
	/*
282
	 * Careful: an NMI might modify the previous event value.
283 284 285
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
286
	 * count to the generic event atomically:
287 288 289
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
290
	rdmsrl(hwc->event_base + idx, new_raw_count);
291 292 293 294 295 296 297 298

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
299
	 * (event-)time and add that to the generic event.
300 301
	 *
	 * Careful, not all hw sign-extends above the physical width
302
	 * of the count.
303
	 */
304 305
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
306

307
	atomic64_add(delta, &event->count);
308
	atomic64_sub(delta, &hwc->period_left);
309 310

	return new_raw_count;
311 312
}

313
static atomic_t active_events;
P
Peter Zijlstra 已提交
314 315 316 317
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
318
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
319 320 321 322 323
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

324
	for (i = 0; i < x86_pmu.num_events; i++) {
325
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
326 327 328
			goto perfctr_fail;
	}

329
	for (i = 0; i < x86_pmu.num_events; i++) {
330
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
331 332
			goto eventsel_fail;
	}
333
#endif
P
Peter Zijlstra 已提交
334 335 336

	return true;

337
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
338 339
eventsel_fail:
	for (i--; i >= 0; i--)
340
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
341

342
	i = x86_pmu.num_events;
P
Peter Zijlstra 已提交
343 344 345

perfctr_fail:
	for (i--; i >= 0; i--)
346
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
347 348 349 350 351

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
352
#endif
P
Peter Zijlstra 已提交
353 354 355 356
}

static void release_pmc_hardware(void)
{
357
#ifdef CONFIG_X86_LOCAL_APIC
P
Peter Zijlstra 已提交
358 359
	int i;

360
	for (i = 0; i < x86_pmu.num_events; i++) {
361 362
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
363 364 365 366
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
367
#endif
P
Peter Zijlstra 已提交
368 369
}

370 371
static int reserve_ds_buffers(void);
static void release_ds_buffers(void);
372

373
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
374
{
375
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
376
		release_pmc_hardware();
377
		release_ds_buffers();
P
Peter Zijlstra 已提交
378 379 380 381
		mutex_unlock(&pmc_reserve_mutex);
	}
}

382 383 384 385 386
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

387
static inline int
388
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
{
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;

	/*
	 * Count user and OS events unless requested not to
	 */
	if (!attr->exclude_user)
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!attr->exclude_kernel)
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;

	return 0;
}

I
Ingo Molnar 已提交
439
/*
440
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
441
 */
442
static int __hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
443
{
444 445
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
446
	u64 config;
P
Peter Zijlstra 已提交
447
	int err;
I
Ingo Molnar 已提交
448

449 450
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
451

P
Peter Zijlstra 已提交
452
	err = 0;
453
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
454
		mutex_lock(&pmc_reserve_mutex);
455
		if (atomic_read(&active_events) == 0) {
456 457 458
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
459
				err = reserve_ds_buffers();
460 461
		}
		if (!err)
462
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
463 464 465 466 467
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

468
	event->destroy = hw_perf_event_destroy;
469

470
	hwc->idx = -1;
471 472
	hwc->last_cpu = -1;
	hwc->last_tag = ~0ULL;
473

474 475 476
	/* Processor specifics */
	if (x86_pmu.hw_config(attr, hwc))
		return -EOPNOTSUPP;
477

478
	if (!hwc->sample_period) {
479
		hwc->sample_period = x86_pmu.max_period;
480
		hwc->last_period = hwc->sample_period;
481
		atomic64_set(&hwc->period_left, hwc->sample_period);
482 483 484 485
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
486 487
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
488 489 490
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
491
	}
492

I
Ingo Molnar 已提交
493
	/*
494
	 * Raw hw_event type provide the config in the hw_event structure
I
Ingo Molnar 已提交
495
	 */
496 497
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
498 499 500
		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
			return -EACCES;
501
		return 0;
I
Ingo Molnar 已提交
502 503
	}

504 505 506 507 508
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
509

510 511 512
	/*
	 * The generic map:
	 */
513 514 515 516 517 518 519 520
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

521 522 523 524
	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
525 526
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
527
		if (!x86_pmu.bts)
528 529 530
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
531
		if (!attr->exclude_kernel)
532 533
			return -EOPNOTSUPP;
	}
534

535
	hwc->config |= config;
P
Peter Zijlstra 已提交
536

I
Ingo Molnar 已提交
537 538 539
	return 0;
}

540
static void x86_pmu_disable_all(void)
541
{
542
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
543 544
	int idx;

545
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
546 547
		u64 val;

548
		if (!test_bit(idx, cpuc->active_mask))
549
			continue;
550
		rdmsrl(x86_pmu.eventsel + idx, val);
551
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
552
			continue;
553
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
554
		wrmsrl(x86_pmu.eventsel + idx, val);
555 556 557
	}
}

558
void hw_perf_disable(void)
559
{
560 561
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

562
	if (!x86_pmu_initialized())
563
		return;
564

565 566 567 568 569 570
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
571 572

	x86_pmu.disable_all();
573
}
I
Ingo Molnar 已提交
574

575
static void x86_pmu_enable_all(void)
576
{
577
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
578 579
	int idx;

580 581
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
		struct perf_event *event = cpuc->events[idx];
582
		u64 val;
583

584
		if (!test_bit(idx, cpuc->active_mask))
585
			continue;
586

587
		val = event->hw.config;
588
		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
589
		wrmsrl(x86_pmu.eventsel + idx, val);
590 591 592
	}
}

593 594 595 596 597 598 599 600 601
static const struct pmu pmu;

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
602
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
603
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
604
	int i, j, w, wmax, num = 0;
605 606 607 608 609
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
610 611
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
612 613
	}

614 615 616
	/*
	 * fastpath, try to reuse previous register
	 */
617
	for (i = 0; i < n; i++) {
618
		hwc = &cpuc->event_list[i]->hw;
619
		c = constraints[i];
620 621 622 623 624 625

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
626
		if (!test_bit(hwc->idx, c->idxmsk))
627 628 629 630 631 632
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
633
		__set_bit(hwc->idx, used_mask);
634 635 636
		if (assign)
			assign[i] = hwc->idx;
	}
637
	if (i == n)
638 639 640 641 642 643 644 645
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
	wmax = x86_pmu.num_events;

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
	if (x86_pmu.num_events_fixed)
		wmax++;

665
	for (w = 1, num = n; num && w <= wmax; w++) {
666
		/* for each event */
667
		for (i = 0; num && i < n; i++) {
668
			c = constraints[i];
669 670
			hwc = &cpuc->event_list[i]->hw;

671
			if (c->weight != w)
672 673
				continue;

674
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
675 676 677 678 679 680 681
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
682
			__set_bit(j, used_mask);
683

684 685 686 687 688
			if (assign)
				assign[i] = j;
			num--;
		}
	}
689
done:
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
728
		    event->state <= PERF_EVENT_STATE_OFF)
729 730 731 732 733 734 735 736 737 738 739 740
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
741
				struct cpu_hw_events *cpuc, int i)
742
{
743 744 745 746 747
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->event_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
	} else {
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
	}
}

766 767 768 769 770 771 772 773 774
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
775
static int x86_pmu_start(struct perf_event *event);
776
static void x86_pmu_stop(struct perf_event *event);
777

778
void hw_perf_enable(void)
779
{
780 781 782 783 784
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
	int i;

785
	if (!x86_pmu_initialized())
786
		return;
787 788 789 790

	if (cpuc->enabled)
		return;

791
	if (cpuc->n_added) {
792
		int n_running = cpuc->n_events - cpuc->n_added;
793 794 795 796 797 798 799
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
800
		for (i = 0; i < n_running; i++) {
801 802 803
			event = cpuc->event_list[i];
			hwc = &event->hw;

804 805 806 807 808 809 810 811
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
812 813
				continue;

814
			x86_pmu_stop(event);
815 816 817 818 819 820
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

821
			if (!match_prev_assignment(hwc, cpuc, i))
822
				x86_assign_hw_event(event, cpuc, i);
823 824
			else if (i < n_running)
				continue;
825

P
Peter Zijlstra 已提交
826
			x86_pmu_start(event);
827 828 829 830
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
831 832 833 834

	cpuc->enabled = 1;
	barrier();

835
	x86_pmu.enable_all();
836 837
}

838
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
839
{
840
	wrmsrl(hwc->config_base + hwc->idx,
841
			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
842 843
}

844
static inline void x86_pmu_disable_event(struct perf_event *event)
845
{
846
	struct hw_perf_event *hwc = &event->hw;
847 848

	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
849 850
}

851
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
852

853 854
/*
 * Set the next IRQ period, based on the hwc->period_left value.
855
 * To be called with the event disabled in hw:
856
 */
857
static int
858
x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
859
{
860
	struct hw_perf_event *hwc = &event->hw;
861
	s64 left = atomic64_read(&hwc->period_left);
862
	s64 period = hwc->sample_period;
863
	int ret = 0, idx = hwc->idx;
864

865 866 867
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

868
	/*
869
	 * If we are way outside a reasonable range then just skip forward:
870 871 872 873
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
874
		hwc->last_period = period;
875
		ret = 1;
876 877 878 879 880
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
881
		hwc->last_period = period;
882
		ret = 1;
883
	}
884
	/*
885
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
886 887 888
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
889

890 891 892
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

893
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
894 895

	/*
896
	 * The hw event starts counting from this event offset,
897 898
	 * mark it to be able to extra future deltas:
	 */
899
	atomic64_set(&hwc->prev_count, (u64)-left);
900

901 902
	wrmsrl(hwc->event_base + idx,
			(u64)(-left) & x86_pmu.event_mask);
903

904
	perf_event_update_userpage(event);
905

906
	return ret;
907 908
}

909
static void x86_pmu_enable_event(struct perf_event *event)
910
{
911
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
912
	if (cpuc->enabled)
913
		__x86_pmu_enable_event(&event->hw);
I
Ingo Molnar 已提交
914 915
}

916
/*
917 918 919 920 921 922 923
 * activate a single event
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
 *
 * Called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
924 925 926 927
 */
static int x86_pmu_enable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
928 929 930
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
931

932
	hwc = &event->hw;
933

934 935 936 937
	n0 = cpuc->n_events;
	n = collect_events(cpuc, event, false);
	if (n < 0)
		return n;
938

939
	ret = x86_pmu.schedule_events(cpuc, n, assign);
940 941 942 943 944 945 946
	if (ret)
		return ret;
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
947

948
	cpuc->n_events = n;
949
	cpuc->n_added += n - n0;
950 951

	return 0;
I
Ingo Molnar 已提交
952 953
}

954 955
static int x86_pmu_start(struct perf_event *event)
{
P
Peter Zijlstra 已提交
956 957 958 959
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

	if (idx == -1)
960 961
		return -EAGAIN;

962
	x86_perf_event_set_period(event);
P
Peter Zijlstra 已提交
963 964
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
965
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
966
	perf_event_update_userpage(event);
967 968 969 970

	return 0;
}

971
static void x86_pmu_unthrottle(struct perf_event *event)
972
{
973 974
	int ret = x86_pmu_start(event);
	WARN_ON_ONCE(ret);
975 976
}

977
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
978
{
979
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
980
	u64 pebs;
981
	struct cpu_hw_events *cpuc;
982
	unsigned long flags;
983 984
	int cpu, idx;

985
	if (!x86_pmu.num_events)
986
		return;
I
Ingo Molnar 已提交
987

988
	local_irq_save(flags);
I
Ingo Molnar 已提交
989 990

	cpu = smp_processor_id();
991
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
992

993
	if (x86_pmu.version >= 2) {
994 995 996 997
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
998
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
999 1000 1001 1002 1003 1004

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1005
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1006
	}
1007
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1008

1009
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1010 1011
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
1012

1013
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1014

1015
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1016
			cpu, idx, pmc_ctrl);
1017
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1018
			cpu, idx, pmc_count);
1019
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1020
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1021
	}
1022
	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1023 1024
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1025
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1026 1027
			cpu, idx, pmc_count);
	}
1028
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1029 1030
}

1031
static void x86_pmu_stop(struct perf_event *event)
I
Ingo Molnar 已提交
1032
{
1033
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1034
	struct hw_perf_event *hwc = &event->hw;
1035
	int idx = hwc->idx;
I
Ingo Molnar 已提交
1036

1037 1038 1039
	if (!__test_and_clear_bit(idx, cpuc->active_mask))
		return;

1040
	x86_pmu.disable(event);
I
Ingo Molnar 已提交
1041

1042
	/*
1043
	 * Drain the remaining delta count out of a event
1044 1045
	 * that we are disabling:
	 */
1046
	x86_perf_event_update(event);
1047

1048
	cpuc->events[idx] = NULL;
1049 1050 1051 1052 1053 1054 1055
}

static void x86_pmu_disable(struct perf_event *event)
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1056
	x86_pmu_stop(event);
1057

1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1068
			break;
1069 1070
		}
	}
1071
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1072 1073
}

1074
static int x86_pmu_handle_irq(struct pt_regs *regs)
1075
{
1076
	struct perf_sample_data data;
1077 1078 1079
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
	struct hw_perf_event *hwc;
V
Vince Weaver 已提交
1080
	int idx, handled = 0;
1081 1082
	u64 val;

1083
	perf_sample_data_init(&data, 0);
1084

1085
	cpuc = &__get_cpu_var(cpu_hw_events);
1086

1087
	for (idx = 0; idx < x86_pmu.num_events; idx++) {
1088
		if (!test_bit(idx, cpuc->active_mask))
1089
			continue;
1090

1091 1092
		event = cpuc->events[idx];
		hwc = &event->hw;
1093

1094
		val = x86_perf_event_update(event);
1095
		if (val & (1ULL << (x86_pmu.event_bits - 1)))
1096
			continue;
1097

1098
		/*
1099
		 * event overflow
1100 1101
		 */
		handled		= 1;
1102
		data.period	= event->hw.last_period;
1103

1104
		if (!x86_perf_event_set_period(event))
1105 1106
			continue;

1107
		if (perf_event_overflow(event, 1, &data, regs))
1108
			x86_pmu_stop(event);
1109
	}
1110

1111 1112 1113
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1114 1115
	return handled;
}
1116

1117 1118 1119 1120 1121
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
1122
	perf_event_do_pending();
1123 1124 1125
	irq_exit();
}

1126
void set_perf_event_pending(void)
1127
{
1128
#ifdef CONFIG_X86_LOCAL_APIC
1129 1130 1131
	if (!x86_pmu.apic || !x86_pmu_initialized())
		return;

1132
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1133
#endif
1134 1135
}

1136
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1137
{
1138 1139
#ifdef CONFIG_X86_LOCAL_APIC
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1140
		return;
1141

I
Ingo Molnar 已提交
1142
	/*
1143
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1144
	 */
1145
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1146
#endif
I
Ingo Molnar 已提交
1147 1148 1149
}

static int __kprobes
1150
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1151 1152 1153 1154
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1155

1156
	if (!atomic_read(&active_events))
1157 1158
		return NOTIFY_DONE;

1159 1160 1161 1162
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1163

1164
	default:
I
Ingo Molnar 已提交
1165
		return NOTIFY_DONE;
1166
	}
I
Ingo Molnar 已提交
1167 1168 1169

	regs = args->regs;

1170
#ifdef CONFIG_X86_LOCAL_APIC
I
Ingo Molnar 已提交
1171
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1172
#endif
1173 1174
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
1175
	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1176 1177 1178 1179
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1180
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1181

1182
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1183 1184
}

1185 1186 1187 1188 1189 1190
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
	.priority		= 1
};

1191
static struct event_constraint unconstrained;
1192
static struct event_constraint emptyconstraint;
1193 1194

static struct event_constraint *
1195
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1196
{
1197
	struct event_constraint *c;
1198 1199 1200

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1201 1202
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1203 1204
		}
	}
1205 1206

	return &unconstrained;
1207 1208 1209
}

static int x86_event_sched_in(struct perf_event *event,
1210
			  struct perf_cpu_context *cpuctx)
1211 1212 1213 1214
{
	int ret = 0;

	event->state = PERF_EVENT_STATE_ACTIVE;
1215
	event->oncpu = smp_processor_id();
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
	event->tstamp_running += event->ctx->time - event->tstamp_stopped;

	if (!is_x86_event(event))
		ret = event->pmu->enable(event);

	if (!ret && !is_software_event(event))
		cpuctx->active_oncpu++;

	if (!ret && event->attr.exclusive)
		cpuctx->exclusive = 1;

	return ret;
}

static void x86_event_sched_out(struct perf_event *event,
1231
			    struct perf_cpu_context *cpuctx)
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
{
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->oncpu = -1;

	if (!is_x86_event(event))
		event->pmu->disable(event);

	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;

	if (!is_software_event(event))
		cpuctx->active_oncpu--;

	if (event->attr.exclusive || !cpuctx->active_oncpu)
		cpuctx->exclusive = 0;
}

/*
 * Called to enable a whole group of events.
 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
 * Assumes the caller has disabled interrupts and has
 * frozen the PMU with hw_perf_save_disable.
 *
 * called with PMU disabled. If successful and return value 1,
 * then guaranteed to call perf_enable() and hw_perf_enable()
 */
int hw_perf_group_sched_in(struct perf_event *leader,
	       struct perf_cpu_context *cpuctx,
1259
	       struct perf_event_context *ctx)
1260
{
1261
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1262 1263 1264 1265
	struct perf_event *sub;
	int assign[X86_PMC_IDX_MAX];
	int n0, n1, ret;

1266 1267 1268
	if (!x86_pmu_initialized())
		return 0;

1269 1270 1271 1272 1273
	/* n0 = total number of events */
	n0 = collect_events(cpuc, leader, true);
	if (n0 < 0)
		return n0;

1274
	ret = x86_pmu.schedule_events(cpuc, n0, assign);
1275 1276 1277
	if (ret)
		return ret;

1278
	ret = x86_event_sched_in(leader, cpuctx);
1279 1280 1281 1282 1283
	if (ret)
		return ret;

	n1 = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1284
		if (sub->state > PERF_EVENT_STATE_OFF) {
1285
			ret = x86_event_sched_in(sub, cpuctx);
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297
			if (ret)
				goto undo;
			++n1;
		}
	}
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n0*sizeof(int));

	cpuc->n_events  = n0;
1298
	cpuc->n_added  += n1;
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309
	ctx->nr_active += n1;

	/*
	 * 1 means successful and events are active
	 * This is not quite true because we defer
	 * actual activation until hw_perf_enable() but
	 * this way we* ensure caller won't try to enable
	 * individual events
	 */
	return 1;
undo:
1310
	x86_event_sched_out(leader, cpuctx);
1311 1312 1313
	n0  = 1;
	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1314
			x86_event_sched_out(sub, cpuctx);
1315 1316 1317 1318 1319 1320 1321
			if (++n0 == n1)
				break;
		}
	}
	return ret;
}

1322 1323
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1324
#include "perf_event_p4.c"
1325
#include "perf_event_intel_lbr.c"
1326
#include "perf_event_intel_ds.c"
1327
#include "perf_event_intel.c"
1328

1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
			x86_pmu.cpu_prepare(cpu);
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

	return NOTIFY_OK;
}

1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1372
void __init init_hw_perf_events(void)
1373
{
1374
	struct event_constraint *c;
1375 1376
	int err;

1377
	pr_info("Performance Events: ");
1378

1379 1380
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1381
		err = intel_pmu_init();
1382
		break;
1383
	case X86_VENDOR_AMD:
1384
		err = amd_pmu_init();
1385
		break;
1386 1387
	default:
		return;
1388
	}
1389
	if (err != 0) {
1390
		pr_cont("no PMU driver, software events only.\n");
1391
		return;
1392
	}
1393

1394 1395
	pmu_check_apic();

1396
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1397

1398 1399 1400
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1401 1402 1403 1404
	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1405
	}
1406 1407
	perf_event_mask = (1 << x86_pmu.num_events) - 1;
	perf_max_events = x86_pmu.num_events;
I
Ingo Molnar 已提交
1408

1409 1410 1411 1412
	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
1413
	}
1414

1415 1416 1417
	perf_event_mask |=
		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
	x86_pmu.intel_ctrl = perf_event_mask;
I
Ingo Molnar 已提交
1418

1419 1420
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1421

1422
	unconstrained = (struct event_constraint)
1423 1424
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
				   0, x86_pmu.num_events);
1425

1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
			if (c->cmask != INTEL_ARCH_FIXED_MASK)
				continue;

			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
			c->weight += x86_pmu.num_events;
		}
	}

I
Ingo Molnar 已提交
1436 1437 1438 1439 1440 1441 1442
	pr_info("... version:                %d\n",     x86_pmu.version);
	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
	pr_info("... event mask:             %016Lx\n", perf_event_mask);
1443 1444

	perf_cpu_notifier(x86_pmu_notifier);
I
Ingo Molnar 已提交
1445
}
I
Ingo Molnar 已提交
1446

1447
static inline void x86_pmu_read(struct perf_event *event)
1448
{
1449
	x86_perf_event_update(event);
1450 1451
}

1452 1453 1454
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
1455 1456
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
1457
	.read		= x86_pmu_read,
1458
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1459 1460
};

1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1487 1488 1489 1490
/*
 * validate a single event group
 *
 * validation include:
1491 1492 1493
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1494 1495 1496 1497
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1498 1499
static int validate_group(struct perf_event *event)
{
1500
	struct perf_event *leader = event->group_leader;
1501 1502
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1503

1504 1505 1506 1507
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1508

1509 1510 1511 1512 1513 1514
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1515 1516
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1517
	if (n < 0)
1518
		goto out_free;
1519

1520 1521
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1522
	if (n < 0)
1523
		goto out_free;
1524

1525
	fake_cpuc->n_events = n;
1526

1527
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1528 1529 1530 1531 1532

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1533 1534
}

1535
const struct pmu *hw_perf_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1536
{
1537
	const struct pmu *tmp;
I
Ingo Molnar 已提交
1538 1539
	int err;

1540
	err = __hw_perf_event_init(event);
1541
	if (!err) {
1542 1543 1544 1545 1546 1547 1548 1549
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1550 1551
		if (event->group_leader != event)
			err = validate_group(event);
1552 1553
		else
			err = validate_event(event);
1554 1555

		event->pmu = tmp;
1556
	}
1557
	if (err) {
1558 1559
		if (event->destroy)
			event->destroy(event);
1560
		return ERR_PTR(err);
1561
	}
I
Ingo Molnar 已提交
1562

1563
	return &pmu;
I
Ingo Molnar 已提交
1564
}
1565 1566 1567 1568 1569 1570

/*
 * callchain support
 */

static inline
1571
void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1572
{
1573
	if (entry->nr < PERF_MAX_STACK_DEPTH)
1574 1575 1576
		entry->ip[entry->nr++] = ip;
}

1577 1578
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
1594
	return 0;
1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1610
	.walk_stack		= print_context_stack_bp,
1611 1612
};

1613 1614
#include "../dumpstack.h"

1615 1616 1617
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
1618
	callchain_store(entry, PERF_CONTEXT_KERNEL);
1619
	callchain_store(entry, regs->ip);
1620

1621
	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1622 1623
}

1624 1625 1626 1627 1628 1629 1630
static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	unsigned long bytes;

	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));

	return bytes == sizeof(*frame);
1631 1632 1633 1634 1635 1636 1637 1638
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;

1639 1640 1641
	if (!user_mode(regs))
		regs = task_pt_regs(current);

1642
	fp = (void __user *)regs->bp;
1643

1644
	callchain_store(entry, PERF_CONTEXT_USER);
1645 1646
	callchain_store(entry, regs->ip);

1647
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648
		frame.next_frame	     = NULL;
1649 1650 1651 1652 1653
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

1654
		if ((unsigned long)fp < regs->sp)
1655 1656 1657
			break;

		callchain_store(entry, frame.return_address);
1658
		fp = frame.next_frame;
1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686
	}
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
1687
		entry = &__get_cpu_var(pmc_nmi_entry);
1688
	else
1689
		entry = &__get_cpu_var(pmc_irq_entry);
1690 1691 1692 1693 1694 1695 1696

	entry->nr = 0;

	perf_do_callchain(regs, entry);

	return entry;
}
1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708

void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
	regs->ip = ip;
	/*
	 * perf_arch_fetch_caller_regs adds another call, we need to increment
	 * the skip level
	 */
	regs->bp = rewind_frame_pointer(skip + 1);
	regs->cs = __KERNEL_CS;
	local_save_flags(regs->flags);
}
1709
EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);