perf_event.c 39.3 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2
 * Performance events x86 architecture code
I
Ingo Molnar 已提交
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
I
Ingo Molnar 已提交
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
I
Ingo Molnar 已提交
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
I
Ingo Molnar 已提交
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/highmem.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
I
Ingo Molnar 已提交
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
31
#include <asm/nmi.h>
32
#include <asm/compat.h>
I
Ingo Molnar 已提交
33

34 35 36 37 38 39 40 41 42 43 44
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

65
		map = kmap_atomic(page);
66
		memcpy(to, map+offset, size);
67
		kunmap_atomic(map);
68 69 70 71 72 73 74 75 76 77 78
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

79
struct event_constraint {
80 81
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82
		u64		idxmsk64;
83
	};
84 85
	u64	code;
	u64	cmask;
86
	int	weight;
87 88
};

89 90 91 92 93 94 95
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

96 97
#define MAX_LBR_ENTRIES		16

98
struct cpu_hw_events {
99 100 101
	/*
	 * Generic x86 PMC bits
	 */
102
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
103
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
104
	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105
	int			enabled;
I
Ingo Molnar 已提交
106

107 108
	int			n_events;
	int			n_added;
109
	int			n_txn;
110
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
111
	u64			tags[X86_PMC_IDX_MAX];
112
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
113

114 115
	unsigned int		group_flag;

116 117 118 119 120 121
	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

122 123 124 125 126 127 128 129
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

130 131 132
	/*
	 * AMD specific bits
	 */
133
	struct amd_nb		*amd_nb;
134 135
};

136
#define __EVENT_CONSTRAINT(c, n, m, w) {\
137
	{ .idxmsk64 = (n) },		\
138 139
	.code = (c),			\
	.cmask = (m),			\
140
	.weight = (w),			\
141
}
142

143 144 145
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

146 147 148
/*
 * Constraint on the Event code.
 */
149
#define INTEL_EVENT_CONSTRAINT(c, n)	\
150
	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
151

152 153
/*
 * Constraint on the Event code + UMask + fixed-mask
154 155 156 157 158 159 160 161
 *
 * filter mask to validate fixed counter events.
 * the following filters disqualify for fixed counters:
 *  - inv
 *  - edge
 *  - cnt-mask
 *  The other filters are supported by fixed counters.
 *  The any-thread option is supported starting with v3.
162
 */
163
#define FIXED_EVENT_CONSTRAINT(c, n)	\
164
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
165

166 167 168 169 170 171
/*
 * Constraint on the Event code + UMask
 */
#define PEBS_EVENT_CONSTRAINT(c, n)	\
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

172 173 174 175
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
176
	for ((e) = (c); (e)->weight; (e)++)
177

178 179 180 181 182 183 184 185 186 187 188
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

I
Ingo Molnar 已提交
189
/*
190
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
191
 */
192
struct x86_pmu {
193 194 195
	/*
	 * Generic x86 PMC bits
	 */
196 197
	const char	*name;
	int		version;
198
	int		(*handle_irq)(struct pt_regs *);
199
	void		(*disable_all)(void);
200
	void		(*enable_all)(int added);
201 202
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
203
	int		(*hw_config)(struct perf_event *event);
204
	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
205 206
	unsigned	eventsel;
	unsigned	perfctr;
207
	u64		(*event_map)(int);
208
	int		max_events;
209 210 211 212
	int		num_counters;
	int		num_counters_fixed;
	int		cntval_bits;
	u64		cntval_mask;
213
	int		apic;
214
	u64		max_period;
215 216 217 218
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

219 220
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
221
	struct event_constraint *event_constraints;
222
	void		(*quirks)(void);
223
	int		perfctr_second_write;
224

225
	int		(*cpu_prepare)(int cpu);
226 227 228
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
229 230 231 232

	/*
	 * Intel Arch Perfmon v2+
	 */
233 234
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
235 236 237 238 239

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
240
	int		bts_active, pebs_active;
241 242 243
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
244 245 246 247 248 249

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
250 251
};

252
static struct x86_pmu x86_pmu __read_mostly;
253

254
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
255 256
	.enabled = 1,
};
I
Ingo Molnar 已提交
257

258
static int x86_perf_event_set_period(struct perf_event *event);
259

260
/*
261
 * Generalized hw caching related hw_event table, filled
262
 * in on a per model basis. A value of 0 means
263 264
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
265 266 267 268 269 270 271 272 273 274
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

275
/*
276 277
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
278 279
 * Returns the delta events processed.
 */
280
static u64
281
x86_perf_event_update(struct perf_event *event)
282
{
283
	struct hw_perf_event *hwc = &event->hw;
284
	int shift = 64 - x86_pmu.cntval_bits;
285
	u64 prev_raw_count, new_raw_count;
286
	int idx = hwc->idx;
287
	s64 delta;
288

289 290 291
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

292
	/*
293
	 * Careful: an NMI might modify the previous event value.
294 295 296
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
297
	 * count to the generic event atomically:
298 299
	 */
again:
300
	prev_raw_count = local64_read(&hwc->prev_count);
301
	rdmsrl(hwc->event_base + idx, new_raw_count);
302

303
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 305 306 307 308 309
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
310
	 * (event-)time and add that to the generic event.
311 312
	 *
	 * Careful, not all hw sign-extends above the physical width
313
	 * of the count.
314
	 */
315 316
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
317

318 319
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
320 321

	return new_raw_count;
322 323
}

324
static atomic_t active_events;
P
Peter Zijlstra 已提交
325 326
static DEFINE_MUTEX(pmc_reserve_mutex);

327 328
#ifdef CONFIG_X86_LOCAL_APIC

P
Peter Zijlstra 已提交
329 330 331 332
static bool reserve_pmc_hardware(void)
{
	int i;

333
	for (i = 0; i < x86_pmu.num_counters; i++) {
334
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
335 336 337
			goto perfctr_fail;
	}

338
	for (i = 0; i < x86_pmu.num_counters; i++) {
339
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
340 341 342 343 344 345 346
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
347
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
348

349
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
350 351 352

perfctr_fail:
	for (i--; i >= 0; i--)
353
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
354 355 356 357 358 359 360 361

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

362
	for (i = 0; i < x86_pmu.num_counters; i++) {
363 364
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
365 366 367
	}
}

368 369 370 371 372 373 374
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

375 376 377 378 379 380 381 382 383 384 385 386 387 388
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
	int ret = 0;

	val = 0xabcdUL;
	ret |= checking_wrmsrl(x86_pmu.perfctr, val);
	ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
	if (ret || val != val_new)
		return false;

	return true;
}

389
static void reserve_ds_buffers(void);
390
static void release_ds_buffers(void);
391

392
static void hw_perf_event_destroy(struct perf_event *event)
P
Peter Zijlstra 已提交
393
{
394
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
395
		release_pmc_hardware();
396
		release_ds_buffers();
P
Peter Zijlstra 已提交
397 398 399 400
		mutex_unlock(&pmc_reserve_mutex);
	}
}

401 402 403 404 405
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

406
static inline int
407
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

439 440 441 442 443 444 445 446 447
static int x86_setup_perfctr(struct perf_event *event)
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

	if (!hwc->sample_period) {
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
448
		local64_set(&hwc->period_left, hwc->sample_period);
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

	if (attr->type == PERF_TYPE_RAW)
		return 0;

	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
	    (hwc->sample_period == 1)) {
		/* BTS is not supported by this architecture. */
486
		if (!x86_pmu.bts_active)
487 488 489 490 491 492 493 494 495 496 497
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
498

499
static int x86_pmu_hw_config(struct perf_event *event)
500
{
P
Peter Zijlstra 已提交
501 502 503 504
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
505
		if (x86_pmu.pebs_active) {
P
Peter Zijlstra 已提交
506 507
			precise++;

508 509 510 511
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
P
Peter Zijlstra 已提交
512 513 514 515 516

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

517 518 519 520
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
521
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
522 523 524 525

	/*
	 * Count user and OS events unless requested not to
	 */
526 527 528 529
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
530

531 532
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
533

534
	return x86_setup_perfctr(event);
535 536
}

I
Ingo Molnar 已提交
537
/*
538
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
539
 */
540
static int __x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
541
{
P
Peter Zijlstra 已提交
542
	int err;
I
Ingo Molnar 已提交
543

544 545
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
546

P
Peter Zijlstra 已提交
547
	err = 0;
548
	if (!atomic_inc_not_zero(&active_events)) {
P
Peter Zijlstra 已提交
549
		mutex_lock(&pmc_reserve_mutex);
550
		if (atomic_read(&active_events) == 0) {
551 552
			if (!reserve_pmc_hardware())
				err = -EBUSY;
553 554
			else
				reserve_ds_buffers();
555 556
		}
		if (!err)
557
			atomic_inc(&active_events);
P
Peter Zijlstra 已提交
558 559 560 561 562
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

563
	event->destroy = hw_perf_event_destroy;
564

565 566 567
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
568

569
	return x86_pmu.hw_config(event);
570 571
}

572
static void x86_pmu_disable_all(void)
573
{
574
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
575 576
	int idx;

577
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
578 579
		u64 val;

580
		if (!test_bit(idx, cpuc->active_mask))
581
			continue;
582
		rdmsrl(x86_pmu.eventsel + idx, val);
583
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
584
			continue;
585
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
586
		wrmsrl(x86_pmu.eventsel + idx, val);
587 588 589
	}
}

P
Peter Zijlstra 已提交
590
static void x86_pmu_disable(struct pmu *pmu)
591
{
592 593
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

594
	if (!x86_pmu_initialized())
595
		return;
596

597 598 599 600 601 602
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
603 604

	x86_pmu.disable_all();
605
}
I
Ingo Molnar 已提交
606

607
static void x86_pmu_enable_all(int added)
608
{
609
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
610 611
	int idx;

612
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
613
		struct perf_event *event = cpuc->events[idx];
614
		u64 val;
615

616
		if (!test_bit(idx, cpuc->active_mask))
617
			continue;
618

619
		val = event->hw.config;
620
		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
621
		wrmsrl(x86_pmu.eventsel + idx, val);
622 623 624
	}
}

P
Peter Zijlstra 已提交
625
static struct pmu pmu;
626 627 628 629 630 631 632 633

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
634
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
635
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
636
	int i, j, w, wmax, num = 0;
637 638 639 640 641
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
642 643
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
644 645
	}

646 647 648
	/*
	 * fastpath, try to reuse previous register
	 */
649
	for (i = 0; i < n; i++) {
650
		hwc = &cpuc->event_list[i]->hw;
651
		c = constraints[i];
652 653 654 655 656 657

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
658
		if (!test_bit(hwc->idx, c->idxmsk))
659 660 661 662 663 664
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

P
Peter Zijlstra 已提交
665
		__set_bit(hwc->idx, used_mask);
666 667 668
		if (assign)
			assign[i] = hwc->idx;
	}
669
	if (i == n)
670 671 672 673 674 675 676 677
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

678 679 680 681 682 683 684 685 686
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
687
	wmax = x86_pmu.num_counters;
688 689 690 691 692 693

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
694
	if (x86_pmu.num_counters_fixed)
695 696
		wmax++;

697
	for (w = 1, num = n; num && w <= wmax; w++) {
698
		/* for each event */
699
		for (i = 0; num && i < n; i++) {
700
			c = constraints[i];
701 702
			hwc = &cpuc->event_list[i]->hw;

703
			if (c->weight != w)
704 705
				continue;

706
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
707 708 709 710 711 712 713
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

P
Peter Zijlstra 已提交
714
			__set_bit(j, used_mask);
715

716 717 718 719 720
			if (assign)
				assign[i] = j;
			num--;
		}
	}
721
done:
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

744
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
760
		    event->state <= PERF_EVENT_STATE_OFF)
761 762 763 764 765 766 767 768 769 770 771 772
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
773
				struct cpu_hw_events *cpuc, int i)
774
{
775 776 777 778 779
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->event_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
	} else {
		hwc->config_base = x86_pmu.eventsel;
		hwc->event_base  = x86_pmu.perfctr;
	}
}

798 799 800 801 802 803 804 805 806
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

P
Peter Zijlstra 已提交
807 808
static void x86_pmu_start(struct perf_event *event, int flags);
static void x86_pmu_stop(struct perf_event *event, int flags);
809

P
Peter Zijlstra 已提交
810
static void x86_pmu_enable(struct pmu *pmu)
811
{
812 813 814
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
815
	int i, added = cpuc->n_added;
816

817
	if (!x86_pmu_initialized())
818
		return;
819 820 821 822

	if (cpuc->enabled)
		return;

823
	if (cpuc->n_added) {
824
		int n_running = cpuc->n_events - cpuc->n_added;
825 826 827 828 829 830 831
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
832
		for (i = 0; i < n_running; i++) {
833 834 835
			event = cpuc->event_list[i];
			hwc = &event->hw;

836 837 838 839 840 841 842 843
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
844 845
				continue;

P
Peter Zijlstra 已提交
846 847 848 849 850 851 852 853
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
854 855 856 857 858 859
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

860
			if (!match_prev_assignment(hwc, cpuc, i))
861
				x86_assign_hw_event(event, cpuc, i);
862 863
			else if (i < n_running)
				continue;
864

P
Peter Zijlstra 已提交
865 866 867 868
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
869 870 871 872
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
873 874 875 876

	cpuc->enabled = 1;
	barrier();

877
	x86_pmu.enable_all(added);
878 879
}

880 881
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
					  u64 enable_mask)
882
{
883
	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
884 885
}

886
static inline void x86_pmu_disable_event(struct perf_event *event)
887
{
888
	struct hw_perf_event *hwc = &event->hw;
889 890

	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
891 892
}

893
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
I
Ingo Molnar 已提交
894

895 896
/*
 * Set the next IRQ period, based on the hwc->period_left value.
897
 * To be called with the event disabled in hw:
898
 */
899
static int
900
x86_perf_event_set_period(struct perf_event *event)
I
Ingo Molnar 已提交
901
{
902
	struct hw_perf_event *hwc = &event->hw;
903
	s64 left = local64_read(&hwc->period_left);
904
	s64 period = hwc->sample_period;
905
	int ret = 0, idx = hwc->idx;
906

907 908 909
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

910
	/*
911
	 * If we are way outside a reasonable range then just skip forward:
912 913 914
	 */
	if (unlikely(left <= -period)) {
		left = period;
915
		local64_set(&hwc->period_left, left);
916
		hwc->last_period = period;
917
		ret = 1;
918 919 920 921
	}

	if (unlikely(left <= 0)) {
		left += period;
922
		local64_set(&hwc->period_left, left);
923
		hwc->last_period = period;
924
		ret = 1;
925
	}
926
	/*
927
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
928 929 930
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
931

932 933 934
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

935
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
936 937

	/*
938
	 * The hw event starts counting from this event offset,
939 940
	 * mark it to be able to extra future deltas:
	 */
941
	local64_set(&hwc->prev_count, (u64)-left);
942

943 944 945 946 947 948 949 950 951
	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
		wrmsrl(hwc->event_base + idx,
952
			(u64)(-left) & x86_pmu.cntval_mask);
953
	}
954

955
	perf_event_update_userpage(event);
956

957
	return ret;
958 959
}

960
static void x86_pmu_enable_event(struct perf_event *event)
961
{
962
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
963
	if (cpuc->enabled)
964 965
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
I
Ingo Molnar 已提交
966 967
}

968
/*
P
Peter Zijlstra 已提交
969
 * Add a single event to the PMU.
970 971 972
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
973
 */
P
Peter Zijlstra 已提交
974
static int x86_pmu_add(struct perf_event *event, int flags)
975 976
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
977 978 979
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
980

981
	hwc = &event->hw;
982

P
Peter Zijlstra 已提交
983
	perf_pmu_disable(event->pmu);
984
	n0 = cpuc->n_events;
985 986 987
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
988

P
Peter Zijlstra 已提交
989 990 991 992
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

993 994 995
	/*
	 * If group events scheduling transaction was started,
	 * skip the schedulability test here, it will be peformed
P
Peter Zijlstra 已提交
996
	 * at commit time (->commit_txn) as a whole
997
	 */
998
	if (cpuc->group_flag & PERF_EVENT_TXN)
999
		goto done_collect;
1000

1001
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1002
	if (ret)
1003
		goto out;
1004 1005 1006 1007 1008
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
1009

1010
done_collect:
1011
	cpuc->n_events = n;
1012
	cpuc->n_added += n - n0;
1013
	cpuc->n_txn += n - n0;
1014

1015 1016
	ret = 0;
out:
P
Peter Zijlstra 已提交
1017
	perf_pmu_enable(event->pmu);
1018
	return ret;
I
Ingo Molnar 已提交
1019 1020
}

P
Peter Zijlstra 已提交
1021
static void x86_pmu_start(struct perf_event *event, int flags)
1022
{
P
Peter Zijlstra 已提交
1023 1024 1025
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

P
Peter Zijlstra 已提交
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
1038

P
Peter Zijlstra 已提交
1039 1040
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1041
	__set_bit(idx, cpuc->running);
1042
	x86_pmu.enable(event);
P
Peter Zijlstra 已提交
1043
	perf_event_update_userpage(event);
1044 1045
}

1046
void perf_event_print_debug(void)
I
Ingo Molnar 已提交
1047
{
1048
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1049
	u64 pebs;
1050
	struct cpu_hw_events *cpuc;
1051
	unsigned long flags;
1052 1053
	int cpu, idx;

1054
	if (!x86_pmu.num_counters)
1055
		return;
I
Ingo Molnar 已提交
1056

1057
	local_irq_save(flags);
I
Ingo Molnar 已提交
1058 1059

	cpu = smp_processor_id();
1060
	cpuc = &per_cpu(cpu_hw_events, cpu);
I
Ingo Molnar 已提交
1061

1062
	if (x86_pmu.version >= 2) {
1063 1064 1065 1066
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1067
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1068 1069 1070 1071 1072 1073

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1074
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1075
	}
1076
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
I
Ingo Molnar 已提交
1077

1078
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1079 1080
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
1081

1082
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
I
Ingo Molnar 已提交
1083

1084
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
1085
			cpu, idx, pmc_ctrl);
1086
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
1087
			cpu, idx, pmc_count);
1088
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1089
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
1090
	}
1091
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1092 1093
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1094
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1095 1096
			cpu, idx, pmc_count);
	}
1097
	local_irq_restore(flags);
I
Ingo Molnar 已提交
1098 1099
}

P
Peter Zijlstra 已提交
1100
static void x86_pmu_stop(struct perf_event *event, int flags)
I
Ingo Molnar 已提交
1101
{
1102
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1103
	struct hw_perf_event *hwc = &event->hw;
I
Ingo Molnar 已提交
1104

P
Peter Zijlstra 已提交
1105 1106 1107 1108 1109 1110
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
1111

P
Peter Zijlstra 已提交
1112 1113 1114 1115 1116 1117 1118 1119
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
1120 1121
}

P
Peter Zijlstra 已提交
1122
static void x86_pmu_del(struct perf_event *event, int flags)
1123 1124 1125 1126
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1127 1128 1129 1130 1131
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
1132
	if (cpuc->group_flag & PERF_EVENT_TXN)
1133 1134
		return;

P
Peter Zijlstra 已提交
1135
	x86_pmu_stop(event, PERF_EF_UPDATE);
1136

1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1147
			break;
1148 1149
		}
	}
1150
	perf_event_update_userpage(event);
I
Ingo Molnar 已提交
1151 1152
}

1153
static int x86_pmu_handle_irq(struct pt_regs *regs)
1154
{
1155
	struct perf_sample_data data;
1156 1157
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
V
Vince Weaver 已提交
1158
	int idx, handled = 0;
1159 1160
	u64 val;

1161
	perf_sample_data_init(&data, 0);
1162

1163
	cpuc = &__get_cpu_var(cpu_hw_events);
1164

1165
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1166 1167 1168 1169 1170 1171 1172 1173
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1174
			continue;
1175
		}
1176

1177
		event = cpuc->events[idx];
1178

1179
		val = x86_perf_event_update(event);
1180
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1181
			continue;
1182

1183
		/*
1184
		 * event overflow
1185
		 */
1186
		handled++;
1187
		data.period	= event->hw.last_period;
1188

1189
		if (!x86_perf_event_set_period(event))
1190 1191
			continue;

1192
		if (perf_event_overflow(event, 1, &data, regs))
P
Peter Zijlstra 已提交
1193
			x86_pmu_stop(event, 0);
1194
	}
1195

1196 1197 1198
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1199 1200
	return handled;
}
1201

1202
void perf_events_lapic_init(void)
I
Ingo Molnar 已提交
1203
{
1204
	if (!x86_pmu.apic || !x86_pmu_initialized())
I
Ingo Molnar 已提交
1205
		return;
1206

I
Ingo Molnar 已提交
1207
	/*
1208
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1209
	 */
1210
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1211 1212
}

1213 1214 1215 1216 1217 1218 1219
struct pmu_nmi_state {
	unsigned int	marked;
	int		handled;
};

static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);

I
Ingo Molnar 已提交
1220
static int __kprobes
1221
perf_event_nmi_handler(struct notifier_block *self,
I
Ingo Molnar 已提交
1222 1223 1224
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
1225 1226
	unsigned int this_nmi;
	int handled;
1227

1228
	if (!atomic_read(&active_events))
1229 1230
		return NOTIFY_DONE;

1231 1232 1233 1234
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
	case DIE_NMIUNKNOWN:
		this_nmi = percpu_read(irq_stat.__nmi_count);
		if (this_nmi != __get_cpu_var(pmu_nmi).marked)
			/* let the kernel handle the unknown nmi */
			return NOTIFY_DONE;
		/*
		 * This one is a PMU back-to-back nmi. Two events
		 * trigger 'simultaneously' raising two back-to-back
		 * NMIs. If the first NMI handles both, the latter
		 * will be empty and daze the CPU. So, we drop it to
		 * avoid false-positive 'unknown nmi' messages.
		 */
		return NOTIFY_STOP;
1248
	default:
I
Ingo Molnar 已提交
1249
		return NOTIFY_DONE;
1250
	}
I
Ingo Molnar 已提交
1251 1252

	apic_write(APIC_LVTPC, APIC_DM_NMI);
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275

	handled = x86_pmu.handle_irq(args->regs);
	if (!handled)
		return NOTIFY_DONE;

	this_nmi = percpu_read(irq_stat.__nmi_count);
	if ((handled > 1) ||
		/* the next nmi could be a back-to-back nmi */
	    ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
	     (__get_cpu_var(pmu_nmi).handled > 1))) {
		/*
		 * We could have two subsequent back-to-back nmis: The
		 * first handles more than one counter, the 2nd
		 * handles only one counter and the 3rd handles no
		 * counter.
		 *
		 * This is the 2nd nmi because the previous was
		 * handling more than one counter. We will mark the
		 * next (3rd) and then drop it if unhandled.
		 */
		__get_cpu_var(pmu_nmi).marked	= this_nmi + 1;
		__get_cpu_var(pmu_nmi).handled	= handled;
	}
I
Ingo Molnar 已提交
1276

1277
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1278 1279
}

1280 1281 1282 1283 1284 1285
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
	.priority		= 1
};

1286
static struct event_constraint unconstrained;
1287
static struct event_constraint emptyconstraint;
1288 1289

static struct event_constraint *
1290
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1291
{
1292
	struct event_constraint *c;
1293 1294 1295

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1296 1297
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1298 1299
		}
	}
1300 1301

	return &unconstrained;
1302 1303
}

1304 1305
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1306
#include "perf_event_p4.c"
1307
#include "perf_event_intel_lbr.c"
1308
#include "perf_event_intel_ds.c"
1309
#include "perf_event_intel.c"
1310

1311 1312 1313 1314
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1315
	int ret = NOTIFY_OK;
1316 1317 1318 1319

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
1320
			ret = x86_pmu.cpu_prepare(cpu);
1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1333
	case CPU_UP_CANCELED:
1334 1335 1336 1337 1338 1339 1340 1341 1342
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1343
	return ret;
1344 1345
}

1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1356
void __init init_hw_perf_events(void)
1357
{
1358
	struct event_constraint *c;
1359 1360
	int err;

1361
	pr_info("Performance Events: ");
1362

1363 1364
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1365
		err = intel_pmu_init();
1366
		break;
1367
	case X86_VENDOR_AMD:
1368
		err = amd_pmu_init();
1369
		break;
1370 1371
	default:
		return;
1372
	}
1373
	if (err != 0) {
1374
		pr_cont("no PMU driver, software events only.\n");
1375
		return;
1376
	}
1377

1378 1379
	pmu_check_apic();

1380 1381 1382 1383 1384 1385
	/* sanity check that the hardware exists or is emulated */
	if (!check_hw_exists()) {
		pr_cont("Broken PMU hardware detected, software events only.\n");
		return;
	}

1386
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1387

1388 1389 1390
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1391
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1392
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1393 1394
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1395
	}
1396
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
I
Ingo Molnar 已提交
1397

1398
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1399
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1400 1401
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1402
	}
1403

1404
	x86_pmu.intel_ctrl |=
1405
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1406

1407 1408
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1409

1410
	unconstrained = (struct event_constraint)
1411 1412
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
				   0, x86_pmu.num_counters);
1413

1414 1415
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1416
			if (c->cmask != X86_RAW_EVENT_MASK)
1417 1418
				continue;

1419 1420
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1421 1422 1423
		}
	}

I
Ingo Molnar 已提交
1424
	pr_info("... version:                %d\n",     x86_pmu.version);
1425 1426 1427
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
I
Ingo Molnar 已提交
1428
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1429
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1430
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1431

1432
	perf_pmu_register(&pmu);
1433
	perf_cpu_notifier(x86_pmu_notifier);
I
Ingo Molnar 已提交
1434
}
I
Ingo Molnar 已提交
1435

1436
static inline void x86_pmu_read(struct perf_event *event)
1437
{
1438
	x86_perf_event_update(event);
1439 1440
}

1441 1442 1443 1444 1445
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
P
Peter Zijlstra 已提交
1446
static void x86_pmu_start_txn(struct pmu *pmu)
1447 1448 1449
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

P
Peter Zijlstra 已提交
1450
	perf_pmu_disable(pmu);
1451
	cpuc->group_flag |= PERF_EVENT_TXN;
1452
	cpuc->n_txn = 0;
1453 1454 1455 1456 1457 1458 1459
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
P
Peter Zijlstra 已提交
1460
static void x86_pmu_cancel_txn(struct pmu *pmu)
1461 1462 1463
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

1464
	cpuc->group_flag &= ~PERF_EVENT_TXN;
1465 1466 1467 1468 1469
	/*
	 * Truncate the collected events.
	 */
	cpuc->n_added -= cpuc->n_txn;
	cpuc->n_events -= cpuc->n_txn;
P
Peter Zijlstra 已提交
1470
	perf_pmu_enable(pmu);
1471 1472 1473 1474 1475 1476 1477
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
P
Peter Zijlstra 已提交
1478
static int x86_pmu_commit_txn(struct pmu *pmu)
1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1499
	cpuc->group_flag &= ~PERF_EVENT_TXN;
P
Peter Zijlstra 已提交
1500
	perf_pmu_enable(pmu);
1501 1502 1503
	return 0;
}

1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1530 1531 1532 1533
/*
 * validate a single event group
 *
 * validation include:
1534 1535 1536
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1537 1538 1539 1540
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1541 1542
static int validate_group(struct perf_event *event)
{
1543
	struct perf_event *leader = event->group_leader;
1544 1545
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1546

1547 1548 1549 1550
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1551

1552 1553 1554 1555 1556 1557
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1558 1559
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1560
	if (n < 0)
1561
		goto out_free;
1562

1563 1564
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1565
	if (n < 0)
1566
		goto out_free;
1567

1568
	fake_cpuc->n_events = n;
1569

1570
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1571 1572 1573 1574 1575

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1576 1577
}

1578
int x86_pmu_event_init(struct perf_event *event)
I
Ingo Molnar 已提交
1579
{
P
Peter Zijlstra 已提交
1580
	struct pmu *tmp;
I
Ingo Molnar 已提交
1581 1582
	int err;

1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1594
	if (!err) {
1595 1596 1597 1598 1599 1600 1601 1602
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1603 1604
		if (event->group_leader != event)
			err = validate_group(event);
1605 1606
		else
			err = validate_event(event);
1607 1608

		event->pmu = tmp;
1609
	}
1610
	if (err) {
1611 1612
		if (event->destroy)
			event->destroy(event);
1613
	}
I
Ingo Molnar 已提交
1614

1615
	return err;
I
Ingo Molnar 已提交
1616
}
1617

1618
static struct pmu pmu = {
P
Peter Zijlstra 已提交
1619 1620 1621
	.pmu_enable	= x86_pmu_enable,
	.pmu_disable	= x86_pmu_disable,

1622
	.event_init	= x86_pmu_event_init,
P
Peter Zijlstra 已提交
1623 1624 1625

	.add		= x86_pmu_add,
	.del		= x86_pmu_del,
1626 1627 1628
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
	.read		= x86_pmu_read,
P
Peter Zijlstra 已提交
1629

1630 1631 1632 1633 1634
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
};

1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651
/*
 * callchain support
 */

static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
1652
	return 0;
1653 1654 1655 1656 1657 1658
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1659
	perf_callchain_store(entry, addr);
1660 1661 1662 1663 1664 1665 1666
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1667
	.walk_stack		= print_context_stack_bp,
1668 1669
};

1670 1671
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1672
{
1673 1674
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1675
		return;
1676 1677
	}

1678
	perf_callchain_store(entry, regs->ip);
1679

1680
	dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
1681 1682
}

1683 1684 1685
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1686
{
1687 1688 1689
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1690

1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1703

1704 1705
		if (fp < compat_ptr(regs->sp))
			break;
1706

1707
		perf_callchain_store(entry, frame.return_address);
1708 1709 1710
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1711
}
1712 1713 1714 1715 1716 1717 1718
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1719

1720 1721
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1722 1723 1724 1725
{
	struct stack_frame frame;
	const void __user *fp;

1726 1727
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1728
		return;
1729
	}
1730

1731
	fp = (void __user *)regs->bp;
1732

1733
	perf_callchain_store(entry, regs->ip);
1734

1735 1736 1737
	if (perf_callchain_user32(regs, entry))
		return;

1738
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1739
		unsigned long bytes;
1740
		frame.next_frame	     = NULL;
1741 1742
		frame.return_address = 0;

1743 1744
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1745 1746
			break;

1747
		if ((unsigned long)fp < regs->sp)
1748 1749
			break;

1750
		perf_callchain_store(entry, frame.return_address);
1751
		fp = frame.next_frame;
1752 1753 1754
	}
}

1755 1756 1757
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1758

1759 1760 1761 1762
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1763

1764 1765 1766 1767 1768 1769
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1770

1771
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1783
	if (regs->flags & PERF_EFLAGS_EXACT)
P
Peter Zijlstra 已提交
1784
		misc |= PERF_RECORD_MISC_EXACT_IP;
1785 1786 1787

	return misc;
}