perf_counter.c 27.9 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3
/*
 * Performance counter x86 architecture code
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
I
Ingo Molnar 已提交
9 10 11 12 13 14 15 16 17
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
18
#include <linux/module.h>
I
Ingo Molnar 已提交
19 20
#include <linux/kdebug.h>
#include <linux/sched.h>
21
#include <linux/uaccess.h>
I
Ingo Molnar 已提交
22 23

#include <asm/apic.h>
24
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
25
#include <asm/nmi.h>
I
Ingo Molnar 已提交
26

27
static u64 perf_counter_mask __read_mostly;
28

I
Ingo Molnar 已提交
29
struct cpu_hw_counters {
30
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
31 32
	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33
	unsigned long		interrupts;
34
	int			enabled;
I
Ingo Molnar 已提交
35 36 37
};

/*
38
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
39
 */
40
struct x86_pmu {
41 42
	const char	*name;
	int		version;
43
	int		(*handle_irq)(struct pt_regs *, int);
44 45
	void		(*disable_all)(void);
	void		(*enable_all)(void);
46
	void		(*enable)(struct hw_perf_counter *, int);
47
	void		(*disable)(struct hw_perf_counter *, int);
48 49
	unsigned	eventsel;
	unsigned	perfctr;
50 51
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
52
	int		max_events;
53 54 55 56
	int		num_counters;
	int		num_counters_fixed;
	int		counter_bits;
	u64		counter_mask;
57
	u64		max_period;
58
	u64		intel_ctrl;
59 60
};

61
static struct x86_pmu x86_pmu __read_mostly;
62

63 64 65
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
	.enabled = 1,
};
I
Ingo Molnar 已提交
66

67 68 69
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
70
static const u64 intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
71
{
72
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
73 74 75 76 77
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
78
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
79 80
};

81
static u64 intel_pmu_event_map(int event)
82 83 84
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
85

86
static u64 intel_pmu_raw_event(u64 event)
87
{
88 89
#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
90 91
#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
92
#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
93 94 95 96

#define CORE_EVNTSEL_MASK 		\
	(CORE_EVNTSEL_EVENT_MASK |	\
	 CORE_EVNTSEL_UNIT_MASK  |	\
97 98
	 CORE_EVNTSEL_EDGE_MASK  |	\
	 CORE_EVNTSEL_INV_MASK  |	\
99 100 101 102 103
	 CORE_EVNTSEL_COUNTER_MASK)

	return event & CORE_EVNTSEL_MASK;
}

104 105 106
/*
 * AMD Performance Monitor K7 and later.
 */
107
static const u64 amd_perfmon_event_map[] =
108 109 110 111 112 113 114 115 116
{
  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
};

117
static u64 amd_pmu_event_map(int event)
118 119 120 121
{
	return amd_perfmon_event_map[event];
}

122
static u64 amd_pmu_raw_event(u64 event)
123
{
124 125
#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
126 127
#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
#define K7_EVNTSEL_INV_MASK	0x000800000ULL
128
#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
129 130 131 132

#define K7_EVNTSEL_MASK			\
	(K7_EVNTSEL_EVENT_MASK |	\
	 K7_EVNTSEL_UNIT_MASK  |	\
133 134
	 K7_EVNTSEL_EDGE_MASK  |	\
	 K7_EVNTSEL_INV_MASK   |	\
135 136 137 138 139
	 K7_EVNTSEL_COUNTER_MASK)

	return event & K7_EVNTSEL_MASK;
}

140 141 142 143 144
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
145
static u64
146 147 148
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
149 150 151
	int shift = 64 - x86_pmu.counter_bits;
	u64 prev_raw_count, new_raw_count;
	s64 delta;
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
174
	 * of the count.
175
	 */
176 177
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
178 179 180

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
181 182

	return new_raw_count;
183 184
}

185
static atomic_t active_counters;
P
Peter Zijlstra 已提交
186 187 188 189 190 191 192 193 194
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

195
	for (i = 0; i < x86_pmu.num_counters; i++) {
196
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
197 198 199
			goto perfctr_fail;
	}

200
	for (i = 0; i < x86_pmu.num_counters; i++) {
201
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
202 203 204 205 206 207 208
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
209
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
210

211
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
212 213 214

perfctr_fail:
	for (i--; i >= 0; i--)
215
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
216 217 218 219 220 221 222 223 224 225 226

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

227
	for (i = 0; i < x86_pmu.num_counters; i++) {
228 229
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
230 231 232 233 234 235 236 237
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
}

static void hw_perf_counter_destroy(struct perf_counter *counter)
{
238
	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
239 240 241 242 243
		release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

244 245 246 247 248
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

I
Ingo Molnar 已提交
249 250 251
/*
 * Setup the hardware configuration for a given hw_event_type
 */
I
Ingo Molnar 已提交
252
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
253
{
I
Ingo Molnar 已提交
254
	struct perf_counter_hw_event *hw_event = &counter->hw_event;
I
Ingo Molnar 已提交
255
	struct hw_perf_counter *hwc = &counter->hw;
P
Peter Zijlstra 已提交
256
	int err;
I
Ingo Molnar 已提交
257

258 259
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
260

P
Peter Zijlstra 已提交
261
	err = 0;
262
	if (!atomic_inc_not_zero(&active_counters)) {
P
Peter Zijlstra 已提交
263
		mutex_lock(&pmc_reserve_mutex);
264
		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
P
Peter Zijlstra 已提交
265 266
			err = -EBUSY;
		else
267
			atomic_inc(&active_counters);
P
Peter Zijlstra 已提交
268 269 270 271 272
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

I
Ingo Molnar 已提交
273
	/*
274
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
275 276
	 * (keep 'enabled' bit clear for now)
	 */
277
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
278 279

	/*
280
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
281
	 */
282 283 284
	if (!hw_event->exclude_user)
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!hw_event->exclude_kernel)
I
Ingo Molnar 已提交
285
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
286 287 288 289 290

	/*
	 * If privileged enough, allow NMI events:
	 */
	hwc->nmi = 0;
291 292 293
	if (hw_event->nmi) {
		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
			return -EACCES;
294
		hwc->nmi = 1;
295
	}
I
Ingo Molnar 已提交
296

297 298 299
	if (!hwc->irq_period)
		hwc->irq_period = x86_pmu.max_period;

300 301
	atomic64_set(&hwc->period_left,
			min(x86_pmu.max_period, hwc->irq_period));
I
Ingo Molnar 已提交
302 303

	/*
304
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
305
	 */
306
	if (perf_event_raw(hw_event)) {
307
		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
I
Ingo Molnar 已提交
308
	} else {
309
		if (perf_event_id(hw_event) >= x86_pmu.max_events)
I
Ingo Molnar 已提交
310 311 312 313
			return -EINVAL;
		/*
		 * The generic map:
		 */
314
		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
I
Ingo Molnar 已提交
315 316
	}

P
Peter Zijlstra 已提交
317 318
	counter->destroy = hw_perf_counter_destroy;

I
Ingo Molnar 已提交
319 320 321
	return 0;
}

322
static void intel_pmu_disable_all(void)
323
{
324
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
I
Ingo Molnar 已提交
325
}
326

327
static void amd_pmu_disable_all(void)
328
{
329
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
330 331 332 333
	int idx;

	if (!cpuc->enabled)
		return;
334 335

	cpuc->enabled = 0;
336 337
	/*
	 * ensure we write the disable before we start disabling the
338 339
	 * counters proper, so that amd_pmu_enable_counter() does the
	 * right thing.
340
	 */
341
	barrier();
342

343
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
344 345
		u64 val;

346
		if (!test_bit(idx, cpuc->active_mask))
347
			continue;
348
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
349 350 351 352
		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			continue;
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
353 354 355
	}
}

356
void hw_perf_disable(void)
357
{
358
	if (!x86_pmu_initialized())
359 360
		return;
	return x86_pmu.disable_all();
361
}
I
Ingo Molnar 已提交
362

363
static void intel_pmu_enable_all(void)
364
{
365
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
366 367
}

368
static void amd_pmu_enable_all(void)
369
{
370
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
371 372
	int idx;

373
	if (cpuc->enabled)
374 375
		return;

376 377 378
	cpuc->enabled = 1;
	barrier();

379
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
380
		u64 val;
381

382
		if (!test_bit(idx, cpuc->active_mask))
383 384 385 386 387 388
			continue;
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			continue;
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
389 390 391
	}
}

392
void hw_perf_enable(void)
393
{
394
	if (!x86_pmu_initialized())
395
		return;
396
	x86_pmu.enable_all();
397 398
}

399
static inline u64 intel_pmu_get_status(void)
400 401 402
{
	u64 status;

403
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
404

405
	return status;
406 407
}

408
static inline void intel_pmu_ack_status(u64 ack)
409 410 411 412
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}

413
static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
414
{
415 416 417
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
418 419
}

420
static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
421
{
422 423 424
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config);
425 426
}

427
static inline void
428
intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
429 430 431 432 433 434 435 436 437 438 439 440
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

441
static inline void
442
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
443
{
444 445 446 447 448 449 450 451 452 453 454 455
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_disable_fixed(hwc, idx);
		return;
	}

	x86_pmu_disable_counter(hwc, idx);
}

static inline void
amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
	x86_pmu_disable_counter(hwc, idx);
456 457
}

458
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
459

460 461 462 463 464
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
static void
465
x86_perf_counter_set_period(struct perf_counter *counter,
466
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
467
{
468
	s64 left = atomic64_read(&hwc->period_left);
469
	s64 period = min(x86_pmu.max_period, hwc->irq_period);
470
	int err;
471 472 473 474 475 476 477 478 479 480 481 482 483

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
	}
484 485 486 487 488
	/*
	 * Quirk: certain CPUs dont like it if just 1 event is left:
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
489

490 491 492 493 494 495
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
496
	atomic64_set(&hwc->prev_count, (u64)-left);
497

498
	err = checking_wrmsrl(hwc->counter_base + idx,
499
			     (u64)(-left) & x86_pmu.counter_mask);
500 501 502
}

static inline void
503
intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
504 505 506 507 508 509
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
510 511 512
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
513
	 */
514 515 516
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
517 518 519 520 521 522 523 524 525
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
526 527
}

528
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
529
{
530 531 532 533 534 535 536 537 538 539 540 541 542 543
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_enable_fixed(hwc, idx);
		return;
	}

	x86_pmu_enable_counter(hwc, idx);
}

static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);

	if (cpuc->enabled)
		x86_pmu_enable_counter(hwc, idx);
544
	else
545
		x86_pmu_disable_counter(hwc, idx);
I
Ingo Molnar 已提交
546 547
}

548 549
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
550
{
551 552
	unsigned int event;

553
	if (!x86_pmu.num_counters_fixed)
554 555
		return -1;

556 557 558 559 560
	if (unlikely(hwc->nmi))
		return -1;

	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

561
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
562
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
563
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
564
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
565
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
566 567
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

568 569 570
	return -1;
}

571 572 573
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
574
static int x86_pmu_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
575 576 577
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
578
	int idx;
I
Ingo Molnar 已提交
579

580 581 582 583 584 585
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
586
		if (test_and_set_bit(idx, cpuc->used_mask))
587
			goto try_generic;
588

589 590 591 592 593 594 595
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
596
		hwc->idx = idx;
597 598 599
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
600
		if (test_and_set_bit(idx, cpuc->used_mask)) {
601
try_generic:
602
			idx = find_first_zero_bit(cpuc->used_mask,
603 604
						  x86_pmu.num_counters);
			if (idx == x86_pmu.num_counters)
605 606
				return -EAGAIN;

607
			set_bit(idx, cpuc->used_mask);
608 609
			hwc->idx = idx;
		}
610 611
		hwc->config_base  = x86_pmu.eventsel;
		hwc->counter_base = x86_pmu.perfctr;
I
Ingo Molnar 已提交
612 613
	}

614 615
	perf_counters_lapic_init(hwc->nmi);

616
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
617

618
	cpuc->counters[idx] = counter;
619
	set_bit(idx, cpuc->active_mask);
620

621
	x86_perf_counter_set_period(counter, hwc, idx);
622
	x86_pmu.enable(hwc, idx);
623 624

	return 0;
I
Ingo Molnar 已提交
625 626
}

627 628 629 630 631 632 633 634 635 636 637 638
static void x86_pmu_unthrottle(struct perf_counter *counter)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;

	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
				cpuc->counters[hwc->idx] != counter))
		return;

	x86_pmu.enable(hwc, hwc->idx);
}

I
Ingo Molnar 已提交
639 640
void perf_counter_print_debug(void)
{
641
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
642
	struct cpu_hw_counters *cpuc;
643
	unsigned long flags;
644 645
	int cpu, idx;

646
	if (!x86_pmu.num_counters)
647
		return;
I
Ingo Molnar 已提交
648

649
	local_irq_save(flags);
I
Ingo Molnar 已提交
650 651

	cpu = smp_processor_id();
652
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
653

654
	if (x86_pmu.version >= 2) {
655 656 657 658 659 660 661 662 663 664
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
665
	}
666
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
I
Ingo Molnar 已提交
667

668
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
669 670
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
671

672
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
673

674
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
675
			cpu, idx, pmc_ctrl);
676
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
677
			cpu, idx, pmc_count);
678
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
679
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
680
	}
681
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
682 683
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

684
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
685 686
			cpu, idx, pmc_count);
	}
687
	local_irq_restore(flags);
I
Ingo Molnar 已提交
688 689
}

690
static void x86_pmu_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
691 692 693
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
694
	int idx = hwc->idx;
I
Ingo Molnar 已提交
695

696 697 698 699
	/*
	 * Must be done before we disable, otherwise the nmi handler
	 * could reenable again:
	 */
700
	clear_bit(idx, cpuc->active_mask);
701
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
702

703 704 705 706
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
707
	barrier();
I
Ingo Molnar 已提交
708

709 710 711 712 713
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
714
	cpuc->counters[idx] = NULL;
715
	clear_bit(idx, cpuc->used_mask);
I
Ingo Molnar 已提交
716 717
}

718
/*
719 720
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
721
 */
722
static void intel_pmu_save_and_restart(struct perf_counter *counter)
I
Ingo Molnar 已提交
723 724 725 726
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;

727
	x86_perf_counter_update(counter, hwc, idx);
728
	x86_perf_counter_set_period(counter, hwc, idx);
729

730
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
731
		intel_pmu_enable_counter(hwc, idx);
I
Ingo Molnar 已提交
732 733 734 735 736 737
}

/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
738
static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
I
Ingo Molnar 已提交
739
{
740 741 742
	struct cpu_hw_counters *cpuc;
	struct cpu_hw_counters;
	int bit, cpu, loops;
743
	u64 ack, status;
744 745 746

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
747

748
	perf_disable();
749
	status = intel_pmu_get_status();
750 751 752 753
	if (!status) {
		perf_enable();
		return 0;
	}
754

755
	loops = 0;
I
Ingo Molnar 已提交
756
again:
757 758
	if (++loops > 100) {
		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
759
		perf_counter_print_debug();
760 761 762
		return 1;
	}

763
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
764
	ack = status;
765
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
767 768

		clear_bit(bit, (unsigned long *) &status);
769
		if (!test_bit(bit, cpuc->active_mask))
I
Ingo Molnar 已提交
770 771
			continue;

772
		intel_pmu_save_and_restart(counter);
773
		if (perf_counter_overflow(counter, nmi, regs, 0))
774
			intel_pmu_disable_counter(&counter->hw, bit);
I
Ingo Molnar 已提交
775 776
	}

777
	intel_pmu_ack_status(ack);
I
Ingo Molnar 已提交
778 779 780 781

	/*
	 * Repeat if there is more work to be done:
	 */
782
	status = intel_pmu_get_status();
I
Ingo Molnar 已提交
783 784
	if (status)
		goto again;
785

786
	perf_enable();
787 788

	return 1;
789 790
}

791 792
static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
{
793
	int cpu, idx, handled = 0;
794
	struct cpu_hw_counters *cpuc;
795 796
	struct perf_counter *counter;
	struct hw_perf_counter *hwc;
797 798 799 800
	u64 val;

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
801

802
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
803
		if (!test_bit(idx, cpuc->active_mask))
804
			continue;
805

806 807
		counter = cpuc->counters[idx];
		hwc = &counter->hw;
808 809

		if (counter->hw_event.nmi != nmi)
810
			continue;
811

812
		val = x86_perf_counter_update(counter, hwc, idx);
813
		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
814
			continue;
815

816 817 818 819
		/* counter overflow */
		x86_perf_counter_set_period(counter, hwc, idx);
		handled = 1;
		inc_irq_stat(apic_perf_irqs);
820
		if (perf_counter_overflow(counter, nmi, regs, 0))
821 822
			amd_pmu_disable_counter(hwc, idx);
	}
823

824 825
	return handled;
}
826

I
Ingo Molnar 已提交
827 828 829 830
void smp_perf_counter_interrupt(struct pt_regs *regs)
{
	irq_enter();
	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
831
	ack_APIC_irq();
832
	x86_pmu.handle_irq(regs, 0);
I
Ingo Molnar 已提交
833 834 835
	irq_exit();
}

836 837 838 839 840 841 842 843 844 845 846 847 848 849
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
	perf_counter_do_pending();
	irq_exit();
}

void set_perf_counter_pending(void)
{
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
}

850
void perf_counters_lapic_init(int nmi)
I
Ingo Molnar 已提交
851 852 853
{
	u32 apic_val;

854
	if (!x86_pmu_initialized())
I
Ingo Molnar 已提交
855
		return;
856

I
Ingo Molnar 已提交
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
	/*
	 * Enable the performance counter vector in the APIC LVT:
	 */
	apic_val = apic_read(APIC_LVTERR);

	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
	if (nmi)
		apic_write(APIC_LVTPC, APIC_DM_NMI);
	else
		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	apic_write(APIC_LVTERR, apic_val);
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
876

877
	if (!atomic_read(&active_counters))
878 879
		return NOTIFY_DONE;

880 881 882 883
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
884

885
	default:
I
Ingo Molnar 已提交
886
		return NOTIFY_DONE;
887
	}
I
Ingo Molnar 已提交
888 889 890 891

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
892 893 894 895 896 897 898 899
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
	x86_pmu.handle_irq(regs, 1);
I
Ingo Molnar 已提交
900

901
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
902 903 904
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
905 906 907
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
908 909
};

910
static struct x86_pmu intel_pmu = {
911
	.name			= "Intel",
912
	.handle_irq		= intel_pmu_handle_irq,
913 914
	.disable_all		= intel_pmu_disable_all,
	.enable_all		= intel_pmu_enable_all,
915 916
	.enable			= intel_pmu_enable_counter,
	.disable		= intel_pmu_disable_counter,
917 918
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
919 920
	.event_map		= intel_pmu_event_map,
	.raw_event		= intel_pmu_raw_event,
921
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
922 923 924 925 926 927
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
	.max_period		= (1ULL << 31) - 1,
928 929
};

930
static struct x86_pmu amd_pmu = {
931
	.name			= "AMD",
932
	.handle_irq		= amd_pmu_handle_irq,
933 934
	.disable_all		= amd_pmu_disable_all,
	.enable_all		= amd_pmu_enable_all,
935 936
	.enable			= amd_pmu_enable_counter,
	.disable		= amd_pmu_disable_counter,
937 938
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
939 940
	.event_map		= amd_pmu_event_map,
	.raw_event		= amd_pmu_raw_event,
941
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
942 943 944
	.num_counters		= 4,
	.counter_bits		= 48,
	.counter_mask		= (1ULL << 48) - 1,
945 946
	/* use highest bit to detect overflow */
	.max_period		= (1ULL << 47) - 1,
947 948
};

949
static int intel_pmu_init(void)
I
Ingo Molnar 已提交
950
{
951
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
952
	union cpuid10_eax eax;
953
	unsigned int unused;
954
	unsigned int ebx;
955
	int version;
I
Ingo Molnar 已提交
956

957
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
958
		return -ENODEV;
959

I
Ingo Molnar 已提交
960 961 962 963
	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
964
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
965
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
966
		return -ENODEV;
I
Ingo Molnar 已提交
967

968 969
	version = eax.split.version_id;
	if (version < 2)
970
		return -ENODEV;
971

972
	x86_pmu = intel_pmu;
973
	x86_pmu.version = version;
974
	x86_pmu.num_counters = eax.split.num_counters;
975 976 977 978 979 980 981

	/*
	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
	 * assume at least 3 counters:
	 */
	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);

982 983
	x86_pmu.counter_bits = eax.split.bit_width;
	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
984

985 986
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);

987
	return 0;
988 989
}

990
static int amd_pmu_init(void)
991
{
992
	x86_pmu = amd_pmu;
993
	return 0;
994 995
}

996 997
void __init init_hw_perf_counters(void)
{
998 999
	int err;

1000 1001
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1002
		err = intel_pmu_init();
1003
		break;
1004
	case X86_VENDOR_AMD:
1005
		err = amd_pmu_init();
1006
		break;
1007 1008
	default:
		return;
1009
	}
1010
	if (err != 0)
1011 1012
		return;

1013 1014 1015 1016
	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
	pr_info("... version:         %d\n", x86_pmu.version);
	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);

1017 1018 1019
	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1020
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1021
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
1022
	}
1023 1024
	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
	perf_max_counters = x86_pmu.num_counters;
I
Ingo Molnar 已提交
1025

1026
	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
1027
	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
1028

1029 1030
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1031
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1032
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1033
	}
1034
	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
1035

1036 1037
	perf_counter_mask |=
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1038

1039
	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1040

1041
	perf_counters_lapic_init(0);
I
Ingo Molnar 已提交
1042 1043
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
1044

1045
static inline void x86_pmu_read(struct perf_counter *counter)
1046 1047 1048 1049
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

1050 1051 1052 1053
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
	.read		= x86_pmu_read,
1054
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1055 1056
};

1057
const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
1058 1059 1060 1061 1062
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
1063
		return ERR_PTR(err);
I
Ingo Molnar 已提交
1064

1065
	return &pmu;
I
Ingo Molnar 已提交
1066
}
1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119

/*
 * callchain support
 */

static inline
void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
{
	if (entry->nr < MAX_STACK_DEPTH)
		entry->ip[entry->nr++] = ip;
}

static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
	/* Don't bother with IRQ stacks for now */
	return -1;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
};

static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	unsigned long bp;
	char *stack;
1120
	int nr = entry->nr;
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131

	callchain_store(entry, instruction_pointer(regs));

	stack = ((char *)regs + sizeof(struct pt_regs));
#ifdef CONFIG_FRAME_POINTER
	bp = frame_pointer(regs);
#else
	bp = 0;
#endif

	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1132 1133

	entry->kernel = entry->nr - nr;
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
}


struct stack_frame {
	const void __user	*next_fp;
	unsigned long		return_address;
};

static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	int ret;

	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
		return 0;

	ret = 1;
	pagefault_disable();
	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
		ret = 0;
	pagefault_enable();

	return ret;
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;
1163
	int nr = entry->nr;
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182

	regs = (struct pt_regs *)current->thread.sp0 - 1;
	fp   = (void __user *)regs->bp;

	callchain_store(entry, regs->ip);

	while (entry->nr < MAX_STACK_DEPTH) {
		frame.next_fp	     = NULL;
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

		if ((unsigned long)fp < user_stack_pointer(regs))
			break;

		callchain_store(entry, frame.return_address);
		fp = frame.next_fp;
	}
1183 1184

	entry->user = entry->nr - nr;
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (!current || current->pid == 0)
		return;

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
		entry = &__get_cpu_var(nmi_entry);
	else
		entry = &__get_cpu_var(irq_entry);

	entry->nr = 0;
1220 1221 1222
	entry->hv = 0;
	entry->kernel = 0;
	entry->user = 0;
1223 1224 1225 1226 1227

	perf_do_callchain(regs, entry);

	return entry;
}