perf_counter.c 27.5 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3 4 5
/*
 * Performance counter x86 architecture code
 *
 *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6
 *  Copyright(C) 2009 Jaswinder Singh Rajput
7
 *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
I
Ingo Molnar 已提交
8 9 10 11 12 13 14 15 16
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
17
#include <linux/module.h>
I
Ingo Molnar 已提交
18 19
#include <linux/kdebug.h>
#include <linux/sched.h>
20
#include <linux/uaccess.h>
I
Ingo Molnar 已提交
21 22

#include <asm/apic.h>
23
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
24
#include <asm/nmi.h>
I
Ingo Molnar 已提交
25

26
static u64 perf_counter_mask __read_mostly;
27

I
Ingo Molnar 已提交
28
struct cpu_hw_counters {
29 30
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
31
	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32
	unsigned long		interrupts;
33 34
	u64			throttle_ctrl;
	int			enabled;
I
Ingo Molnar 已提交
35 36 37
};

/*
38
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
39
 */
40
struct x86_pmu {
41 42
	const char	*name;
	int		version;
43
	int		(*handle_irq)(struct pt_regs *, int);
44
	u64		(*save_disable_all)(void);
45
	void		(*restore_all)(u64);
46
	void		(*enable)(struct hw_perf_counter *, int);
47
	void		(*disable)(struct hw_perf_counter *, int);
48 49
	unsigned	eventsel;
	unsigned	perfctr;
50 51
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
52
	int		max_events;
53 54 55 56
	int		num_counters;
	int		num_counters_fixed;
	int		counter_bits;
	u64		counter_mask;
57
	u64		max_period;
58 59
};

60
static struct x86_pmu x86_pmu __read_mostly;
61

62 63 64
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
	.enabled = 1,
};
I
Ingo Molnar 已提交
65

66 67 68
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
69
static const u64 intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
70
{
71
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
72 73 74 75 76
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
77
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
78 79
};

80
static u64 intel_pmu_event_map(int event)
81 82 83
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
84

85
static u64 intel_pmu_raw_event(u64 event)
86
{
87 88 89
#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
90 91 92 93 94 95 96 97 98

#define CORE_EVNTSEL_MASK 		\
	(CORE_EVNTSEL_EVENT_MASK |	\
	 CORE_EVNTSEL_UNIT_MASK  |	\
	 CORE_EVNTSEL_COUNTER_MASK)

	return event & CORE_EVNTSEL_MASK;
}

99 100 101
/*
 * AMD Performance Monitor K7 and later.
 */
102
static const u64 amd_perfmon_event_map[] =
103 104 105 106 107 108 109 110 111
{
  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
};

112
static u64 amd_pmu_event_map(int event)
113 114 115 116
{
	return amd_perfmon_event_map[event];
}

117
static u64 amd_pmu_raw_event(u64 event)
118
{
119 120 121
#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
122 123 124 125 126 127 128 129 130

#define K7_EVNTSEL_MASK			\
	(K7_EVNTSEL_EVENT_MASK |	\
	 K7_EVNTSEL_UNIT_MASK  |	\
	 K7_EVNTSEL_COUNTER_MASK)

	return event & K7_EVNTSEL_MASK;
}

131 132 133 134 135
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
136
static u64
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
	u64 prev_raw_count, new_raw_count, delta;

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
	 * of the count, so we do that by clipping the delta to 32 bits:
	 */
	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
169 170

	return new_raw_count;
171 172
}

P
Peter Zijlstra 已提交
173 174 175 176 177 178 179 180 181 182
static atomic_t num_counters;
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

183
	for (i = 0; i < x86_pmu.num_counters; i++) {
184
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
185 186 187
			goto perfctr_fail;
	}

188
	for (i = 0; i < x86_pmu.num_counters; i++) {
189
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
190 191 192 193 194 195 196
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
197
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
198

199
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
200 201 202

perfctr_fail:
	for (i--; i >= 0; i--)
203
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
204 205 206 207 208 209 210 211 212 213 214

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

215
	for (i = 0; i < x86_pmu.num_counters; i++) {
216 217
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
218 219 220 221 222 223 224 225 226 227 228 229 230 231
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
}

static void hw_perf_counter_destroy(struct perf_counter *counter)
{
	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
		release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

232 233 234 235 236
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

I
Ingo Molnar 已提交
237 238 239
/*
 * Setup the hardware configuration for a given hw_event_type
 */
I
Ingo Molnar 已提交
240
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
241
{
I
Ingo Molnar 已提交
242
	struct perf_counter_hw_event *hw_event = &counter->hw_event;
I
Ingo Molnar 已提交
243
	struct hw_perf_counter *hwc = &counter->hw;
P
Peter Zijlstra 已提交
244
	int err;
I
Ingo Molnar 已提交
245

246 247
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
248

P
Peter Zijlstra 已提交
249 250 251 252 253 254 255 256 257 258 259 260
	err = 0;
	if (atomic_inc_not_zero(&num_counters)) {
		mutex_lock(&pmc_reserve_mutex);
		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
			err = -EBUSY;
		else
			atomic_inc(&num_counters);
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

I
Ingo Molnar 已提交
261
	/*
262
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
263 264
	 * (keep 'enabled' bit clear for now)
	 */
265
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
266 267

	/*
268
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
269
	 */
270 271 272
	if (!hw_event->exclude_user)
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!hw_event->exclude_kernel)
I
Ingo Molnar 已提交
273
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
274 275 276 277 278 279 280

	/*
	 * If privileged enough, allow NMI events:
	 */
	hwc->nmi = 0;
	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
		hwc->nmi = 1;
I
Ingo Molnar 已提交
281

I
Ingo Molnar 已提交
282
	hwc->irq_period		= hw_event->irq_period;
283 284
	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
		hwc->irq_period = x86_pmu.max_period;
I
Ingo Molnar 已提交
285

286
	atomic64_set(&hwc->period_left, hwc->irq_period);
I
Ingo Molnar 已提交
287 288

	/*
289
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
290
	 */
291
	if (perf_event_raw(hw_event)) {
292
		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
I
Ingo Molnar 已提交
293
	} else {
294
		if (perf_event_id(hw_event) >= x86_pmu.max_events)
I
Ingo Molnar 已提交
295 296 297 298
			return -EINVAL;
		/*
		 * The generic map:
		 */
299
		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
I
Ingo Molnar 已提交
300 301
	}

P
Peter Zijlstra 已提交
302 303
	counter->destroy = hw_perf_counter_destroy;

I
Ingo Molnar 已提交
304 305 306
	return 0;
}

307
static u64 intel_pmu_save_disable_all(void)
308 309 310 311
{
	u64 ctrl;

	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
312
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
313

314
	return ctrl;
I
Ingo Molnar 已提交
315
}
316

317
static u64 amd_pmu_save_disable_all(void)
318
{
319 320 321 322 323
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	int enabled, idx;

	enabled = cpuc->enabled;
	cpuc->enabled = 0;
324 325
	/*
	 * ensure we write the disable before we start disabling the
326 327
	 * counters proper, so that amd_pmu_enable_counter() does the
	 * right thing.
328
	 */
329
	barrier();
330

331
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
332 333
		u64 val;

334
		if (!test_bit(idx, cpuc->active))
335
			continue;
336
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
337 338 339 340
		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			continue;
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
341 342
	}

343
	return enabled;
344 345
}

346 347
u64 hw_perf_save_disable(void)
{
348
	if (!x86_pmu_initialized())
349
		return 0;
350
	return x86_pmu.save_disable_all();
351
}
352 353 354
/*
 * Exported because of ACPI idle
 */
355
EXPORT_SYMBOL_GPL(hw_perf_save_disable);
I
Ingo Molnar 已提交
356

357
static void intel_pmu_restore_all(u64 ctrl)
358 359 360 361
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
}

362
static void amd_pmu_restore_all(u64 ctrl)
363
{
364
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
365 366
	int idx;

367 368 369 370 371
	cpuc->enabled = ctrl;
	barrier();
	if (!ctrl)
		return;

372
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
373
		u64 val;
374

375
		if (!test_bit(idx, cpuc->active))
376 377 378 379 380 381
			continue;
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			continue;
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
382 383 384
	}
}

385 386
void hw_perf_restore(u64 ctrl)
{
387
	if (!x86_pmu_initialized())
388
		return;
389
	x86_pmu.restore_all(ctrl);
390
}
391 392 393
/*
 * Exported because of ACPI idle
 */
394 395
EXPORT_SYMBOL_GPL(hw_perf_restore);

396
static inline u64 intel_pmu_get_status(u64 mask)
397 398 399
{
	u64 status;

400
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
401

402
	return status;
403 404
}

405
static inline void intel_pmu_ack_status(u64 ack)
406 407 408 409
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}

410
static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
411
{
412 413 414
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
415 416
}

417
static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
418
{
419 420 421
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config);
422 423
}

424
static inline void
425
intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
426 427 428 429 430 431 432 433 434 435 436 437
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

438
static inline void
439
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
440
{
441 442 443 444 445 446 447 448 449 450 451 452
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_disable_fixed(hwc, idx);
		return;
	}

	x86_pmu_disable_counter(hwc, idx);
}

static inline void
amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
	x86_pmu_disable_counter(hwc, idx);
453 454
}

455
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
456

457 458 459 460 461
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
static void
462
x86_perf_counter_set_period(struct perf_counter *counter,
463
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
464
{
465
	s64 left = atomic64_read(&hwc->period_left);
466
	s64 period = hwc->irq_period;
467
	int err;
468 469 470 471 472 473 474 475 476 477 478 479 480

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
	}
I
Ingo Molnar 已提交
481

482 483 484 485 486 487
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
488
	atomic64_set(&hwc->prev_count, (u64)-left);
489

490
	err = checking_wrmsrl(hwc->counter_base + idx,
491
			     (u64)(-left) & x86_pmu.counter_mask);
492 493 494
}

static inline void
495
intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
496 497 498 499 500 501
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
502 503 504
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
505
	 */
506 507 508
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
509 510 511 512 513 514 515 516 517
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
518 519
}

520
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
521
{
522 523 524 525 526 527 528 529 530 531 532 533 534 535
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_enable_fixed(hwc, idx);
		return;
	}

	x86_pmu_enable_counter(hwc, idx);
}

static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);

	if (cpuc->enabled)
		x86_pmu_enable_counter(hwc, idx);
536
	else
537
		x86_pmu_disable_counter(hwc, idx);
I
Ingo Molnar 已提交
538 539
}

540 541
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
542
{
543 544
	unsigned int event;

545 546 547
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
		return -1;

548 549 550 551 552
	if (unlikely(hwc->nmi))
		return -1;

	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

553
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
554
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
555
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
556
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
557
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
558 559
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

560 561 562
	return -1;
}

563 564 565
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
566
static int x86_pmu_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
567 568 569
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
570
	int idx;
I
Ingo Molnar 已提交
571

572 573 574 575 576 577 578 579
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
		if (test_and_set_bit(idx, cpuc->used))
			goto try_generic;
580

581 582 583 584 585 586 587
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
588
		hwc->idx = idx;
589 590 591 592 593
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
		if (test_and_set_bit(idx, cpuc->used)) {
try_generic:
594 595 596
			idx = find_first_zero_bit(cpuc->used,
						  x86_pmu.num_counters);
			if (idx == x86_pmu.num_counters)
597 598 599 600 601
				return -EAGAIN;

			set_bit(idx, cpuc->used);
			hwc->idx = idx;
		}
602 603
		hwc->config_base  = x86_pmu.eventsel;
		hwc->counter_base = x86_pmu.perfctr;
I
Ingo Molnar 已提交
604 605 606 607
	}

	perf_counters_lapic_init(hwc->nmi);

608
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
609

610
	cpuc->counters[idx] = counter;
611
	set_bit(idx, cpuc->active);
612

613
	x86_perf_counter_set_period(counter, hwc, idx);
614
	x86_pmu.enable(hwc, idx);
615 616

	return 0;
I
Ingo Molnar 已提交
617 618 619 620
}

void perf_counter_print_debug(void)
{
621
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
622
	struct cpu_hw_counters *cpuc;
623 624
	int cpu, idx;

625
	if (!x86_pmu.num_counters)
626
		return;
I
Ingo Molnar 已提交
627 628 629 630

	local_irq_disable();

	cpu = smp_processor_id();
631
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
632

633
	if (x86_pmu.version >= 2) {
634 635 636 637 638 639 640 641 642 643
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
644
	}
645
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
I
Ingo Molnar 已提交
646

647
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
648 649
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
650

651
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
652

653
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
654
			cpu, idx, pmc_ctrl);
655
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
656
			cpu, idx, pmc_count);
657
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
658
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
659
	}
660
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
661 662
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

663
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
664 665
			cpu, idx, pmc_count);
	}
I
Ingo Molnar 已提交
666 667 668
	local_irq_enable();
}

669
static void x86_pmu_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
670 671 672
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
673
	int idx = hwc->idx;
I
Ingo Molnar 已提交
674

675 676 677 678 679
	/*
	 * Must be done before we disable, otherwise the nmi handler
	 * could reenable again:
	 */
	clear_bit(idx, cpuc->active);
680
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
681

682 683 684 685
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
686
	barrier();
I
Ingo Molnar 已提交
687

688 689 690 691 692
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
693 694
	cpuc->counters[idx] = NULL;
	clear_bit(idx, cpuc->used);
I
Ingo Molnar 已提交
695 696
}

697
/*
698 699
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
700
 */
701
static void intel_pmu_save_and_restart(struct perf_counter *counter)
I
Ingo Molnar 已提交
702 703 704 705
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;

706
	x86_perf_counter_update(counter, hwc, idx);
707
	x86_perf_counter_set_period(counter, hwc, idx);
708

709
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
710
		intel_pmu_enable_counter(hwc, idx);
I
Ingo Molnar 已提交
711 712
}

713 714 715
/*
 * Maximum interrupt frequency of 100KHz per CPU
 */
716
#define PERFMON_MAX_INTERRUPTS (100000/HZ)
717

I
Ingo Molnar 已提交
718 719 720 721
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
722
static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
I
Ingo Molnar 已提交
723 724
{
	int bit, cpu = smp_processor_id();
725
	u64 ack, status;
726
	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
727
	int ret = 0;
728

729
	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
I
Ingo Molnar 已提交
730

731
	status = intel_pmu_get_status(cpuc->throttle_ctrl);
732 733 734
	if (!status)
		goto out;

735
	ret = 1;
I
Ingo Molnar 已提交
736
again:
737
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
738
	ack = status;
739
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
740
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
741 742

		clear_bit(bit, (unsigned long *) &status);
743
		if (!test_bit(bit, cpuc->active))
I
Ingo Molnar 已提交
744 745
			continue;

746
		intel_pmu_save_and_restart(counter);
747
		if (perf_counter_overflow(counter, nmi, regs, 0))
748
			intel_pmu_disable_counter(&counter->hw, bit);
I
Ingo Molnar 已提交
749 750
	}

751
	intel_pmu_ack_status(ack);
I
Ingo Molnar 已提交
752 753 754 755

	/*
	 * Repeat if there is more work to be done:
	 */
756
	status = intel_pmu_get_status(cpuc->throttle_ctrl);
I
Ingo Molnar 已提交
757 758
	if (status)
		goto again;
759
out:
I
Ingo Molnar 已提交
760
	/*
761
	 * Restore - do not reenable when global enable is off or throttled:
I
Ingo Molnar 已提交
762
	 */
763
	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
764
		intel_pmu_restore_all(cpuc->throttle_ctrl);
765 766

	return ret;
767 768
}

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
{
	int cpu = smp_processor_id();
	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
	u64 val;
	int handled = 0;
	struct perf_counter *counter;
	struct hw_perf_counter *hwc;
	int idx;

	++cpuc->interrupts;
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
		if (!test_bit(idx, cpuc->active))
			continue;
		counter = cpuc->counters[idx];
		hwc = &counter->hw;
785
		val = x86_perf_counter_update(counter, hwc, idx);
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
			continue;
		/* counter overflow */
		x86_perf_counter_set_period(counter, hwc, idx);
		handled = 1;
		inc_irq_stat(apic_perf_irqs);
		if (perf_counter_overflow(counter, nmi, regs, 0))
			amd_pmu_disable_counter(hwc, idx);
		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
			/*
			 * do not reenable when throttled, but reload
			 * the register
			 */
			amd_pmu_disable_counter(hwc, idx);
		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
			amd_pmu_enable_counter(hwc, idx);
	}
	return handled;
}
805

806 807 808 809
void perf_counter_unthrottle(void)
{
	struct cpu_hw_counters *cpuc;

810
	if (!x86_pmu_initialized())
811 812
		return;

813
	cpuc = &__get_cpu_var(cpu_hw_counters);
814
	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
815
		if (printk_ratelimit())
816
			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
817
		hw_perf_restore(cpuc->throttle_ctrl);
818
	}
819
	cpuc->interrupts = 0;
I
Ingo Molnar 已提交
820 821 822 823 824 825
}

void smp_perf_counter_interrupt(struct pt_regs *regs)
{
	irq_enter();
	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
826
	ack_APIC_irq();
827
	x86_pmu.handle_irq(regs, 0);
I
Ingo Molnar 已提交
828 829 830
	irq_exit();
}

831 832 833 834 835 836 837 838 839 840 841 842 843 844
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
	perf_counter_do_pending();
	irq_exit();
}

void set_perf_counter_pending(void)
{
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
}

845
void perf_counters_lapic_init(int nmi)
I
Ingo Molnar 已提交
846 847 848
{
	u32 apic_val;

849
	if (!x86_pmu_initialized())
I
Ingo Molnar 已提交
850
		return;
851

I
Ingo Molnar 已提交
852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
	/*
	 * Enable the performance counter vector in the APIC LVT:
	 */
	apic_val = apic_read(APIC_LVTERR);

	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
	if (nmi)
		apic_write(APIC_LVTPC, APIC_DM_NMI);
	else
		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
	apic_write(APIC_LVTERR, apic_val);
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
871 872 873 874 875 876
	int ret;

	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
877

878
	default:
I
Ingo Molnar 已提交
879
		return NOTIFY_DONE;
880
	}
I
Ingo Molnar 已提交
881 882 883 884

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
885
	ret = x86_pmu.handle_irq(regs, 1);
I
Ingo Molnar 已提交
886

887
	return ret ? NOTIFY_STOP : NOTIFY_OK;
I
Ingo Molnar 已提交
888 889 890
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
891 892 893
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
894 895
};

896
static struct x86_pmu intel_pmu = {
897
	.name			= "Intel",
898
	.handle_irq		= intel_pmu_handle_irq,
899 900 901 902
	.save_disable_all	= intel_pmu_save_disable_all,
	.restore_all		= intel_pmu_restore_all,
	.enable			= intel_pmu_enable_counter,
	.disable		= intel_pmu_disable_counter,
903 904
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
905 906
	.event_map		= intel_pmu_event_map,
	.raw_event		= intel_pmu_raw_event,
907
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
908 909 910 911 912 913
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
	.max_period		= (1ULL << 31) - 1,
914 915
};

916
static struct x86_pmu amd_pmu = {
917
	.name			= "AMD",
918
	.handle_irq		= amd_pmu_handle_irq,
919 920 921 922
	.save_disable_all	= amd_pmu_save_disable_all,
	.restore_all		= amd_pmu_restore_all,
	.enable			= amd_pmu_enable_counter,
	.disable		= amd_pmu_disable_counter,
923 924
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
925 926
	.event_map		= amd_pmu_event_map,
	.raw_event		= amd_pmu_raw_event,
927
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
928 929 930
	.num_counters		= 4,
	.counter_bits		= 48,
	.counter_mask		= (1ULL << 48) - 1,
931 932
	/* use highest bit to detect overflow */
	.max_period		= (1ULL << 47) - 1,
933 934
};

935
static int intel_pmu_init(void)
I
Ingo Molnar 已提交
936
{
937
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
938
	union cpuid10_eax eax;
939
	unsigned int unused;
940
	unsigned int ebx;
941
	int version;
I
Ingo Molnar 已提交
942

943
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
944
		return -ENODEV;
945

I
Ingo Molnar 已提交
946 947 948 949
	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
950
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
951
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
952
		return -ENODEV;
I
Ingo Molnar 已提交
953

954 955
	version = eax.split.version_id;
	if (version < 2)
956
		return -ENODEV;
957

958
	x86_pmu = intel_pmu;
959
	x86_pmu.version = version;
960 961 962 963
	x86_pmu.num_counters = eax.split.num_counters;
	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
	x86_pmu.counter_bits = eax.split.bit_width;
	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
964

965
	return 0;
966 967
}

968
static int amd_pmu_init(void)
969
{
970
	x86_pmu = amd_pmu;
971
	return 0;
972 973
}

974 975
void __init init_hw_perf_counters(void)
{
976 977
	int err;

978 979
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
980
		err = intel_pmu_init();
981
		break;
982
	case X86_VENDOR_AMD:
983
		err = amd_pmu_init();
984
		break;
985 986
	default:
		return;
987
	}
988
	if (err != 0)
989 990
		return;

991 992 993 994
	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
	pr_info("... version:         %d\n", x86_pmu.version);
	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);

995 996 997
	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
998
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
999
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
1000
	}
1001 1002
	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
	perf_max_counters = x86_pmu.num_counters;
I
Ingo Molnar 已提交
1003

1004
	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
1005
	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
1006

1007 1008
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1009
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1010
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1011
	}
1012
	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
1013

1014 1015
	perf_counter_mask |=
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1016

1017
	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1018

I
Ingo Molnar 已提交
1019 1020 1021
	perf_counters_lapic_init(0);
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
1022

1023
static inline void x86_pmu_read(struct perf_counter *counter)
1024 1025 1026 1027
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

1028 1029 1030 1031
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
	.read		= x86_pmu_read,
I
Ingo Molnar 已提交
1032 1033
};

1034
const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
1035 1036 1037 1038 1039
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
1040
		return ERR_PTR(err);
I
Ingo Molnar 已提交
1041

1042
	return &pmu;
I
Ingo Molnar 已提交
1043
}
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096

/*
 * callchain support
 */

static inline
void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
{
	if (entry->nr < MAX_STACK_DEPTH)
		entry->ip[entry->nr++] = ip;
}

static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
	/* Don't bother with IRQ stacks for now */
	return -1;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
};

static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	unsigned long bp;
	char *stack;
1097
	int nr = entry->nr;
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108

	callchain_store(entry, instruction_pointer(regs));

	stack = ((char *)regs + sizeof(struct pt_regs));
#ifdef CONFIG_FRAME_POINTER
	bp = frame_pointer(regs);
#else
	bp = 0;
#endif

	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1109 1110

	entry->kernel = entry->nr - nr;
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
}


struct stack_frame {
	const void __user	*next_fp;
	unsigned long		return_address;
};

static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	int ret;

	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
		return 0;

	ret = 1;
	pagefault_disable();
	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
		ret = 0;
	pagefault_enable();

	return ret;
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;
1140
	int nr = entry->nr;
1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159

	regs = (struct pt_regs *)current->thread.sp0 - 1;
	fp   = (void __user *)regs->bp;

	callchain_store(entry, regs->ip);

	while (entry->nr < MAX_STACK_DEPTH) {
		frame.next_fp	     = NULL;
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

		if ((unsigned long)fp < user_stack_pointer(regs))
			break;

		callchain_store(entry, frame.return_address);
		fp = frame.next_fp;
	}
1160 1161

	entry->user = entry->nr - nr;
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (!current || current->pid == 0)
		return;

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
		entry = &__get_cpu_var(nmi_entry);
	else
		entry = &__get_cpu_var(irq_entry);

	entry->nr = 0;
1197 1198 1199
	entry->hv = 0;
	entry->kernel = 0;
	entry->user = 0;
1200 1201 1202 1203 1204

	perf_do_callchain(regs, entry);

	return entry;
}