perf_counter.c 32.5 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3
/*
 * Performance counter x86 architecture code
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
I
Ingo Molnar 已提交
9 10 11 12 13 14 15 16 17
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
18
#include <linux/module.h>
I
Ingo Molnar 已提交
19 20
#include <linux/kdebug.h>
#include <linux/sched.h>
21
#include <linux/uaccess.h>
I
Ingo Molnar 已提交
22 23

#include <asm/apic.h>
24
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
25
#include <asm/nmi.h>
I
Ingo Molnar 已提交
26

27
static u64 perf_counter_mask __read_mostly;
28

I
Ingo Molnar 已提交
29
struct cpu_hw_counters {
30
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
31 32
	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33
	unsigned long		interrupts;
34
	int			enabled;
I
Ingo Molnar 已提交
35 36 37
};

/*
38
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
39
 */
40
struct x86_pmu {
41 42
	const char	*name;
	int		version;
43
	int		(*handle_irq)(struct pt_regs *);
44 45
	void		(*disable_all)(void);
	void		(*enable_all)(void);
46
	void		(*enable)(struct hw_perf_counter *, int);
47
	void		(*disable)(struct hw_perf_counter *, int);
48 49
	unsigned	eventsel;
	unsigned	perfctr;
50 51
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
52
	int		max_events;
53 54 55 56
	int		num_counters;
	int		num_counters_fixed;
	int		counter_bits;
	u64		counter_mask;
57
	u64		max_period;
58
	u64		intel_ctrl;
59 60
};

61
static struct x86_pmu x86_pmu __read_mostly;
62

63 64 65
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
	.enabled = 1,
};
I
Ingo Molnar 已提交
66

67 68 69
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
70
static const u64 intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
71
{
72
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
73 74 75 76 77
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
78
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
79 80
};

81
static u64 intel_pmu_event_map(int event)
82 83 84
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
/*
 * Generalized hw caching related event table, filled
 * in on a per model basis. A value of 0 means
 * 'not supported', -1 means 'event makes no sense on
 * this CPU', any other value means the raw event
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

static const u64 nehalem_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
 [ C(L2  ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
};

static const u64 core2_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
	/* To be filled in */
};

static const u64 atom_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
	/* To be filled in */
};

208
static u64 intel_pmu_raw_event(u64 event)
209
{
210 211
#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
212 213
#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
214
#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
215

216
#define CORE_EVNTSEL_MASK		\
217 218
	(CORE_EVNTSEL_EVENT_MASK |	\
	 CORE_EVNTSEL_UNIT_MASK  |	\
219 220
	 CORE_EVNTSEL_EDGE_MASK  |	\
	 CORE_EVNTSEL_INV_MASK  |	\
221 222 223 224 225
	 CORE_EVNTSEL_COUNTER_MASK)

	return event & CORE_EVNTSEL_MASK;
}

226 227 228
/*
 * AMD Performance Monitor K7 and later.
 */
229
static const u64 amd_perfmon_event_map[] =
230 231 232 233 234 235 236 237 238
{
  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
};

239
static u64 amd_pmu_event_map(int event)
240 241 242 243
{
	return amd_perfmon_event_map[event];
}

244
static u64 amd_pmu_raw_event(u64 event)
245
{
246 247
#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
248 249
#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
#define K7_EVNTSEL_INV_MASK	0x000800000ULL
250
#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
251 252 253 254

#define K7_EVNTSEL_MASK			\
	(K7_EVNTSEL_EVENT_MASK |	\
	 K7_EVNTSEL_UNIT_MASK  |	\
255 256
	 K7_EVNTSEL_EDGE_MASK  |	\
	 K7_EVNTSEL_INV_MASK   |	\
257 258 259 260 261
	 K7_EVNTSEL_COUNTER_MASK)

	return event & K7_EVNTSEL_MASK;
}

262 263 264 265 266
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
267
static u64
268 269 270
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
271 272 273
	int shift = 64 - x86_pmu.counter_bits;
	u64 prev_raw_count, new_raw_count;
	s64 delta;
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
296
	 * of the count.
297
	 */
298 299
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
300 301 302

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
303 304

	return new_raw_count;
305 306
}

307
static atomic_t active_counters;
P
Peter Zijlstra 已提交
308 309 310 311 312 313 314 315 316
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

317
	for (i = 0; i < x86_pmu.num_counters; i++) {
318
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
319 320 321
			goto perfctr_fail;
	}

322
	for (i = 0; i < x86_pmu.num_counters; i++) {
323
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
324 325 326 327 328 329 330
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
331
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
332

333
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
334 335 336

perfctr_fail:
	for (i--; i >= 0; i--)
337
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
338 339 340 341 342 343 344 345 346 347 348

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

349
	for (i = 0; i < x86_pmu.num_counters; i++) {
350 351
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
352 353 354 355 356 357 358 359
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
}

static void hw_perf_counter_destroy(struct perf_counter *counter)
{
360
	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
361 362 363 364 365
		release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

366 367 368 369 370
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
static inline int
set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

I
Ingo Molnar 已提交
404
/*
405
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
406
 */
I
Ingo Molnar 已提交
407
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
408
{
409
	struct perf_counter_attr *attr = &counter->attr;
I
Ingo Molnar 已提交
410
	struct hw_perf_counter *hwc = &counter->hw;
P
Peter Zijlstra 已提交
411
	int err;
I
Ingo Molnar 已提交
412

413 414
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
415

P
Peter Zijlstra 已提交
416
	err = 0;
417
	if (!atomic_inc_not_zero(&active_counters)) {
P
Peter Zijlstra 已提交
418
		mutex_lock(&pmc_reserve_mutex);
419
		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
P
Peter Zijlstra 已提交
420 421
			err = -EBUSY;
		else
422
			atomic_inc(&active_counters);
P
Peter Zijlstra 已提交
423 424 425 426 427
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

I
Ingo Molnar 已提交
428
	/*
429
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
430 431
	 * (keep 'enabled' bit clear for now)
	 */
432
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
433 434

	/*
435
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
436
	 */
437
	if (!attr->exclude_user)
438
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
439
	if (!attr->exclude_kernel)
I
Ingo Molnar 已提交
440
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
441

442 443
	if (!hwc->sample_period)
		hwc->sample_period = x86_pmu.max_period;
444

445
	atomic64_set(&hwc->period_left, hwc->sample_period);
446
	counter->destroy = hw_perf_counter_destroy;
I
Ingo Molnar 已提交
447 448

	/*
449
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
450
	 */
451 452
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
453
		return 0;
I
Ingo Molnar 已提交
454 455
	}

456 457 458 459 460 461 462 463 464
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
	/*
	 * The generic map:
	 */
	hwc->config |= x86_pmu.event_map(attr->config);
P
Peter Zijlstra 已提交
465

I
Ingo Molnar 已提交
466 467 468
	return 0;
}

469
static void intel_pmu_disable_all(void)
470
{
471
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
I
Ingo Molnar 已提交
472
}
473

474
static void amd_pmu_disable_all(void)
475
{
476
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
477 478 479 480
	int idx;

	if (!cpuc->enabled)
		return;
481 482

	cpuc->enabled = 0;
483 484
	/*
	 * ensure we write the disable before we start disabling the
485 486
	 * counters proper, so that amd_pmu_enable_counter() does the
	 * right thing.
487
	 */
488
	barrier();
489

490
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
491 492
		u64 val;

493
		if (!test_bit(idx, cpuc->active_mask))
494
			continue;
495
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
496 497 498 499
		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			continue;
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
500 501 502
	}
}

503
void hw_perf_disable(void)
504
{
505
	if (!x86_pmu_initialized())
506 507
		return;
	return x86_pmu.disable_all();
508
}
I
Ingo Molnar 已提交
509

510
static void intel_pmu_enable_all(void)
511
{
512
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
513 514
}

515
static void amd_pmu_enable_all(void)
516
{
517
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
518 519
	int idx;

520
	if (cpuc->enabled)
521 522
		return;

523 524 525
	cpuc->enabled = 1;
	barrier();

526
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
527
		u64 val;
528

529
		if (!test_bit(idx, cpuc->active_mask))
530 531 532 533 534 535
			continue;
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			continue;
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
536 537 538
	}
}

539
void hw_perf_enable(void)
540
{
541
	if (!x86_pmu_initialized())
542
		return;
543
	x86_pmu.enable_all();
544 545
}

546
static inline u64 intel_pmu_get_status(void)
547 548 549
{
	u64 status;

550
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
551

552
	return status;
553 554
}

555
static inline void intel_pmu_ack_status(u64 ack)
556 557 558 559
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}

560
static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
561
{
562 563 564
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
565 566
}

567
static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
568
{
569 570 571
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config);
572 573
}

574
static inline void
575
intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
576 577 578 579 580 581 582 583 584 585 586 587
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

588
static inline void
589
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
590
{
591 592 593 594 595 596 597 598 599 600 601 602
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_disable_fixed(hwc, idx);
		return;
	}

	x86_pmu_disable_counter(hwc, idx);
}

static inline void
amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
	x86_pmu_disable_counter(hwc, idx);
603 604
}

605
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
606

607 608 609 610
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
611
static int
612
x86_perf_counter_set_period(struct perf_counter *counter,
613
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
614
{
615
	s64 left = atomic64_read(&hwc->period_left);
616 617
	s64 period = hwc->sample_period;
	int err, ret = 0;
618 619 620 621 622 623 624

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
625
		ret = 1;
626 627 628 629 630
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
631
		ret = 1;
632
	}
633 634 635 636 637
	/*
	 * Quirk: certain CPUs dont like it if just 1 event is left:
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
638

639 640 641
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

642 643 644 645 646 647
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
648
	atomic64_set(&hwc->prev_count, (u64)-left);
649

650
	err = checking_wrmsrl(hwc->counter_base + idx,
651
			     (u64)(-left) & x86_pmu.counter_mask);
652 653

	return ret;
654 655 656
}

static inline void
657
intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
658 659 660 661 662 663
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
664 665 666
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
667
	 */
668 669 670
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
671 672 673 674 675 676 677 678 679
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
680 681
}

682
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
683
{
684 685 686 687 688 689 690 691 692 693 694 695 696 697
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_enable_fixed(hwc, idx);
		return;
	}

	x86_pmu_enable_counter(hwc, idx);
}

static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);

	if (cpuc->enabled)
		x86_pmu_enable_counter(hwc, idx);
698
	else
699
		x86_pmu_disable_counter(hwc, idx);
I
Ingo Molnar 已提交
700 701
}

702 703
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
704
{
705 706
	unsigned int event;

707
	if (!x86_pmu.num_counters_fixed)
708 709
		return -1;

710 711
	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

712
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
713
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
714
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
715
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
716
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
717 718
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

719 720 721
	return -1;
}

722 723 724
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
725
static int x86_pmu_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
726 727 728
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
729
	int idx;
I
Ingo Molnar 已提交
730

731 732 733 734 735 736
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
737
		if (test_and_set_bit(idx, cpuc->used_mask))
738
			goto try_generic;
739

740 741 742 743 744 745 746
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
747
		hwc->idx = idx;
748 749 750
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
751
		if (test_and_set_bit(idx, cpuc->used_mask)) {
752
try_generic:
753
			idx = find_first_zero_bit(cpuc->used_mask,
754 755
						  x86_pmu.num_counters);
			if (idx == x86_pmu.num_counters)
756 757
				return -EAGAIN;

758
			set_bit(idx, cpuc->used_mask);
759 760
			hwc->idx = idx;
		}
761 762
		hwc->config_base  = x86_pmu.eventsel;
		hwc->counter_base = x86_pmu.perfctr;
I
Ingo Molnar 已提交
763 764
	}

765
	perf_counters_lapic_init();
766

767
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
768

769
	cpuc->counters[idx] = counter;
770
	set_bit(idx, cpuc->active_mask);
771

772
	x86_perf_counter_set_period(counter, hwc, idx);
773
	x86_pmu.enable(hwc, idx);
774 775

	return 0;
I
Ingo Molnar 已提交
776 777
}

778 779 780 781 782 783 784 785 786 787 788 789
static void x86_pmu_unthrottle(struct perf_counter *counter)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;

	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
				cpuc->counters[hwc->idx] != counter))
		return;

	x86_pmu.enable(hwc, hwc->idx);
}

I
Ingo Molnar 已提交
790 791
void perf_counter_print_debug(void)
{
792
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
793
	struct cpu_hw_counters *cpuc;
794
	unsigned long flags;
795 796
	int cpu, idx;

797
	if (!x86_pmu.num_counters)
798
		return;
I
Ingo Molnar 已提交
799

800
	local_irq_save(flags);
I
Ingo Molnar 已提交
801 802

	cpu = smp_processor_id();
803
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
804

805
	if (x86_pmu.version >= 2) {
806 807 808 809 810 811 812 813 814 815
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
816
	}
817
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
I
Ingo Molnar 已提交
818

819
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
820 821
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
822

823
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
824

825
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
826
			cpu, idx, pmc_ctrl);
827
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
828
			cpu, idx, pmc_count);
829
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
830
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
831
	}
832
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
833 834
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

835
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
836 837
			cpu, idx, pmc_count);
	}
838
	local_irq_restore(flags);
I
Ingo Molnar 已提交
839 840
}

841
static void x86_pmu_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
842 843 844
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
845
	int idx = hwc->idx;
I
Ingo Molnar 已提交
846

847 848 849 850
	/*
	 * Must be done before we disable, otherwise the nmi handler
	 * could reenable again:
	 */
851
	clear_bit(idx, cpuc->active_mask);
852
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
853

854 855 856 857
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
858
	barrier();
I
Ingo Molnar 已提交
859

860 861 862 863 864
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
865
	cpuc->counters[idx] = NULL;
866
	clear_bit(idx, cpuc->used_mask);
I
Ingo Molnar 已提交
867 868
}

869
/*
870 871
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
872
 */
873
static int intel_pmu_save_and_restart(struct perf_counter *counter)
I
Ingo Molnar 已提交
874 875 876
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;
877
	int ret;
I
Ingo Molnar 已提交
878

879
	x86_perf_counter_update(counter, hwc, idx);
880
	ret = x86_perf_counter_set_period(counter, hwc, idx);
881

882
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
883
		intel_pmu_enable_counter(hwc, idx);
884 885

	return ret;
I
Ingo Molnar 已提交
886 887
}

888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
static void intel_pmu_reset(void)
{
	unsigned long flags;
	int idx;

	if (!x86_pmu.num_counters)
		return;

	local_irq_save(flags);

	printk("clearing PMU state on CPU#%d\n", smp_processor_id());

	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
	}
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
	}

	local_irq_restore(flags);
}


I
Ingo Molnar 已提交
912 913 914 915
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
916
static int intel_pmu_handle_irq(struct pt_regs *regs)
I
Ingo Molnar 已提交
917
{
918 919 920
	struct cpu_hw_counters *cpuc;
	struct cpu_hw_counters;
	int bit, cpu, loops;
921
	u64 ack, status;
922 923 924

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
925

926
	perf_disable();
927
	status = intel_pmu_get_status();
928 929 930 931
	if (!status) {
		perf_enable();
		return 0;
	}
932

933
	loops = 0;
I
Ingo Molnar 已提交
934
again:
935 936
	if (++loops > 100) {
		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
937
		perf_counter_print_debug();
938 939
		intel_pmu_reset();
		perf_enable();
940 941 942
		return 1;
	}

943
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
944
	ack = status;
945
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
946
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
947 948

		clear_bit(bit, (unsigned long *) &status);
949
		if (!test_bit(bit, cpuc->active_mask))
I
Ingo Molnar 已提交
950 951
			continue;

952 953 954
		if (!intel_pmu_save_and_restart(counter))
			continue;

955
		if (perf_counter_overflow(counter, 1, regs, 0))
956
			intel_pmu_disable_counter(&counter->hw, bit);
I
Ingo Molnar 已提交
957 958
	}

959
	intel_pmu_ack_status(ack);
I
Ingo Molnar 已提交
960 961 962 963

	/*
	 * Repeat if there is more work to be done:
	 */
964
	status = intel_pmu_get_status();
I
Ingo Molnar 已提交
965 966
	if (status)
		goto again;
967

968
	perf_enable();
969 970

	return 1;
971 972
}

973
static int amd_pmu_handle_irq(struct pt_regs *regs)
974
{
975
	int cpu, idx, handled = 0;
976
	struct cpu_hw_counters *cpuc;
977 978
	struct perf_counter *counter;
	struct hw_perf_counter *hwc;
979 980 981 982
	u64 val;

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
983

984
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
985
		if (!test_bit(idx, cpuc->active_mask))
986
			continue;
987

988 989
		counter = cpuc->counters[idx];
		hwc = &counter->hw;
990

991
		val = x86_perf_counter_update(counter, hwc, idx);
992
		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
993
			continue;
994

995 996 997
		/* counter overflow */
		handled = 1;
		inc_irq_stat(apic_perf_irqs);
998 999 1000
		if (!x86_perf_counter_set_period(counter, hwc, idx))
			continue;

1001
		if (perf_counter_overflow(counter, 1, regs, 0))
1002 1003
			amd_pmu_disable_counter(hwc, idx);
	}
1004

1005 1006
	return handled;
}
1007

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
	perf_counter_do_pending();
	irq_exit();
}

void set_perf_counter_pending(void)
{
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
}

1022
void perf_counters_lapic_init(void)
I
Ingo Molnar 已提交
1023
{
1024
	if (!x86_pmu_initialized())
I
Ingo Molnar 已提交
1025
		return;
1026

I
Ingo Molnar 已提交
1027
	/*
1028
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1029
	 */
1030
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1031 1032 1033 1034 1035 1036 1037 1038
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1039

1040
	if (!atomic_read(&active_counters))
1041 1042
		return NOTIFY_DONE;

1043 1044 1045 1046
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1047

1048
	default:
I
Ingo Molnar 已提交
1049
		return NOTIFY_DONE;
1050
	}
I
Ingo Molnar 已提交
1051 1052 1053 1054

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
1055 1056 1057 1058 1059 1060 1061
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1062
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1063

1064
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1065 1066 1067
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1068 1069 1070
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
1071 1072
};

1073
static struct x86_pmu intel_pmu = {
1074
	.name			= "Intel",
1075
	.handle_irq		= intel_pmu_handle_irq,
1076 1077
	.disable_all		= intel_pmu_disable_all,
	.enable_all		= intel_pmu_enable_all,
1078 1079
	.enable			= intel_pmu_enable_counter,
	.disable		= intel_pmu_disable_counter,
1080 1081
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
1082 1083
	.event_map		= intel_pmu_event_map,
	.raw_event		= intel_pmu_raw_event,
1084
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
1085 1086 1087 1088 1089 1090
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
	.max_period		= (1ULL << 31) - 1,
1091 1092
};

1093
static struct x86_pmu amd_pmu = {
1094
	.name			= "AMD",
1095
	.handle_irq		= amd_pmu_handle_irq,
1096 1097
	.disable_all		= amd_pmu_disable_all,
	.enable_all		= amd_pmu_enable_all,
1098 1099
	.enable			= amd_pmu_enable_counter,
	.disable		= amd_pmu_disable_counter,
1100 1101
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
1102 1103
	.event_map		= amd_pmu_event_map,
	.raw_event		= amd_pmu_raw_event,
1104
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
1105 1106 1107
	.num_counters		= 4,
	.counter_bits		= 48,
	.counter_mask		= (1ULL << 48) - 1,
1108 1109
	/* use highest bit to detect overflow */
	.max_period		= (1ULL << 47) - 1,
1110 1111
};

1112
static int intel_pmu_init(void)
I
Ingo Molnar 已提交
1113
{
1114
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
1115
	union cpuid10_eax eax;
1116
	unsigned int unused;
1117
	unsigned int ebx;
1118
	int version;
I
Ingo Molnar 已提交
1119

1120
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1121
		return -ENODEV;
1122

I
Ingo Molnar 已提交
1123 1124 1125 1126
	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
1127
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
1128
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1129
		return -ENODEV;
I
Ingo Molnar 已提交
1130

1131 1132
	version = eax.split.version_id;
	if (version < 2)
1133
		return -ENODEV;
1134

1135
	x86_pmu = intel_pmu;
1136
	x86_pmu.version = version;
1137
	x86_pmu.num_counters = eax.split.num_counters;
1138 1139 1140 1141 1142 1143 1144

	/*
	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
	 * assume at least 3 counters:
	 */
	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);

1145 1146
	x86_pmu.counter_bits = eax.split.bit_width;
	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1147

1148 1149
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);

1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
	/*
	 * Nehalem:
	 */
	switch (boot_cpu_data.x86_model) {
	case 17:
		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Core2 event tables\n");
		break;
	default:
	case 26:
		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Nehalem/Corei7 event tables\n");
		break;
	case 28:
		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Atom event tables\n");
		break;
	}
1177
	return 0;
1178 1179
}

1180
static int amd_pmu_init(void)
1181
{
1182
	x86_pmu = amd_pmu;
1183
	return 0;
1184 1185
}

1186 1187
void __init init_hw_perf_counters(void)
{
1188 1189
	int err;

1190 1191
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1192
		err = intel_pmu_init();
1193
		break;
1194
	case X86_VENDOR_AMD:
1195
		err = amd_pmu_init();
1196
		break;
1197 1198
	default:
		return;
1199
	}
1200
	if (err != 0)
1201 1202
		return;

1203 1204 1205 1206
	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
	pr_info("... version:         %d\n", x86_pmu.version);
	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);

1207 1208 1209
	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1210
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1211
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
1212
	}
1213 1214
	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
	perf_max_counters = x86_pmu.num_counters;
I
Ingo Molnar 已提交
1215

1216
	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
1217
	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
1218

1219 1220
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1221
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1222
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1223
	}
1224
	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
1225

1226 1227
	perf_counter_mask |=
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1228

1229
	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1230

1231
	perf_counters_lapic_init();
I
Ingo Molnar 已提交
1232 1233
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
1234

1235
static inline void x86_pmu_read(struct perf_counter *counter)
1236 1237 1238 1239
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

1240 1241 1242 1243
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
	.read		= x86_pmu_read,
1244
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1245 1246
};

1247
const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
1248 1249 1250 1251 1252
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
1253
		return ERR_PTR(err);
I
Ingo Molnar 已提交
1254

1255
	return &pmu;
I
Ingo Molnar 已提交
1256
}
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309

/*
 * callchain support
 */

static inline
void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
{
	if (entry->nr < MAX_STACK_DEPTH)
		entry->ip[entry->nr++] = ip;
}

static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
	/* Don't bother with IRQ stacks for now */
	return -1;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
};

static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	unsigned long bp;
	char *stack;
1310
	int nr = entry->nr;
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321

	callchain_store(entry, instruction_pointer(regs));

	stack = ((char *)regs + sizeof(struct pt_regs));
#ifdef CONFIG_FRAME_POINTER
	bp = frame_pointer(regs);
#else
	bp = 0;
#endif

	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1322 1323

	entry->kernel = entry->nr - nr;
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
}


struct stack_frame {
	const void __user	*next_fp;
	unsigned long		return_address;
};

static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	int ret;

	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
		return 0;

	ret = 1;
	pagefault_disable();
	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
		ret = 0;
	pagefault_enable();

	return ret;
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;
1353
	int nr = entry->nr;
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372

	regs = (struct pt_regs *)current->thread.sp0 - 1;
	fp   = (void __user *)regs->bp;

	callchain_store(entry, regs->ip);

	while (entry->nr < MAX_STACK_DEPTH) {
		frame.next_fp	     = NULL;
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

		if ((unsigned long)fp < user_stack_pointer(regs))
			break;

		callchain_store(entry, frame.return_address);
		fp = frame.next_fp;
	}
1373 1374

	entry->user = entry->nr - nr;
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (!current || current->pid == 0)
		return;

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
		entry = &__get_cpu_var(nmi_entry);
	else
		entry = &__get_cpu_var(irq_entry);

	entry->nr = 0;
1410 1411 1412
	entry->hv = 0;
	entry->kernel = 0;
	entry->user = 0;
1413 1414 1415 1416 1417

	perf_do_callchain(regs, entry);

	return entry;
}