perf_counter.c 34.8 KB
Newer Older
I
Ingo Molnar 已提交
1 2 3
/*
 * Performance counter x86 architecture code
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
I
Ingo Molnar 已提交
9 10 11 12 13 14 15 16 17
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_counter.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
18
#include <linux/module.h>
I
Ingo Molnar 已提交
19 20
#include <linux/kdebug.h>
#include <linux/sched.h>
21
#include <linux/uaccess.h>
I
Ingo Molnar 已提交
22 23

#include <asm/apic.h>
24
#include <asm/stacktrace.h>
P
Peter Zijlstra 已提交
25
#include <asm/nmi.h>
I
Ingo Molnar 已提交
26

27
static u64 perf_counter_mask __read_mostly;
28

I
Ingo Molnar 已提交
29
struct cpu_hw_counters {
30
	struct perf_counter	*counters[X86_PMC_IDX_MAX];
31 32
	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33
	unsigned long		interrupts;
34
	int			enabled;
I
Ingo Molnar 已提交
35 36 37
};

/*
38
 * struct x86_pmu - generic x86 pmu
I
Ingo Molnar 已提交
39
 */
40
struct x86_pmu {
41 42
	const char	*name;
	int		version;
43
	int		(*handle_irq)(struct pt_regs *);
44 45
	void		(*disable_all)(void);
	void		(*enable_all)(void);
46
	void		(*enable)(struct hw_perf_counter *, int);
47
	void		(*disable)(struct hw_perf_counter *, int);
48 49
	unsigned	eventsel;
	unsigned	perfctr;
50 51
	u64		(*event_map)(int);
	u64		(*raw_event)(u64);
52
	int		max_events;
53 54 55 56
	int		num_counters;
	int		num_counters_fixed;
	int		counter_bits;
	u64		counter_mask;
57
	u64		max_period;
58
	u64		intel_ctrl;
59 60
};

61
static struct x86_pmu x86_pmu __read_mostly;
62

63 64 65
static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
	.enabled = 1,
};
I
Ingo Molnar 已提交
66

67 68 69
/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
70
static const u64 intel_perfmon_event_map[] =
I
Ingo Molnar 已提交
71
{
72
  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
I
Ingo Molnar 已提交
73 74 75 76 77
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
78
  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
I
Ingo Molnar 已提交
79 80
};

81
static u64 intel_pmu_event_map(int event)
82 83 84
{
	return intel_perfmon_event_map[event];
}
I
Ingo Molnar 已提交
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
/*
 * Generalized hw caching related event table, filled
 * in on a per model basis. A value of 0 means
 * 'not supported', -1 means 'event makes no sense on
 * this CPU', any other value means the raw event
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];

static const u64 nehalem_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
 [ C(L2  ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x0,
		[ C(RESULT_MISS)   ] = 0x0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
};

static const u64 core2_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
 [ C(L1D) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(L1I ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(L2  ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(DTLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = 0,
		[ C(RESULT_MISS)   ] = 0,
	},
 },
 [ C(ITLB) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
 [ C(BPU ) ] = {
	[ C(OP_READ) ] = {
		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
	},
	[ C(OP_WRITE) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
	[ C(OP_PREFETCH) ] = {
		[ C(RESULT_ACCESS) ] = -1,
		[ C(RESULT_MISS)   ] = -1,
	},
 },
281 282 283 284 285 286 287 288 289 290
};

static const u64 atom_hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
	/* To be filled in */
};

291
static u64 intel_pmu_raw_event(u64 event)
292
{
293 294
#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
295 296
#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
297
#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
298

299
#define CORE_EVNTSEL_MASK		\
300 301
	(CORE_EVNTSEL_EVENT_MASK |	\
	 CORE_EVNTSEL_UNIT_MASK  |	\
302 303
	 CORE_EVNTSEL_EDGE_MASK  |	\
	 CORE_EVNTSEL_INV_MASK  |	\
304 305 306 307 308
	 CORE_EVNTSEL_COUNTER_MASK)

	return event & CORE_EVNTSEL_MASK;
}

309 310 311
/*
 * AMD Performance Monitor K7 and later.
 */
312
static const u64 amd_perfmon_event_map[] =
313 314 315 316 317 318 319 320 321
{
  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
};

322
static u64 amd_pmu_event_map(int event)
323 324 325 326
{
	return amd_perfmon_event_map[event];
}

327
static u64 amd_pmu_raw_event(u64 event)
328
{
329 330
#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
331 332
#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
#define K7_EVNTSEL_INV_MASK	0x000800000ULL
333
#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
334 335 336 337

#define K7_EVNTSEL_MASK			\
	(K7_EVNTSEL_EVENT_MASK |	\
	 K7_EVNTSEL_UNIT_MASK  |	\
338 339
	 K7_EVNTSEL_EDGE_MASK  |	\
	 K7_EVNTSEL_INV_MASK   |	\
340 341 342 343 344
	 K7_EVNTSEL_COUNTER_MASK)

	return event & K7_EVNTSEL_MASK;
}

345 346 347 348 349
/*
 * Propagate counter elapsed time into the generic counter.
 * Can only be executed on the CPU where the counter is active.
 * Returns the delta events processed.
 */
350
static u64
351 352 353
x86_perf_counter_update(struct perf_counter *counter,
			struct hw_perf_counter *hwc, int idx)
{
354 355 356
	int shift = 64 - x86_pmu.counter_bits;
	u64 prev_raw_count, new_raw_count;
	s64 delta;
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378

	/*
	 * Careful: an NMI might modify the previous counter value.
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
	 * count to the generic counter atomically:
	 */
again:
	prev_raw_count = atomic64_read(&hwc->prev_count);
	rdmsrl(hwc->counter_base + idx, new_raw_count);

	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
	 * (counter-)time and add that to the generic counter.
	 *
	 * Careful, not all hw sign-extends above the physical width
379
	 * of the count.
380
	 */
381 382
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
383 384 385

	atomic64_add(delta, &counter->count);
	atomic64_sub(delta, &hwc->period_left);
386 387

	return new_raw_count;
388 389
}

390
static atomic_t active_counters;
P
Peter Zijlstra 已提交
391 392 393 394 395 396 397 398 399
static DEFINE_MUTEX(pmc_reserve_mutex);

static bool reserve_pmc_hardware(void)
{
	int i;

	if (nmi_watchdog == NMI_LOCAL_APIC)
		disable_lapic_nmi_watchdog();

400
	for (i = 0; i < x86_pmu.num_counters; i++) {
401
		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
P
Peter Zijlstra 已提交
402 403 404
			goto perfctr_fail;
	}

405
	for (i = 0; i < x86_pmu.num_counters; i++) {
406
		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
P
Peter Zijlstra 已提交
407 408 409 410 411 412 413
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
414
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
415

416
	i = x86_pmu.num_counters;
P
Peter Zijlstra 已提交
417 418 419

perfctr_fail:
	for (i--; i >= 0; i--)
420
		release_perfctr_nmi(x86_pmu.perfctr + i);
P
Peter Zijlstra 已提交
421 422 423 424 425 426 427 428 429 430 431

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

432
	for (i = 0; i < x86_pmu.num_counters; i++) {
433 434
		release_perfctr_nmi(x86_pmu.perfctr + i);
		release_evntsel_nmi(x86_pmu.eventsel + i);
P
Peter Zijlstra 已提交
435 436 437 438 439 440 441 442
	}

	if (nmi_watchdog == NMI_LOCAL_APIC)
		enable_lapic_nmi_watchdog();
}

static void hw_perf_counter_destroy(struct perf_counter *counter)
{
443
	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
P
Peter Zijlstra 已提交
444 445 446 447 448
		release_pmc_hardware();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

449 450 451 452 453
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
static inline int
set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
{
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;

	return 0;
}

I
Ingo Molnar 已提交
487
/*
488
 * Setup the hardware configuration for a given attr_type
I
Ingo Molnar 已提交
489
 */
I
Ingo Molnar 已提交
490
static int __hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
491
{
492
	struct perf_counter_attr *attr = &counter->attr;
I
Ingo Molnar 已提交
493
	struct hw_perf_counter *hwc = &counter->hw;
P
Peter Zijlstra 已提交
494
	int err;
I
Ingo Molnar 已提交
495

496 497
	if (!x86_pmu_initialized())
		return -ENODEV;
I
Ingo Molnar 已提交
498

P
Peter Zijlstra 已提交
499
	err = 0;
500
	if (!atomic_inc_not_zero(&active_counters)) {
P
Peter Zijlstra 已提交
501
		mutex_lock(&pmc_reserve_mutex);
502
		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
P
Peter Zijlstra 已提交
503 504
			err = -EBUSY;
		else
505
			atomic_inc(&active_counters);
P
Peter Zijlstra 已提交
506 507 508 509 510
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

I
Ingo Molnar 已提交
511
	/*
512
	 * Generate PMC IRQs:
I
Ingo Molnar 已提交
513 514
	 * (keep 'enabled' bit clear for now)
	 */
515
	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
I
Ingo Molnar 已提交
516 517

	/*
518
	 * Count user and OS events unless requested not to.
I
Ingo Molnar 已提交
519
	 */
520
	if (!attr->exclude_user)
521
		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
522
	if (!attr->exclude_kernel)
I
Ingo Molnar 已提交
523
		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
524

525 526
	if (!hwc->sample_period)
		hwc->sample_period = x86_pmu.max_period;
527

528
	atomic64_set(&hwc->period_left, hwc->sample_period);
529
	counter->destroy = hw_perf_counter_destroy;
I
Ingo Molnar 已提交
530 531

	/*
532
	 * Raw event type provide the config in the event structure
I
Ingo Molnar 已提交
533
	 */
534 535
	if (attr->type == PERF_TYPE_RAW) {
		hwc->config |= x86_pmu.raw_event(attr->config);
536
		return 0;
I
Ingo Molnar 已提交
537 538
	}

539 540 541 542 543 544 545 546 547
	if (attr->type == PERF_TYPE_HW_CACHE)
		return set_ext_hw_attr(hwc, attr);

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;
	/*
	 * The generic map:
	 */
	hwc->config |= x86_pmu.event_map(attr->config);
P
Peter Zijlstra 已提交
548

I
Ingo Molnar 已提交
549 550 551
	return 0;
}

552
static void intel_pmu_disable_all(void)
553
{
554
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
I
Ingo Molnar 已提交
555
}
556

557
static void amd_pmu_disable_all(void)
558
{
559
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
560 561 562 563
	int idx;

	if (!cpuc->enabled)
		return;
564 565

	cpuc->enabled = 0;
566 567
	/*
	 * ensure we write the disable before we start disabling the
568 569
	 * counters proper, so that amd_pmu_enable_counter() does the
	 * right thing.
570
	 */
571
	barrier();
572

573
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
574 575
		u64 val;

576
		if (!test_bit(idx, cpuc->active_mask))
577
			continue;
578
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
579 580 581 582
		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			continue;
		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
583 584 585
	}
}

586
void hw_perf_disable(void)
587
{
588
	if (!x86_pmu_initialized())
589 590
		return;
	return x86_pmu.disable_all();
591
}
I
Ingo Molnar 已提交
592

593
static void intel_pmu_enable_all(void)
594
{
595
	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
596 597
}

598
static void amd_pmu_enable_all(void)
599
{
600
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
601 602
	int idx;

603
	if (cpuc->enabled)
604 605
		return;

606 607 608
	cpuc->enabled = 1;
	barrier();

609
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
610
		u64 val;
611

612
		if (!test_bit(idx, cpuc->active_mask))
613 614 615 616 617 618
			continue;
		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			continue;
		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
619 620 621
	}
}

622
void hw_perf_enable(void)
623
{
624
	if (!x86_pmu_initialized())
625
		return;
626
	x86_pmu.enable_all();
627 628
}

629
static inline u64 intel_pmu_get_status(void)
630 631 632
{
	u64 status;

633
	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
634

635
	return status;
636 637
}

638
static inline void intel_pmu_ack_status(u64 ack)
639 640 641 642
{
	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}

643
static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
644
{
645 646 647
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
648 649
}

650
static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
651
{
652 653 654
	int err;
	err = checking_wrmsrl(hwc->config_base + idx,
			      hwc->config);
655 656
}

657
static inline void
658
intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
659 660 661 662 663 664 665 666 667 668 669 670
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, mask;
	int err;

	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
}

671
static inline void
672
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
673
{
674 675 676 677 678 679 680 681 682 683 684 685
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_disable_fixed(hwc, idx);
		return;
	}

	x86_pmu_disable_counter(hwc, idx);
}

static inline void
amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
	x86_pmu_disable_counter(hwc, idx);
686 687
}

688
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
I
Ingo Molnar 已提交
689

690 691 692 693
/*
 * Set the next IRQ period, based on the hwc->period_left value.
 * To be called with the counter disabled in hw:
 */
694
static int
695
x86_perf_counter_set_period(struct perf_counter *counter,
696
			     struct hw_perf_counter *hwc, int idx)
I
Ingo Molnar 已提交
697
{
698
	s64 left = atomic64_read(&hwc->period_left);
699 700
	s64 period = hwc->sample_period;
	int err, ret = 0;
701 702 703 704 705 706 707

	/*
	 * If we are way outside a reasoable range then just skip forward:
	 */
	if (unlikely(left <= -period)) {
		left = period;
		atomic64_set(&hwc->period_left, left);
708
		ret = 1;
709 710 711 712 713
	}

	if (unlikely(left <= 0)) {
		left += period;
		atomic64_set(&hwc->period_left, left);
714
		ret = 1;
715
	}
716 717 718 719 720
	/*
	 * Quirk: certain CPUs dont like it if just 1 event is left:
	 */
	if (unlikely(left < 2))
		left = 2;
I
Ingo Molnar 已提交
721

722 723 724
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

725 726 727 728 729 730
	per_cpu(prev_left[idx], smp_processor_id()) = left;

	/*
	 * The hw counter starts counting from this counter offset,
	 * mark it to be able to extra future deltas:
	 */
731
	atomic64_set(&hwc->prev_count, (u64)-left);
732

733
	err = checking_wrmsrl(hwc->counter_base + idx,
734
			     (u64)(-left) & x86_pmu.counter_mask);
735 736

	return ret;
737 738 739
}

static inline void
740
intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
741 742 743 744 745 746
{
	int idx = __idx - X86_PMC_IDX_FIXED;
	u64 ctrl_val, bits, mask;
	int err;

	/*
747 748 749
	 * Enable IRQ generation (0x8),
	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
	 * if requested:
750
	 */
751 752 753
	bits = 0x8ULL;
	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
		bits |= 0x2;
754 755 756 757 758 759 760 761 762
	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
		bits |= 0x1;
	bits <<= (idx * 4);
	mask = 0xfULL << (idx * 4);

	rdmsrl(hwc->config_base, ctrl_val);
	ctrl_val &= ~mask;
	ctrl_val |= bits;
	err = checking_wrmsrl(hwc->config_base, ctrl_val);
763 764
}

765
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
766
{
767 768 769 770 771 772 773 774 775 776 777 778 779 780
	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
		intel_pmu_enable_fixed(hwc, idx);
		return;
	}

	x86_pmu_enable_counter(hwc, idx);
}

static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);

	if (cpuc->enabled)
		x86_pmu_enable_counter(hwc, idx);
781
	else
782
		x86_pmu_disable_counter(hwc, idx);
I
Ingo Molnar 已提交
783 784
}

785 786
static int
fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
787
{
788 789
	unsigned int event;

790
	if (!x86_pmu.num_counters_fixed)
791 792
		return -1;

793 794
	event = hwc->config & ARCH_PERFMON_EVENT_MASK;

795
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
796
		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
797
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
798
		return X86_PMC_IDX_FIXED_CPU_CYCLES;
799
	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
800 801
		return X86_PMC_IDX_FIXED_BUS_CYCLES;

802 803 804
	return -1;
}

805 806 807
/*
 * Find a PMC slot for the freshly enabled / scheduled in counter:
 */
808
static int x86_pmu_enable(struct perf_counter *counter)
I
Ingo Molnar 已提交
809 810 811
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
812
	int idx;
I
Ingo Molnar 已提交
813

814 815 816 817 818 819
	idx = fixed_mode_idx(counter, hwc);
	if (idx >= 0) {
		/*
		 * Try to get the fixed counter, if that is already taken
		 * then try to get a generic counter:
		 */
820
		if (test_and_set_bit(idx, cpuc->used_mask))
821
			goto try_generic;
822

823 824 825 826 827 828 829
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		/*
		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
		 */
		hwc->counter_base =
			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
830
		hwc->idx = idx;
831 832 833
	} else {
		idx = hwc->idx;
		/* Try to get the previous generic counter again */
834
		if (test_and_set_bit(idx, cpuc->used_mask)) {
835
try_generic:
836
			idx = find_first_zero_bit(cpuc->used_mask,
837 838
						  x86_pmu.num_counters);
			if (idx == x86_pmu.num_counters)
839 840
				return -EAGAIN;

841
			set_bit(idx, cpuc->used_mask);
842 843
			hwc->idx = idx;
		}
844 845
		hwc->config_base  = x86_pmu.eventsel;
		hwc->counter_base = x86_pmu.perfctr;
I
Ingo Molnar 已提交
846 847
	}

848
	perf_counters_lapic_init();
849

850
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
851

852
	cpuc->counters[idx] = counter;
853
	set_bit(idx, cpuc->active_mask);
854

855
	x86_perf_counter_set_period(counter, hwc, idx);
856
	x86_pmu.enable(hwc, idx);
857 858

	return 0;
I
Ingo Molnar 已提交
859 860
}

861 862 863 864 865 866 867 868 869 870 871 872
static void x86_pmu_unthrottle(struct perf_counter *counter)
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;

	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
				cpuc->counters[hwc->idx] != counter))
		return;

	x86_pmu.enable(hwc, hwc->idx);
}

I
Ingo Molnar 已提交
873 874
void perf_counter_print_debug(void)
{
875
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
876
	struct cpu_hw_counters *cpuc;
877
	unsigned long flags;
878 879
	int cpu, idx;

880
	if (!x86_pmu.num_counters)
881
		return;
I
Ingo Molnar 已提交
882

883
	local_irq_save(flags);
I
Ingo Molnar 已提交
884 885

	cpu = smp_processor_id();
886
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
887

888
	if (x86_pmu.version >= 2) {
889 890 891 892 893 894 895 896 897 898
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
899
	}
900
	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
I
Ingo Molnar 已提交
901

902
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
903 904
		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
I
Ingo Molnar 已提交
905

906
		prev_left = per_cpu(prev_left[idx], cpu);
I
Ingo Molnar 已提交
907

908
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
I
Ingo Molnar 已提交
909
			cpu, idx, pmc_ctrl);
910
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
I
Ingo Molnar 已提交
911
			cpu, idx, pmc_count);
912
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
913
			cpu, idx, prev_left);
I
Ingo Molnar 已提交
914
	}
915
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
916 917
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

918
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
919 920
			cpu, idx, pmc_count);
	}
921
	local_irq_restore(flags);
I
Ingo Molnar 已提交
922 923
}

924
static void x86_pmu_disable(struct perf_counter *counter)
I
Ingo Molnar 已提交
925 926 927
{
	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
	struct hw_perf_counter *hwc = &counter->hw;
928
	int idx = hwc->idx;
I
Ingo Molnar 已提交
929

930 931 932 933
	/*
	 * Must be done before we disable, otherwise the nmi handler
	 * could reenable again:
	 */
934
	clear_bit(idx, cpuc->active_mask);
935
	x86_pmu.disable(hwc, idx);
I
Ingo Molnar 已提交
936

937 938 939 940
	/*
	 * Make sure the cleared pointer becomes visible before we
	 * (potentially) free the counter:
	 */
941
	barrier();
I
Ingo Molnar 已提交
942

943 944 945 946 947
	/*
	 * Drain the remaining delta count out of a counter
	 * that we are disabling:
	 */
	x86_perf_counter_update(counter, hwc, idx);
948
	cpuc->counters[idx] = NULL;
949
	clear_bit(idx, cpuc->used_mask);
I
Ingo Molnar 已提交
950 951
}

952
/*
953 954
 * Save and restart an expired counter. Called by NMI contexts,
 * so it has to be careful about preempting normal counter ops:
955
 */
956
static int intel_pmu_save_and_restart(struct perf_counter *counter)
I
Ingo Molnar 已提交
957 958 959
{
	struct hw_perf_counter *hwc = &counter->hw;
	int idx = hwc->idx;
960
	int ret;
I
Ingo Molnar 已提交
961

962
	x86_perf_counter_update(counter, hwc, idx);
963
	ret = x86_perf_counter_set_period(counter, hwc, idx);
964

965
	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
966
		intel_pmu_enable_counter(hwc, idx);
967 968

	return ret;
I
Ingo Molnar 已提交
969 970
}

971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
static void intel_pmu_reset(void)
{
	unsigned long flags;
	int idx;

	if (!x86_pmu.num_counters)
		return;

	local_irq_save(flags);

	printk("clearing PMU state on CPU#%d\n", smp_processor_id());

	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
	}
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
	}

	local_irq_restore(flags);
}


I
Ingo Molnar 已提交
995 996 997 998
/*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
 * rules apply:
 */
999
static int intel_pmu_handle_irq(struct pt_regs *regs)
I
Ingo Molnar 已提交
1000
{
1001 1002 1003
	struct cpu_hw_counters *cpuc;
	struct cpu_hw_counters;
	int bit, cpu, loops;
1004
	u64 ack, status;
1005 1006 1007

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
I
Ingo Molnar 已提交
1008

1009
	perf_disable();
1010
	status = intel_pmu_get_status();
1011 1012 1013 1014
	if (!status) {
		perf_enable();
		return 0;
	}
1015

1016
	loops = 0;
I
Ingo Molnar 已提交
1017
again:
1018 1019
	if (++loops > 100) {
		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1020
		perf_counter_print_debug();
1021 1022
		intel_pmu_reset();
		perf_enable();
1023 1024 1025
		return 1;
	}

1026
	inc_irq_stat(apic_perf_irqs);
I
Ingo Molnar 已提交
1027
	ack = status;
1028
	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1029
		struct perf_counter *counter = cpuc->counters[bit];
I
Ingo Molnar 已提交
1030 1031

		clear_bit(bit, (unsigned long *) &status);
1032
		if (!test_bit(bit, cpuc->active_mask))
I
Ingo Molnar 已提交
1033 1034
			continue;

1035 1036 1037
		if (!intel_pmu_save_and_restart(counter))
			continue;

1038
		if (perf_counter_overflow(counter, 1, regs, 0))
1039
			intel_pmu_disable_counter(&counter->hw, bit);
I
Ingo Molnar 已提交
1040 1041
	}

1042
	intel_pmu_ack_status(ack);
I
Ingo Molnar 已提交
1043 1044 1045 1046

	/*
	 * Repeat if there is more work to be done:
	 */
1047
	status = intel_pmu_get_status();
I
Ingo Molnar 已提交
1048 1049
	if (status)
		goto again;
1050

1051
	perf_enable();
1052 1053

	return 1;
1054 1055
}

1056
static int amd_pmu_handle_irq(struct pt_regs *regs)
1057
{
1058
	int cpu, idx, handled = 0;
1059
	struct cpu_hw_counters *cpuc;
1060 1061
	struct perf_counter *counter;
	struct hw_perf_counter *hwc;
1062 1063 1064 1065
	u64 val;

	cpu = smp_processor_id();
	cpuc = &per_cpu(cpu_hw_counters, cpu);
1066

1067
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1068
		if (!test_bit(idx, cpuc->active_mask))
1069
			continue;
1070

1071 1072
		counter = cpuc->counters[idx];
		hwc = &counter->hw;
1073

1074
		val = x86_perf_counter_update(counter, hwc, idx);
1075
		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1076
			continue;
1077

1078 1079 1080
		/* counter overflow */
		handled = 1;
		inc_irq_stat(apic_perf_irqs);
1081 1082 1083
		if (!x86_perf_counter_set_period(counter, hwc, idx))
			continue;

1084
		if (perf_counter_overflow(counter, 1, regs, 0))
1085 1086
			amd_pmu_disable_counter(hwc, idx);
	}
1087

1088 1089
	return handled;
}
1090

1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
void smp_perf_pending_interrupt(struct pt_regs *regs)
{
	irq_enter();
	ack_APIC_irq();
	inc_irq_stat(apic_pending_irqs);
	perf_counter_do_pending();
	irq_exit();
}

void set_perf_counter_pending(void)
{
	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
}

1105
void perf_counters_lapic_init(void)
I
Ingo Molnar 已提交
1106
{
1107
	if (!x86_pmu_initialized())
I
Ingo Molnar 已提交
1108
		return;
1109

I
Ingo Molnar 已提交
1110
	/*
1111
	 * Always use NMI for PMU
I
Ingo Molnar 已提交
1112
	 */
1113
	apic_write(APIC_LVTPC, APIC_DM_NMI);
I
Ingo Molnar 已提交
1114 1115 1116 1117 1118 1119 1120 1121
}

static int __kprobes
perf_counter_nmi_handler(struct notifier_block *self,
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
	struct pt_regs *regs;
1122

1123
	if (!atomic_read(&active_counters))
1124 1125
		return NOTIFY_DONE;

1126 1127 1128 1129
	switch (cmd) {
	case DIE_NMI:
	case DIE_NMI_IPI:
		break;
I
Ingo Molnar 已提交
1130

1131
	default:
I
Ingo Molnar 已提交
1132
		return NOTIFY_DONE;
1133
	}
I
Ingo Molnar 已提交
1134 1135 1136 1137

	regs = args->regs;

	apic_write(APIC_LVTPC, APIC_DM_NMI);
1138 1139 1140 1141 1142 1143 1144
	/*
	 * Can't rely on the handled return value to say it was our NMI, two
	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
	 *
	 * If the first NMI handles both, the latter will be empty and daze
	 * the CPU.
	 */
1145
	x86_pmu.handle_irq(regs);
I
Ingo Molnar 已提交
1146

1147
	return NOTIFY_STOP;
I
Ingo Molnar 已提交
1148 1149 1150
}

static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1151 1152 1153
	.notifier_call		= perf_counter_nmi_handler,
	.next			= NULL,
	.priority		= 1
I
Ingo Molnar 已提交
1154 1155
};

1156
static struct x86_pmu intel_pmu = {
1157
	.name			= "Intel",
1158
	.handle_irq		= intel_pmu_handle_irq,
1159 1160
	.disable_all		= intel_pmu_disable_all,
	.enable_all		= intel_pmu_enable_all,
1161 1162
	.enable			= intel_pmu_enable_counter,
	.disable		= intel_pmu_disable_counter,
1163 1164
	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
1165 1166
	.event_map		= intel_pmu_event_map,
	.raw_event		= intel_pmu_raw_event,
1167
	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
1168 1169 1170 1171 1172 1173
	/*
	 * Intel PMCs cannot be accessed sanely above 32 bit width,
	 * so we install an artificial 1<<31 period regardless of
	 * the generic counter period:
	 */
	.max_period		= (1ULL << 31) - 1,
1174 1175
};

1176
static struct x86_pmu amd_pmu = {
1177
	.name			= "AMD",
1178
	.handle_irq		= amd_pmu_handle_irq,
1179 1180
	.disable_all		= amd_pmu_disable_all,
	.enable_all		= amd_pmu_enable_all,
1181 1182
	.enable			= amd_pmu_enable_counter,
	.disable		= amd_pmu_disable_counter,
1183 1184
	.eventsel		= MSR_K7_EVNTSEL0,
	.perfctr		= MSR_K7_PERFCTR0,
1185 1186
	.event_map		= amd_pmu_event_map,
	.raw_event		= amd_pmu_raw_event,
1187
	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
1188 1189 1190
	.num_counters		= 4,
	.counter_bits		= 48,
	.counter_mask		= (1ULL << 48) - 1,
1191 1192
	/* use highest bit to detect overflow */
	.max_period		= (1ULL << 47) - 1,
1193 1194
};

1195
static int intel_pmu_init(void)
I
Ingo Molnar 已提交
1196
{
1197
	union cpuid10_edx edx;
I
Ingo Molnar 已提交
1198
	union cpuid10_eax eax;
1199
	unsigned int unused;
1200
	unsigned int ebx;
1201
	int version;
I
Ingo Molnar 已提交
1202

1203
	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1204
		return -ENODEV;
1205

I
Ingo Molnar 已提交
1206 1207 1208 1209
	/*
	 * Check whether the Architectural PerfMon supports
	 * Branch Misses Retired Event or not.
	 */
1210
	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
I
Ingo Molnar 已提交
1211
	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1212
		return -ENODEV;
I
Ingo Molnar 已提交
1213

1214 1215
	version = eax.split.version_id;
	if (version < 2)
1216
		return -ENODEV;
1217

1218
	x86_pmu = intel_pmu;
1219
	x86_pmu.version = version;
1220
	x86_pmu.num_counters = eax.split.num_counters;
1221 1222 1223 1224 1225 1226 1227

	/*
	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
	 * assume at least 3 counters:
	 */
	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);

1228 1229
	x86_pmu.counter_bits = eax.split.bit_width;
	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1230

1231 1232
	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);

1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
	/*
	 * Nehalem:
	 */
	switch (boot_cpu_data.x86_model) {
	case 17:
		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Core2 event tables\n");
		break;
	default:
	case 26:
		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Nehalem/Corei7 event tables\n");
		break;
	case 28:
		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);

		pr_info("... installed Atom event tables\n");
		break;
	}
1260
	return 0;
1261 1262
}

1263
static int amd_pmu_init(void)
1264
{
1265
	x86_pmu = amd_pmu;
1266
	return 0;
1267 1268
}

1269 1270
void __init init_hw_perf_counters(void)
{
1271 1272
	int err;

1273 1274
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1275
		err = intel_pmu_init();
1276
		break;
1277
	case X86_VENDOR_AMD:
1278
		err = amd_pmu_init();
1279
		break;
1280 1281
	default:
		return;
1282
	}
1283
	if (err != 0)
1284 1285
		return;

1286 1287 1288 1289
	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
	pr_info("... version:         %d\n", x86_pmu.version);
	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);

1290 1291 1292
	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
I
Ingo Molnar 已提交
1293
		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1294
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
I
Ingo Molnar 已提交
1295
	}
1296 1297
	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
	perf_max_counters = x86_pmu.num_counters;
I
Ingo Molnar 已提交
1298

1299
	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
1300
	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
1301

1302 1303
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1304
		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1305
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1306
	}
1307
	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
1308

1309 1310
	perf_counter_mask |=
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
I
Ingo Molnar 已提交
1311

1312
	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1313

1314
	perf_counters_lapic_init();
I
Ingo Molnar 已提交
1315 1316
	register_die_notifier(&perf_counter_nmi_notifier);
}
I
Ingo Molnar 已提交
1317

1318
static inline void x86_pmu_read(struct perf_counter *counter)
1319 1320 1321 1322
{
	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
}

1323 1324 1325 1326
static const struct pmu pmu = {
	.enable		= x86_pmu_enable,
	.disable	= x86_pmu_disable,
	.read		= x86_pmu_read,
1327
	.unthrottle	= x86_pmu_unthrottle,
I
Ingo Molnar 已提交
1328 1329
};

1330
const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
I
Ingo Molnar 已提交
1331 1332 1333 1334 1335
{
	int err;

	err = __hw_perf_counter_init(counter);
	if (err)
1336
		return ERR_PTR(err);
I
Ingo Molnar 已提交
1337

1338
	return &pmu;
I
Ingo Molnar 已提交
1339
}
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392

/*
 * callchain support
 */

static inline
void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
{
	if (entry->nr < MAX_STACK_DEPTH)
		entry->ip[entry->nr++] = ip;
}

static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);


static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
	/* Ignore warnings */
}

static void backtrace_warning(void *data, char *msg)
{
	/* Ignore warnings */
}

static int backtrace_stack(void *data, char *name)
{
	/* Don't bother with IRQ stacks for now */
	return -1;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

	if (reliable)
		callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
	.warning		= backtrace_warning,
	.warning_symbol		= backtrace_warning_symbol,
	.stack			= backtrace_stack,
	.address		= backtrace_address,
};

static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	unsigned long bp;
	char *stack;
1393
	int nr = entry->nr;
1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404

	callchain_store(entry, instruction_pointer(regs));

	stack = ((char *)regs + sizeof(struct pt_regs));
#ifdef CONFIG_FRAME_POINTER
	bp = frame_pointer(regs);
#else
	bp = 0;
#endif

	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1405 1406

	entry->kernel = entry->nr - nr;
1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
}


struct stack_frame {
	const void __user	*next_fp;
	unsigned long		return_address;
};

static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
{
	int ret;

	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
		return 0;

	ret = 1;
	pagefault_disable();
	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
		ret = 0;
	pagefault_enable();

	return ret;
}

static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	struct stack_frame frame;
	const void __user *fp;
1436
	int nr = entry->nr;
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455

	regs = (struct pt_regs *)current->thread.sp0 - 1;
	fp   = (void __user *)regs->bp;

	callchain_store(entry, regs->ip);

	while (entry->nr < MAX_STACK_DEPTH) {
		frame.next_fp	     = NULL;
		frame.return_address = 0;

		if (!copy_stack_frame(fp, &frame))
			break;

		if ((unsigned long)fp < user_stack_pointer(regs))
			break;

		callchain_store(entry, frame.return_address);
		fp = frame.next_fp;
	}
1456 1457

	entry->user = entry->nr - nr;
1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492
}

static void
perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
	int is_user;

	if (!regs)
		return;

	is_user = user_mode(regs);

	if (!current || current->pid == 0)
		return;

	if (is_user && current->state != TASK_RUNNING)
		return;

	if (!is_user)
		perf_callchain_kernel(regs, entry);

	if (current->mm)
		perf_callchain_user(regs, entry);
}

struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
	struct perf_callchain_entry *entry;

	if (in_nmi())
		entry = &__get_cpu_var(nmi_entry);
	else
		entry = &__get_cpu_var(irq_entry);

	entry->nr = 0;
1493 1494 1495
	entry->hv = 0;
	entry->kernel = 0;
	entry->user = 0;
1496 1497 1498 1499 1500

	perf_do_callchain(regs, entry);

	return entry;
}